In [32]:
from utils.logging import logger
import pandas as pd
import numpy as np
import mlflow
from sklearn.cluster import KMeans
from sklearn.pipeline import Pipeline
from sentence_transformers import SentenceTransformer
from sklearn.metrics import silhouette_score
import cloudpickle
import sys

In [33]:
# RANDOM_SEED
RANDOM_SEED = 42

# CLUSTER
CLUSTER_MIN = 2
CLUSTER_MAX = 15
CLUSTER_STEP = 1

LOG_ONLY_BEST=True

# data
DATA_PATH = './data/yelp_labelled.txt'

# sentence transformer
# choose one from https://www.sbert.net/docs/pretrained_models.html
MODEL_NAME = 'multi-qa-MiniLM-L6-cos-v1'

# Eval Metrics
EVAL_METRICS = ['silhouette_score']

In [34]:
# load data
logger.info('Loading file {DATA_PATH}...')
df = pd.read_csv(DATA_PATH, header=None,
                 sep='\t', names=['sentence', 'label'])
logger.info(f'Load complete: {df.shape[0]} rows and {df.shape[1]} columns.')

Loading file {DATA_PATH}...
Load complete: 1000 rows and 2 columns.


In [35]:
from sentcluster import BertEmbedder
logger.info(f'Loading sentence transformer {MODEL_NAME}...')
embedder = BertEmbedder(MODEL_NAME)
logger.info(f'done.')


Loading sentence transformer multi-qa-MiniLM-L6-cos-v1...
done.


In [36]:
pipe.set_params(**{'clusterer__n_clusters':4})
predicted_2d = pipe.fit(df.sentence)
predicted_labels = pipe.predict(df.sentence)

sil_score = silhouette_score(pipe['embedder'].transform(df.sentence),predicted_labels)
print(sil_score)

Batches: 100%|██████████| 32/32 [00:00<00:00, 74.48it/s]
Batches: 100%|██████████| 32/32 [00:00<00:00, 76.83it/s]
Batches: 100%|██████████| 32/32 [00:00<00:00, 77.45it/s]

0.04648494





# Training Step

In [37]:
from sys import version_info
PYTHON_VERSION = "{major}.{minor}.{micro}".format(
    major=version_info.major, minor=version_info.minor, micro=version_info.micro
)

conda_env = {
    "channels": ["defaults", "conda-forge"],
    "dependencies": ["python={}".format(PYTHON_VERSION), "pip"],
    "pip": [
        "mlflow",
        "cloudpickle=={}".format(cloudpickle.__version__),
        "vaderSentiment==3.3.2",
    ],
    "name": "mlflow-env",
}


In [39]:
from typing import Iterable, List
#from sklearn.metrics import adjusted_rand_score, adjusted_mutual_info_score, normalized_mutual_info_score,homogeneity_score,completeness_score,v_measure_score,silhouette_score
from collections import namedtuple
import importlib 
from sklearn.model_selection import cross_val_score,cross_val_predict,cross_validate
from collections import defaultdict
from sklearn.metrics import get_scorer,silhouette_score
import warnings
import time

np.random.seed(RANDOM_SEED)

# Start an MLflow run; the "with" keyword ensures we'll close the run even if this cell crashes
pipe = Pipeline([('embedder',embedder),('clusterer',KMeans())])

best_metric = np.Inf
for current_n in range(CLUSTER_MIN, CLUSTER_MAX, CLUSTER_STEP):
    with mlflow.start_run():
        warnings.filterwarnings("ignore")
        mlflow.log_param("n_clusters", current_n)
        model_start_time = time.strftime("%Y%m%d-%H%M%S")
        pipe.set_params(**{'clusterer__n_clusters':current_n})
        predicted_2d = pipe.fit(df.sentence)
        predicted_labels = pipe.predict(df.sentence)

        current_metric = silhouette_score(pipe['embedder'].transform(df.sentence),predicted_labels)
        
        # Print out Silhouette Score and log it in mlflow
        logger.info(f"n_cluster={current_n}, silhouette_score={current_metric:.5f}")

        if LOG_ONLY_BEST:
            if current_metric < best_metric:
                best_metric = current_metric 
                mlflow.sklearn.log_model(pipe, f"{MODEL_NAME}_KMeans_(n={current_n})",conda_env=conda_env)
                mlflow.log_metric('silhouette_score', best_metric)
                mlflow.sklearn.save_model(pipe,f'models/{model_start_time}_{MODEL_NAME}_KMeans_(n={current_n}',conda_env=conda_env)
        else:
                mlflow.sklearn.log_model(pipe, f"{MODEL_NAME}_KMeans_(n={current_n})",conda_env=conda_env)
                mlflow.log_metric('silhouette_score', current_metric)
                mlflow.sklearn.save_model(pipe,f'models/{model_start_time}_{MODEL_NAME}_KMeans_(n={current_n}',conda_env=conda_env)

Batches: 100%|██████████| 32/32 [00:00<00:00, 72.78it/s]
Batches: 100%|██████████| 32/32 [00:00<00:00, 75.08it/s]
Batches: 100%|██████████| 32/32 [00:00<00:00, 70.76it/s]
n_cluster=2, silhouette_score=0.06888
Batches: 100%|██████████| 32/32 [00:00<00:00, 74.95it/s]
Batches: 100%|██████████| 32/32 [00:00<00:00, 65.15it/s]
Batches: 100%|██████████| 32/32 [00:00<00:00, 70.37it/s]
n_cluster=3, silhouette_score=0.05623
Batches: 100%|██████████| 32/32 [00:00<00:00, 67.76it/s]
Batches: 100%|██████████| 32/32 [00:00<00:00, 73.01it/s]
Batches: 100%|██████████| 32/32 [00:00<00:00, 69.07it/s]
n_cluster=4, silhouette_score=0.04648
Batches: 100%|██████████| 32/32 [00:00<00:00, 70.14it/s]
Batches: 100%|██████████| 32/32 [00:00<00:00, 76.65it/s]
Batches: 100%|██████████| 32/32 [00:00<00:00, 72.74it/s]
n_cluster=5, silhouette_score=0.04916
Batches: 100%|██████████| 32/32 [00:00<00:00, 70.71it/s]
Batches: 100%|██████████| 32/32 [00:00<00:00, 71.28it/s]
Batches: 100%|██████████| 32/32 [00:00<00:00, 71.6