In [48]:
from utils.logging import logger
import pandas as pd
import numpy as np
import mlflow
from sklearn.cluster import KMeans
from sklearn.pipeline import Pipeline
from sklearn.metrics import silhouette_score
import cloudpickle

from sklearn.metrics import silhouette_score
import warnings
import time

In [49]:
# RANDOM_SEED
RANDOM_SEED = 42

# CLUSTER
CLUSTER_MIN = 5
CLUSTER_MAX = 8
CLUSTER_STEP = 1

LOG_ONLY_BEST=True

# data
DATA_PATH = './data/yelp_labelled.txt'

# sentence transformer
# choose one from https://www.sbert.net/docs/pretrained_models.html
MODEL_NAME = 'multi-qa-MiniLM-L6-cos-v1'

# Eval Metrics
EVAL_METRICS = ['silhouette_score']

In [50]:
# load data
logger.info('Loading file {DATA_PATH}...')
df = pd.read_csv(DATA_PATH, header=None,
                 sep='\t', names=['sentence', 'label'])
logger.info(f'Load complete: {df.shape[0]} rows and {df.shape[1]} columns.')

Loading file {DATA_PATH}...
Load complete: 1000 rows and 2 columns.


In [51]:
from sentcluster import BertEmbedder
logger.info(f'Loading sentence transformer {MODEL_NAME}...')
embedder = BertEmbedder(MODEL_NAME)
logger.info(f'done.')


Loading sentence transformer multi-qa-MiniLM-L6-cos-v1...
done.


# Config conda env

In [52]:
from sys import version_info
PYTHON_VERSION = "{major}.{minor}.{micro}".format(
    major=version_info.major, minor=version_info.minor, micro=version_info.micro
)

conda_env = {
    "channels": ["defaults", "conda-forge"],
    "dependencies": ["python={}".format(PYTHON_VERSION), "pip"],
    "pip": [
        "mlflow",
        "cloudpickle=={}".format(cloudpickle.__version__),
        "scikit-learn==1.2.2",
        "sentence-transformers==2.2.2"
    ],
    "name": "mlflow-env",
}


# Training Step

In [53]:


np.random.seed(RANDOM_SEED)

# Start an MLflow run; the "with" keyword ensures we'll close the run even if this cell crashes
pipe = Pipeline([('embedder',embedder),('clusterer',KMeans())])

best_metric = np.Inf
best_labels = None

for current_n in range(CLUSTER_MIN, CLUSTER_MAX, CLUSTER_STEP):
    with mlflow.start_run():
        warnings.filterwarnings("ignore")
        mlflow.log_param("n_clusters", current_n)
        model_start_time = time.strftime("%Y%m%d-%H%M%S")
        pipe.set_params(**{'clusterer__n_clusters':current_n})
        predicted_2d = pipe.fit(df.sentence)
        predicted_labels = pipe.predict(df.sentence)

        current_metric = silhouette_score(pipe['embedder'].transform(df.sentence),predicted_labels)
        
        # Print out Silhouette Score and log it in mlflow
        logger.info(f"n_cluster={current_n}, silhouette_score={current_metric:.5f}")

        if LOG_ONLY_BEST and current_metric < best_metric:
            best_metric = current_metric 
            mlflow.sklearn.log_model(pipe, f"{MODEL_NAME}_KMeans_(n={current_n})",conda_env=conda_env)
            mlflow.log_metric('silhouette_score', best_metric)
            best_labels = predicted_labels
        else:
            mlflow.sklearn.log_model(pipe, f"{MODEL_NAME}_KMeans_(n={current_n})",conda_env=conda_env)
            mlflow.log_metric('silhouette_score', current_metric)
            best_labels = predicted_labels


Batches: 100%|██████████| 32/32 [00:00<00:00, 73.10it/s]
Batches: 100%|██████████| 32/32 [00:00<00:00, 72.82it/s]
Batches: 100%|██████████| 32/32 [00:00<00:00, 71.21it/s]
n_cluster=5, silhouette_score=0.04953
Batches: 100%|██████████| 32/32 [00:00<00:00, 73.75it/s]
Batches: 100%|██████████| 32/32 [00:00<00:00, 79.81it/s]
Batches: 100%|██████████| 32/32 [00:00<00:00, 69.25it/s]
n_cluster=6, silhouette_score=0.04882
Batches: 100%|██████████| 32/32 [00:00<00:00, 77.84it/s]
Batches: 100%|██████████| 32/32 [00:00<00:00, 76.29it/s]
Batches: 100%|██████████| 32/32 [00:00<00:00, 70.32it/s]
n_cluster=7, silhouette_score=0.04781


# Generate scored dataset with best model

In [54]:
results_df = pd.DataFrame( {'sentence':df.sentence,'label':best_labels})

In [55]:
for r in sorted(results_df.label.unique()):
    logger.info(f'------------------------------------')
    logger.info(f'samples from cluster={r}')
    logger.info(f'------------------------------------')
    for index,row in results_df[results_df.label==r].head(5).iterrows():
        logger.info(row['sentence'])

------------------------------------
samples from cluster=0
------------------------------------
Crust is not good.
Not tasty and the texture was just nasty.
Honeslty it didn't taste THAT fresh.)
The potatoes were like rubber and you could tell they had been made up ahead of time being kept under a warmer.
The fries were great too.
------------------------------------
samples from cluster=1
------------------------------------
The cashier had no care what so ever on what I had to say it still ended up being wayyy overpriced.
I was disgusted because I was pretty sure that was human hair.
Waitress was a little slow in service.
- They never brought a salad we asked for.
Took an hour to get our food only 4 tables in restaurant my food was Luke warm, Our sever was running around like he was totally overwhelmed.
------------------------------------
samples from cluster=2
------------------------------------
The selection on the menu was great and so were the prices.
The food, amazing.
Also t