In [15]:
from utils.logging import logger
import pandas as pd
import numpy as np
import mlflow
from sklearn.cluster import KMeans
from sklearn.pipeline import Pipeline
from sentence_transformers import SentenceTransformer
from sklearn.metrics import silhouette_score
import cloudpickle
import sys

from typing import Iterable, List
from collections import namedtuple
from collections import defaultdict
from sklearn.metrics import get_scorer,silhouette_score
import warnings
import time

In [16]:
# RANDOM_SEED
RANDOM_SEED = 42

# CLUSTER
CLUSTER_MIN = 2
CLUSTER_MAX = 15
CLUSTER_STEP = 1

LOG_ONLY_BEST=True

# data
DATA_PATH = './data/yelp_labelled.txt'

# sentence transformer
# choose one from https://www.sbert.net/docs/pretrained_models.html
MODEL_NAME = 'multi-qa-MiniLM-L6-cos-v1'

# Eval Metrics
EVAL_METRICS = ['silhouette_score']

In [17]:
# load data
logger.info('Loading file {DATA_PATH}...')
df = pd.read_csv(DATA_PATH, header=None,
                 sep='\t', names=['sentence', 'label'])
logger.info(f'Load complete: {df.shape[0]} rows and {df.shape[1]} columns.')

Loading file {DATA_PATH}...
Load complete: 1000 rows and 2 columns.


In [18]:
from sentcluster import BertEmbedder
logger.info(f'Loading sentence transformer {MODEL_NAME}...')
embedder = BertEmbedder(MODEL_NAME)
logger.info(f'done.')


Loading sentence transformer multi-qa-MiniLM-L6-cos-v1...
done.


# Config conda env

In [19]:
from sys import version_info
PYTHON_VERSION = "{major}.{minor}.{micro}".format(
    major=version_info.major, minor=version_info.minor, micro=version_info.micro
)

conda_env = {
    "channels": ["defaults", "conda-forge"],
    "dependencies": ["python={}".format(PYTHON_VERSION), "pip"],
    "pip": [
        "mlflow",
        "cloudpickle=={}".format(cloudpickle.__version__),
        "vaderSentiment==3.3.2",
    ],
    "name": "mlflow-env",
}


# Training Step

In [20]:


np.random.seed(RANDOM_SEED)

# Start an MLflow run; the "with" keyword ensures we'll close the run even if this cell crashes
pipe = Pipeline([('embedder',embedder),('clusterer',KMeans())])

best_metric = np.Inf
best_labels = None

for current_n in range(CLUSTER_MIN, CLUSTER_MAX, CLUSTER_STEP):
    with mlflow.start_run():
        warnings.filterwarnings("ignore")
        mlflow.log_param("n_clusters", current_n)
        model_start_time = time.strftime("%Y%m%d-%H%M%S")
        pipe.set_params(**{'clusterer__n_clusters':current_n})
        predicted_2d = pipe.fit(df.sentence)
        predicted_labels = pipe.predict(df.sentence)

        current_metric = silhouette_score(pipe['embedder'].transform(df.sentence),predicted_labels)
        
        # Print out Silhouette Score and log it in mlflow
        logger.info(f"n_cluster={current_n}, silhouette_score={current_metric:.5f}")

        if LOG_ONLY_BEST:
            if current_metric < best_metric:
                best_metric = current_metric 
                mlflow.sklearn.log_model(pipe, f"{MODEL_NAME}_KMeans_(n={current_n})",conda_env=conda_env)
                mlflow.log_metric('silhouette_score', best_metric)
                mlflow.sklearn.save_model(pipe,f'models/{model_start_time}_{MODEL_NAME}_KMeans_(n={current_n}',conda_env=conda_env)
                best_labels = predicted_labels
        else:
                mlflow.sklearn.log_model(pipe, f"{MODEL_NAME}_KMeans_(n={current_n})",conda_env=conda_env)
                mlflow.log_metric('silhouette_score', current_metric)
                mlflow.sklearn.save_model(pipe,f'models/{model_start_time}_{MODEL_NAME}_KMeans_(n={current_n}',conda_env=conda_env)
                best_labels = predicted_labels


Batches: 100%|██████████| 32/32 [00:00<00:00, 65.21it/s]
Batches: 100%|██████████| 32/32 [00:00<00:00, 75.02it/s]
Batches: 100%|██████████| 32/32 [00:00<00:00, 78.65it/s]
n_cluster=2, silhouette_score=0.06888
Batches: 100%|██████████| 32/32 [00:00<00:00, 65.64it/s]
Batches: 100%|██████████| 32/32 [00:00<00:00, 70.91it/s]
Batches: 100%|██████████| 32/32 [00:00<00:00, 74.95it/s]
n_cluster=3, silhouette_score=0.05623
Batches: 100%|██████████| 32/32 [00:00<00:00, 73.00it/s]
Batches: 100%|██████████| 32/32 [00:00<00:00, 78.15it/s]
Batches: 100%|██████████| 32/32 [00:00<00:00, 74.56it/s]
n_cluster=4, silhouette_score=0.04648
Batches: 100%|██████████| 32/32 [00:00<00:00, 73.63it/s]
Batches: 100%|██████████| 32/32 [00:00<00:00, 77.70it/s]
Batches: 100%|██████████| 32/32 [00:00<00:00, 75.44it/s]
n_cluster=5, silhouette_score=0.04916
Batches: 100%|██████████| 32/32 [00:00<00:00, 70.25it/s]
Batches: 100%|██████████| 32/32 [00:00<00:00, 76.37it/s]
Batches: 100%|██████████| 32/32 [00:00<00:00, 78.4

# Generate scored dataset with best model

In [21]:
results_df = pd.DataFrame( {'sentence':df.sentence,'label':best_labels})

In [33]:
for r in sorted(results_df.label.unique()):
    logger.info(f'------------------------------------')
    logger.info(f'samples from cluster={r}')
    logger.info(f'------------------------------------')
    for index,row in results_df[results_df.label==r].head(5).iterrows():
        logger.info(row['sentence'])

------------------------------------
samples from cluster=0
------------------------------------
The selection on the menu was great and so were the prices.
The Burrittos Blah!
My first visit to Hiro was a delight!
Their chow mein is so good!
The cocktails are all handmade and delicious.
------------------------------------
samples from cluster=1
------------------------------------
Crust is not good.
The food, amazing.
Also there are combos like a burger, fries, and beer for 23 which is a decent deal.
Ample portions and good prices.
The only thing I did like was the prime rib and dessert section.
------------------------------------
samples from cluster=2
------------------------------------
Stopped by during the late May bank holiday off Rick Steve recommendation and loved it.
I was disgusted because I was pretty sure that was human hair.
I was shocked because no signs indicate cash only.
did not like at all.
So they performed.
------------------------------------
samples from cluste