In [1]:
from utils.logging import logger
import pandas as pd
import numpy as np
import mlflow
from sklearn.cluster import KMeans
from sklearn.pipeline import Pipeline
from sentence_transformers import SentenceTransformer
from sklearn.metrics import silhouette_score
import cloudpickle
import sys

from typing import Iterable, List
#from sklearn.metrics import adjusted_rand_score, adjusted_mutual_info_score, normalized_mutual_info_score,homogeneity_score,completeness_score,v_measure_score,silhouette_score
from collections import namedtuple
from collections import defaultdict
from sklearn.metrics import get_scorer,silhouette_score
import warnings
import time

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# RANDOM_SEED
RANDOM_SEED = 42

# CLUSTER
CLUSTER_MIN = 2
CLUSTER_MAX = 15
CLUSTER_STEP = 1

LOG_ONLY_BEST=True

# data
DATA_PATH = './data/yelp_labelled.txt'

# sentence transformer
# choose one from https://www.sbert.net/docs/pretrained_models.html
MODEL_NAME = 'multi-qa-MiniLM-L6-cos-v1'

# Eval Metrics
EVAL_METRICS = ['silhouette_score']

In [3]:
# load data
logger.info('Loading file {DATA_PATH}...')
df = pd.read_csv(DATA_PATH, header=None,
                 sep='\t', names=['sentence', 'label'])
logger.info(f'Load complete: {df.shape[0]} rows and {df.shape[1]} columns.')

Loading file {DATA_PATH}...
Load complete: 1000 rows and 2 columns.


In [4]:
from sentcluster import BertEmbedder
logger.info(f'Loading sentence transformer {MODEL_NAME}...')
embedder = BertEmbedder(MODEL_NAME)
logger.info(f'done.')


Loading sentence transformer multi-qa-MiniLM-L6-cos-v1...
done.


# Config conda env

In [6]:
from sys import version_info
PYTHON_VERSION = "{major}.{minor}.{micro}".format(
    major=version_info.major, minor=version_info.minor, micro=version_info.micro
)

conda_env = {
    "channels": ["defaults", "conda-forge"],
    "dependencies": ["python={}".format(PYTHON_VERSION), "pip"],
    "pip": [
        "mlflow",
        "cloudpickle=={}".format(cloudpickle.__version__),
        "vaderSentiment==3.3.2",
    ],
    "name": "mlflow-env",
}


# Training Step

In [8]:


np.random.seed(RANDOM_SEED)

# Start an MLflow run; the "with" keyword ensures we'll close the run even if this cell crashes
pipe = Pipeline([('embedder',embedder),('clusterer',KMeans())])

best_metric = np.Inf
best_labels = None

for current_n in range(CLUSTER_MIN, CLUSTER_MAX, CLUSTER_STEP):
    with mlflow.start_run():
        warnings.filterwarnings("ignore")
        mlflow.log_param("n_clusters", current_n)
        model_start_time = time.strftime("%Y%m%d-%H%M%S")
        pipe.set_params(**{'clusterer__n_clusters':current_n})
        predicted_2d = pipe.fit(df.sentence)
        predicted_labels = pipe.predict(df.sentence)

        current_metric = silhouette_score(pipe['embedder'].transform(df.sentence),predicted_labels)
        
        # Print out Silhouette Score and log it in mlflow
        logger.info(f"n_cluster={current_n}, silhouette_score={current_metric:.5f}")

        if LOG_ONLY_BEST:
            if current_metric < best_metric:
                best_metric = current_metric 
                mlflow.sklearn.log_model(pipe, f"{MODEL_NAME}_KMeans_(n={current_n})",conda_env=conda_env)
                mlflow.log_metric('silhouette_score', best_metric)
                mlflow.sklearn.save_model(pipe,f'models/{model_start_time}_{MODEL_NAME}_KMeans_(n={current_n}',conda_env=conda_env)
                best_labels = predicted_labels
        else:
                mlflow.sklearn.log_model(pipe, f"{MODEL_NAME}_KMeans_(n={current_n})",conda_env=conda_env)
                mlflow.log_metric('silhouette_score', current_metric)
                mlflow.sklearn.save_model(pipe,f'models/{model_start_time}_{MODEL_NAME}_KMeans_(n={current_n}',conda_env=conda_env)
                best_labels = predicted_labels


Batches: 100%|██████████| 32/32 [00:00<00:00, 67.39it/s]
Batches: 100%|██████████| 32/32 [00:00<00:00, 69.66it/s]
Batches: 100%|██████████| 32/32 [00:00<00:00, 78.32it/s]
n_cluster=2, silhouette_score=0.06888
Batches: 100%|██████████| 32/32 [00:00<00:00, 74.14it/s]
Batches: 100%|██████████| 32/32 [00:00<00:00, 75.30it/s]
Batches: 100%|██████████| 32/32 [00:00<00:00, 75.88it/s]
n_cluster=3, silhouette_score=0.05623
Batches: 100%|██████████| 32/32 [00:00<00:00, 73.00it/s]
Batches: 100%|██████████| 32/32 [00:00<00:00, 73.18it/s]
Batches: 100%|██████████| 32/32 [00:00<00:00, 71.61it/s]
n_cluster=4, silhouette_score=0.04648
Batches: 100%|██████████| 32/32 [00:00<00:00, 75.93it/s]
Batches: 100%|██████████| 32/32 [00:00<00:00, 77.84it/s]
Batches: 100%|██████████| 32/32 [00:00<00:00, 79.23it/s]
n_cluster=5, silhouette_score=0.04916
Batches: 100%|██████████| 32/32 [00:00<00:00, 72.77it/s]
Batches: 100%|██████████| 32/32 [00:00<00:00, 69.68it/s]
Batches: 100%|██████████| 32/32 [00:00<00:00, 72.3

# Generate scored dataset with best model

In [10]:
results_df = pd.DataFrame( {'sentence':df.sentence,'label':best_labels})

In [14]:
for r in results_df.label.unique():
    logger.info(f'samples from cluster={r}')
    logger.info(results_df[results_df.label==r].head(10).sentence.values)
    logger.info('\n')

samples from cluster=10
['Wow... Loved this place.'
 'This place is not worth your time, let alone Vegas.'
 'I could care less... The interior is just beautiful.'
 'I found this place by accident and I could not be happier.'
 'Overall, I like this place a lot.'
 'This place receives stars for their APPETIZERS!!!'
 'We are so glad we found this place.'
 "I guess I should have known that this place would suck, because it is inside of the Excalibur, but I didn't use my common sense."
 'This place has it!'
 'Although I very much liked the look and sound of this place, the actual experience was a bit disappointing.']


samples from cluster=1
['Crust is not good.' 'The food, amazing.'
 'Also there are combos like a burger, fries, and beer for 23 which is a decent deal.'
 'Ample portions and good prices.'
 'The only thing I did like was the prime rib and dessert section.'
 "It's too bad the food is so damn generic."
 'Great food and service, huge portions and they give a military discount.'
 

In [None]:
import mlflow
logged_model = 'runs:/79ab3cd9f79547d48fa11a9475805744/multi-qa-MiniLM-L6-cos-v1_KMeans_(n=13)'

# Load model as a PyFuncModel.
loaded_model = mlflow.pyfunc.load_model(logged_model)

# Predict on a Pandas DataFrame.
import pandas as pd
loaded_model.predict(pd.DataFrame(data))