In [None]:
# uncomment in databricks
# %pip install mlflow
# %pip install sentence-transformers

In [None]:
import mlflow, pickle
import pandas as pd

from sentence_transformers import SentenceTransformer
model = SentenceTransformer('emilyalsentzer/Bio_ClinicalBERT')

In [67]:
sentences = ['not feeling very well, feeling lost',
    'feeling healthy, stopped because of remission', 
    'stopped because of dry eyes and shortness of breath',
    'feeling better, improving today, not feeling pain anymore',
    'reports anxiety, anxious, cannot understand',
     ]

embeddings = model.encode(sentences)

from sklearn.cluster import KMeans

km = KMeans(n_clusters=2)
predicted_2d = km.fit_transform(embeddings)
predicted_labels = km.predict(embeddings)

In [77]:
embeddings = model.encode(sentences)

from sklearn.cluster import KMeans

km = KMeans(n_clusters=2)
cval = cross_validate(km,embeddings,cv=5,return_estimator=True)
cval
#predicted_2d = km.fit_transform(embeddings)
#predicted_labels = km.predict(embeddings)


{'fit_time': array([0.00620198, 0.00384712, 0.00360036, 0.00361276, 0.00530028]),
 'score_time': array([0.00025964, 0.00015521, 0.00014758, 0.00014043, 0.00020623]),
 'estimator': [KMeans(n_clusters=2),
  KMeans(n_clusters=2),
  KMeans(n_clusters=2),
  KMeans(n_clusters=2),
  KMeans(n_clusters=2)],
 'test_score': array([-27.16184807, -27.28355408, -34.28046036, -28.56356239,
        -40.49306488])}

In [127]:
from typing import Iterable, List
#from sklearn.metrics import adjusted_rand_score, adjusted_mutual_info_score, normalized_mutual_info_score,homogeneity_score,completeness_score,v_measure_score,silhouette_score
from collections import namedtuple
import importlib 
from sklearn.model_selection import cross_val_score,cross_val_predict,cross_validate
from collections import defaultdict
from sklearn.metrics import get_scorer,silhouette_score

import warnings
CLUSTER_EVAL_METRICS_LIST = ['adjusted_rand_score',
                     'adjusted_mutual_info_score',
                     'normalized_mutual_info_score',
                     'homogeneity_score','completeness_score',
                     'v_measure_score']

class SentenceKmeans():

    def __init__(self, 
                 sentences:Iterable[str],
                 true_labels:Iterable[str]=None,
                 cluster_min: int=1,
                 cluster_max: int=3,
                 cluster_step: int=1,
                 sentence_transformer_name: str='emilyalsentzer/Bio_ClinicalBERT',
                 supervised_eval_metrics_list: List=['homogeneity_score','completeness_score'],
                 random_seed: int = 42
                ):
        
        self.sentences = sentences
        self.cluster_min = cluster_min
        self.cluster_max = cluster_max
        self.cluster_step = cluster_step
        self.sentence_transformer = SentenceTransformer(sentence_transformer_name)
        self.embedded_sentences = self.sentence_transformer.encode(self.sentences)
        self.random_seed = 42
        
        if true_labels is None:
            self.true_labels = np.zeros(len(self.sentences))
        self.eval_metrics_list=supervised_eval_metrics_list

    # Evaluate metrics
    def eval_metrics(self,actual, pred):
    
        results = defaultdict(None)
        
        for met in self.eval_metrics_list:
            if met == 'silhouette_score':
                results['silhouette_score']=silhouette_score(self.embedded_sentences,pred)
            else:
                results[met] = get_scorer(met)._score_func(actual,pred)
    
        return results
    
    def train(self):
        # Start an MLflow run; the "with" keyword ensures we'll close the run even if this cell crashes
        with mlflow.start_run():
            warnings.filterwarnings("ignore")
            np.random.seed(self.random_seed)
            for current_n in range(self.cluster_min, self.cluster_max, self.cluster_step):
                mlflow.log_param("n_clusters", current_n)
                
                km = KMeans(n_clusters=current_n)

                predicted_labels = km.fit_predict(self.embedded_sentences)
                predicted_2d = km.transform(self.embedded_sentences)
                
                metrics_dict = self.eval_metrics(self.true_labels,predicted_labels)

                # Print out ElasticNet model metrics

                print(f"n_cluster={current_n}")
                for met in self.eval_metrics_list:
                    print(f'{met} = {metrics_dict[met]:.4f} \n')
                    mlflow.log_metric(met, metrics_dict[met])
                
                
                mlflow.sklearn.log_model(km, f"KMeans (n={current_n})")
               # mlflow.sklearn.save_model(km,'./my_models')
  

    

In [130]:
class SentenceKmeans():

    def __init__(self, 
                 sentences:Iterable[str],
                 true_labels:Iterable[str]=None,
                 cluster_min: int=1,
                 cluster_max: int=3,
                 cluster_step: int=1,
                 sentence_transformer_name: str='emilyalsentzer/Bio_ClinicalBERT',
                 supervised_eval_metrics_list: List=['homogeneity_score','completeness_score'],
                 random_seed: int = 42
                ):
        
        self.sentences = sentences
        self.cluster_min = cluster_min
        self.cluster_max = cluster_max
        self.cluster_step = cluster_step
        self.sentence_transformer = SentenceTransformer(sentence_transformer_name)
        self.embedded_sentences = self.sentence_transformer.encode(self.sentences)
        self.random_seed = 42
        
        if true_labels is None:
            self.true_labels = np.zeros(len(self.sentences))
        self.eval_metrics_list=supervised_eval_metrics_list

    # Evaluate metrics
    def eval_metrics(self,actual, pred):
    
        results = defaultdict(None)
        
        for met in self.eval_metrics_list:
            if met == 'silhouette_score':
                results['silhouette_score']=silhouette_score(self.embedded_sentences,pred)
            else:
                results[met] = get_scorer(met)._score_func(actual,pred)
    
        return results
    
    def train(self):
        # Start an MLflow run; the "with" keyword ensures we'll close the run even if this cell crashes
        np.random.seed(self.random_seed)

        for current_n in range(self.cluster_min, self.cluster_max, self.cluster_step):
            with mlflow.start_run():
                warnings.filterwarnings("ignore")
                mlflow.log_param("n_clusters", current_n)

                km = KMeans(n_clusters=current_n)

                predicted_labels = km.fit_predict(self.embedded_sentences)
                predicted_2d = km.transform(self.embedded_sentences)

                metrics_dict = self.eval_metrics(self.true_labels,predicted_labels)

                # Print out ElasticNet model metrics

                print(f"n_cluster={current_n}")
                for met in self.eval_metrics_list:
                    print(f'{met} = {metrics_dict[met]:.4f}')
                    mlflow.log_metric(met, metrics_dict[met])


                mlflow.sklearn.log_model(km, f"KMeans (n={current_n})")
               # mlflow.sklearn.save_model(km,'./my_models')   

In [133]:
SentClust = SentenceKmeans(sentences,supervised_eval_metrics_list=CLUSTER_EVAL_METRICS_LIST)
  

No sentence-transformers model found with name /home/ares/.cache/torch/sentence_transformers/emilyalsentzer_Bio_ClinicalBERT. Creating a new one with MEAN pooling.
Some weights of the model checkpoint at /home/ares/.cache/torch/sentence_transformers/emilyalsentzer_Bio_ClinicalBERT were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSeque

In [134]:
SentClust.train()

n_cluster=1
adjusted_rand_score = 1.0000 

adjusted_mutual_info_score = 1.0000 

normalized_mutual_info_score = 1.0000 

homogeneity_score = 1.0000 

completeness_score = 1.0000 

v_measure_score = 1.0000 

n_cluster=2
adjusted_rand_score = 0.0000 

adjusted_mutual_info_score = 0.0000 

normalized_mutual_info_score = 0.0000 

homogeneity_score = 1.0000 

completeness_score = 0.0000 

v_measure_score = 0.0000 

