# BERT Prediction Error Analysis

This code is used to:
- Verify the impact from text preprocessing on BERT.
- Utilize the unsupervised learning to look for the patterns of error data

In order to speed up the issue analysis, only 1% of total training data(random sampling) is used.

In [None]:
import warnings
# Disable 3 types of warning
warnings.filterwarnings("ignore",category=DeprecationWarning)
warnings.filterwarnings("ignore",category=(FutureWarning))
warnings.filterwarnings("ignore",category=(RuntimeWarning))

In [None]:
import pandas as pd
import numpy as np
import torch
from torch.utils.data import TensorDataset, DataLoader, SequentialSampler
import os
from tqdm import trange
from tqdm.notebook import tqdm
from sklearn.metrics import classification_report, precision_recall_fscore_support
from transformers import AdamW,get_linear_schedule_with_warmup
from transformers import BertForSequenceClassification,DistilBertForSequenceClassification
from transformers import BertTokenizer,DistilBertTokenizer
from sklearn.metrics import classification_report, precision_recall_fscore_support

In [None]:
# Import helper functions

from helper import train_dev_test, convert_examples_to_inputs,get_data_loader,data_evaluation,duple_labels,elbow_plot,train

In [None]:
#Enable GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
# Enable logging.
import logging

logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s -   %(message)s',
                    datefmt = '%m/%d/%Y %H:%M:%S',
                    level = logging.INFO)
logger = logging.getLogger(__name__)

##### Import dataset

In [None]:
import pandas as pd
# load train data
source_train_data_path="./01_data/WikiLarge_Train.csv"
source_train_data=pd.read_csv(source_train_data_path)

RANDOM_STATE=1
PORTION=0.01
size=round(len(source_train_data)*PORTION)
train_data=source_train_data.sample(n=size,random_state=RANDOM_STATE)

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
# Plot histogram with the length. Truncate max length to 5000 tokens.
plt.style.use("ggplot")

plt.figure(figsize=(10, 8))
train_data['length'] = train_data['original_text'].apply(lambda x: len(x.split()))
sns.distplot(train_data[train_data['length'] < 5000]['length'])
plt.title('Frequence of sentences of a given length', fontsize=14)
plt.xlabel('length', fontsize=14)

In [None]:
# BERT token length should not be more than 512.
data_describe=train_data['original_text'].apply(lambda x: len(x.split())).describe()
print(data_describe)
MAX_SEQ_LENGTH=int(data_describe['max'])
if MAX_SEQ_LENGTH>512:
    MAX_SEQ_LENGTH=512

#### Verify the impact from text preprocessing

The following situations will be considered respectively:
- Initial status - Pretrain: evaluate the test dataset without training.
- Fine tune without preprocessing: train the bert, then evaluate the test dataset.
- Remove stopwords,train the bert, then evaluate the test dataset.
- Correct mis-spelling,train the bert, then evaluate the test dataset.
- Lemmanization,train the bert, then evaluate the test dataset.
- Remove duplicate records (same label),train the bert, then evaluate the test dataset.
- Remove duplicated records ( different label),train the bert, then evaluate the test dataset.


In [None]:
# Initalize the parameters for training / evaluation
params={
        "GRADIENT_ACCUMULATION_STEPS":1,
        "NUM_TRAIN_EPOCHS":8,
        "LEARNING_RATE":2e-5,
        "WARMUP_PROPORTION":0.1,
        "MAX_GRAD_NORM":5,
        "MAX_SEQ_LENGTH":MAX_SEQ_LENGTH,
        "BATCH_SIZE":16,
        "NUM_WARMUP_STEPS":600
}
BERT_MODEL = "distilbert-base-uncased"
OUTPUT_DIR = "./tmp/"
MODEL_FILE_NAME = "pytorch_model.bin"

In [None]:
df_acc=pd.DataFrame(columns=['description','accuracy','texts','labels','prediction'])
list_acc=['Pretrain','Fine tune without preprocessing',
                       'Removal of stopwords','Correction of spelling',
                      'Lemmatization','Removal of duplicate records with same labels',
                      'Removal of duplicate records with different labels']

In [None]:
from text_preprocessing import preprocess_text
from text_preprocessing import check_spelling,remove_stopword,lemmatize_word

for i in range(len(list_acc)):
    ### prepare for data based on different situation
    process_data=train_data.copy()
    if list_acc[i]=='Removal of stopwords':
        process_data['original_text'] = process_data['original_text'].apply(lambda x:' '.join(remove_stopword(x)))  
    
    if list_acc[i]=='Removal of stopwords':
        process_data['original_text'] = process_data['original_text'].apply(lambda x:' '.join(check_spelling(x)) if type(x)=='str' else x) 

    if list_acc[i]=='Correction of spelling':
        process_data['original_text'] = process_data['original_text'].apply(lambda x:' '.join(remove_stopword(x)))      
       
    if list_acc[i]=='Removal of duplicate records with same labels':
        process_data.drop_duplicates(subset=['original_text','label'],inplace=True)

    if list_acc[i]=='Removal of duplicate records with different labels':
        # look for the records with different labels 
        df_2labels=process_data.copy()
        df_duple_labels=duple_labels(df_2labels)
        # Indentify double labels in data
        df_2labels['duplicated']=df_2labels.duplicated(subset=['original_text'])
        df_2labels=double_data_unique.merge(df_duple_labels,how="left",left_on="original_text",right_on="original_text")
        df_2labels['label_y']=df_2labels['label_y'].apply(lambda x: '0' if pd.isnull(x) else '1') # 0 means 1 label, 1 means 2 labels
        df_2labels=pd.DataFrame(df_2labels[['original_text','label_x','label_y','duplicated']])
        df_2labels.columns=['original_text','label','dulabel','duplicated']
        process_data=df_2labels[df_2labels['dulabel']=='0'].copy()

    _,(train_texts,dev_texts,test_texts),(train_labels,dev_labels,test_labels),\
                                        (target_names,label2idx)=train_dev_test(process_data,random_state=RANDOM_STATE)

    ### Train model if it is not in "pretrain"
    if list_acc[i]!='Pretrain':
        train(BERT_MODEL,train_texts,train_labels,dev_texts,dev_labels,target_names,label2idx,params)
        train_flag=True
    else:
        train_flag=False

    ### Evaluate model
    _,pred,acc=data_evaluation(test_texts,test_labels,BERT_MODEL,params,trained=train_flag,OUTPUT_DIR = OUTPUT_DIR, MODEL_FILE_NAME = MODEL_FILE_NAME)
    
    ### Save the output including dataset, accuracy results.
    df_acc.at[i,'description']=list_acc[i]
    df_acc.at[i,'accuracy']=acc
    df_acc.at[i,'texts']=(train_texts,dev_texts,test_texts)
    df_acc.at[i,'labels']=(train_labels,dev_labels,test_labels)
    df_acc.at[i,'prediction']=pred

In [None]:
import altair as alt
bars = alt.Chart(df_acc).mark_bar().encode(
    x='accuracy:Q',
    y="description:O"
)

text = bars.mark_text(
    align='left',
    baseline='middle',
    dx=3  # Nudges text to right so it doesn't appear on top of the bar
).encode(
    text='accuracy:Q'
)

rule = alt.Chart(df_acc[df_acc['description']=='Pretrain']).mark_rule(color='red').encode(
    y='accuracy:Q'
)
(bars + text + rule).properties(height=500)

Compare the embeddings between error records in test set with training set + test set ( including error record), find out the similar 
records,display the numbers of different labels to check whether the portion of label could impact the classification records.

In [None]:
df_test=pd.DataFrame(columns=['original_text','label','err','init_pred','best_pred','cnt_1','cnt_0','avg_1','avg_0'])

df_test['original_text']=orig_test_texts
df_test['label']=orig_test_labels
df_test['init_pred']=init_pred
df_test['best_pred']=orig_pred
df_test['err']=df_test['best_pred']-df_test['label']
df_test['err']=df_test['err'].apply(lambda x: 1 if x!=0 else x)

from sklearn.metrics.pairwise import cosine_similarity
# work out the similarity and identify the record of self
#str_to_predict=df_test['original_text'].iloc[11]
#print(str_to_predict)
query_embeddings=embedder.encode(df_test['original_text'])

from sentence_transformers import SentenceTransformer
onlinemodel='all-mpnet-base-v2'
embedder = SentenceTransformer(onlinemodel)

In [None]:
#Create train, dev, test data
df_init,(train_texts,dev_texts,test_texts),(train_labels,dev_labels,test_labels),(target_names,label2idx)=train_dev_test(train_data,random_state=RANDOM_STATE)

##### Initial accuracy for pretrain model

In [None]:
_,init_pred,init_acc=data_evaluation(test_texts,test_labels,BERT_MODEL,params,trained=False,OUTPUT_DIR = OUTPUT_DIR, MODEL_FILE_NAME = MODEL_FILE_NAME)

##### Train model without any preprocessing.

In [None]:
train(BERT_MODEL,train_texts,train_labels,dev_texts,dev_labels,target_names,label2idx,params)

##### Accuracy after training

In [None]:
_,orig_pred,orig_acc=data_evaluation(test_texts,test_labels,BERT_MODEL,params,trained=True,OUTPUT_DIR = OUTPUT_DIR, MODEL_FILE_NAME = MODEL_FILE_NAME)

In [None]:
# Assign those for future error analysis
orig_train_texts,orig_train_labels=train_texts,train_labels
orig_dev_texts,orig_dev_labels=dev_texts,dev_labels
orig_test_texts,orig_test_labels=test_texts,test_labels

##### Remove stopwords

In [None]:

# Preprocess text using custom preprocess functions in the pipeline 
#STOPWORDS=['-RRB-','-LRB-'] # remove customized stopwords
#preprocess_functions = [to_lower, remove_punctuation,remove_special_character,normalize_unicode,remove_stopword,lemmatize_word]


In [None]:
process_data=train_data.copy()
process_data['original_text'] = process_data['original_text'].apply(lambda x:' '.join(remove_stopword(x)))

In [None]:
process_data

In [None]:
df_sw,(train_texts,dev_texts,test_texts),(train_labels,dev_labels,test_labels),(target_names,label2idx)=train_dev_test(process_data,random_state=RANDOM_STATE)

In [None]:
train(BERT_MODEL,train_texts,train_labels,dev_texts,dev_labels,target_names,label2idx,params)

In [None]:
_,sw_pred,sw_acc=data_evaluation(test_texts,test_labels,BERT_MODEL,params,trained=True,OUTPUT_DIR = OUTPUT_DIR, MODEL_FILE_NAME = MODEL_FILE_NAME)

##### Spelling correction 

In [None]:
process_data=train_data.copy()
process_data['original_text'] = process_data['original_text'].apply(lambda x:' '.join(check_spelling(x)) if type(x)=='str' else x)

In [None]:
process_data

In [None]:
df_pc,(train_texts,dev_texts,test_texts),(train_labels,dev_labels,test_labels),(target_names,label2idx)=train_dev_test(process_data,random_state=RANDOM_STATE)

In [None]:
train(BERT_MODEL,train_texts,train_labels,dev_texts,dev_labels,target_names,label2idx,params)

In [None]:
_,pc_pred,pc_acc=data_evaluation(test_texts,test_labels,BERT_MODEL,params,trained=True,OUTPUT_DIR = OUTPUT_DIR, MODEL_FILE_NAME = MODEL_FILE_NAME)

##### Lemmatization 

In [None]:
process_data=train_data.copy()
process_data['original_text'] = process_data['original_text'].apply(lambda x:' '.join(lemmatize_word(x)))

In [None]:
process_data['original_text']

In [None]:
df_lm,(train_texts,dev_texts,test_texts),(train_labels,dev_labels,test_labels),(target_names,label2idx)=train_dev_test(process_data,random_state=RANDOM_STATE)

In [None]:
train(BERT_MODEL,train_texts,train_labels,dev_texts,dev_labels,target_names,label2idx,params)

In [None]:
_,lm_pred,lm_acc=data_evaluation(test_texts,test_labels,BERT_MODEL,params,trained=True,OUTPUT_DIR = OUTPUT_DIR, MODEL_FILE_NAME = MODEL_FILE_NAME)

##### Check the duplicated data and create the data set without duplicates

In [None]:
train_data_unique=train_data.copy()
train_data_unique['duplicated']=train_data_unique.duplicated(subset=['original_text'])
#train_data_unique=train_data[train_data['duplicated']==False]
#print("Duplicated records: %.2f%%" %(100*(len(train_data)-len(train_data_unique))/len(train_data)))

In [None]:
train_data_unique

In [None]:
process_data=train_data_unique.copy()
process_data=process_data[(process_data['duplicated']==False)]

In [None]:
df_dup,(train_texts,dev_texts,test_texts),(train_labels,dev_labels,test_labels),(target_names,label2idx)=train_dev_test(process_data,random_state=RANDOM_STATE)

In [None]:
train(BERT_MODEL,train_texts,train_labels,dev_texts,dev_labels,target_names,label2idx,params)

In [None]:
_,dup_pred,dup_acc=data_evaluation(test_texts,test_labels,BERT_MODEL,params,trained=True,OUTPUT_DIR = OUTPUT_DIR, MODEL_FILE_NAME = MODEL_FILE_NAME)

##### Remove the records which have the different labels
The sentences with different labels should impact the train performance as well as prediction results 

In [None]:
# look for the records with different labels 
df_duple_labels=duple_labels(train_data)

In [None]:
# Indentify double labels in data
train_data_unique=train_data_unique.merge(df_duple_labels,how="left",left_on="original_text",right_on="original_text")
train_data_unique['label_y']=train_data_unique['label_y'].apply(lambda x: '0' if pd.isnull(x) else '1') # 0 means 1 label, 1 means 2 labels
train_data_unique=pd.DataFrame(train_data_unique[['original_text','label_x','label_y','duplicated']])
train_data_unique.columns=['original_text','label','dulabel','duplicated']

In [None]:
train_data_unique

##### Accuracy of removing records with duplicate text or different label

In [None]:
process_data=train_data_unique.copy()
process_data=process_data[(process_data['dulabel']=='0')]

In [None]:
process_data

In [None]:
df_duo,(train_texts,dev_texts,test_texts),(train_labels,dev_labels,test_labels),(target_names,label2idx)=train_dev_test(process_data,random_state=RANDOM_STATE)

In [None]:
train(BERT_MODEL,train_texts,train_labels,dev_texts,dev_labels,target_names,label2idx,params)

In [None]:
_,duo_pred,duo_acc=data_evaluation(test_texts,test_labels,BERT_MODEL,params,trained=True,OUTPUT_DIR = OUTPUT_DIR, MODEL_FILE_NAME = MODEL_FILE_NAME)

##### Compare the results from different text processing

In [None]:
df_acc=pd.DataFrame(columns=['description','accuracy'])
df_acc['description']=['Pretrain','Fine tune without preprocessing',
                       'Removal of stopwords','Correction of spelling',
                      'Lemmatization','Removal of duplicate records',
                      'Removal of mislabeling']
df_acc['accuracy']=[init_acc,orig_acc,sw_acc,pc_acc,lm_acc,dup_acc,duo_acc]

In [None]:
sns.barplot(data=df_acc, y="description", x="accuracy")

##### Impact investigation on similar text

In [None]:
df_test=pd.DataFrame(columns=['original_text','label','err','init_pred','best_pred','cnt_1','cnt_0','avg_1','avg_0'])

In [None]:
df_test['original_text']=orig_test_texts
df_test['label']=orig_test_labels
df_test['init_pred']=init_pred
df_test['best_pred']=orig_pred
df_test['err']=df_test['best_pred']-df_test['label']
df_test['err']=df_test['err'].apply(lambda x: 1 if x!=0 else x)

In [None]:
df_test

##### Display the length of test data and error data in test set

In [None]:
# Plot histogram with the length. Truncate max length to 5000 tokens.
plt.style.use("ggplot")

plt.figure(figsize=(10, 8))
df_test['length'] = df_test['original_text'].apply(lambda x: len(x.split()))
sns.distplot(df_test[df_test['length'] < 5000]['length'])
plt.title('Frequence of sentences of a given length', fontsize=14)
plt.xlabel('length', fontsize=14)

In [None]:
sns.distplot(df_test[(df_test['length'] < 5000) & (df_test['err']==1)]['length'])
plt.title('Frequence of sentences of a given length', fontsize=14)
plt.xlabel('length', fontsize=14)

In [None]:
from sentence_transformers import SentenceTransformer
onlinemodel='all-mpnet-base-v2'
embedder = SentenceTransformer(onlinemodel)

In [None]:
from sklearn.metrics.pairwise import cosine_similarity
# work out the similarity and identify the record of self
#str_to_predict=df_test['original_text'].iloc[11]
#print(str_to_predict)
query_embeddings=embedder.encode(df_test['original_text'])
#sim=cosine_similarity([embedder.encode(str_to_predict)],query_embeddings)
#j=np.argmax(sim)
#sim.argsort()[-3:][::-1][0][:-1]

Compare the embeddings between error records in test set with training set + test set ( including error record), find out the similar 
records,display the numbers of different labels to check whether the portion of label could impact the classification records.

In [None]:
#Create the dataset for comparison,remove duplicate and different label
#df_train=train_data_unique[(train_data_unique['duplicated']==False) | (train_data_unique['dulabel']=='0')]
df_train=train_data_unique.copy()
df_comp=pd.concat([pd.DataFrame(df_train[['original_text','label']]),pd.DataFrame()],
                  ignore_index=True)

#Assign the index to identify each row
#df_comp=df_train.copy()
df_comp=df_comp.reset_index()


# Create the embedding for comparison dataset
query_embeddings=embedder.encode(df_comp['original_text'])

In [None]:
df_test['id']=df_test.index
df_test['id']=df_test['id'].apply(lambda x: 'e'+str(x))

In [None]:
# Set the simlarity threshold 
THRESHOLD=0.5

#Create dateset to cluster the similar sentence
err_cluster=pd.DataFrame(columns=list(df_comp.columns)+['score','cluster'])

df_err=df_test[df_test['err']==1]
for i in trange(len(df_err)):
    #print(df_test[df_test['err']==1]['original_text'].iloc[i])
    df_temp=pd.DataFrame(columns=err_cluster.columns)
    str_to_predict=df_test[df_test['err']==1]['original_text'].iloc[i]
    #print(str_to_predict)
    sim=cosine_similarity([embedder.encode(str_to_predict)],query_embeddings)
    j=np.argmax(sim)
    sim_rows=list(np.where(sim[0]>THRESHOLD)[0])
    if sim_rows !=[]:
        #print(set(np.where(sim[0]>THRESHOLD)[0]))
        
        if err_cluster.empty:
            df_temp=df_comp.iloc[sim_rows]
        else:
            #print(set(err_cluster['index']))
            #df_temp=df_comp.iloc[list(set(sim_rows)-set(list(err_cluster['index'])))]
            df_temp=df_comp.iloc[sim_rows]
            # (df_temp)
        
        if df_temp.empty== False:
            #print(sim[0][sim_rows])
            df_temp['score']=sim[0][sim_rows]
            df_temp['cluster']=df_err['id'].iloc[i]
            
            
            err_cluster=pd.concat([err_cluster,df_temp],axis=0,ignore_index=True)
            #print(df_temp)

In [None]:
df_test['id']=df_test.index
df_test['id']=df_test['id'].apply(lambda x: 'e'+str(x))

In [None]:
df_test

In [None]:
# Set the simlarity threshold 
THRESHOLD=0.5

#Create dateset to cluster the similar sentence
err_cluster=pd.DataFrame(columns=list(df_comp.columns)+['score','cluster'])

df_err=df_test[df_test['err']==1]
for i in trange(len(df_err)):
    #print(df_test[df_test['err']==1]['original_text'].iloc[i])
    df_temp=pd.DataFrame(columns=err_cluster.columns)
    str_to_predict=df_test[df_test['err']==1]['original_text'].iloc[i]
    #print(str_to_predict)
    sim=cosine_similarity([embedder.encode(str_to_predict)],query_embeddings)
    j=np.argmax(sim)
    sim_rows=list(np.where(sim[0]>THRESHOLD)[0])
    if sim_rows !=[]:
        #print(set(np.where(sim[0]>THRESHOLD)[0]))
        
        if err_cluster.empty:
            df_temp=df_comp.iloc[sim_rows]
        else:
            #print(set(err_cluster['index']))
            #df_temp=df_comp.iloc[list(set(sim_rows)-set(list(err_cluster['index'])))]
            df_temp=df_comp.iloc[sim_rows]
            # (df_temp)
        
        if df_temp.empty== False:
            #print(sim[0][sim_rows])
            df_temp['score']=sim[0][sim_rows]
            df_temp['cluster']=df_err['id'].iloc[i]
            
            
            err_cluster=pd.concat([err_cluster,df_temp],axis=0,ignore_index=True)
            #print(df_temp)

In [None]:
err_cluster

In [None]:
#Display the records whose similar recods have 2 diferent labels
df_group=err_cluster.groupby(['cluster','label']).count().reset_index().groupby('cluster').count()
df_group[df_group['label']>1]

In [None]:
err_cluster[err_cluster['cluster']=='e100']

In [None]:
df_err[df_err['id']=='e100']

Change label and retrain the model ,see whether it could impact the evluation result.

In [None]:
#Create train, dev, test data
df_comp.iloc[4088]

In [None]:
df_comp.at[4088,'label']=0

In [None]:
df_comp.at[4088,'label']

In [None]:
orig_train_text=df_comp['original_text']
orig_train_label=df_comp['label']

In [None]:
train(BERT_MODEL,orig_train_texts,orig_train_labels,orig_dev_texts,orig_dev_labels,target_names,label2idx,params)

In [None]:
_,orig_pred,orig_acc=data_evaluation(orig_test_texts,orig_test_labels,BERT_MODEL,params,trained=True,OUTPUT_DIR = OUTPUT_DIR, MODEL_FILE_NAME = MODEL_FILE_NAME)

In [None]:
orig_test_texts[100],orig_pred[100]

Apparently, after chaning one record's label from 1 to 0, the result of evaluation is changed as well.
Let's change the label back, and take further test again.

In [None]:
df_comp.at[4088,'label']=1

In [None]:
df_comp.at[4088,'label']

In [None]:
err_cluster[err_cluster['cluster']=='e9']

In [None]:
df_err[df_err['id']=='e9']

In [None]:
df_comp.at[3467,'label']=0
df_comp.at[642,'label']=0

In [None]:
df_comp.at[3467,'label'],df_comp.at[642,'label']

In [None]:
orig_train_text=df_comp['original_text']
orig_train_label=df_comp['label']

In [None]:
train(BERT_MODEL,orig_train_texts,orig_train_labels,orig_dev_texts,orig_dev_labels,target_names,label2idx,params)

In [None]:
_,orig_pred,orig_acc=data_evaluation(orig_test_texts,orig_test_labels,BERT_MODEL,params,trained=True,OUTPUT_DIR = OUTPUT_DIR, MODEL_FILE_NAME = MODEL_FILE_NAME)

In [None]:
orig_test_texts[9],orig_pred[9]

In [None]:
err_cluster[err_cluster['cluster']=='e103']

In [None]:
df_err[df_err['id']=='e103']

In [None]:
err_cluster[err_cluster['cluster']=='e111']

In [None]:
df_err[df_err['id']=='e111']

##### Final report 5.3 - Unsupervised learning

Step 1. Create dataset for unsupervised learning

In [None]:
# Create the embedding for comparison dataset
df_train=train_data_unique.copy()
df_train['type']='train'
df_train['err']='0'
df_test['type']='test'
df_cluster=pd.concat([pd.DataFrame(df_train[['original_text','label','type','err']]),
                   pd.DataFrame(df_test[['original_text','label','type','err']])],
                  ignore_index=True)

#Assign the index to identify each row
df_cluster=df_cluster.reset_index()


Step 2. Using sentence-BERT to create the embeddings.

In [None]:
cluster_embeddings=embedder.encode(df_cluster['original_text'])

Step 3. Run PCA

In [None]:
# Clustering algorithms
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans, MiniBatchKMeans
from sklearn.metrics.pairwise import cosine_similarity

# Using PCA to reduce the dimension to project the result to 2-d scatter plot
pca = PCA(n_components=2)
principalComponents = pca.fit_transform(cluster_embeddings)

In [None]:
df_pca = pd.DataFrame(data = principalComponents
             , columns = ['principal component 1', 'principal component 2'])

df_pca['sentence_id']=df_cluster.index

Step 4. Run Elbow method to find out optimal K.

In [None]:
# Run Elbow for full training data
elbow_plot(principalComponents,maxK=15)

Step 5. Run Kmeans to cluster the embeddings

In [None]:
num_clusters = 2
clf = KMeans(n_clusters=num_clusters, 
            max_iter=100, 
            init='k-means++', 
            n_init=1)
clf.fit_predict(cluster_embeddings)
cluster_assignment = clf.labels_

cdf=pd.DataFrame(columns=["cluster_id","sentence_id","sentence"])

for i in range(len(cluster_assignment)):
    new_row=pd.Series(data={"cluster_id":cluster_assignment[i],
                                "sentence_id":i,
                                "sentence":df_cluster.iloc[i]['original_text'],
                                "label":df_cluster.iloc[i]['label'],
                                "err":str(df_cluster.iloc[i]['err'])
                           }
                            )
    cdf=cdf.append(new_row,ignore_index=True)

cdf.head()

In [None]:
# Combine PCA results with K-means results to see clustering
df_k=df_pca.merge(cdf,right_on=['sentence_id'],left_on=['sentence_id'])

In [None]:
df_k

Step 6. Plot clustering results

In [None]:
import altair as alt

alt.Chart(df_k).mark_point(size=60).encode(
    x='principal component 1',
    y='principal component 2',
    shape='err:N',
    color=alt.Color('cluster_id', scale=alt.Scale(scheme='category20c')),
    tooltip=['sentence','sentence_id','label']
).properties(title='PCA & Kmeans',height=400,width=500).interactive()

Using UMAP & HDBSCAN  

In [None]:
import umap
umap_embeddings = umap.UMAP(n_neighbors=30, 
                            n_components=2, 
                            metric='cosine').fit_transform(cluster_embeddings)

In [None]:
import hdbscan
cluster = hdbscan.HDBSCAN(min_cluster_size=2,
                          metric='euclidean',                      
                          cluster_selection_method='eom').fit(umap_embeddings)

In [None]:
umap_embeddings

In [None]:
df_uh = pd.DataFrame(data = umap_embeddings
             , columns = ['UMAP component 1', 'UMAP component 2'])

df_uh['sentence_id']=df_cluster.index

In [None]:
uh_cluster_assignment = cluster.labels_

uh_cdf=pd.DataFrame(columns=["cluster_id","sentence_id","sentence"])

for i in range(len(uh_cluster_assignment)):
    new_row=pd.Series(data={"cluster_id":uh_cluster_assignment[i],
                                "sentence_id":i,
                                "sentence":df_cluster.iloc[i]['original_text'],
                                "label":df_cluster.iloc[i]['label'],
                                "err":str(df_cluster.iloc[i]['err'])
                           }
                            )
    uh_cdf=uh_cdf.append(new_row,ignore_index=True)

uh_cdf.head()

In [None]:
# Combine PCA results with K-means results to see clustering
df_uh_c=df_uh.merge(uh_cdf,right_on=['sentence_id'],left_on=['sentence_id'])

In [None]:
alt.Chart(df_uh_c).mark_point(size=60).encode(
    x='UMAP component 1',
    y='UMAP component 2',
    shape='err:N',
    color=alt.Color('cluster_id', scale=alt.Scale(scheme='category20c')),
    tooltip=['sentence','label']
).properties(title='UMAP & HBDSCAN',height=400,width=500).interactive()

In [None]:
df_uh_c[df_uh_c['sentence_id']==4257]