In [None]:
import re
import pandas as pd
import numpy as np
from sklearn.cluster import KMeans
import umap
import nltk;
import spacy
import string

import matplotlib.pyplot as plt
from matplotlib.axes._axes import _log as matplotlib_axes_logger
matplotlib_axes_logger.setLevel('ERROR')
import seaborn as sns
sns.set(style='white', rc={'figure.figsize':(10,8)})

np.random.seed(42)
%config Completer.use_jedi = False
pd.set_option('display.max_columns', 30)
# pd.set_option('display.max_rows', 100)

nlp = spacy.load("en_core_web_sm")

In [None]:
df = pd.read_csv('mindvalley_fb_comments.csv')
df.rename(columns={"comment":"comments"},inplace=True)
df.head()

In [None]:
df.comments[286]

In [None]:
df.comments.replace("\S*@\S*" , "", regex=True, inplace = True)
df.comments.replace("\S*#\S*" , "", regex=True, inplace = True)
df.comments.replace("\S*http\S*" , "", regex=True, inplace = True)
df.comments.replace("_" , " ", regex=True, inplace = True)
df.comments.replace("^" , " ", regex=True, inplace = True)
df.comments.replace("\n" , " ", regex=True, inplace = True)

In [None]:
df.comments[286]

In [None]:
for i in string.punctuation:
    try:
        df.comments.replace(f"\{i}" , "", regex=True, inplace = True)
    except: pass

In [None]:
df.comments[286]

In [None]:
junk_words = "ve xx tqvm yessssssssssit youuuuuu nan nt citi kashif a very were yet than through via here rishisunak more much getting go going ive dont able when which who year barclays hsbcin amp like an no any since after banking im there out how starlingbank starling one only again over other then am may some do cant about banks would could its even their after has them get got within now all just if what or at had hsbcukbusiness up by into will an was our us so been hsbcuk co om ke ha gone your we can from as but to the hsbc and of you for are have they is my hsbc_uk with it in on this ðÿ be not bank me that"
junk_words_list = junk_words.split(" ")

In [None]:
def remove_unicode(text):

    try:
        text = text.encode("latin_1").decode("raw_unicode_escape").encode('utf-16', 'surrogatepass').decode('utf-16')
    except:
        text = text

    emoji_pattern = re.compile("["
        u"\U0001F600-\U0001F64F"  # emoticons
        u"\U0001F300-\U0001F5FF"  # symbols & pictographs
        u"\U0001F680-\U0001F6FF"  # transport & map symbols
        u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
        u"\U00002500-\U00002BEF"  # chinese char
        u"\U00002702-\U000027B0"
        u"\U00002702-\U000027B0"
        u"\U000024C2-\U0001F251"
        u"\U0001f926-\U0001f937"
        u"\U00010000-\U0010ffff"
        u"\u2640-\u2642" 
        u"\u2600-\u2B55"
        u"\u200d"
        u"\u23cf"
        u"\u23e9"
        u"\u231a"
        u"\ufe0f"  # dingbats
        u"\u3030"
        "]+", re.UNICODE)

    text = re.sub(emoji_pattern, '', str(text))
    text = ''.join(text.split('\\n')) # to remove uncompiled \n
    text = re.sub('\n', '', text)     # to remove compiled \n

    return text

In [None]:
def word_root(text):
    list_data3=[]
    text = remove_unicode(text)
    doc = nlp(text)
    for token in doc:
        if token.text.isalpha():
            lemma = token.lemma_
            list_data3.append(lemma)
        else:pass
    return " ".join(list_data3)
df.comments = df.comments.apply(lambda x: word_root(str(x).lower()))  

In [None]:
df.comments.value_counts()

In [None]:
df.replace('nan', np.nan, inplace = True)
df.replace('', np.nan, inplace = True)
df.replace('do', np.nan, inplace = True)
df = df.dropna()

In [None]:
df.comments.value_counts()

In [None]:
list_data = df.comments.unique().tolist()

In [None]:
len(list_data)

In [None]:
from sentence_transformers import SentenceTransformer

In [None]:
# distilbert-base-nli-mean-tokens
# paraphrase-distilroberta-base-v1
# roberta-large-nli-stsb-mean-tokens
#allenai-specter


In [None]:
model=SentenceTransformer('xlm-r-distilroberta-base-paraphrase-v1')
embeddings = model.encode(list_data, show_progress_bar=True)

In [None]:
best_model = umap.UMAP(n_components=5).fit_transform(embeddings)

#n_neighbors=10, min_dist=0.0, 

In [None]:
#inertia

kmeans_per_k = [KMeans(n_clusters=k, random_state=42).fit(best_model)
                for k in range(1, 10)]
inertias = [model.inertia_ for model in kmeans_per_k]

plt.plot(range(1, 10), inertias, "bo-")
plt.xlabel("$k$", fontsize=14)
plt.ylabel("Inertia", fontsize=14)
plt.show()

In [None]:
# from nltk.cluster import KMeansClusterer,euclidean_distance
# clusterer =KMeansClusterer(6,euclidean_distance)
# cluster=np.array(clusterer.cluster(best_model,True))

In [None]:
cluster = KMeans(n_clusters=4, init="k-means++").fit_predict(best_model)

In [None]:
def plot(cluster = cluster):    

    clustered = (cluster >= 0)
    reduced_dim = umap.UMAP(n_components=2).fit_transform(embeddings)



    unique, counts = np.unique(cluster, return_counts=True)
    print(dict(zip(unique, counts)))

    plt.scatter(reduced_dim[~clustered, 0],
                reduced_dim[~clustered, 1],
                c=(0.5,0.5,0.5),
                # s=1,
                alpha=0.5
                )
    plt.scatter(reduced_dim[clustered, 0],
                reduced_dim[clustered, 1],
                c=cluster[clustered],
                # s=1,
                cmap='Accent')
    plt.show()

In [None]:
plot()

In [None]:
len(cluster)

In [None]:
docs = pd.DataFrame(list_data,columns=["comments"])
docs["cluster"] = cluster
labeled_docs = docs.groupby(["cluster"], as_index=False).agg({"comments": " ".join})
labeled_docs.comments.replace(r"\S*clas\S*", "class", regex=True, inplace = True)
labeled_docs.comments.replace(r"\S*teach\S*", "teach", regex=True, inplace = True)
labeled_docs

In [None]:
from keybert import KeyBERT
array_text = labeled_docs.comments.tolist()
    
kw_extractor = KeyBERT(model=model)
trigram = []
for j in range(len(array_text)):
    keywords = kw_extractor.extract_keywords(array_text[j],top_n=10,keyphrase_ngram_range=(3, 3))
    print(f'\033[1m   Trigram Cluster {j+1}: \033[0m')
    print([word for word in keywords])
    trigram.append([word[0] for word in keywords])

In [None]:
bigram=[]
for j in range(len(array_text)):
    keywords = kw_extractor.extract_keywords(array_text[j],top_n=3,keyphrase_ngram_range=(2, 2))
    print(f'\033[1m   Bigram Cluster {j+1}: \033[0m' )
    print([word for word in keywords])
    bigram.append([word[0] for word in keywords])

In [None]:
unigram=[]
for j in range(len(array_text)):
    keywords = kw_extractor.extract_keywords(array_text[j],top_n=10,keyphrase_ngram_range=(1, 1))
    print(f'\033[1m   Unigram Cluster {j+1}: \033[0m' )
    print([word for word in keywords])
    unigram.append([word[0] for word in keywords])

In [None]:
for i in range(4):
    clusterer=[]
    print(f'\033[1m   Cluster {i+1}: \033[0m')
    clusterer.append(trigram[i][0])
    
    wor=[]
    for w in trigram[i][1].split(" "):
        wor.append(w in trigram[i][0])
    if all(wor):
        clusterer.append(trigram[i][2])
    else:
        clusterer.append(trigram[i][1])
        
    for m, n in zip(trigram[i][0:2]*5,bigram[i][0:10]):
        first,second = n.split(" ")
        if not(first in m and second in m):
            clusterer.append(n)
    clusterer = clusterer[:5]
    [clusterer.append(j) for j in unigram[i]]
    print(clusterer)
    