In [1]:
import time
start = time.time()

# 1. Imports and reading data

In [2]:
%%time
import joblib
import re
import pandas as pd
import numpy as np
import umap
import spacy
import string
import hdbscan

import preprocessor as p

from sentence_transformers import SentenceTransformer

%config Completer.use_jedi = False
nlp = spacy.load("en_core_web_sm")

Wall time: 19.7 s


In [62]:
df = pd.read_csv('hsbc_comments.csv')
df.rename(columns={"message":"comments"},inplace=True)

# 2. Preprocessing

## a. Applying tweet-preprocesser library

In [59]:
p.clean("Notice how they haven\u2019t replied to anyone?\U0001f923\U0001f923\U0001f923 https://google.com #hello @mehmet")

'Notice how they havent replied to anyone?'

In [60]:
df.comments.replace("\n" , " ", regex=True, inplace = True)
df.comments=df.comments.apply(lambda x:p.clean(str(x)))

In [63]:
%%time
df.comments=df.comments.apply(lambda x:p.clean(str(x)))

Wall time: 269 ms


## b. Applying SpsCy lemmatizer, isalpha() and removing short words

In [56]:
%%time
def word_root(text):
    list_data3=[]
    doc = nlp(text)
    for token in doc:
        if token.text.isalpha():
            lemma = token.lemma_
            if len(lemma)>2:
                list_data3.append(lemma)
    return " ".join(list_data3)

df.comments = df.comments.apply(lambda x: word_root(str(x).lower()))  

Wall time: 9.77 s


In [57]:
%%time
df.comments = df.comments.apply(lambda x: word_root(str(x).lower()))  

Wall time: 9.76 s


In [31]:
word_root(p.clean("Notice how they haven\u2019t replied to anyone?\U0001f923\U0001f923\U0001f923 https://google.com #hello @mehmet"))

'notice how they have reply anyone'

## c. Removing ROWS which has NaN data

In [6]:
df.replace('nan', np.nan, inplace = True)
df.replace('', np.nan, inplace = True)
df.replace('do', np.nan, inplace = True)
df = df.dropna()
df.reset_index(drop=True,inplace=True)

In [7]:
list_data = df.comments.unique().tolist()

In [42]:
len(list_data)

884

# 3. Model, Dimension Reduction and Saving Model

##     a. BERT Model

In [36]:
%%time
model=SentenceTransformer('distilbert-base-nli-mean-tokens')


Wall time: 4.07 s


In [37]:
%%time
embeddings = model.encode(list_data, show_progress_bar=True)

Batches:   0%|          | 0/28 [00:00<?, ?it/s]

Wall time: 39.6 s


## b. UMAP Dimension Reduction

In [38]:
%%time
best_model = umap.UMAP(n_components=5,min_dist=0.0).fit_transform(embeddings)

# #n_neighbors=10, min_dist=0.0, 

Wall time: 4.6 s


## c. Saving Model

## d. Loading Model

In [11]:
best_model = joblib.load("HSBC_comments_model")

# 4. Clustering and Reducing Clusters with Cosine-Similarity

## a. HDBSCAN Clustering

In [47]:
from sklearn.cluster import KMeans

In [49]:
%%time
cluster = KMeans(n_clusters=5, init="k-means++").fit_predict(best_model)

Wall time: 88.9 ms


In [40]:
%%time
cluster = hdbscan.HDBSCAN().fit_predict(best_model)
KMeans

Wall time: 34.4 ms


In [50]:

docs = pd.DataFrame(list_data,columns=["comments"])
docs["cluster"] = cluster
labeled_docs = docs.groupby(["cluster"], as_index=False).agg({"comments": " ".join})
labeled_docs

Unnamed: 0,cluster,comments
0,0,call the evening the suggestion but but they h...
1,1,melanie humberstone brilliant they card take l...
2,2,imagination run wild wild wild try get your ba...
3,3,would love donate again this year year unfortu...
4,4,would like say massive thank you hsbc and the ...


In [45]:
docs.head(20)

Unnamed: 0,comments,cluster
0,would like say massive thank you hsbc and the ...,1
1,imagination run wild wild wild try get your ba...,1
2,you would send the neccesary number would use ...,1
3,just past hour wait again again for the time t...,1
4,melanie humberstone,0
5,brilliant,0
6,absolutely fume fume sit hold for almost hour ...,1
7,would love donate again this year year unfortu...,1
8,possible you can answer telephone less than hr...,1
9,they card take long opne,0


## b. Merging Similar Clusters with Cosine-Similarity (Applied 2 times)

In [14]:
%%time
array_text = labeled_docs.comments.tolist()
from sentence_transformers import SentenceTransformer, util

sentences = array_text[1:]

#Encode all sentences
embeddings = model.encode(sentences)

#Compute cosine similarity between all pairs
cos_sim = util.pytorch_cos_sim(embeddings, embeddings)

#Add all pairs to a list with their cosine similarity score
all_sentence_combinations = []
for i in range(len(cos_sim)-1):
    for j in range(i+1, len(cos_sim)):
        all_sentence_combinations.append([cos_sim[i][j], i, j])

#Sort list by the highest cosine similarity score
all_sentence_combinations = sorted(all_sentence_combinations, key=lambda x: x[0], reverse=True)
similar = []
for score, i, j in all_sentence_combinations:
#     print("cluster {} \t cluster {} \t similarity: {:.4f}".format(i+1, j+1, cos_sim[i][j]))
    similar.append((score,i+1,j+1))

Wall time: 6.86 s


In [15]:
%%time
i=0
while similar[i][0]>0.40:
    one, two = sorted([similar[i][1],similar[i][2]])
    if labeled_docs.comments[one] == "" and labeled_docs.comments[two] == "":
        pass
    elif labeled_docs.comments[one] == "" and labeled_docs.comments[two] != "":
        labeled_docs.comments[one] = labeled_docs.comments[two]
    else:
        labeled_docs.comments[one] = labeled_docs.comments[one] +" "+labeled_docs.comments[two]
    labeled_docs.comments[two]=""
    i+=1



Wall time: 166 ms


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [16]:
labeled_docs.replace('', np.nan, inplace = True)
labeled_docs=labeled_docs.dropna()
labeled_docs.reset_index(drop=True,inplace=True)

In [17]:
%%time
array_text = labeled_docs.comments.tolist()
from sentence_transformers import SentenceTransformer, util

sentences = array_text[1:]

#Encode all sentences
embeddings = model.encode(sentences)

#Compute cosine similarity between all pairs
cos_sim = util.pytorch_cos_sim(embeddings, embeddings)

#Add all pairs to a list with their cosine similarity score
all_sentence_combinations = []
for i in range(len(cos_sim)-1):
    for j in range(i+1, len(cos_sim)):
        all_sentence_combinations.append([cos_sim[i][j], i, j])

#Sort list by the highest cosine similarity score
all_sentence_combinations = sorted(all_sentence_combinations, key=lambda x: x[0], reverse=True)
similar = []
for score, i, j in all_sentence_combinations:
#     print("cluster {} \t cluster {} \t similarity: {:.4f}".format(i+1, j+1, cos_sim[i][j]))
    similar.append((score,i+1,j+1))

Wall time: 2.73 s


In [18]:
%%time
i=0
while similar[i][0]>0.40:
    one, two = sorted([similar[i][1],similar[i][2]])
    if labeled_docs.comments[one] == "" and labeled_docs.comments[two] == "":
        pass
    elif labeled_docs.comments[one] == "" and labeled_docs.comments[two] != "":
        labeled_docs.comments[one] = labeled_docs.comments[two]
    else:
        labeled_docs.comments[one] = labeled_docs.comments[one] +" "+labeled_docs.comments[two]
    labeled_docs.comments[two]=""
    i+=1

Wall time: 7 ms


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [19]:
labeled_docs.replace('', np.nan, inplace = True)
labeled_docs=labeled_docs.dropna()
labeled_docs.reset_index(drop=True,inplace=True)

# 5. Keyword Extruction with KeyBERT

## a. Bigrams

In [55]:
%%time
from keybert import KeyBERT
array_text = labeled_docs.comments.tolist()
kw_extractor = KeyBERT(model=model)
bigram=[]
for j in range(len(array_text)):
    keywords = kw_extractor.extract_keywords(array_text[j],top_n=10,keyphrase_ngram_range=(2, 2))
    print(f'\033[1m   Bigram Cluster {j+1}: \033[0m' )
    print([word for word in keywords])
    bigram.append([word[0] for word in keywords])

[1m   Bigram Cluster 1: [0m
[('horrendous bank', 0.5735), ('bank disgusting', 0.5733), ('disgusting bank', 0.5686), ('terrible customer', 0.5369), ('terrible bank', 0.5368), ('awful bank', 0.534), ('terrible hsbc', 0.5285), ('bank misfortune', 0.5268), ('dreadful customer', 0.5056), ('hsbc disgusted', 0.5021)]
[1m   Bigram Cluster 2: [0m
[('love colleague', 0.5079), ('amazing love', 0.4717), ('love quote', 0.4372), ('proud nina', 0.427), ('love love', 0.427), ('passion love', 0.421), ('love word', 0.4177), ('happy diwali', 0.4136), ('gallacher amazing', 0.4109), ('beautiful service', 0.4105)]
[1m   Bigram Cluster 3: [0m
[('tuesday application', 0.4406), ('tuesday tuesday', 0.4093), ('following tuesday', 0.3688), ('today tuesday', 0.3675), ('tuesday january', 0.3643), ('hsbc backlog', 0.361), ('husband facebook', 0.3605), ('facebook facebook', 0.3578), ('hsbc twitter', 0.3566), ('new treasurer', 0.3497)]
[1m   Bigram Cluster 4: [0m
[('diabetes diabetes', 0.3337), ('medicine week

## b. Unigrams

In [54]:
%%time
unigram=[]
for j in range(len(array_text)):
    keywords = kw_extractor.extract_keywords(array_text[j],top_n=10,keyphrase_ngram_range=(1, 1))
    print(f'\033[1m   Unigram Cluster {j+1}: \033[0m' )
    print([word for word in keywords])
    unigram.append([word[0] for word in keywords])

[1m   Unigram Cluster 1: [0m
[('horrendous', 0.3536), ('racist', 0.3363), ('bankrupt', 0.3259), ('fraudster', 0.3109), ('incompetent', 0.2908), ('disgusting', 0.2817), ('pandemic', 0.2797), ('disgusted', 0.2791), ('disgraceful', 0.2769), ('sick', 0.2757)]
[1m   Unigram Cluster 2: [0m
[('love', 0.2959), ('happy', 0.2844), ('skyla', 0.2813), ('proud', 0.2746), ('grateful', 0.2696), ('merry', 0.2264), ('beautiful', 0.2182), ('thank', 0.2084), ('aplaude', 0.1961), ('passion', 0.1853)]
[1m   Unigram Cluster 3: [0m
[('tuesday', 0.3282), ('facebook', 0.2335), ('june', 0.2175), ('thursday', 0.2141), ('payday', 0.2069), ('emailing', 0.2053), ('birthday', 0.1853), ('monday', 0.1841), ('christmas', 0.1825), ('google', 0.1815)]
[1m   Unigram Cluster 4: [0m
[('tuesday', 0.2112), ('diabetes', 0.1935), ('netflix', 0.1803), ('millionaire', 0.1235), ('heartbroken', 0.1214), ('instagram', 0.1016), ('phishing', 0.0977), ('plague', 0.0966), ('ebay', 0.095), ('june', 0.0898)]
[1m   Unigram Cluster

## c. Results

In [None]:
tejas shinde
1. Using preprocessing library to clean - removing emoji url @mentions and #mentions
2. function to lemmatize and lower which removes symbols(checking alpha)
3. removing documents which are NA and do 
4. loading BERT model
5. embeddings - model.encode() - 1min 10 sec
6. reducing dimensions  and creating best model- UMAP - 4.6 sec
7. saving and loading embedding and dimensionality reduction model
8. fitting model to predict - (HDBSCAN)34.4 ms sec (KMEANS) 88.9 ms
9. KeyBERT to contribution 30 
"I am applying for bounce back loan"
apply bounce back loan,
apply bounce_back_loan
apply', 'bounce_back_loan'

In [22]:
for i in range(len(array_text)-1):
    clusterer=[]
    bigramer=[]
    print(f'\033[1m   Cluster {i+1}: \033[0m')
    [clusterer.append(j) for j in unigram[i]]
    clusterer.append(bigram[i][0])
    
    for word in bigram[i][0].split(" "):
        if word not in bigramer:
            bigramer.append(word)
    
    for each in bigram[i]:
        first,second=each.split(" ")
        if first != second and not(first in bigramer and second in bigramer):
            clusterer.append(each)
            if bigramer.append(first): first not in bigramer
            if bigramer.append(second): second not in bigramer
    print(clusterer)

[1m   Cluster 1: [0m
['defraud', 'fraud', 'pensioner', 'deposit', 'pension', 'fraudster', 'brazil', 'fraudulent', 'fca', 'check', 'hsbc pension', 'clawback pension', 'great pension', 'huge pension', 'brilliant pension', 'bank pension', 'people pension', 'hsbc pensioner', 'claw pension']
[1m   Cluster 2: [0m
['chima', 'passion', 'inspirational', 'okechukwu', 'inspiration', 'love', 'great', 'deliver', 'work', 'colour', 'love chima', 'chima inspirational', 'colour chima', 'chima great', 'chima inspiration', 'work chima', 'okechukwu chima']
[1m   Cluster 3: [0m
['hypocrisy', 'paradise', 'environment', 'reply', 'ignore', 'action', 'recognise', 'island', 'email', 'england', 'action hypocrisy', 'christma reply', 'environment hsbc', 'notice reply', 'like island', 'island annoying', 'mean island']
[1m   Cluster 4: [0m
['telephone', 'disconnect', 'phone', 'communicate', 'callback', 'replace', 'restore', 'serve', 'operate', 'register', 'telephone wait', 'phone wait', 'phone isolate', 'tel

In [23]:
end = time.time()
print(end - start)

232.08395624160767
