# 1. Imports and reading data

In [1]:
%%time
import time
import re
import pandas as pd
import numpy as np
import umap
import hdbscan
import spacy
import gensim
import sklearn

from sklearn.cluster import KMeans
from nltk.corpus import stopwords
from sentence_transformers import SentenceTransformer
from keybert import KeyBERT

nlp = spacy.load('en_core_web_sm', disable = ['parser', 'ner'])
stop = stopwords.words('english')
model=SentenceTransformer('distilbert-base-nli-mean-tokens')
kw_extractor = KeyBERT(model=model)

%config Completer.use_jedi = False

Wall time: 23.7 s


In [2]:
# xlm-r-distilroberta-base-paraphrase-v1
# distilbert-base-nli-mean-tokens

In [3]:
start = time.time()

In [4]:
df = pd.read_csv('hsbc_comments.csv')
df.rename(columns={"message":"comments"},inplace=True)

# 2. Preprocessing

In [5]:
#This takes each sentence as inputs and returns list of clean tokens for that sentence
def TextCleaner(doc):
    doc = re.sub("[,.']", "", str(doc))
    doc = [token for token in doc.split(' ') if len(re.sub("[a-zA-Z]", "", token)) == 0]
    doc = nlp(" ".join(doc))
    taglist = ['RB','RBR', 'RBS','JJR','JJ','JJS','NN','NNS','VB','VBG','VBP','VBN']
    poslist = ['ADJ','ADV','NOUN','VERB']
    doc = [token.lemma_.lower() for token in doc if token.tag_ in taglist and token.pos_ in poslist]
    doc = [token for token in doc if not token in stop and len(token)>2 and len(token)<15]
    return doc

In [6]:
# This function converts a list of clean documents to trigrams 
def TextNGram(doc,ngrams = 3):
    bigram = gensim.models.Phrases(doc , min_count=10, threshold=40) # higher threshold fewer phrases.
    bigram_mod = gensim.models.phrases.Phraser(bigram)
    if ngrams == 2:
        doc = [bigram_mod[sent] for sent in doc]
        return doc
    elif ngrams == 3:
        trigram = gensim.models.Phrases(bigram[doc], threshold=20)
        trigram_mod = gensim.models.phrases.Phraser(trigram)
        doc = [trigram_mod[bigram_mod[sent]] for sent in doc]
        return doc
    else:
        return doc
        

In [7]:
%%time
#First 
df['Processed'] = [TextCleaner(f) for f in df['comments']]
#Second
df['Processed_ngram'] = TextNGram(df['Processed'])

Wall time: 6.29 s


In [8]:
print(df['Processed_ngram'].tolist())



In [9]:
data = pd.Series(df['Processed_ngram'].apply(lambda x: " ".join(x) if len(x) > 4 else np.nan))

In [10]:
data.dropna(axis=0, inplace=True)
data.reset_index(drop=True,inplace=True)

In [11]:
list_data = data.unique().tolist()

# 3. Model, Dimension Reduction and Saving Model

##     a. BERT Model

In [12]:
%%time
embeddings = model.encode(list_data, show_progress_bar=True)

Batches:   0%|          | 0/21 [00:00<?, ?it/s]

Wall time: 20.5 s


## b. UMAP Dimension Reduction

In [13]:
embedding_norm = sklearn.preprocessing.normalize(embeddings, norm='l2')

In [14]:
%%time
best_model = umap.UMAP(n_components=5,min_dist=0.0).fit_transform(embedding_norm)

# #n_neighbors=10, min_dist=0.0, 

Wall time: 10.5 s


# 4. Clustering and Reducing Clusters with Cosine-Similarity

## a. HDBSCAN Clustering

In [15]:
%%time
cluster = KMeans(n_clusters=4, init="k-means++").fit_predict(best_model)

Wall time: 63 ms


In [16]:
%%time
docs = pd.DataFrame(list_data,columns=["comments"])
docs["cluster"] = cluster
labeled_docs = docs.groupby(["cluster"], as_index=False).agg({"comments": " ".join})
array_text = labeled_docs.comments.tolist()

Wall time: 8 ms


In [17]:
%%time
for j in range(len(array_text)):
    keywords = kw_extractor.extract_keywords(array_text[j], top_n=10)
    print(f'\033[1m   Cluster {j+1}: \033[0m')
    print([word for word, degree in keywords])

[1m   Cluster 1: [0m
['online_banking', 'telephone_banking', 'financial_ombudsman', 'email_address', 'banker', 'bounce_loan', 'fraud_department', 'staff_pensioner', 'debit_card', 'cash_machine']
[1m   Cluster 2: [0m
['freeze_business', 'online_banking', 'suffer_month', 'fraud_department', 'debit_card', 'issue_debit_card', 'plague', 'unfortunately_freeze', 'telephone_banking', 'time_regret']
[1m   Cluster 3: [0m
['online_banking', 'bounce_loan', 'google', 'telephone_banking', 'holiday_extension', 'email_address', 'facebook', 'online', 'financial_ombudsman', 'debit_card']
[1m   Cluster 4: [0m
['spend_hour', 'working_day', 'telephone_banking', 'bounce_loan', 'hour', 'tomorrow', 'email_address', 'basic_banking', 'send_cheque', 'month_ago']
Wall time: 20.9 s


In [18]:
end = time.time()
print(end - start)

58.50318193435669
