In [1]:
import time
start = time.time()

# 1. Imports and reading data

In [2]:
%%time
import joblib
import re
import pandas as pd
import numpy as np
import umap
import spacy
import string
import hdbscan

import preprocessor as p

from sentence_transformers import SentenceTransformer

%config Completer.use_jedi = False
nlp = spacy.load("en_core_web_sm")

Wall time: 19.1 s


In [3]:
df = pd.read_csv('octopus_energy_fb_comments.csv')
df.rename(columns={"fb_comment":"comments"},inplace=True)

# 2. Preprocessing

## a. Applying tweet-preprocesser library

In [4]:
df.comments.replace("\n" , " ", regex=True, inplace = True)
df.comments=df.comments.apply(lambda x:p.clean(str(x)))

## b. Applying SpsCy lemmatizer, isalpha() and removing short words

In [5]:
def word_root(text):
    lemma="i"
    list_data3=[]
    doc = nlp(text)
    for token in doc:
        if token.text.isalpha():
            lemma = token.lemma_
        if len(lemma)>2:
            list_data3.append(lemma)
        else:pass
    return " ".join(list_data3)
df.comments = df.comments.apply(lambda x: word_root(str(x).lower()))  

## c. Removing ROWS which has NaN data

In [6]:
df.replace('nan', np.nan, inplace = True)
df.replace('', np.nan, inplace = True)
df.replace('do', np.nan, inplace = True)
df = df.dropna()
df.reset_index(drop=True,inplace=True)

In [7]:
list_data = df.comments.unique().tolist()

# 3. Model, Dimension Reduction and Saving Model

##     a. BERT Model

In [None]:
%%time
model=SentenceTransformer('xlm-r-distilroberta-base-paraphrase-v1')
# embeddings = model.encode(list_data, show_progress_bar=True)

## b. UMAP Dimension Reduction

In [None]:
# %%time
# best_model = umap.UMAP(n_components=5,min_dist=0.0).fit_transform(embeddings)

# #n_neighbors=10, min_dist=0.0, 

## c. Saving Model

In [None]:
# joblib.dump(best_model, "Octopus_facebook_model")

## d. Loading Model

In [None]:
best_model = joblib.load("Octopus_facebook_model")

# 4. Clustering and Reducing Clusters with Cosine-Similarity

## a. HDBSCAN Clustering

In [None]:
cluster = hdbscan.HDBSCAN().fit_predict(best_model)

In [None]:
docs = pd.DataFrame(list_data,columns=["comments"])
docs["cluster"] = cluster
labeled_docs = docs.groupby(["cluster"], as_index=False).agg({"comments": " ".join})

## b. Merging Similar Clusters with Cosine-Similarity (Applied 2 times)

In [None]:
%%time
array_text = labeled_docs.comments.tolist()
from sentence_transformers import SentenceTransformer, util

sentences = array_text[1:]

#Encode all sentences
embeddings = model.encode(sentences)

#Compute cosine similarity between all pairs
cos_sim = util.pytorch_cos_sim(embeddings, embeddings)

#Add all pairs to a list with their cosine similarity score
all_sentence_combinations = []
for i in range(len(cos_sim)-1):
    for j in range(i+1, len(cos_sim)):
        all_sentence_combinations.append([cos_sim[i][j], i, j])

#Sort list by the highest cosine similarity score
all_sentence_combinations = sorted(all_sentence_combinations, key=lambda x: x[0], reverse=True)
similar = []
for score, i, j in all_sentence_combinations:
#     print("cluster {} \t cluster {} \t similarity: {:.4f}".format(i+1, j+1, cos_sim[i][j]))
    similar.append((score,i+1,j+1))

In [None]:
%%time
i=0
while similar[i][0]>0.40:
    one, two = sorted([similar[i][1],similar[i][2]])
    if labeled_docs.comments[one] == "" and labeled_docs.comments[two] == "":
        pass
    elif labeled_docs.comments[one] == "" and labeled_docs.comments[two] != "":
        labeled_docs.comments[one] = labeled_docs.comments[two]
    else:
        labeled_docs.comments[one] = labeled_docs.comments[one] +" "+labeled_docs.comments[two]
    labeled_docs.comments[two]=""
    i+=1



In [None]:
labeled_docs.replace('', np.nan, inplace = True)
labeled_docs=labeled_docs.dropna()
labeled_docs.reset_index(drop=True,inplace=True)

In [None]:
%%time
array_text = labeled_docs.comments.tolist()
from sentence_transformers import SentenceTransformer, util

sentences = array_text[1:]

#Encode all sentences
embeddings = model.encode(sentences)

#Compute cosine similarity between all pairs
cos_sim = util.pytorch_cos_sim(embeddings, embeddings)

#Add all pairs to a list with their cosine similarity score
all_sentence_combinations = []
for i in range(len(cos_sim)-1):
    for j in range(i+1, len(cos_sim)):
        all_sentence_combinations.append([cos_sim[i][j], i, j])

#Sort list by the highest cosine similarity score
all_sentence_combinations = sorted(all_sentence_combinations, key=lambda x: x[0], reverse=True)
similar = []
for score, i, j in all_sentence_combinations:
#     print("cluster {} \t cluster {} \t similarity: {:.4f}".format(i+1, j+1, cos_sim[i][j]))
    similar.append((score,i+1,j+1))

In [None]:
%%time
i=0
while similar[i][0]>0.40:
    one, two = sorted([similar[i][1],similar[i][2]])
    if labeled_docs.comments[one] == "" and labeled_docs.comments[two] == "":
        pass
    elif labeled_docs.comments[one] == "" and labeled_docs.comments[two] != "":
        labeled_docs.comments[one] = labeled_docs.comments[two]
    else:
        labeled_docs.comments[one] = labeled_docs.comments[one] +" "+labeled_docs.comments[two]
    labeled_docs.comments[two]=""
    i+=1

In [None]:
labeled_docs.replace('', np.nan, inplace = True)
labeled_docs=labeled_docs.dropna()
labeled_docs.reset_index(drop=True,inplace=True)

# 5. Keyword Extruction with KeyBERT

## a. Bigrams

In [None]:
%%time
from keybert import KeyBERT
array_text = labeled_docs.comments.tolist()
kw_extractor = KeyBERT(model=model)
bigram=[]
for j in range(len(array_text)-1):
    keywords = kw_extractor.extract_keywords(array_text[j+1],top_n=10,keyphrase_ngram_range=(2, 2))
#     print(f'\033[1m   Bigram Cluster {j+1}: \033[0m' )
#     print([word for word in keywords])
    bigram.append([word[0] for word in keywords])

## b. Unigrams

In [None]:
%%time
unigram=[]
for j in range(len(array_text)-1):
    keywords = kw_extractor.extract_keywords(array_text[j+1],top_n=10,keyphrase_ngram_range=(1, 1))
#     print(f'\033[1m   Unigram Cluster {j+1}: \033[0m' )
#     print([word for word in keywords])
    unigram.append([word[0] for word in keywords])

## c. Results

In [None]:
for i in range(len(array_text)-1):
    clusterer=[]
    bigramer=[]
    print(f'\033[1m   Cluster {i+1}: \033[0m')
    [clusterer.append(j) for j in unigram[i]]
    clusterer.append(bigram[i][0])
    
    for word in bigram[i][0].split(" "):
        if word not in bigramer:
            bigramer.append(word)
    
    for each in bigram[i]:
        first,second=each.split(" ")
        if first != second and not(first in bigramer and second in bigramer):
            clusterer.append(each)
            if bigramer.append(first): first not in bigramer
            if bigramer.append(second): second not in bigramer
    print(clusterer)

In [None]:
end = time.time()
print(end - start)