In [None]:
# library to find the elbow in kmeans clustering
!pip install kneed

In [None]:
# library to encode the sentences
!pip install -U sentence-transformers

In [None]:
# downloading all data and unziping them
!gdown --id 13dBDebbbBLfTg0-jbiJMh6u9NtEJ6T8l
!unzip "/content/changeadvisor-dataset.zip" -d "/content/"

In [5]:
# importing all relevant libraries for use
import pandas as pd
import numpy as np
from kneed import KneeLocator
from sentence_transformers import SentenceTransformer
from sklearn.cluster import KMeans
import re


In [38]:
# a function that takes text and returns the embeddings. The return shape is 
# (N,768) where n is the number of reviews 
def get_embeddings(text):
  # loading the bert model
  bert_model = SentenceTransformer('bert-base-uncased')
  # encoding and returning the embeddings
  embeddings = bert_model.encode(text)
  return embeddings

In [39]:
# getting the best k. The fucntion takes the embeddings and the range between which we think the k will be
def get_k(embeddings,starting_range = 1,ending_range=10):
  
  k_range = range(starting_range,ending_range)
  WSS = []
  # looping over all the k from starting to endings
  for k in k_range:
    # initializing 1 by 1 kmeans clustering algo with different k from the range
      km_cluster = KMeans(n_clusters = k,algorithm='auto', copy_x=True, init='k-means++', max_iter=300,
      n_init=10, n_jobs=None, precompute_distances='auto',
      random_state=0, tol=0.0001, verbose=0)

    # fitting the embeddings on kmeans to get the inertia
      km_cluster.fit(embeddings)
      # saving the inertia to find the elbow
      WSS.append(km_cluster.inertia_)
  
  # after the loop is over we have intertia(WSS) for each value of k which we provide to kneelocater to get us the elbow k value
  elbow_locator = KneeLocator(k_range, WSS, curve='convex', direction='decreasing')
  elbow_point = elbow_locator.knee - 1
  
  return elbow_point

In [45]:
# this takes in the embeddings and text and also the best k value and clusters the data accordingly
# after it stores the data in csv
def cluster_reviews(embeddings,text,best_k,filename ='Clustered_Data.csv'):
  # initializing the kmeans with best k 
  clustering_model = KMeans(n_clusters=best_k,algorithm='auto', copy_x=True, init='k-means++', max_iter=300,
      n_init=10, n_jobs=None, precompute_distances='auto',
      random_state=0, tol=0.0001, verbose=0)
  # fitting the kmeans on embeddings
  clustering_model.fit(embeddings)
  # getting the labels for each row(Review) provided to kmeans
  cluster_assignment = clustering_model.labels_
  # putting it with text and storing in csv file
  print('Saving the clustered result in a csv file')
  print('--------------------------------------- \n')
  data = pd.DataFrame(list(zip(text, cluster_assignment)),
               columns =['Text', 'Cluster'])
  data.to_csv(filename)
  print('Process Done!!!')

In [50]:
# the main function which starts it all
def main(filepath = "/content/reviews/com.achep.acdisplay.txt"):
  # get filename   
  filename = re.findall(r'[\w.]*.txt', filepath)[0]

  #  reading the files of reviews 
  print("Reading the the text data...")
  print('--------------------------------------- \n')
  my_file = open(filepath, "r")
  content = my_file.read()
  content_list = content.split("\n")
  my_file.close()

  # getting the embeddings for all reviews
  # currently its reduce to 100 you can change the value or either remove [] to work on full data
  print("Creating embeddings from the text data...")
  print('--------------------------------------- \n')
  embeddings = get_embeddings(content_list[:100])
  # getting the best k
  print("\nDetermining the best K for the embedded data...")
  print('--------------------------------------- \n')
  k_value = get_k(embeddings,1,10)
  # getting the review with clusters
  print("Applying Kmeans with the best K to the embedded data...")
  print('-------------------------------------- \n')
  cluster_reviews(embeddings,content_list,k_value,filename)

In [51]:
# calling main
main()

Reading the the text data...
--------------------------------------- 

Creating embeddings from the text data...
--------------------------------------- 



Some weights of the model checkpoint at /root/.cache/torch/sentence_transformers/bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).



 Determining the best K for the embedded data...
--------------------------------------- 

Applying Kmeans with the best K to the embedded data...
-------------------------------------- 

Saving the clustered result in a csv file
--------------------------------------- 

Process Done!!!
