### Document Clustering

Clustering similar documents together.

## Read Dataset

In [1]:
import pandas as pd

reviews_df = pd.read_csv("https://drive.google.com/uc?export=download&id=10dXjNNV9dbkn5shYLPcKXk_FWHvLmbGT")

In [2]:
pd.set_option("max_colwidth", 200)

In [3]:
reviews_df.sample(10)

Unnamed: 0,review,sentiment
857,A perfect renovated look of Refresh is taking my brethe here Very nice place to have quick bites along with chit chats \n\nI recommend this,1
1915,garlic toast was very bad,0
948,This place is really awesome I love the food and ambience This is one the best cafe in the city Staff is very polite generous and proactive,1
599,pasta was amazing,1
2593,The sandwich was filled with mayonnaise\nTaste can be improved,0
3009,Perfect place to spend a really good timeThumbs up for both tea and coffeeI come here everytime and enjoy my perfect cup of tea and snacks,1
3815,Enjoyed VERY NICE FOOD especially the grilled sandwich staff nd service was very goodI also liked the interiors of the cafe nd its ambience,1
2105,Excellent Im in love with their food Very satisfying,1
2539,I ordered a walnut brownie but I got a chocolate brownie\nPlease revert,0
4434,thick crust looks like thin crust,0


In [4]:
reviews_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5476 entries, 0 to 5475
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   review     5322 non-null   object
 1   sentiment  5476 non-null   int64 
dtypes: int64(1), object(1)
memory usage: 85.7+ KB


In [16]:
reviews_df = reviews_df.dropna()

In [17]:
reviews_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 5322 entries, 0 to 5475
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   review     5322 non-null   object
 1   sentiment  5322 non-null   int64 
dtypes: int64(1), object(1)
memory usage: 124.7+ KB


##### Download NLTK Resources

In [18]:
import nltk

In [19]:
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

##### Clean Text and return Tokens

In [20]:
from nltk.tokenize import WhitespaceTokenizer
tokenizer_w = WhitespaceTokenizer()

def tokenize(text):
    tokenized_list = tokenizer_w.tokenize(text.lower())   
    return tokenized_list

##### wordnet and lemmatization

Use NLTK's wordnet to find meanings of words, synonyms, antonyms, and more. In addition, we use WordNetLemmatizer to get the root word.

In [21]:

from nltk.stem.wordnet import WordNetLemmatizer

def get_lemma(word):
    return WordNetLemmatizer().lemmatize(word)

##### Filter out stop words

In [22]:
import nltk
en_stop = set(nltk.corpus.stopwords.words('english'))

##### Define a function to prepare Text for Topic Modelling

In [23]:
min_token_length = 3

In [24]:
def prepare_text_for_lda(text):
    tokens = tokenize(text)
    tokens = [token for token in tokens if len(token) > min_token_length]
    tokens = [token for token in tokens if token not in en_stop]
    tokens = [get_lemma(token) for token in tokens]
    return tokens

### Prepare Data

In [25]:
from gensim.models.doc2vec import Doc2Vec, TaggedDocument

In [26]:
reviews_df['new_reviews'] = reviews_df.review.map(lambda x: prepare_text_for_lda(x))

In [27]:
train_doc2vec = [TaggedDocument((d), tags=[str(i)]) for i, d in enumerate(reviews_df['new_reviews'])]

In [28]:
train_doc2vec[0:1]

[TaggedDocument(words=['pathetic', 'waste', 'money'], tags=['0'])]

In [29]:
len(train_doc2vec)

5322

##### LDA with gensim

First, we are creating a dictionary from the data, then convert to bag-of-words corpus and save the dictionary and corpus for future use.

In [30]:
#Train a doc2vec model to learn tweet representations. Use only training data!!
model = Doc2Vec(vector_size=50, alpha=0.025, min_count=5, dm =1, epochs=100)
model.build_vocab(train_doc2vec)
model.train(train_doc2vec, total_examples=model.corpus_count, epochs=model.epochs)
model.save("d2v.model")
print("Model Saved")

Model Saved


In [31]:
#Infer the feature representation for training and test data using the trained model
model= Doc2Vec.load("d2v.model")
#infer in multiple steps to get a stable representation. 
doc_vectors =  [model.infer_vector(list_of_tokens, steps=50) for list_of_tokens in reviews_df['new_reviews']]

In [44]:
doc_vectors[0:1]

[array([-0.00762218,  0.0062645 ,  0.11593407,  0.04018093,  0.00967872,
         0.01843313, -0.06368793,  0.05949349, -0.10142181, -0.02306454,
         0.04658538,  0.0194962 , -0.07044731,  0.04605113, -0.13581951,
         0.18217945, -0.09189049,  0.1260983 , -0.01662198,  0.01770755,
        -0.05389329, -0.22464792, -0.09044197, -0.00527616,  0.06158732,
         0.0219105 , -0.08901794, -0.08471939,  0.00299248,  0.09755848,
         0.01742232, -0.10582723, -0.07851638, -0.03315408,  0.05420331,
         0.07518912, -0.05302157,  0.01482735,  0.08178735, -0.06101079,
         0.00594147,  0.08248877, -0.02506243, -0.26798683,  0.03006242,
         0.02358549, -0.06595553,  0.07970453, -0.01842817, -0.11404482],
       dtype=float32)]

In [33]:
from sklearn.cluster import KMeans

In [50]:
kmeans = KMeans(n_clusters=2, random_state=100)

In [51]:
kmeans.fit(doc_vectors)

KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=300,
       n_clusters=2, n_init=10, n_jobs=None, precompute_distances='auto',
       random_state=100, tol=0.0001, verbose=0)

In [52]:
kmeans.labels_

array([1, 0, 1, ..., 1, 0, 0], dtype=int32)

In [53]:
reviews_df['cluster_id'] = kmeans.labels_

In [69]:
reviews_df[['review', 'cluster_id']][reviews_df.cluster_id == 1].sample(10)

Unnamed: 0,review,cluster_id
163,not bad,1
3721,good,1
1915,garlic toast was very bad,1
3137,Overpriced\nDoesnt give you an authentic bill and on explains the 20 tax as food tax\nPoor Service,1
2719,Pepsi I got the small sizeand I had ordered for the medium size,1
2650,The sandwich was filled with mayonnaise\nTaste can be improved,1
4869,Absolutely loved it A place must try The burgers were yum and the juices as well The ambience is really good a peacefull place for a good meal,1
4280,The burger didnt contained cheese at all,1
1181,Worst hot chocolate in the history Please dont name it Dark Chocolate Theres nothing chocolate about it let alone Dark Chocolate,1
1872,Just crust All the way,1


## Using Universal Sentence Encoder from Tensorflow Hub

In [56]:
import tensorflow as tf
import tensorflow_hub as hub
import numpy as np

In [57]:
module_url = "https://tfhub.dev/google/universal-sentence-encoder/4" 
model = hub.load(module_url)
print ("module %s loaded" % module_url)

module https://tfhub.dev/google/universal-sentence-encoder/4 loaded


In [58]:
sentence_embeddings = model(reviews_df.review)

In [59]:
sentence_embeddings[0:1]

<tf.Tensor: shape=(1, 512), dtype=float32, numpy=
array([[ 1.45384716e-02, -4.37697917e-02, -4.71408851e-02,
        -1.14987940e-02,  6.72410131e-02,  6.79939464e-02,
         1.03918333e-02, -4.20408025e-02, -3.87788974e-02,
        -8.69151354e-02,  6.63133617e-03,  3.35455574e-02,
        -1.25764329e-02, -7.40172490e-02,  6.46213368e-02,
        -5.83788902e-02,  2.39781849e-02, -3.67021561e-02,
        -7.02580158e-03, -2.60541476e-02,  4.51884307e-02,
         3.16841342e-02,  5.20070605e-02, -1.04249474e-02,
         3.05004083e-02,  2.65507717e-02,  5.90266101e-02,
        -8.20243433e-02, -6.89064618e-03,  6.38538087e-03,
         2.48819664e-02, -2.28606425e-02, -2.82280222e-02,
        -2.88413651e-02, -3.04937623e-02,  5.63430833e-03,
        -2.50107311e-02, -8.52059945e-03, -5.05692698e-02,
         3.62647027e-02, -4.66937423e-02,  4.31796312e-02,
        -5.39595820e-02,  3.27307126e-03, -1.76834911e-02,
         1.22667728e-02, -6.92560971e-02,  2.12305859e-02,
      

In [75]:
kmeans_tf = KMeans(n_clusters=2, random_state=100)

In [76]:
kmeans_tf.fit(sentence_embeddings)
reviews_df['cluster_id_tf'] = kmeans_tf.labels_

In [81]:
reviews_df[['review', 'cluster_id_tf']][reviews_df.cluster_id_tf == 0].sample(10)

Unnamed: 0,review,cluster_id_tf
4019,worst,0
4226,not bad,0
4948,horrible food,0
1514,super,0
2753,not bad,0
586,good,0
4270,good taste,0
3791,good,0
159,too bad taste,0
318,good,0
