In [33]:
import pandas as pd
import matplotlib.pyplot as plt

In [34]:
with open('lemmatized.txt') as f:
    reviews = f.read().splitlines()

reviews[0]

"['jan', 'be', 'very', 'friendly', 'and', 'welcome', 'host', 'the', 'apartment', 'be', 'great', 'and', 'the', 'area', 'be', 'sooo', 'amaze', 'lot', 'of', 'nice', 'cafe', 'and', 'shop', 'enjoy', 'my', 'time', 'there', 'lot']"

In [35]:
reviews = [review.replace('\'', '').strip('][').split(', ') for review in reviews]
reviews[0]

['jan',
 'be',
 'very',
 'friendly',
 'and',
 'welcome',
 'host',
 'the',
 'apartment',
 'be',
 'great',
 'and',
 'the',
 'area',
 'be',
 'sooo',
 'amaze',
 'lot',
 'of',
 'nice',
 'cafe',
 'and',
 'shop',
 'enjoy',
 'my',
 'time',
 'there',
 'lot']

## Bigrams generation

In [36]:
from gensim.models.phrases import Phrases, Phraser

phrases = Phrases(reviews, min_count=3, progress_per=50000)
bigram = Phraser(phrases)
sentences = bigram[reviews]
sentences[1]

['it',
 'be',
 'really',
 'nice',
 'area',
 'food',
 'park',
 'transport',
 'be',
 'perfect']

In [37]:
from collections import defaultdict

word_freq = defaultdict(int)
for sent in sentences:
    for i in sent:
        word_freq[i] += 1
len(word_freq)

61035

In [38]:
sorted(word_freq, key=word_freq.get, reverse=True)[:10]

['be', 'and', 'the', 'to', 'in', 'very', 'we', 'of', 'great', 'it']

## Word2Vec model

In [39]:
from gensim.models import Word2Vec

w2v_model = Word2Vec(min_count=20,
                     window=4,
                     vector_size=300,
                     sample=6e-5,
                     alpha=0.03,
                     min_alpha=0.0007,
                     negative=20,
                     workers=4)

In [40]:
from time import time

t = time()

w2v_model.build_vocab(sentences, progress_per=10000)

print('Time to build vocab: {} mins'.format(round((time() - t) / 60, 2)))

Time to build vocab: 0.24 mins


In [41]:
t = time()

w2v_model.train(sentences, total_examples=w2v_model.corpus_count, epochs=30, report_delay=1)

print('Time to train the model: {} mins'.format(round((time() - t) / 60, 2)))

Time to train the model: 9.21 mins


In [134]:
# w2v_model.save("word2vec.model")

## Exploring the model

In [43]:
w2v_model.wv.most_similar(positive=["apartment"])

[('flat', 0.8184784054756165),
 ('studio', 0.5035080909729004),
 ('appartment', 0.49468955397605896),
 ('spacious', 0.48489269614219666),
 ('modern', 0.48029956221580505),
 ('clean', 0.4498693346977234),
 ('apt', 0.42693156003952026),
 ('itself', 0.42469674348831177),
 ('beautifully_appoint', 0.41398540139198303),
 ('bright', 0.41148775815963745)]

## Clustering model

In [75]:
from sklearn.cluster import KMeans
import numpy as np

kmeans_model = KMeans(n_clusters=2, max_iter=1000, random_state=42, n_init=50)
kmeans_model.fit(X=w2v_model.wv.vectors.astype('double'))

KMeans(max_iter=1000, n_clusters=2, n_init=50, random_state=42)

In [105]:
w2v_model.wv.similar_by_vector(kmeans_model.cluster_centers_[0], topn=10, restrict_vocab=None)

[('no', 0.6178538799285889),
 ('that', 0.6154983639717102),
 ('useless', 0.6057008504867554),
 ('metal', 0.601493239402771),
 ('rug', 0.5799352526664734),
 ('plastic', 0.5778060555458069),
 ('bit_annoy', 0.5755999088287354),
 ('broken', 0.5718725323677063),
 ('mop', 0.5668021440505981),
 ('odor', 0.5624467730522156)]

In [106]:
negative_cluster_index = 0
negative_cluster_center = kmeans_model.cluster_centers_[negative_cluster_index]
positive_cluster_center = kmeans_model.cluster_centers_[1-negative_cluster_index]

In [107]:
# dfWords = pd.DataFrame(w2v_model.wv.vocab.keys())

dfWords = pd.DataFrame(w2v_model.wv.key_to_index.keys())
dfWords.columns = ['words']
dfWords['vectors'] = dfWords.words.apply(lambda x: w2v_model.wv[f'{x}'])
dfWords['cluster'] = dfWords.vectors.apply(lambda x: kmeans_model.predict([np.array(x)]))
dfWords.cluster = dfWords.cluster.apply(lambda x: x[0])
dfWords.head()

Unnamed: 0,words,vectors,cluster
0,be,"[0.11147587, 0.07957115, 0.0352661, 0.2452787,...",0
1,and,"[-0.30145282, -0.041696303, 0.06714162, -0.166...",2
2,the,"[0.17333335, -0.43380734, 0.6963205, -0.070611...",0
3,to,"[-0.5403165, 0.002703896, 0.1560523, -0.411324...",1
4,in,"[0.41520724, 0.11420307, 0.59832984, -0.116240...",0


In [108]:
dfWords['cluster_value'] = [-1 if i==negative_cluster_index else 1 for i in dfWords.cluster]
dfWords['closeness_score'] = dfWords.apply(lambda x: 1/(kmeans_model.transform([x.vectors]).min()), axis=1)
dfWords['sentiment_coeff'] = dfWords.closeness_score * dfWords.cluster_value
dfWords[dfWords['cluster_value'] == -1].head()

Unnamed: 0,words,vectors,cluster,cluster_value,closeness_score,sentiment_coeff
0,be,"[0.11147587, 0.07957115, 0.0352661, 0.2452787,...",0,-1,0.178454,-0.178454
2,the,"[0.17333335, -0.43380734, 0.6963205, -0.070611...",0,-1,0.142254,-0.142254
4,in,"[0.41520724, 0.11420307, 0.59832984, -0.116240...",0,-1,0.134969,-0.134969
9,it,"[0.013412308, -0.42243224, 0.15261401, 0.58773...",0,-1,0.141696,-0.141696
12,for,"[0.12189595, -0.04447002, -0.5717739, -1.14031...",0,-1,0.110504,-0.110504


In [109]:
dfCleanedReviews = pd.DataFrame([' '.join(review) for review in reviews], columns=['comments'])
dfCleanedReviews.head()

Unnamed: 0,comments
0,jan be very friendly and welcome host the apar...
1,it be really nice area food park transport be ...
2,we have very nice stay in berlin thanks to jan...
3,great location close to mauerpark kastanienall...
4,apartment very well locate close to everything...


In [110]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(norm=None)
transformed = tfidf.fit_transform(dfCleanedReviews['comments'].tolist())
features = pd.Series(tfidf.get_feature_names())

In [111]:
def create_tfidf_dictionary(x, transformed_file, features):
    '''
    create dictionary for each input sentence x, where each word has assigned its tfidf score

    inspired  by function from this wonderful article:
    https://medium.com/analytics-vidhya/automated-keyword-extraction-from-articles-using-nlp-bfd864f41b34

    x - row of dataframe, containing sentences, and their indexes,
    transformed_file - all sentences transformed with TfidfVectorizer
    features - names of all words in corpus used in TfidfVectorizer

    '''
    # print('Name:', x.name)
    vector_coo = transformed_file[x.name].tocoo()
    vector_coo.col = features.iloc[vector_coo.col].values
    dict_from_coo = dict(zip(vector_coo.col, vector_coo.data))
    return dict_from_coo

def replace_tfidf_words(x, transformed_file, features):
    '''
    replacing each word with it's calculated tfidf dictionary with scores of each word
    x - row of dataframe, containing sentences, and their indexes,
    transformed_file - all sentences transformed with TfidfVectorizer
    features - names of all words in corpus used in TfidfVectorizer
    '''
    dictionary = create_tfidf_dictionary(x, transformed_file, features)
    # print(dictionary.keys())
    # print('Dictionary[there]', dictionary['there'])
    # print('Dictionary[be]', dictionary['be'])
    # print(dictionary.keys())
    # print(x.comments)
    try:
        res = list(map(lambda y:dictionary[f'{y}'], x.comments.split()))
    except KeyError:
        res = [0 for i in x.comments.split()]
    # print('Res[0]:', res[0])
    # return list(map(lambda y:dictionary[f'{y}'], x.comments.split()))
    return res

In [112]:
replaced_tfidf_scores = dfCleanedReviews.apply(lambda x: replace_tfidf_words(x, transformed, features), axis=1)
type(replaced_tfidf_scores)

pandas.core.series.Series

In [113]:
sentiment_dict = dict(zip(dfWords.words.values, dfWords.sentiment_coeff.values))

In [114]:

def replace_sentiment_words(word, sentiment_dict):
    '''
    replacing each word with its associated sentiment score from sentiment dict
    '''
    try:
        out = sentiment_dict[word]
    except KeyError:
        out = 0
    return out

In [115]:
replaced_closeness_scores = dfCleanedReviews['comments'].apply(lambda x: list(map(lambda y: replace_sentiment_words(y, sentiment_dict), x.split())))

In [116]:
replacement_df = pd.DataFrame(data=[replaced_closeness_scores, replaced_tfidf_scores, dfCleanedReviews['comments']]).T
replacement_df.columns = ['sentiment_coeff', 'tfidf_scores', 'sentence']
replacement_df['sentiment_rate'] = replacement_df.apply(lambda x: np.array(x.loc['sentiment_coeff']) @ np.array(x.loc['tfidf_scores']), axis=1)
# replacement_df['prediction'] = 'Positive' if replacement_df.sentiment_rate>0 else 'Negative'
replacement_df['prediction'] = (replacement_df.sentiment_rate>0).astype('int8')
# replacement_df['sentiment'] = [1 if i==1 else 0 for i in replacement_df.sentiment]
replacement_df.head()

Unnamed: 0,sentiment_coeff,tfidf_scores,sentence,sentiment_rate,prediction
0,"[0.08264830965921147, -0.17845424863877163, 0....","[6.45518200427728, 3.6108140688773807, 1.62336...",jan be very friendly and welcome host the apar...,5.850742,1
1,"[-0.14169600930737072, -0.17845424863877163, 0...","[2.034512963851828, 2.407209379251587, 2.64191...",it be really nice area food park transport be ...,1.203945,1
2,"[0.11390161214456318, -0.14736751348122512, 0....","[2.091150860411193, 4.119858950780223, 4.87009...",we have very nice stay in berlin thanks to jan...,8.632043,1
3,"[0.1436133694443176, 0.14592163170361538, 0.09...","[1.8501655371557688, 2.159718877149754, 2.7123...",great location close to mauerpark kastanienall...,7.758851,1
4,"[0.12993493434599254, 0.13770945493730063, 0.0...","[3.927308953862587, 3.246732537406179, 2.84084...",apartment very well locate close to everything...,2.161046,1


In [117]:
dfNegativeSentiment = replacement_df[replacement_df['prediction'] == 0]
dfNegativeSentiment.sort_values(by=['sentiment_rate'], inplace=True)
dfNegativeSentiment['sentence'].head().tolist()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


['accuracy pretty sure this place be low income housing the building be super sketchy the lobby area be disgust dark and full of graffiti tags the elevator be the same way the wall and floor of hallway lead into the apartment look like they ve never be clean the neighbor have their possession scatter about the hallway it say it have three bedroom but there be two plus bed in the living room which be openly connect to the rest of the residence every electrical socket and light fixture be hang out of the wall or ceiling the stove have be disable and be connect to propane tank there be piece of raw plywood hang randomly attach to wall jet out into area of traffic the nicer furniture in the photo have be remove check in from the street it be difficult to see the building we be suppose to enter there be very little lighting in the area and there aren easily visible sign with marking everything around the entrance look be dirty and cover in graffiti tag have make plan to meet chadi between a

In [118]:
# replacement_df.to_csv('sentiment_dataset_2_clusters.csv', sep=',', index=False, header=True)
#

In [119]:
# wcss = []
# for i in range(1, 11):
#     kmeans = KMeans(n_clusters = i, init = 'k-means++', random_state = 42)
#     kmeans.fit(X=w2v_model.wv.vectors.astype('double'))
#     # inertia_ is sum of squared distance of samples to its closest cluster centers.
#     wcss.append(kmeans.inertia_)
#     print("inertia_", kmeans.inertia_)
#     # print("wcss", within_css(kmeans))
# plt.plot(range(1, 11), wcss)
# plt.title('Elbow Method')
# plt.xlabel('Number of clusters')
# plt.ylabel('WCSS')
# plt.show()
#

In [120]:
# from sklearn.metrics import silhouette_score
#
# def kmeansSilhouette(X,range_clusters):
#     for i, k in range_clusters :
#
#         # Run the Kmeans algorithm
#         km = KMeans(n_clusters = k, init = 'k-means++', random_state = 42)
#
#         km.fit(X)
#         labels = km.predict(X)
#
#         print("For n_clusters =", k,
#                   "The computed average silhouette_score is :", silhouette_score(X, labels, metric='euclidean'))
#

In [121]:
# rangeClusters = enumerate([2,3,4,5,6,7,8,9,10])
# kmeansSilhouette(w2v_model.wv.vectors.astype('double'), rangeClusters)

## 3-clusters

In [122]:
kmeans_model = KMeans(n_clusters=3, max_iter=1000, random_state=42, n_init=50)
kmeans_model.fit(X=w2v_model.wv.vectors.astype('double'))

KMeans(max_iter=1000, n_clusters=3, n_init=50, random_state=42)

In [140]:
w2v_model.wv.similar_by_vector(kmeans_model.cluster_centers_[2], topn=10, restrict_vocab=None)

[('host', 0.6405877470970154),
 ('franck', 0.5894724130630493),
 ('jonna', 0.5868253111839294),
 ('nasir', 0.5857282876968384),
 ('yasemin', 0.5811430215835571),
 ('frieda', 0.5712906122207642),
 ('cordula', 0.566154956817627),
 ('valentina', 0.5637852549552917),
 ('fanny', 0.5616538524627686),
 ('emre', 0.5603703260421753)]

In [142]:
negative_cluster_index = 0
positive_cluster_index = 1
negative_cluster_center = kmeans_model.cluster_centers_[negative_cluster_index]
positive_cluster_center = kmeans_model.cluster_centers_[positive_cluster_index]

In [125]:
# dfWords = pd.DataFrame(w2v_model.wv.vocab.keys())

dfWords = pd.DataFrame(w2v_model.wv.key_to_index.keys())
dfWords.columns = ['words']
dfWords['vectors'] = dfWords.words.apply(lambda x: w2v_model.wv[f'{x}'])
dfWords['cluster'] = dfWords.vectors.apply(lambda x: kmeans_model.predict([np.array(x)]))
dfWords.cluster = dfWords.cluster.apply(lambda x: x[0])
dfWords.head()

Unnamed: 0,words,vectors,cluster
0,be,"[0.11147587, 0.07957115, 0.0352661, 0.2452787,...",0
1,and,"[-0.30145282, -0.041696303, 0.06714162, -0.166...",2
2,the,"[0.17333335, -0.43380734, 0.6963205, -0.070611...",0
3,to,"[-0.5403165, 0.002703896, 0.1560523, -0.411324...",1
4,in,"[0.41520724, 0.11420307, 0.59832984, -0.116240...",0


In [143]:
dfWords['cluster_value'] = [-1 if i==negative_cluster_index else 1 if i==positive_cluster_index else 0 for i in dfWords.cluster]
dfWords['closeness_score'] = dfWords.apply(lambda x: 1/(kmeans_model.transform([x.vectors]).min()), axis=1)
dfWords['sentiment_coeff'] = dfWords.closeness_score * dfWords.cluster_value
dfWords[dfWords['cluster_value'] == -1].head()

Unnamed: 0,words,vectors,cluster,cluster_value,closeness_score,sentiment_coeff
0,be,"[0.11147587, 0.07957115, 0.0352661, 0.2452787,...",0,-1,0.178454,-0.178454
2,the,"[0.17333335, -0.43380734, 0.6963205, -0.070611...",0,-1,0.142254,-0.142254
4,in,"[0.41520724, 0.11420307, 0.59832984, -0.116240...",0,-1,0.134969,-0.134969
9,it,"[0.013412308, -0.42243224, 0.15261401, 0.58773...",0,-1,0.141696,-0.141696
12,for,"[0.12189595, -0.04447002, -0.5717739, -1.14031...",0,-1,0.110504,-0.110504


In [144]:
sentiment_dict = dict(zip(dfWords.words.values, dfWords.sentiment_coeff.values))

In [145]:
replaced_closeness_scores = dfCleanedReviews['comments'].apply(lambda x: list(map(lambda y: replace_sentiment_words(y, sentiment_dict), x.split())))

In [146]:
replacement_df = pd.DataFrame(data=[replaced_closeness_scores, replaced_tfidf_scores, dfCleanedReviews['comments']]).T
replacement_df.columns = ['sentiment_coeff', 'tfidf_scores', 'sentence']
replacement_df['sentiment_rate'] = replacement_df.apply(lambda x: np.array(x.loc['sentiment_coeff']) @ np.array(x.loc['tfidf_scores']), axis=1)
# replacement_df['prediction'] = 'Positive' if replacement_df.sentiment_rate>0 else 'Negative'
replacement_df['prediction'] = (replacement_df.sentiment_rate>0).astype('int8')
# replacement_df['sentiment'] = [1 if i==1 else 0 for i in replacement_df.sentiment]
replacement_df.head()

Unnamed: 0,sentiment_coeff,tfidf_scores,sentence,sentiment_rate,prediction
0,"[0.0, -0.17845424863877163, 0.0, 0.0, 0.0, 0.0...","[6.45518200427728, 3.6108140688773807, 1.62336...",jan be very friendly and welcome host the apar...,-0.632116,0
1,"[-0.14169600930737072, -0.17845424863877163, 0...","[2.034512963851828, 2.407209379251587, 2.64191...",it be really nice area food park transport be ...,0.141791,1
2,"[0.0, -0.14736751348122512, 0.0, 0.0, 0.0, -0....","[2.091150860411193, 4.119858950780223, 4.87009...",we have very nice stay in berlin thanks to jan...,-0.656895,0
3,"[0.0, 0.14592163170361538, 0.09924674485275728...","[1.8501655371557688, 2.159718877149754, 2.7123...",great location close to mauerpark kastanienall...,4.695634,1
4,"[0.0, 0.0, 0.08637751005721282, 0.101727374669...","[3.927308953862587, 3.246732537406179, 2.84084...",apartment very well locate close to everything...,-2.767381,0


In [None]:
# replacement_df.to_csv('sentiment_dataset_3_clusters.csv', sep=',', index=False, header=True)

In [147]:
dfNegativeSentiment = replacement_df[replacement_df['prediction'] == 0]
dfNegativeSentiment.sort_values(by=['sentiment_rate'], inplace=True)
dfNegativeSentiment['sentence'].head().tolist()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


['accuracy pretty sure this place be low income housing the building be super sketchy the lobby area be disgust dark and full of graffiti tags the elevator be the same way the wall and floor of hallway lead into the apartment look like they ve never be clean the neighbor have their possession scatter about the hallway it say it have three bedroom but there be two plus bed in the living room which be openly connect to the rest of the residence every electrical socket and light fixture be hang out of the wall or ceiling the stove have be disable and be connect to propane tank there be piece of raw plywood hang randomly attach to wall jet out into area of traffic the nicer furniture in the photo have be remove check in from the street it be difficult to see the building we be suppose to enter there be very little lighting in the area and there aren easily visible sign with marking everything around the entrance look be dirty and cover in graffiti tag have make plan to meet chadi between a

## Hierarchical clustering

In [129]:
# import scipy.cluster.hierarchy as sch
#
# plt.figure(figsize=(10,10))
# dendrogram = sch.dendrogram(sch.linkage(w2v_model.wv.vectors.astype('double'), method = 'ward'))
# plt.title('Dendrogram')
# plt.xlabel('Words')
# plt.ylabel('Euclidean distances')
# plt.show()
#

In [130]:
# from sklearn.cluster import AgglomerativeClustering
#
# hc = AgglomerativeClustering(n_clusters = 4, linkage = 'ward')
# y_hc = hc.fit_predict(w2v_model.wv.vectors.astype('double'))
#

In [131]:
# dfWords2 = pd.DataFrame(data=[w2v_model.wv.key_to_index.keys(), y_hc]).T
# dfWords2.columns = ['words', 'clusters']
# dfWords2['vectors'] = dfWords2.words.apply(lambda x: w2v_model.wv[f'{x}'])
# dfWords2.head()
#

In [132]:
# batch_nrs = len(w2v_model.wv.key_to_index.keys()) // 500
# batch_nrs
#

In [133]:
# silhouette_scores = []
#
# for i in range(batch_nrs):
#     silhouetteScore = silhouette_score(w2v_model.wv.vectors[i*500:(i+1)*500],
#                                        y_hc[i*500:(i+1)*500],
#                                        metric='euclidean')
#     print(silhouetteScore)
#     silhouette_scores.append(silhouetteScore)
#
# finalSilhouetteScore = np.mean(silhouette_scores)
# finalSilhouetteScore
