### Copyright (C) 2020 Sobhan Moradiyan Daghigh - All Rights Reserved

## Data Mining UniProj - no.4
#### 1/6/2021

In [22]:
import pandas as pd
import numpy as np
from sklearn.cluster import KMeans
from sklearn.feature_extraction.text import TfidfVectorizer
import nltk
from nltk import word_tokenize
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk import pos_tag

import string
# nltk.download()

### Reading the dataset

In [2]:
quality = pd.read_excel('dataset/keifiat.xlsx')

In [3]:
len(quality['recommend'][quality['recommend'] == '\\N']), len(quality['recommend'][quality['recommend'] != '\\N']), 

(36382, 63618)

In [4]:
dataset = quality.filter(['product_id', 'comment', 'recommend']).copy()
dataset.head()

Unnamed: 0,product_id,comment,recommend
0,3692,واقعا عالیه. من که ازش خیلی راضیم,\N
1,90213,سلام، قبل اینکه نظرم رو بگم میخواستم به یک موض...,recommended
2,59473,گیره های فلزی خیلی سخت تا میشوند و لذا حوله را...,not_recommended
3,120499,همه چیز در رابطه با ظاهر این گوشی بسیار خوب اس...,no_idea
4,67200,اگر ظرفیتش براتون کافیه حتما بخرید.\r\nیه شارژ...,no_idea


In [5]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 3 columns):
 #   Column      Non-Null Count   Dtype 
---  ------      --------------   ----- 
 0   product_id  100000 non-null  int64 
 1   comment     99883 non-null   object
 2   recommend   100000 non-null  object
dtypes: int64(1), object(2)
memory usage: 2.3+ MB


In [6]:
dataset = dataset[dataset['recommend'] != '\\N']
dataset.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 63618 entries, 1 to 99999
Data columns (total 3 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   product_id  63618 non-null  int64 
 1   comment     63586 non-null  object
 2   recommend   63618 non-null  object
dtypes: int64(1), object(2)
memory usage: 1.9+ MB


In [7]:
dataset.dropna(inplace=True)
dataset.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 63586 entries, 1 to 99999
Data columns (total 3 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   product_id  63586 non-null  int64 
 1   comment     63586 non-null  object
 2   recommend   63586 non-null  object
dtypes: int64(1), object(2)
memory usage: 1.9+ MB


In [145]:
dataset.to_csv('preprocessed_dataset.csv', index=False, encoding='utf-8-sig')

In [222]:
translated = pd.read_csv('translate_dataset.csv')
translated.head()

Unnamed: 0,product_id,comment,recommend,eng_comment
0,111178,در کل وسیله ای خوب و مورد نیاز است. اگر یک میل...,recommended,Overall a good and needed tool. If a flat bar ...
1,180451,تو خریدش شک نکنید،عالی و بسیار کاربردی,recommended,"Do not hesitate to buy it, excellent and very ..."
2,195191,فست شارژ داره که خیلی خوبه\r\nکیفیت بدنش عالیه...,recommended,"It has a fast charge, which is very good The ..."
3,195191,بیش از ۲ سال پیش از دیجی خریدم با وجود ناراضی...,recommended,"I bought it from Digi more than 2 years ago, d..."
4,180451,واقعا خیلی ازش راضی هستم\r\nبی نقص,recommended,I am really very satisfied with it flawless


In [223]:
translated = translated.drop(['comment'], axis=1)
translated.iloc[:, 2] += translated.iloc[:, 1]
translated.head()

Unnamed: 0,product_id,recommend,eng_comment
0,111178,recommended,Overall a good and needed tool. If a flat bar ...
1,180451,recommended,"Do not hesitate to buy it, excellent and very ..."
2,195191,recommended,"It has a fast charge, which is very good The ..."
3,195191,recommended,"I bought it from Digi more than 2 years ago, d..."
4,180451,recommended,I am really very satisfied with it flawless r...


In [224]:
translated.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1176 entries, 0 to 1175
Data columns (total 3 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   product_id   1176 non-null   int64 
 1   recommend    1176 non-null   object
 2   eng_comment  1176 non-null   object
dtypes: int64(1), object(2)
memory usage: 27.7+ KB


In [225]:
preprocessed_comments = []
for comment in translated['eng_comment']:
    # Lowercase
    comment = comment.lower()
    # Removing Punctuation
    comment = "".join([char for char in comment if char not in string.punctuation])
    # Word Tokenization
    comment = word_tokenize(comment)
    # Stopword Filtering
    stop_words = stopwords.words('english')
#     [stop_words.remove(x) for x in ['not', "isn't", 'very', "don't", "aren't", "haven't"]]
    comment = [word for word in comment if word not in stop_words]
    # Filter 
    comment = pos_tag(comment)
    comment = [word for word, tag in comment if tag in ('VB', 'VBN', 'VBD', 'VBP', 'RB', 'JJ', 'JJR' )]
    # Stemming
    porter = PorterStemmer()
    comment = [porter.stem(word) for word in comment]
    
    preprocessed_comments.append(' '.join(comment))


In [226]:
preprocessed_comments

['overal need flat ad shoulder better recommend',
 'buy excel recommend',
 'fast good excel scratch still healthi energ overal great think buy main recommend',
 'bought ago bought xiaomi work realli well shaki light weight small recommend',
 'realli satisfi recommend',
 'realli satisfi honestli xiaomi great recommend',
 'regular everywher high special high kala also seen test bought recommend',
 'recommend',
 'also got dj almost half weaken batteri damag take went sea antiallerg overal complet satisfi recommend',
 'bought watch good recommend',
 'charg amp good comfort beauti well recommend',
 'notic notic announc automat automat recommend',
 'uniqu charg properli mayb fulli charg recommend',
 'interest new updat english english save name updat mi fit also variou wrist bluetooth poli mobil',
 'second buy great recommend',
 'well well luckili screen light weak satisfi also call',
 'got amaz ago fulli charg high recommend constantli tavern quickli good drum groov recommend',
 'receiv ful

In [227]:
vectorizer = TfidfVectorizer(stop_words={'english'})
X = vectorizer.fit_transform(translated['eng_comment'])

In [None]:
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans

Sum_of_squared_distances = []
K = range(1,10)

for k in K:
   km = KMeans(n_clusters=k, max_iter=500, n_init=20)
   km = km.fit(X)
   Sum_of_squared_distances.append(km.inertia_)

plt.plot(K, Sum_of_squared_distances, 'bx-')
plt.xlabel('k')
plt.ylabel('Sum_of_squared_distances')
plt.title('Elbow Method For Optimal k')
plt.show()

In [185]:
# import matplotlib.pyplot as plt
# from sklearn.cluster import KMeans

# Sum_of_squared_distances = []
# K = range(2,10)

# for k in K:
#    km = KMeans(n_clusters=k, max_iter=1000, n_init=100')
#    km = km.fit(X)
#    Sum_of_squared_distances.append(km.inertia_)

# plt.plot(K, Sum_of_squared_distances, 'bx-')
# plt.xlabel('k')
# plt.ylabel('Sum_of_squared_distances')
# plt.title('Elbow Method For Optimal k')
# plt.show()

In [217]:
true_k = 5
model = KMeans(n_clusters=true_k, init='k-means++', max_iter=200, n_init=10)
model.fit(X)
labels=model.labels_

translated['cluster'] = labels
translated

Unnamed: 0,product_id,recommend,eng_comment,cluster
0,111178,recommended,Overall a good and needed tool. If a flat bar ...,1
1,180451,recommended,"Do not hesitate to buy it, excellent and very ...",2
2,195191,recommended,"It has a fast charge, which is very good The ...",3
3,195191,recommended,"I bought it from Digi more than 2 years ago, d...",3
4,180451,recommended,I am really very satisfied with it flawless r...,0
...,...,...,...,...
1171,416458,recommended,Overall good. recommended,0
1172,416458,no_idea,The lace part should have a lace edge that hol...,1
1173,319878,recommended,It is well made and of good quality for its pr...,0
1174,416458,recommended,"Excellent, I can buy it in a couple of weeks, ...",2


In [218]:
groups = translated.groupby(by='cluster')
groups.first()

Unnamed: 0_level_0,product_id,recommend,eng_comment
cluster,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,180451,recommended,I am really very satisfied with it flawless r...
1,111178,recommended,Overall a good and needed tool. If a flat bar ...
2,180451,recommended,"Do not hesitate to buy it, excellent and very ..."
3,195191,recommended,"It has a fast charge, which is very good The ..."
4,195191,recommended,"We are really satisfied ... Honestly, Xiaomi i..."


In [219]:
for grp, pdf in groups:
    print("grp:", grp, "values:\n", pd.DataFrame(pdf['recommend'].tolist()).value_counts())

grp: 0 values:
 recommended    158
no_idea         20
dtype: int64
grp: 1 values:
 recommended        235
not_recommended    106
no_idea             68
dtype: int64
grp: 2 values:
 recommended    76
no_idea         2
dtype: int64
grp: 3 values:
 recommended        290
no_idea             63
not_recommended     44
dtype: int64
grp: 4 values:
 recommended    109
no_idea          5
dtype: int64


In [214]:
for grp, pdf in groups:
    print("grp:", grp, "values:\n", pd.DataFrame(pdf['recommend'].tolist()).value_counts())

grp: 0 values:
 recommended        381
no_idea             44
not_recommended      6
dtype: int64
grp: 1 values:
 recommended        301
not_recommended    144
no_idea            107
dtype: int64
grp: 2 values:
 recommended    78
no_idea         2
dtype: int64
grp: 3 values:
 recommended    108
no_idea          5
dtype: int64


In [211]:
for grp, pdf in groups:
    print("grp:", grp, "values:\n", pd.DataFrame(pdf['recommend'].tolist()).value_counts())

grp: 0 values:
 recommended    77
no_idea         2
dtype: int64
grp: 1 values:
 recommended        370
no_idea             26
not_recommended      1
dtype: int64
grp: 2 values:
 recommended        421
not_recommended    149
no_idea            130
dtype: int64


In [201]:
for grp, pdf in groups:
    print("grp:", grp, "values:\n", pd.DataFrame(pdf['recommend'].tolist()).value_counts())

grp: 0 values:
 recommended    340
no_idea         13
dtype: int64
grp: 1 values:
 recommended        452
not_recommended    150
no_idea            143
dtype: int64
grp: 2 values:
 recommended    76
no_idea         2
dtype: int64


In [188]:
for grp, pdf in groups:
    print("grp:", grp, "values:\n", pd.DataFrame(pdf['recommend'].tolist()).value_counts())

grp: 0 values:
 recommended    70
no_idea         2
dtype: int64
grp: 1 values:
 recommended        256
no_idea             48
not_recommended     25
dtype: int64
grp: 2 values:
 recommended        133
no_idea              6
not_recommended      2
dtype: int64
grp: 3 values:
 recommended        409
not_recommended    123
no_idea            102
dtype: int64


In [177]:
for grp, pdf in groups:
    print("grp:", grp, "values:\n", pd.DataFrame(pdf['recommend'].tolist()).value_counts())

grp: 0 values:
 recommended        375
no_idea             81
not_recommended     68
dtype: int64
grp: 1 values:
 recommended        319
not_recommended     80
no_idea             70
dtype: int64
grp: 2 values:
 recommended        103
no_idea              5
not_recommended      2
dtype: int64
grp: 3 values:
 recommended    71
no_idea         2
dtype: int64


In [165]:
for grp, pdf in groups:
    print("grp:", grp, "values:\n", pd.DataFrame(pdf['recommend'].tolist()).value_counts())

grp: 0 values:
 recommended    73
no_idea         2
dtype: int64
grp: 1 values:
 recommended        433
no_idea             46
not_recommended     19
dtype: int64
grp: 2 values:
 recommended        362
not_recommended    131
no_idea            110
dtype: int64


In [None]:
preprocessed_comments

In [136]:
text2 = 'Overall a good and needed tool. If a flat bar recommended is added about 10 to recommended 15 inches below the shoulder position, it will provide a better support. '
words = word_tokenize(text2)
stop_words = stopwords.words('english')
filtered_words = [word for word in words if word not in stop_words]
comment = pos_tag(filtered_words)
comment

[('Overall', 'RB'),
 ('good', 'JJ'),
 ('needed', 'VBN'),
 ('tool', 'NN'),
 ('.', '.'),
 ('If', 'IN'),
 ('flat', 'JJ'),
 ('bar', 'NN'),
 ('recommended', 'VBD'),
 ('added', 'VBD'),
 ('10', 'CD'),
 ('recommended', 'VBD'),
 ('15', 'CD'),
 ('inches', 'NNS'),
 ('shoulder', 'JJR'),
 ('position', 'NN'),
 (',', ','),
 ('provide', 'VB'),
 ('better', 'JJR'),
 ('support', 'NN'),
 ('.', '.')]

In [137]:
porter = PorterStemmer()
comment = [porter.stem(word) for word in [word for word, tag in comment if tag in ('VB', 'VBN', 'VBD', 'VBP', 'RB', 'JJ', 'JJR' )]]
comment

['overal',
 'good',
 'need',
 'flat',
 'recommend',
 'ad',
 'recommend',
 'shoulder',
 'provid',
 'better']

In [68]:
print([word for word, tag in comment if tag in ('VB', 'VBN', 'VBD', 'VBP', 'RB', 'JJ', 'JJR' )])

['Overall', 'good', 'needed', 'flat', 'added', 'shoulder', 'provide', 'better']


In [55]:
text = 'I am really very satisfied with it flawless'
words = word_tokenize(text)
stop_words = stopwords.words('english')
filtered_words = [word for word in words if word not in stop_words]
comment = pos_tag(filtered_words)
comment

[('I', 'PRP'), ('really', 'RB'), ('satisfied', 'VBD'), ('flawless', 'NN')]

In [56]:

text = 'Hi, I bought this watch a few months ago, I suggest you buying that, it is really and very nice and useful'
words = word_tokenize(text)
stop_words = stopwords.words('english')
filtered_words = [word for word in words if word not in stop_words]
comment = pos_tag(filtered_words)
comment

[('Hi', 'NNP'),
 (',', ','),
 ('I', 'PRP'),
 ('bought', 'VBD'),
 ('watch', 'JJ'),
 ('months', 'NNS'),
 ('ago', 'IN'),
 (',', ','),
 ('I', 'PRP'),
 ('suggest', 'VBP'),
 ('buying', 'VBG'),
 (',', ','),
 ('really', 'RB'),
 ('nice', 'JJ'),
 ('useful', 'JJ')]

In [57]:
print([(word, tag) for word, tag in comment if tag in ('VB', 'VBN', 'VBD', 'VBP', 'JJ', 'JJR' )])

[('bought', 'VBD'), ('watch', 'JJ'), ('suggest', 'VBP'), ('nice', 'JJ'), ('useful', 'JJ')]


In [None]:
def claculate_score():
    