In [54]:
from hazm import word_tokenize
import pandas as pd
import numpy as np

import gensim
from gensim.models.word2vec import Word2Vec
from sklearn.manifold import TSNE

from bokeh.io import output_notebook
from bokeh.plotting import show, figure

import warnings
########### Prevent Warnings ###########
warnings.filterwarnings(action='ignore')
########### Prevent Warnings ###########
%matplotlib inline

In [97]:
df = pd.read_csv('data/cleaned/data.csv')
df.head(1)

Unnamed: 0,title,comment,text,rate,verification_status
0,کیفیت حجم صدای عالی,محصول توی بازار اصلا نمیشه تهران گشتم واقعا مح...,کیفیت حجم صدای عالی محصول توی بازار اصلا نمیشه...,10,0


## Perprocessing Sentences in comments

In [98]:
# Set types for data
def set_types(df):
    df.title = df.title.apply(str)
    df.comment = df.comment.apply(str)
    df.text = df.text.apply(str)
    return df

df = set_types(df)

In [37]:
# Storing comments in list
comments = [comment for comment in df.comment]

In [38]:
    # converting each sentence to list of words and inserting in sents
    sents = [word_tokenize(comment) for comment in comments]

# example
sents[45]

['بک',
 'ماهه',
 'اسنفاده',
 'میکنم',
 'واصلا',
 'نمیریزه',
 'حواست',
 'نباشه',
 'دسستتو',
 'بزنی',
 'بهش',
 'پاک',
 'نمیشه',
 'حتما',
 'ارایش',
 'پاک',
 'کن',
 'پاک',
 'کنین',
 'اب',
 'صورت',
 'میشه']

## We only do the word2vec for the comments

In [39]:
model = Word2Vec(sentences=sents, size=64, window=10, min_count=5, seed=42, workers=5)

In [40]:
model.save('digikala_words.w2v')

## If you have preprocessed model, you can load it here

In [10]:
model = gensim.models.Word2Vec.load('digikala_words.w2v')

In [57]:
# Check for vector
model['دیجیکالا']

array([ 0.27298594,  1.551736  , -3.6564288 , -2.5420392 ,  2.3090856 ,
        1.3484155 , -2.729764  ,  2.8432271 , -1.3368765 , -1.1168908 ,
       -1.9622827 , -3.1754713 ,  0.32199654, -1.1872797 ,  2.0569417 ,
        2.4265778 ,  0.6501076 ,  1.110878  , -0.99614537, -0.05294515,
       -1.9627934 ,  0.61865205, -0.7036291 ,  3.525221  , -0.86941105,
       -1.5919472 , -1.463034  , -1.7985111 ,  0.36313066,  2.8223054 ,
        0.5513516 ,  2.7963915 , -0.40075648, -2.182105  , -0.875631  ,
       -1.2463248 , -2.0293555 ,  0.51488715, -0.75486696,  0.3320152 ,
       -0.40043002, -2.0729272 ,  0.11785657, -0.14828837,  0.45061046,
       -0.96828467, -0.26408488,  0.44675866,  0.98780584, -0.86508644,
       -2.356472  , -3.797717  ,  0.78625757, -0.2029165 ,  2.296204  ,
        1.8761927 ,  0.16572428,  0.16773523,  2.2822046 ,  3.7144063 ,
       -3.7939734 ,  0.8244034 , -0.4197574 ,  0.8304381 ], dtype=float32)

In [56]:
# The size of each vector is 64
len(model['دیجیکالا'])

64

In [74]:
# Let's check out most similar words to 'دیجیکالا'
model.most_similar('دیجیکالا', topn=15)

[('دیجی', 0.7308797240257263),
 ('دیحی', 0.7056687474250793),
 ('فروشگاه', 0.7015074491500854),
 ('فروشنده', 0.6769243478775024),
 ('دجی', 0.6692821979522705),
 ('اینترنتی', 0.663563072681427),
 ('مرسوله', 0.643081784248352),
 ('کالایی', 0.6382397413253784),
 ('کالاها', 0.6295326352119446),
 ('تامین', 0.6199561357498169),
 ('توجهتون', 0.6179360151290894),
 ('اجناس', 0.6133273839950562),
 ('ازدیجی', 0.6109930276870728),
 ('اجناسی', 0.6052873134613037),
 ('معیوب', 0.6051090955734253)]

In [73]:
# Another example
model.most_similar('گوشی', topn=15)

[('آیفون', 0.8086047172546387),
 ('گوشیه', 0.8054739236831665),
 ('هواوی', 0.7847117781639099),
 ('سامسونگ', 0.7785416841506958),
 ('نوت', 0.7501413822174072),
 ('نوکیا', 0.746269941329956),
 ('اندروید', 0.7396976947784424),
 ('هوآوی', 0.7322538495063782),
 ('ایفون', 0.7285028696060181),
 ('موبایل', 0.7275124788284302),
 ('پرچمدار', 0.6974676847457886),
 ('دوربین', 0.6935129165649414),
 ('s', 0.6917316913604736),
 ('تبلت', 0.6880324482917786),
 ('اپل', 0.6778054237365723)]

## Impressive work! The model is doing a great job

In [72]:
# Check similarity
model.similarity('دیجیکالا', 'فروشگاه')

0.70150745

## Let's configure settings for model visualization with bokeh

In [75]:
len(model.wv.vocab)

20881

In [77]:
X = model[model.wv.vocab]

In [79]:
tsne = TSNE(n_components=2, n_iter=1000)

In [80]:
X_2d = tsne.fit_transform(X)

In [81]:
coords_df = pd.DataFrame(X_2d, columns= ['x', 'y'])
coords_df['token'] = model.wv.vocab.keys()

# Save for later usges
coords_df.to_csv('digikala_tsne_word_model.csv', index=False)

In [82]:
coords_df.head()

Unnamed: 0,x,y,token
0,19.45182,34.997715,محصول
1,-28.609201,38.024948,توی
2,28.981407,39.225021,بازار
3,-21.963257,37.621536,اصلا
4,-29.204384,44.717419,نمیشه


In [84]:
output_notebook()

In [85]:
samples_df = coords_df.sample(n=5000)

In [88]:
p = figure()
_ = p.text(x=samples_df.x, y=samples_df.y, text=samples_df.token)

In [89]:
show(p)

As you can see, related words are closer to each other and make groups

For example most of the mobile phone brands have made a group together:

[Sony, Note, Xperia, سامسونگ, ...]