In [31]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
import warnings 
warnings.filterwarnings('ignore')
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
matplotlib.rcParams['figure.figsize'] = (10.0, 6.0)
import plotly.graph_objs as go
import chart_studio.plotly as py
import cufflinks
pd.options.display.max_columns = 30
from IPython.core.interactiveshell import InteractiveShell
import plotly.figure_factory as ff
InteractiveShell.ast_node_interactivity = 'all'
from plotly.offline import iplot
cufflinks.go_offline()
cufflinks.set_config_file(world_readable=True, theme='pearl')

from bokeh.plotting import figure, output_file, show
from bokeh.models import Label
from bokeh.io import output_notebook
output_notebook()
from collections import Counter

In [32]:
import re
from constants import RUSSIAN_STOPWORDS, PUNCTUATION


def clean_text(text):
    text = text.lower()
    text = re.sub(r'(<\s*\w+\s*>)*(<\s*/\w+\s*>)*', '', text)
    text = [word for word in text.split(' ') if word !=' ' and word not in RUSSIAN_STOPWORDS and word not in PUNCTUATION]

    return " ".join(text)

In [33]:
from pymystem3 import Mystem

mystem = Mystem()
def lemmatize_text(text):
    text = mystem.lemmatize(text)
    text = [word for word in text if word != ' ']
    return " ".join(text)

In [34]:
reviews = pd.read_csv('data/2019-12-14/all.csv', index_col=['Unnamed: 0'])

In [35]:
reviews.head()

Unnamed: 0,text,plus,minus,language,rating,category
0,Мой любимый DKNY. Запах просто шикарный. Спаси...,,,russian,5.0,perfumes
1,"Парфюм оригинальный. Всё, как в профессиональн...",,,russian,5.0,perfumes
2,,Шикарный запах! Безумно нравится! Не пожалеете...,,russian,5.0,perfumes
3,,"Аромат очень сладкий, карамель ощущается.",Не стойкий.,russian,2.0,perfumes
4,,"Отличный сервис, восхитительные духи!",,russian,5.0,perfumes


In [36]:
reviews.describe()

Unnamed: 0,rating
count,104662.0
mean,4.708901
std,0.800245
min,1.0
25%,5.0
50%,5.0
75%,5.0
max,5.0


In [37]:
reviews.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 104662 entries, 0 to 5665
Data columns (total 6 columns):
text        53988 non-null object
plus        72946 non-null object
minus       33866 non-null object
language    104662 non-null object
rating      104662 non-null float64
category    104662 non-null object
dtypes: float64(1), object(5)
memory usage: 5.6+ MB


In [38]:
reviews['category'].unique()

array(['perfumes', 'smartphones', 'car-electronics', 'memory-cards',
       'wearables', 'tires', 'power-banks', 'portable-speakers',
       'car-audio', 'books', 'beauty', 'small-home-appl',
       'kitchen-home-appl', 'big-home-appl', 'climate-equipment',
       'watches', 'headphones'], dtype=object)

In [39]:
print(f"Number of values in text: {reviews.text.count()}")
print(f"Number of values in plus: {reviews.plus.count()}")
print(f"Number of values in minus: {reviews.minus.count()}")

Number of values in text: 53988
Number of values in plus: 72946
Number of values in minus: 33866


In [40]:
reviews.groupby('language')['rating'].count()

language
kazakh      5867
other        177
russian    98618
Name: rating, dtype: int64

In [41]:
reviews.loc[reviews.language == 'other']

Unnamed: 0,text,plus,minus,language,rating,category
738,,,"Запах норм, но не стойкий. Это меня растроило....",other,3.0,perfumes
819,,.,"Запах мне не понравился. Хотела отправить, но ...",other,1.0,perfumes
1204,Best of the best,,,other,5.0,perfumes
1828,,One love!,,other,5.0,perfumes
1913,Number one!,,,other,5.0,perfumes
...,...,...,...,...,...,...
3232,I liked it very much.,,,other,5.0,headphones
3450,Must have!,"Сейчас я все раскидаю, ребята! Эти наушники ну...",,other,5.0,headphones
3632,,"Keremet estiledi, qysta o'te yn'g'aily.",,other,5.0,headphones
3990,,100/100,,other,5.0,headphones


In [42]:
category_count = reviews.groupby('category')['rating'].mean()

In [43]:
category_count

category
beauty               4.627750
big-home-appl        4.724496
books                4.916515
car-audio            4.670774
car-electronics      4.562733
climate-equipment    4.626050
headphones           4.524709
kitchen-home-appl    4.706973
memory-cards         4.893773
perfumes             4.405961
portable-speakers    4.712575
power-banks          4.792350
small-home-appl      4.696167
smartphones          4.787580
tires                4.822464
watches              4.673987
wearables            4.771816
Name: rating, dtype: float64

In [44]:
category_count.iplot(kind='bar', yTitle='Count', linecolor='black', opacity=0.8,
                                                           title='Reviews per Category', xTitle='Category')


In [45]:
category_avg_rating = reviews.groupby('category')['rating'].mean()

In [46]:
category_avg_rating.iplot(
    kind='bar',
    xTitle='rating',
    linecolor='black',
    yTitle='count',
    title='Review Rating Distribution')

In [47]:
reviews = reviews.fillna('')

In [48]:
reviews = reviews.loc[reviews.language == 'russian']

In [49]:
reviews.head(5)


Unnamed: 0,text,plus,minus,language,rating,category
0,Мой любимый DKNY. Запах просто шикарный. Спаси...,,,russian,5.0,perfumes
1,"Парфюм оригинальный. Всё, как в профессиональн...",,,russian,5.0,perfumes
2,,Шикарный запах! Безумно нравится! Не пожалеете...,,russian,5.0,perfumes
3,,"Аромат очень сладкий, карамель ощущается.",Не стойкий.,russian,2.0,perfumes
4,,"Отличный сервис, восхитительные духи!",,russian,5.0,perfumes


In [50]:
reviews['combined_text'] = reviews.text + ' ' + reviews.plus + ' ' + reviews.minus


In [51]:
def get_top_n_words(corpus, n=None):
    vec = CountVectorizer().fit(corpus)
    bag_of_words = vec.transform(corpus)
    sum_words = bag_of_words.sum(axis=0) 
    words_freq = [(word, sum_words[0, idx]) for word, idx in vec.vocabulary_.items()]
    words_freq = sorted(words_freq, key = lambda x: x[1], reverse=True)
    return words_freq[:n]

In [52]:
common_words = get_top_n_words(reviews['combined_text'], 20)
for word, freq in common_words:
    print(word, freq)

не 47800
очень 37198
все 30876
на 27224
телефон 20054
что 17775
супер 13704
за 13502
но 13171
для 12736
br 12733
хороший 12305
нет 11764
спасибо 11645
отличный 10760
как 10348
отлично 9885
качество 9856
работает 9837
хорошо 9691


In [53]:
top_frequent = pd.DataFrame(common_words, columns = ['review' , 'count'])

In [54]:
top_frequent.groupby('review').sum()['count'].sort_values(ascending=False).iplot(
    kind='bar', yTitle='Count', linecolor='black', title='Top 20 words in review before removing stop words')