In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
import warnings 
warnings.filterwarnings('ignore')
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
matplotlib.rcParams['figure.figsize'] = (10.0, 6.0)
import plotly.graph_objs as go
import chart_studio.plotly as py
import cufflinks
pd.options.display.max_columns = 30
from IPython.core.interactiveshell import InteractiveShell
import plotly.figure_factory as ff
InteractiveShell.ast_node_interactivity = 'all'
from plotly.offline import iplot
cufflinks.go_offline()
cufflinks.set_config_file(world_readable=True, theme='pearl')

from bokeh.plotting import figure, output_file, show
from bokeh.models import Label
from bokeh.io import output_notebook
output_notebook()
from collections import Counter

In [2]:
def get_top_n_words(corpus, n=None):
    vec = CountVectorizer().fit(corpus)
    bag_of_words = vec.transform(corpus)
    sum_words = bag_of_words.sum(axis=0) 
    words_freq = [(word, sum_words[0, idx]) for word, idx in vec.vocabulary_.items()]
    words_freq = sorted(words_freq, key = lambda x: x[1], reverse=True)
    return words_freq[:n]

In [3]:
def get_top_n_grams(corpus, n=None):
    vec = CountVectorizer(ngram_range=(2, 2)).fit(corpus)
    bag_of_words = vec.transform(corpus)
    sum_words = bag_of_words.sum(axis=0) 
    words_freq = [(word, sum_words[0, idx]) for word, idx in vec.vocabulary_.items()]
    words_freq = sorted(words_freq, key = lambda x: x[1], reverse=True)
    return words_freq[:n]

In [4]:

cleaned_reviews = pd.read_csv('cleaned_data.csv', usecols=['combined_text', 'rating'])
cleaned_reviews = cleaned_reviews.fillna('')

In [5]:
common_cleaned_words = get_top_n_words(cleaned_reviews['combined_text'], 20)
for word, freq in common_cleaned_words:
    print(word, freq)

не 47659
очень 37195
хороший 24906
телефон 23309
отличный 18622
понравиться 17676
супер 13706
качество 12461
удобный 11824
все 11811
нет 11757
цена 11725
спасибо 11649
работать 11514
довольный 10377
отлично 9821
камера 8700
быстро 7599
товар 7135
просто 6943


In [6]:
top_cleaned_frequent = pd.DataFrame(common_cleaned_words, columns = ['review' , 'count'])

In [7]:
top_cleaned_frequent.groupby('review').sum()['count'].sort_values(ascending=False).iplot(
    kind='bar', yTitle='Count', linecolor='black', title='Top 20 words after cleaning')

In [8]:
common_cleaned_ngrams = get_top_n_grams(cleaned_reviews['combined_text'], 20)
for word, freq in common_cleaned_ngrams:
    print(word, freq)

очень понравиться 5657
очень хороший 4842
очень удобный 3774
очень довольный 3600
спасибо каспи 3560
хороший телефон 3154
отличный телефон 2876
не пожалеть 2284
свой деньги 2264
просто супер 2240
цена качество 2113
каспи магазин 1637
телефон очень 1630
очень нравиться 1476
довольный покупка 1391
очень удобно 1352
все советовать 1328
не очень 1285
телефон отличный 1256
такой цена 1228


In [9]:
top_cleaned_ngrams = pd.DataFrame(common_cleaned_ngrams, columns = ['review' , 'count'])


In [10]:
top_cleaned_ngrams.groupby('review').sum()['count'].sort_values(ascending=False).iplot(
    kind='bar', yTitle='Count', linecolor='black', title='Top 20 bigrams')
