In [1]:
%load_ext autoreload
%autoreload 2

import warnings
import pickle
warnings.filterwarnings("ignore")

import sys
from os.path import join

import pandas as pd
import numpy as np
import matplotlib.pylab as plt
from tqdm import tqdm
# custom imports
SCRIPTS_PATH = '../scripts'
if SCRIPTS_PATH not in sys.path:
    sys.path.append(SCRIPTS_PATH)
    
import data_preparation
from text_utils import *

In [2]:
import pandas as pd
import matplotlib.pylab as plt
import seaborn as sb
%matplotlib inline

In [3]:
dataset1_df = pd.read_csv('../data/new_data/dataset1.csv', index_col=0) # отзывы
dataset2_df = pd.read_csv('../data/new_data/dataset2.csv', index_col=0) # характеристики товаров

compare_v3_df = pd.read_csv('../data/new_data/compare_v3.csv', sep=';', index_col=0) # сравнения товаров 
views_df = pd.read_csv('../data/new_data/views.csv', sep=';', index_col=0) # просмотры (сессия) пользователей 

In [44]:
dataset1_df.head()

Unnamed: 0,PRODUCT,CATEGORY_ID,BRAND_ID,CATEGORY_NAME,BRAND_NAME,TYPE,SUBJECT,TEXT,POSTED_DATE,RATING,BENEFITS,DRAWBACKS,RECOMMENDED,LIKES_COUNT,DISLIKES_COUNT
0,30024724,2060202,1900,PRINTERS LASER,Kyocera,REVIEW,,Что особенно ценно - при профилактике принтера...,23.10.16 22:26:00,5.0,"хорошая скорость печати, двусторонняя печать, ...","Рядом ""на стол"" не поставишь, место много зани...",1.0,2.0,0.0
1,30024724,2060202,1900,PRINTERS LASER,Kyocera,REVIEW,,Он у нас три месяца в офисе стоит уже - большо...,17.04.16 11:17:42,5.0,,,,0.0,2.0
2,30024725,2060202,1900,PRINTERS LASER,Kyocera,REVIEW,,"Просто небо и земля с тем принтером, который у...",15.04.16 20:22:47,4.0,,,,0.0,1.0
3,30024727,2060101,1900,ALL-IN-ONE LASER,Kyocera,REVIEW,С такой вроде бы навороченной техникой все смо...,У нас в кабинете недавно его поставили. Первое...,18.04.16 20:01:57,5.0,,,,0.0,0.0
4,30024727,2060101,1900,ALL-IN-ONE LASER,Kyocera,REVIEW,,Очень многофункциональная штука. Есть все нео...,23.04.16 12:03:39,5.0,,,,1.0,0.0


In [45]:
temp_df = dataset1_df.groupby('PRODUCT').agg({'RATING':'mean', 'CATEGORY_ID':'count'})

In [49]:
temp_df[temp_df['CATEGORY_ID']>4].sort_values(by='RATING')

Unnamed: 0_level_0,RATING,CATEGORY_ID
PRODUCT,Unnamed: 1_level_1,Unnamed: 2_level_1
20022309,1.000000,5
50048292,1.000000,7
50037426,1.000000,11
30015709,1.000000,7
50037074,1.000000,5
50049381,1.000000,12
50045082,1.166667,6
20022764,1.166667,6
50041631,1.166667,6
50045164,1.200000,5


In [17]:
%%time

from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.linear_model import SGDClassifier
from pymystem3 import Mystem


class ClassificationModel:
    def __init__(self):
        self.mystem = Mystem()
        self.pipeline_stages =  [('vect', CountVectorizer(max_features=100000)),
                                 ('tfidf', TfidfTransformer()),
                                 ('clf', SGDClassifier(n_jobs=-1))]

        self.pipeline = Pipeline(self.pipeline_stages)
    
    def preprocess_text(self, text):
        return ''.join(self.mystem.lemmatize(text)).strip()
    
    def fit(self, x_train, y_train):
        x_train = [self.preprocess_text(x) for x in x_train]
        self.pipeline.fit(x_train, np.round(y_train))
        vect_voc = self.pipeline_stages[0][1].vocabulary_
        self.inverse_vocabulary = dict(zip(vect_voc.values(), vect_voc.keys()))
    
    def get_words_importance(self, coef_idx):
        clf = self.pipeline_stages[-1][1]
        coeff = [(self.inverse_vocabulary[i], clf.coef_[coef_idx][i]) for i in range(clf.coef_[0].shape[0])]
        return sorted(coeff, key=lambda x: x[1], reverse=True)
    
    def predict(self, x_data):
        x_data = [self.preprocess_text(x) for x in x_data]
        return self.pipeline.predict(x_data)
    
def features_extractor(dataset):
    return training_data['TEXT'], training_data['NAME']


clf_model = ClassificationModel()

CPU times: user 0 ns, sys: 4 ms, total: 4 ms
Wall time: 312 µs


In [18]:
#X_data = training_data['TEXT']
# y_data = training_data['VALUE']

X_data = dataset1_df['TEXT']
y_data = dataset1_df['CATEGORY_NAME'].str.strip()

clf_model.pipeline.fit(X_data, y_data)

vect_voc = clf_model.pipeline_stages[0][1].vocabulary_
clf_model.inverse_vocabulary = dict(zip(vect_voc.values(), vect_voc.keys()))

In [8]:
list(clf_model.pipeline_stages[-1][1].classes_)

['3D AUDIO SYSTEM',
 '3D CAMCORDERS DIGITAL',
 '3D DIGITAL CAMERAS',
 'ACCESSORIES FOR ACOUSTIC SYSTEMS',
 'ACCESSORIES FOR APPLE SMARTPHONES',
 'ACCESSORIES FOR APPLE TABLETS',
 'ACCESSORIES FOR CAMCORDERS',
 'ACCESSORIES FOR CLIMATE TECHNICS',
 'ACCESSORIES FOR COOKERS & OVENS',
 'ACCESSORIES FOR COSMETIC TOOLS',
 'ACCESSORIES FOR DISH WASHERS',
 'ACCESSORIES FOR E-BOOKS',
 'ACCESSORIES FOR GAMES CONSOLES DS',
 'ACCESSORIES FOR HAIR CARE',
 'ACCESSORIES FOR HOODS',
 'ACCESSORIES FOR IRON BOARDS',
 'ACCESSORIES FOR MACBOOK',
 'ACCESSORIES FOR MICROWAVES OVENS',
 'ACCESSORIES FOR MUSICAL INSTRUMENTS',
 'ACCESSORIES FOR NINTENDO WII',
 'ACCESSORIES FOR PLAYSTATION 2 (PS 2)',
 'ACCESSORIES FOR PLAYSTATION 3 (PS 3)',
 'ACCESSORIES FOR PLAYSTATION 4 (PS 4)',
 'ACCESSORIES FOR PLAYSTATION PORTABLE (PS',
 'ACCESSORIES FOR PORTABLE SPEAKERS',
 'ACCESSORIES FOR PRINTERS',
 'ACCESSORIES FOR RADIO-CONTROLLED DEVICES',
 'ACCESSORIES FOR REFREGIRATORS',
 'ACCESSORIES FOR SHA',
 'ACCESSORIES FOR SM

In [33]:
cat_id = list(clf_model.pipeline_stages[-1][1].classes_).index('SMARTPHONES')
tablet_keywords = clf_model.get_words_importance(cat_id)
tablet_keywords[:100]

[('смартфон', 3.2070159836651086),
 ('новинке', 1.5102663298988503),
 ('xiaomi', 1.3034761929789995),
 ('телефон', 1.1760754025064071),
 ('фронтальная', 0.97215511994372472),
 ('redmi', 0.86432315840739238),
 ('мнение', 0.79269780697509074),
 ('мп', 0.77007407350922819),
 ('htc', 0.75551377735007763),
 ('камера', 0.74250504097177217),
 ('4a', 0.70359570983741904),
 ('коммуникатор', 0.70239346580845108),
 ('приложения', 0.67965757983623576),
 ('симки', 0.64838952746486767),
 ('батарея', 0.58856291060419863),
 ('смартфона', 0.58555807189506781),
 ('приложений', 0.57410396170463063),
 ('андроид', 0.56323140182829001),
 ('nokia', 0.53744097992930862),
 ('телефоном', 0.53645560604228482),
 ('смартфонов', 0.51104386060228479),
 ('телефона', 0.48031927947278519),
 ('сим', 0.47926772618158653),
 ('4g', 0.47755243569828854),
 ('селфи', 0.47475937436075821),
 ('смарт', 0.4746405616619353),
 ('gps', 0.47178871075026063),
 ('задняя', 0.46630326797133997),
 ('смартфоне', 0.46448514023245352),
 ('те

In [39]:
from collections import OrderedDict

category_keywords = dict()
for idx, catergory_name in enumerate(clf_model.pipeline_stages[-1][1].classes_):
    words_importance = clf_model.get_words_importance(idx)
    words_importance = filter(lambda x:x[1]!=0, words_importance)
    category_keywords[catergory_name] = OrderedDict(words_importance)

In [40]:
import pickle
pickle.dump(category_keywords, open('../dumps/category_keywords.pkl', 'wb'))

In [318]:
import eli5

product_text = dataset1_df[dataset1_df['PRODUCT']==20035335]['TEXT'].sample(1).iloc[0]

eli5.show_prediction(clf_model.pipeline_stages[-1][1],
                     product_text, vec=clf_model.pipeline_stages[0][1],
                     target_names=list(clf_model.pipeline_stages[-1][1].classes_),
                     targets=['IRONS'])

Contribution?,Feature
12.024,Highlighted in text (sum)
-1.066,<BIAS>
