## Анализ тональности отзывов

In [15]:
import numpy as np
import pandas as pd

from sklearn.pipeline import make_pipeline
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.svm import LinearSVC
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB

import nltk
import re
import string
import pickle

#### Список стоп-слов:

In [4]:
stop_words = nltk.corpus.stopwords.words('english')

#### Загрузим данные:

In [5]:
train = pd.read_csv('data/products_sentiment_train.tsv', sep='\t', header=None, names=['text', 'target'])
test = pd.read_csv('data/products_sentiment_test.tsv', sep='\t', index_col='Id')

In [6]:
train.text.head()

0            2 . take around 10,000 640x480 pictures .
1    i downloaded a trial version of computer assoc...
2    the wrt54g plus the hga7t is a perfect solutio...
3    i dont especially like how music files are uns...
4    i was using the cheapie pail ... and it worked...
Name: text, dtype: object

#### Функция для преобразования текста отзыва. Удаляем все символы пунктуации, слова с цифрами, производим лемматизацию:

In [7]:
from nltk.stem import WordNetLemmatizer
wordnet_lemmatizer = WordNetLemmatizer()

In [8]:
def hasNumbers(inputString):
    return any(char.isdigit() for char in inputString)

regex = re.compile('[%s]' % re.escape(string.punctuation))

def filt(sent):
    sent = regex.sub('', sent)
    words = [x.strip() for x in sent.split() if not hasNumbers(x)]
    result = []
    for word in words:
        word = wordnet_lemmatizer.lemmatize(word)
        if word in stop_words:
            continue
        result.append(word)
        #result.append(word)
            
    return ' '.join(result)

In [11]:
train['text_filt'] = train.text.apply(filt)

In [12]:
test['text_filt'] = test.text.apply(filt)

#### Обучим векторайзер:

In [22]:
vectorizer = CountVectorizer(ngram_range=(1,1),stop_words=stop_words)

In [23]:
data = vectorizer.fit_transform(train.text_filt)

In [24]:
with open("models/vectorizer.pkl","wb") as f:
    pickle.dump(vectorizer, f)

#### Обучим классификатор:

In [25]:
clf = MultinomialNB()
clf.fit(data, train.target)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [26]:
with open("models/classif.pkl","wb") as f:
    pickle.dump(clf, f)