## Анализ тональности отзывов

In [6]:
import numpy as np
import pandas as pd

from sklearn.pipeline import make_pipeline
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.svm import LinearSVC
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB

import nltk
import re
import string

#### Список стоп-слов:

In [7]:
stop_words = nltk.corpus.stopwords.words('english')

#### Загрузим данные:

In [8]:
train = pd.read_csv('products_sentiment_train.tsv', sep='\t', header=None, names=['text', 'target'])
test = pd.read_csv('products_sentiment_test.tsv', sep='\t', index_col='Id')

In [9]:
sample = pd.read_csv('products_sentiment_sample_submission.csv')

In [10]:
train.text.head()

0            2 . take around 10,000 640x480 pictures .
1    i downloaded a trial version of computer assoc...
2    the wrt54g plus the hga7t is a perfect solutio...
3    i dont especially like how music files are uns...
4    i was using the cheapie pail ... and it worked...
Name: text, dtype: object

#### Функция для преобразования текста отзыва. Удаляем все символы пунктуации, слова с цифрами, производим лемматизацию:

In [11]:
from nltk.stem import WordNetLemmatizer
wordnet_lemmatizer = WordNetLemmatizer()

In [12]:
def hasNumbers(inputString):
    return any(char.isdigit() for char in inputString)

regex = re.compile('[%s]' % re.escape(string.punctuation))

def filt(sent):
    sent = regex.sub('', sent)
    words = [x.strip() for x in sent.split() if not hasNumbers(x)]
    result = []
    for word in words:
        word = wordnet_lemmatizer.lemmatize(word)
        if word in stop_words:
            continue
        result.append(word)
        #result.append(word)
            
    return ' '.join(result)

In [13]:
train['text_filt'] = train.text.apply(filt)

In [14]:
test['text_filt'] = test.text.apply(filt)

#### Выберем классификатор:

In [15]:
def score(clf):
    scores = cross_val_score(clf, train.text_filt, train.target, cv=5)
    print("CLF score mean = {:.5f}, std = {:.5f}".format(scores.mean(), scores.std()))

In [16]:
for classf in [LogisticRegression, SGDClassifier, LinearSVC, MultinomialNB]:
    clf = make_pipeline(CountVectorizer(ngram_range=(1,1), stop_words='english'), classf())
    score(clf)

CLF score mean = 0.75198, std = 0.01793
CLF score mean = 0.73348, std = 0.01275
CLF score mean = 0.73398, std = 0.01360
CLF score mean = 0.75448, std = 0.01255


#### Обучим классификатор:

In [30]:
clf = make_pipeline(CountVectorizer(ngram_range=(1,1)), MultinomialNB())
clf.fit(train.text_filt, train.target)

Pipeline(steps=[('countvectorizer', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words='english',
        strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)), ('multinomialnb', MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))])

#### Сохраним предсказания:

In [31]:
predictions = clf.predict(test.text_filt)

In [32]:
out = pd.DataFrame(predictions, index=test.index, columns=['y'])

In [33]:
out.to_csv('submission.csv')