## Анализ тональности отзывов

In [6]:
import numpy as np
import pandas as pd

from sklearn.pipeline import make_pipeline
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.svm import LinearSVC
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB

import nltk
import re
import string

#### Список стоп-слов:

In [7]:
stop_words = nltk.corpus.stopwords.words('english')

#### Загрузим данные:

In [8]:
train = pd.read_csv('products_sentiment_train.tsv', sep='\t', header=None, names=['text', 'target'])
test = pd.read_csv('products_sentiment_test.tsv', sep='\t', index_col='Id')

In [9]:
sample = pd.read_csv('products_sentiment_sample_submission.csv')

In [10]:
train.text.head()

0            2 . take around 10,000 640x480 pictures .
1    i downloaded a trial version of computer assoc...
2    the wrt54g plus the hga7t is a perfect solutio...
3    i dont especially like how music files are uns...
4    i was using the cheapie pail ... and it worked...
Name: text, dtype: object

#### Функция для преобразования текста отзыва. Удаляем все символы пунктуации, слова с цифрами, производим лемматизацию:

In [11]:
from nltk.stem import WordNetLemmatizer
wordnet_lemmatizer = WordNetLemmatizer()

In [12]:
def hasNumbers(inputString):
    return any(char.isdigit() for char in inputString)

regex = re.compile('[%s]' % re.escape(string.punctuation))

def filt(sent):
    sent = regex.sub('', sent)
    words = [x.strip() for x in sent.split() if not hasNumbers(x)]
    result = []
    for word in words:
        word = wordnet_lemmatizer.lemmatize(word)
        if word in stop_words:
            continue
        result.append(word)
        #result.append(word)
            
    return ' '.join(result)

In [13]:
train['text_filt'] = train.text.apply(filt)

In [14]:
test['text_filt'] = test.text.apply(filt)

#### Выберем классификатор:

In [15]:
def score(clf):
    scores = cross_val_score(clf, train.text_filt, train.target, cv=5)
    print("CLF score mean = {:.5f}, std = {:.5f}".format(scores.mean(), scores.std()))

In [16]:
for classf in [LogisticRegression, SGDClassifier, LinearSVC, MultinomialNB]:
    clf = make_pipeline(CountVectorizer(ngram_range=(1,1), stop_words='english'), classf())
    score(clf)

CLF score mean = 0.75198, std = 0.01793
CLF score mean = 0.73348, std = 0.01275
CLF score mean = 0.73398, std = 0.01360
CLF score mean = 0.75448, std = 0.01255


#### Посмотрим на предложения, которые мы плохо определяем:

In [17]:
from sklearn.cross_validation import train_test_split

ttrain, ttest = train_test_split(train)

In [18]:
clf = make_pipeline(CountVectorizer(ngram_range=(1,1)), MultinomialNB())
clf.fit(ttrain.text_filt, ttrain.target)
predictions = clf.predict(ttest.text_filt)

for x in ttest[ttest.target != predictions].iterrows():
    print("{}: {}".format(x[1]['target'], x[1]['text']))

0: this unit was working fine for the 1st month and a half and then the color signal disappeared . 
0: however , the battery life isn 't as good as i would like .
1: have used it frequently and have had no problems . 
0: they obviously have quality and reliability problems .
0: so , don 't buy this or upgrade .
0: recenly the variable speed control went out .
0: 2 ) the body construction - buttons , casing , etc , are too plastic . 
0: the black fly wheel feels pretty cheap & is uncomfortable to use . 
0: i took my zen micro with me over the weekend , and was forced to lug my laptop with me on the trip for one reason : to keep the zen micro charged .
0: another irriation is that if you hold the scroll button down for too long , it keeps going past the song you may have wanted to stop on . 
1: one other plus is that napster now supports this device via their software . 
0: the best thing to do with this version , is to leave it on the store shelf . it doesn 't work , and will cost you a

#### Обучим классификатор:

In [30]:
clf = make_pipeline(CountVectorizer(ngram_range=(1,1)), MultinomialNB())
clf.fit(train.text_filt, train.target)

Pipeline(steps=[('countvectorizer', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words='english',
        strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)), ('multinomialnb', MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))])

#### Сохраним предсказания:

In [31]:
predictions = clf.predict(test.text_filt)

In [32]:
out = pd.DataFrame(predictions, index=test.index, columns=['y'])

In [33]:
out.to_csv('submission.csv')

## Word2Vec

In [83]:
from gensim import utils
from gensim.models.doc2vec import LabeledSentence
from gensim.models import Doc2Vec

from random import shuffle

In [115]:
class LabeledLineSentence(object):
    def __init__(self, train_sentences, train_labels, test_sentences):
        self.src_sentences = train_sentences.tolist() + test_sentences.tolist()
        self.src_labels = ['TRIAN_{}_{}'.format(x, i) for i, x in enumerate(train_labels)] + \
                          ['TEST_{}'.format(i) for i in range(len(test_sentences))]
    
    def __iter__(self):
        for i,sentence in enumerate(self.src_sentences):
            yield LabeledSentence(sentence.split(), [self.src_labels[i]])
    
    def to_array(self):
        self.sentences = []
        for i,sentence in enumerate(self.src_sentences):
            self.sentences.append(LabeledSentence(sentence.split(), [self.src_labels[i]]))
        return self.sentences
    
    def sentences_perm(self):
        b = self.sentences
        shuffle(b)
        return b

In [138]:
train_sentences = LabeledLineSentence(train.text_filt, train.target, test.text_filt)

In [139]:
model = Doc2Vec(min_count=1, window=10, size=100, sample=1e-4, negative=5, workers=7)

model.build_vocab(train_sentences.to_array())

In [140]:
for epoch in range(10):
    model.train(train_sentences.sentences_perm())



In [142]:
model.most_similar('good')

[('handsfree', 0.9998518228530884),
 ('scratch', 0.9998512864112854),
 ('model', 0.9998455047607422),
 ('incredible', 0.9998373985290527),
 ('quite', 0.9998307824134827),
 ('drop', 0.9998292922973633),
 ('start', 0.999825656414032),
 ('required', 0.9998246431350708),
 ('picture', 0.9998235702514648),
 ('somewhat', 0.9998204708099365)]

In [144]:
model

<gensim.models.doc2vec.Doc2Vec at 0x11ed6a8d0>

In [150]:
#train_arrays = [model[['TRIAN_{}_{}'.format(x, i)]] for i,x in enumerate(train.target)]

In [152]:
train_arrays = [model.docvecs[i] for i in range(len(train.target))]

In [154]:
test_arrays = [model.docvecs[i+len(train.target)] for i in range(test.shape[0])]

In [155]:
classifier = LogisticRegression()
classifier.fit(train_arrays, train.target)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [157]:
cross_val_score(LogisticRegression(), train_arrays, train.target, cv=5).mean()

0.63250655629097674

#### NLTK Sentiment Analysis

In [162]:
from nltk.sentiment import SentimentAnalyzer

#### NLTK

In [168]:
from nltk.corpus import product_reviews_2

In [169]:
for file in product_reviews_2.fileids():
    print(len(product_reviews_2.reviews(file)))

1
51
49
31
48
50
49
0
0
46


In [19]:
from polyglot.text import Text