# Sentiment analysis example

Let's get a dataset with movie reviews from NLTK:

In [1]:
from nltk.corpus import movie_reviews
 
negids = movie_reviews.fileids('neg')
posids = movie_reviews.fileids('pos')

print(negids[:5])

[u'neg/cv000_29416.txt', u'neg/cv001_19502.txt', u'neg/cv002_17424.txt', u'neg/cv003_12683.txt', u'neg/cv004_12641.txt']


In [2]:
import nltk

Prepare the train set:

In [3]:
negfeats = [" ".join(movie_reviews.words(fileids=[f])) for f in negids]
posfeats = [" ".join(movie_reviews.words(fileids=[f])) for f in posids]

texts = negfeats + posfeats
labels = [0] * len(negfeats) + [1] * len(posfeats)

In [4]:
print (texts[0])

plot : two teen couples go to a church party , drink and then drive . they get into an accident . one of the guys dies , but his girlfriend continues to see him in her life , and has nightmares . what ' s the deal ? watch the movie and " sorta " find out . . . critique : a mind - fuck movie for the teen generation that touches on a very cool idea , but presents it in a very bad package . which is what makes this review an even harder one to write , since i generally applaud films which attempt to break the mold , mess with your head and such ( lost highway & memento ) , but there are good and bad ways of making all types of films , and these folks just didn ' t snag this one correctly . they seem to have taken this pretty neat concept , but executed it terribly . so what are the problems with the movie ? well , its main problem is that it ' s simply too jumbled . it starts off " normal " but then downshifts into this " fantasy " world in which you , as an audience member , have no idea

Importing packages:

In [5]:
from sklearn.feature_extraction.text import TfidfTransformer, CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.svm import LinearSVC
from sklearn.cross_validation import cross_val_score
from sklearn.pipeline import Pipeline
import gensim

ImportError: No module named gensim

### Estimating quality (cross validation)

In [6]:
def text_classifier(vectorizer, transformer, classifier):
    return Pipeline(
            [("vectorizer", vectorizer),
            ("transformer", transformer),
            ("classifier", classifier)]
        )

In [7]:
for clf in [LogisticRegression, LinearSVC, SGDClassifier]:
    print clf
    print cross_val_score(text_classifier(CountVectorizer(), TfidfTransformer(), clf()), texts, labels).mean()
    print "\n"

<class 'sklearn.linear_model.logistic.LogisticRegression'>
0.813511115906


<class 'sklearn.svm.classes.LinearSVC'>
0.845507183831


<class 'sklearn.linear_model.stochastic_gradient.SGDClassifier'>
0.831518644393




### Fitting classifier on all data

In [8]:
clf_pipeline = Pipeline(
            [("vectorizer", TfidfVectorizer()),
            ("classifier", LinearSVC())]
        )


clf_pipeline.fit(texts, labels)

print clf_pipeline

Pipeline(steps=[('vectorizer', TfidfVectorizer(analyzer=u'word', binary=False, decode_error=u'strict',
        dtype=<type 'numpy.int64'>, encoding=u'utf-8', input=u'content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm=u'l2', preprocessor=None, smooth_id...ax_iter=1000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0))])


In [9]:
print clf_pipeline.predict(["Amazing film! I will advice it to all my friends. Genious",
                           "Awful film! The man who advised me to watch it is really crazy idiot."])

[1 0]


## Dimensionality reduction and sentiment analysis

In [10]:
%%time
from sklearn.decomposition import NMF, TruncatedSVD

v = CountVectorizer()
mx = v.fit_transform(texts)
mf = TruncatedSVD(10)
u = mf.fit_transform(mx)

CPU times: user 1.69 s, sys: 576 ms, total: 2.27 s
Wall time: 2.14 s


In [12]:
for transform in [TruncatedSVD]:
    print transform
    print cross_val_score(text_classifier(CountVectorizer(), transform(n_components=10), LinearSVC()), texts, labels).mean()
    print "\n"


<class 'sklearn.decomposition.truncated_svd.TruncatedSVD'>
0.549528570487







Если задать n_components=1000:

In [13]:
%%time
print cross_val_score(text_classifier(TfidfVectorizer(), TruncatedSVD(n_components=1000), LinearSVC()),
                      texts, 
                      labels
                     ).mean()

KeyboardInterrupt: 

## Trees ensembles

In [73]:
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier

In [15]:
%%time
print cross_val_score(
    Pipeline([
            ("vectorizer", CountVectorizer()),
            ("transformer", TruncatedSVD(100)),
            ("classifier", RandomForestClassifier(100))
        ]),
    texts,
    labels
    )

[ 0.68113772  0.71171171  0.68168168]
CPU times: user 12.8 s, sys: 3.1 s, total: 15.9 s
Wall time: 9.7 s


More components and trees:

In [16]:
%%time
print cross_val_score(text_classifier(CountVectorizer(), TruncatedSVD(n_components=1000), RandomForestClassifier(1000)),
                      texts, 
                      labels
                     ).mean()

KeyboardInterrupt: 

Tf*Idf:

In [None]:
%%time
print cross_val_score(text_classifier(TfidfVectorizer(), TruncatedSVD(n_components=1000), RandomForestClassifier(1000)),
                      texts, 
                      labels
                     ).mean()

## Combining Tf*Idf and SVD

In [17]:
from sklearn.pipeline import FeatureUnion

estimators = [('tfidf', TfidfTransformer()), ('svd', TruncatedSVD(1))]
combined = FeatureUnion(estimators)

In [18]:
%%time
print (cross_val_score(
    Pipeline([
            ("vectorizer", CountVectorizer()),
            ("transformer", combined),
            ("classifier", LinearSVC())
        ]),
    texts,
    labels
    ))

[ 0.80239521  0.76426426  0.82132132]
CPU times: user 10.2 s, sys: 892 ms, total: 11.1 s
Wall time: 10.6 s


# Homework

1. Try to improve cross_val_score result using tf*idf, n-gramms, other classifiers, word2vec (see gensim word2vec tutorial) or something else
2. Write in the same ipython notebook (in Markdown cells) the problem formulation for sentiment a

In [26]:
estimators = [('tfidf', TfidfTransformer()), ('svd', TruncatedSVD(1))]
combined = FeatureUnion(estimators)



In [27]:
%%time 
print (cross_val_score(
    Pipeline([
            ("vectorizer", CountVectorizer(ngram_range=(1,3))),
            ("transformer", combined),
            ("classifier", LinearSVC())
        ]),
    texts,
    labels
    ))

[ 0.57185629  0.74924925  0.503003  ]
CPU times: user 1min 42s, sys: 3.91 s, total: 1min 46s
Wall time: 1min 44s


In [7]:
import numpy as np
import gensim.models as g


In [8]:
# trained on english wikipedia
wiki = g.Doc2Vec.load('enwiki_dbow/doc2vec.bin')

In [11]:
len(wiki.vocab)

669549

In [53]:
def to_wv_average(texts):
    wvs = []
    for text in texts:
        words = []
        for word in text.split():
            try:
                words.append(wiki.wv[word.lower()])
            except:
                continue
        wvs.append(np.array(words).mean(axis=0))
    return wvs
        

In [54]:
wvs_average = to_wv_average(texts)

In [59]:
wvs_average = np.array(wvs_average).reshape(-1, 300)

In [60]:
%%time 
print (cross_val_score(
    LinearSVC(),
    wvs_average,
    labels
    ))
# Just average each text as array of wordvecs

[ 0.72155689  0.72522523  0.73273273]
CPU times: user 324 ms, sys: 412 ms, total: 736 ms
Wall time: 1.22 s


In [68]:
from collections import defaultdict

class TfidfEmbeddingVectorizer(object):
    def __init__(self, word2vec):
        self.word2vec = word2vec
        self.word2weight = None
        self.dim = 300

    def fit(self, X, y=None):
        tfidf = TfidfVectorizer(analyzer=lambda x: x)
        tfidf.fit(X)
        # if a word was never seen - it must be at least as infrequent
        # as any of the known words - so the default idf is the max of 
        # known idf's
        max_idf = max(tfidf.idf_)
        self.word2weight = defaultdict(
            lambda: max_idf,
            [(w, tfidf.idf_[i]) for w, i in tfidf.vocabulary_.items()])

        return self

    def transform(self, X):
        return np.array([
                np.mean([self.word2vec[w] * self.word2weight[w]
                         for w in words if w in self.word2vec] or
                        [np.zeros(self.dim)], axis=0)
                for words in X
            ])

In [70]:
trnsf = TfidfEmbeddingVectorizer(wiki).fit(texts)
wvs_idf = trnsf.transform(texts)

In [71]:
%%time 
print (cross_val_score(
    LinearSVC(),
    wvs_idf,
    labels
    ))

# average wordvecs with tf-idf weights

[ 0.55688623  0.56456456  0.61861862]
CPU times: user 644 ms, sys: 496 ms, total: 1.14 s
Wall time: 1.14 s


In [74]:
%%time 
print (cross_val_score(
    RandomForestClassifier(1000),
    wvs_idf,
    labels
    ))
# average wordvecs with tf-idf weights

[ 0.58383234  0.5990991   0.6006006 ]
CPU times: user 43.7 s, sys: 84 ms, total: 43.8 s
Wall time: 44 s


In [75]:
%%time 
print (cross_val_score(
    RandomForestClassifier(1000),
    wvs_average,
    labels
    ))
# average wordvecs with tf-idf weights

[ 0.70209581  0.72072072  0.73423423]
CPU times: user 42.7 s, sys: 76 ms, total: 42.8 s
Wall time: 42.9 s
