In [16]:
import numpy as np
import pandas as pd

from sklearn.pipeline import make_pipeline
from sklearn.cross_validation import cross_val_score
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.svm import LinearSVC
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

from nltk.corpus import movie_reviews
import nltk

In [2]:
def write_answer(assignment_N, answer):        
    with open("answ_{}.txt".format(assignment_N), "w") as fout:
        if isinstance(answer, str):
            fout.write("{}".format(answer))
        elif isinstance(answer, int):
            fout.write("{}".format(answer))
        else:
            fout.write("{:.2f}".format(answer))

In [3]:
negids = movie_reviews.fileids('neg')
posids = movie_reviews.fileids('pos')

In [4]:
negfeats = [movie_reviews.words(fileids=[f]) for f in negids]
posfeats = [movie_reviews.words(fileids=[f]) for f in posids]

In [5]:
allfeats = [' '.join(x) for x in negfeats] + [' '.join(x) for x in posfeats]
target = [0]*len(negfeats) + [1]*len(posfeats)

In [27]:
for vect in [CountVectorizer, TfidfVectorizer]:
    clf = make_pipeline( vect(), LogisticRegression() )
    scores = cross_val_score(clf, allfeats, target, scoring='accuracy', cv=5)
    print("CLF, mean = {:.2f}, std ={:.2f}".format(scores.mean(), scores.std()))

CLF, mean = 0.84, std =0.02
CLF, mean = 0.82, std =0.00


In [26]:
write_answer(1, "0.84 0.02 0.82 0")

In [11]:
for min_df in [10, 50]:
    clf = make_pipeline( CountVectorizer(min_df=min_df), LogisticRegression() )
    scores = cross_val_score(clf, allfeats, target, scoring='accuracy', cv=5)
    print("CLF(min_df = {}), mean = {:.2f}, std ={:.2f}".format(min_df, scores.mean(), scores.std()))

CLF(min_df = 10), mean = 0.84, std =0.01
CLF(min_df = 50), mean = 0.81, std =0.01


In [12]:
write_answer(2, "0.84 0.81")

In [14]:
for classif in [LogisticRegression, LinearSVC, SGDClassifier]:
    clf = make_pipeline( CountVectorizer(), classif() )
    scores = cross_val_score(clf, allfeats, target, scoring='accuracy', cv=5)
    print("CLF, mean = {:.2f}, std ={:.2f}".format(scores.mean(), scores.std()))

CLF, mean = 0.84, std =0.02
CLF, mean = 0.83, std =0.02
CLF, mean = 0.76, std =0.06


In [25]:
write_answer(3, "0.76")

In [18]:
stop_words = nltk.corpus.stopwords.words('english')

In [20]:
for sw in[stop_words, 'english']:
    clf = make_pipeline( CountVectorizer(stop_words=sw), LogisticRegression() )
    scores = cross_val_score(clf, allfeats, target, scoring='accuracy', cv=5)
    print("CLF, mean = {:.2f}, std ={:.2f}".format(scores.mean(), scores.std()))

CLF, mean = 0.84, std =0.01
CLF, mean = 0.84, std =0.01


In [21]:
write_answer(4, "0.84 0.84")

In [22]:
clfs = [
    make_pipeline(CountVectorizer(ngram_range=(1,2)), LogisticRegression()),
    make_pipeline(CountVectorizer(ngram_range=(3,4),analyzer='char_wb'), LogisticRegression()),
]

In [23]:
for clf in clfs:
    scores = cross_val_score(clf, allfeats, target, scoring='accuracy', cv=5)
    print("CLF, mean = {:.2f}, std ={:.2f}".format(scores.mean(), scores.std()))

CLF, mean = 0.85, std =0.02
CLF, mean = 0.81, std =0.01


In [24]:
write_answer(5, "0.85 0.81")