# Sentiment Analysis on Movie Polarity Dataset

In [139]:
from sklearn.datasets import load_files
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.grid_search import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.svm import LinearSVC

In [140]:
data = load_files('/almacen/Media/Meta/ml/sentiment/txt_sentoken', shuffle=True )

In [142]:
idx = 2
print "Value =", data.target[idx], "\n-------\n", data.data[idx]

Value = 1 
-------
quaid stars as a man who has taken up the proffesion of dragonslayer after he feels he is betrayed by a dragon early in the movie . 
he runs into the last dragon in existence , and there is a genuinely amusing battle between the two which results in a standoff where quaid is in the dragons mouth , but has his sword pointed at the dragons brain . 
eventually , they decide to call a truce , and they work out a deal . 
since he is the last dragon , he will pretend to die and quaid will be able to get paid for it . 
their scam works at first , until they come to a town without any money . 
instead the town sacrifices a girl to the dragon , but of course , draco is a nice droagon , so he won't eat her . there is however a very amusing scene where draco is hitting on the young girl . 
of course , as you can probably tell by the plot , this is a silly movie , but it does know when to take itself seriously at the right times , unlike eddie , which was serious all the time . 

In [143]:
vect = TfidfVectorizer( min_df=5, max_df=0.8, sublinear_tf=True, use_idf=True )
X = vect.fit_transform(data.data)

In [144]:
X.shape

(2000, 13290)

In [145]:
# See some of the very frequent words
reversed_dict = { v:k for k,v in vect.vocabulary_.iteritems() }
lowest = np.argpartition(vect.idf_,8)[:8]
key = lambda x : vect.idf_[x]
for idf in sorted( lowest, key=key ):
    print "{:10} {}".format( reversed_dict[idf], vect.idf_[idf] )

movie      1.25217150997
out        1.26186463918
like       1.28951617051
was        1.29018395016
they       1.29085217605
there      1.29085217605
so         1.29823228335
you        1.30566726183


In [146]:
# See some of the least frequent words
for i in np.argpartition(vect.idf_,-8)[-8:]:
    print "{:10} {}".format( reversed_dict[i], vect.idf_[i] )

squadron   6.80964286536
priced     6.80964286536
flounder   6.80964286536
stack      6.80964286536
stacked    6.80964286536
focal      6.80964286536
flustered  6.80964286536
flown      6.80964286536


In [147]:
# do we have sparse data?
import scipy.sparse as sp
sp.issparse(X)

True

In [148]:
# The usual train/test split
from sklearn.cross_validation import train_test_split
from sklearn.utils import check_random_state

rs = check_random_state( 1352 )
Xr, Xt, yr, yt = train_test_split( X, data.target, test_size=0.30, random_state=rs )

In [149]:
# Let's try a random forest
from sklearn.ensemble import RandomForestClassifier

clf = RandomForestClassifier(n_estimators=50).fit( Xr, yr )
yp = clf.predict( Xt )

In [151]:
# How good did we get
from sklearn.metrics import accuracy_score, confusion_matrix

print "Accuracy =", accuracy_score( yt, yp )


Accuracy = 0.766666666667


In [152]:
print confusion_matrix( yt, yp )

[[241  52]
 [ 88 219]]


In [156]:
# Do a grid search over the parameter space. Will take a while

from sklearn.grid_search import GridSearchCV

param_grid = [
  {'n_estimators': range(6,100,4) },
 ]

clf = GridSearchCV( RandomForestClassifier(), param_grid, cv=5 )
#                    scoring='%s_weighted' % score)

clf.fit(Xr, yr)

GridSearchCV(cv=5, error_score='raise',
       estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False),
       fit_params={}, iid=True, loss_func=None, n_jobs=1,
       param_grid=[{'n_estimators': [6, 10, 14, 18, 22, 26, 30, 34, 38, 42, 46, 50, 54, 58, 62, 66, 70, 74, 78, 82, 86, 90, 94, 98]}],
       pre_dispatch='2*n_jobs', refit=True, score_func=None, scoring=None,
       verbose=0)

In [157]:
    print("Best parameters set found on development set:")
    print()
    print(clf.best_params_)
    print()
    print("Grid scores on development set:")
    print()
    for params, mean_score, scores in clf.grid_scores_:
        print("%0.3f (+/-%0.03f) for %r"
              % (mean_score, scores.std() * 2, params))

Best parameters set found on development set:
()
{'n_estimators': 98}
()
Grid scores on development set:
()
0.627 (+/-0.022) for {'n_estimators': 6}
0.656 (+/-0.048) for {'n_estimators': 10}
0.690 (+/-0.049) for {'n_estimators': 14}
0.709 (+/-0.029) for {'n_estimators': 18}
0.699 (+/-0.025) for {'n_estimators': 22}
0.709 (+/-0.032) for {'n_estimators': 26}
0.739 (+/-0.061) for {'n_estimators': 30}
0.735 (+/-0.035) for {'n_estimators': 34}
0.755 (+/-0.020) for {'n_estimators': 38}
0.750 (+/-0.037) for {'n_estimators': 42}
0.731 (+/-0.033) for {'n_estimators': 46}
0.759 (+/-0.019) for {'n_estimators': 50}
0.761 (+/-0.042) for {'n_estimators': 54}
0.747 (+/-0.045) for {'n_estimators': 58}
0.756 (+/-0.031) for {'n_estimators': 62}
0.751 (+/-0.038) for {'n_estimators': 66}
0.775 (+/-0.017) for {'n_estimators': 70}
0.770 (+/-0.034) for {'n_estimators': 74}
0.774 (+/-0.038) for {'n_estimators': 78}
0.789 (+/-0.044) for {'n_estimators': 82}
0.772 (+/-0.035) for {'n_estimators': 86}
0.773 (+/-0