In [24]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import GaussianNB
from sklearn.datasets import load_files
from sklearn.pipeline import Pipeline
from sklearn.linear_model import SGDClassifier
from sklearn.grid_search import GridSearchCV
from sklearn.decomposition import PCA
from sklearn import metrics
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier 
import matplotlib.pyplot as plt
import numpy as np
import nltk
import os
import re
import random
%matplotlib inline

In [36]:
#Load test and training data for sentiment classifier, and load test documents.
amazon_train = load_files('.\\amazon\\train', description='amazon movie reviews train', categories=['neg','pos'], shuffle='True',
                            load_content=True, encoding='utf-8')
amazon_test = load_files('.\\amazon\\test', description='amazon movie reviews test', categories=['neg','pos'], shuffle='True',
                            load_content=True, encoding='utf-8')
docs_test = amazon_test.data
X = np.array(amazon_train.data)
Y = np.array(amazon_train.target)

In [59]:
#Pipeline of default count vectorizer, tf-idf transformer, and default random forest classifier
#Outputs test accuracy of classifier
text_clf = Pipeline([ ('vect', CountVectorizer(ngram_range=(1,1))),
                      ('clf', MultinomialNB())
                      ])
text_clf = text_clf.fit(amazon_train.data, amazon_train.target)
predicted = text_clf.predict(docs_test)
print '%.5f' % (np.mean(predicted == amazon_test.target))

0.68500


In [34]:
#Pipeline of default count vectorizer, tf-idf transformer, and default random forest classifier
#Outputs test accuracy of classifier
text_clf_1 = Pipeline([('vect', CountVectorizer(ngram_range=(1,1))),
                      ('tfidf', TfidfTransformer()),
                      ('clf', RandomForestClassifier()),
                      ])
text_clf_1 = text_clf_1.fit(amazon_train.data, amazon_train.target)
predicted1 = text_clf_1.predict(docs_test)
print '%.5f' % (np.mean(predicted1 == amazon_test.target))

0.66500


In [35]:
text_clf_2 = Pipeline([('vect', CountVectorizer()),
                      ('tfidf', TfidfTransformer()),
                      ('clf', SGDClassifier(loss='log', penalty='l2', alpha=1e-3,
                                             n_iter=5, random_state=50)),
                      ])
text_clf_2 = text_clf_2.fit(amazon_train.data, amazon_train.target)
predicted2 = text_clf_2.predict(docs_test)
print '%.5f' % np.mean(predicted2 == amazon_test.target)

0.76500


In [22]:
text_clf_3 = Pipeline([('vect', CountVectorizer()),
                      ('tfidf', TfidfTransformer()),
                      ('clf', SVC()),
                      ])
text_clf_3 = text_clf_3.fit(amazon_train.data, amazon_train.target)
predicted3 = text_clf_3.predict(docs_test)
print '%.5f' %  np.mean(predicted3 == amazon_test.target)

0.64000


In [49]:
text_clf_4 = Pipeline([('vect', CountVectorizer(ngram_range=(1,2))),
                      ('tfidf', TfidfTransformer()),
                      ('clf', SGDClassifier(loss='log', penalty='l2', alpha=1e-3,
                                             n_iter=5, random_state=50)),
                      ])
text_clf_4 = text_clf_4.fit(amazon_train.data, amazon_train.target)
predicted4 = text_clf_4.predict(docs_test)
print '%.5f' % np.mean(predicted4 == amazon_test.target)

0.73000


In [50]:
text_clf_5 = Pipeline([('vect', CountVectorizer()),
                      ('tfidf', TfidfTransformer()),
                      ('clf', SGDClassifier(loss='log', penalty='l2', alpha=1e-2,
                                             n_iter=5, random_state=50)),
                      ])
text_clf_5 = text_clf_5.fit(amazon_train.data, amazon_train.target)
predicted5 = text_clf_5.predict(docs_test)
print '%.5f' % np.mean(predicted5 == amazon_test.target)

0.74500


In [55]:
text_clf_6 = Pipeline([('vect', CountVectorizer()),
                      ('tfidf', TfidfTransformer()),
                      ('clf', SVC()),
                      ])
text_clf_6 = text_clf_6.fit(amazon_train.data, amazon_train.target)
predicted6 = text_clf_6.predict(docs_test)
print '%.5f' %  np.mean(predicted6 == amazon_test.target)

0.64000


In [52]:
text_clf_7 = Pipeline([('vect', CountVectorizer()),
                      ('tfidf', TfidfTransformer()),
                      ('clf', SVC(kernel='linear')),
                      ])
text_clf_7 = text_clf_7.fit(amazon_train.data, amazon_train.target)
predicted7 = text_clf_7.predict(docs_test)
print '%.5f' % np.mean(predicted7 == amazon_test.target)

0.71500


In [53]:
text_clf_8 = Pipeline([('vect', CountVectorizer()),
                      ('tfidf', TfidfTransformer()),
                      ('clf', SVC(C=0.5, kernel='linear')),
                      ])
text_clf_8 = text_clf_8.fit(amazon_train.data, amazon_train.target)
predicted8 = text_clf_8.predict(docs_test)
print '%.5f' % np.mean(predicted8 == amazon_test.target)

0.75000
