In [1]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.datasets import load_files
from sklearn.pipeline import Pipeline
from sklearn.linear_model import SGDClassifier
from sklearn.grid_search import GridSearchCV
from sklearn import metrics
from sklearn.svm import SVC
import numpy as np
import nltk
import os
import re
import random

In [2]:
imdb_train = load_files('.\\imdb\\train', description='imdb reviews train', categories=['neg','pos'], shuffle='True',
                            load_content=True, encoding='utf-8')
imdb_test = load_files('.\\imdb\\test', description='imdb reviews test', categories=['neg','pos'], shuffle='True',
                            load_content=True, encoding='utf-8')
docs_test = imdb_test.data

In [5]:
#Pipeline of default count vectorizer, multinomial naive Bayes classifier, as a sort of baseline
#Outputs test accuracy of classifier
text_clf = Pipeline([ ('vect', CountVectorizer(ngram_range=(1,1))),
                      ('clf', MultinomialNB())
                      ])
text_clf = text_clf.fit(imdb_train.data, imdb_train.target)
predicted = text_clf.predict(docs_test)
print '%.5f' % (np.mean(predicted == imdb_test.target))

0.80597


In [3]:
#Pipeline of default count vectorizer, tf-idf transformer, and SGD classifier
#Outputs test accuracy of classifier
text_clf = Pipeline([('vect', CountVectorizer()),
                      ('tfidf', TfidfTransformer()),
                      ('clf', SGDClassifier(loss='log', penalty='l2', alpha=1e-3,
                                             n_iter=5, random_state=50)),
                      ])
text_clf = text_clf.fit(imdb_train.data, imdb_train.target)
predicted = text_clf.predict(docs_test)
print '%.5f' % np.mean(predicted == imdb_test.target)

0.78109


In [7]:
text_clf_2 = Pipeline([('vect2', CountVectorizer()),
                      ('tfidf2', TfidfTransformer()),
                      ('clf2', SVC(kernel='linear')),
                      ])
text_clf_2 = text_clf_2.fit(imdb_train.data, imdb_train.target)
predicted2 = text_clf_2.predict(docs_test)
print '%.5f' % np.mean(predicted2 == imdb_test.target)

0.78109452736318408

In [4]:
text_clf_3 = Pipeline([('vect3', CountVectorizer()),
                      ('tfidf3', TfidfTransformer()),
                      ('clf3', SVC(C=0.5, kernel='linear')),
                      ])
text_clf_3 = text_clf_3.fit(imdb_train.data, imdb_train.target)
predicted3 = text_clf_3.predict(docs_test)
print '%.5f' % np.mean(predicted3 == imdb_test.target)

0.81095
