In [1]:
from sklearn.cross_validation import train_test_split
from nltk.corpus import movie_reviews as mr
import nltk
from nltk.corpus import stopwords
from nltk.probability import FreqDist
import string
from itertools import chain

test_plus_valid_perc = .2
valid_of_test_perc = .5

data = []
# create a useful dictionary from each review
for category in mr.categories():

    if category == 'pos':
        pretty_category_name = 'positive'
    elif category == 'neg':
        pretty_category_name = 'negative'

    for fileid in mr.fileids(category):

        review_words = mr.words(fileid)
        review_text = ''
        
        for word in review_words:
            review_text += ' ' + word

        review_dictionary = {
            'text': review_text,
            'sentiment': pretty_category_name
        }

        data.append(review_dictionary)
        
# create a useful dictionary from each review
train, test = train_test_split(data, test_size=test_plus_valid_perc)
test, valid = train_test_split(test, test_size=valid_of_test_perc)

def get_arrays(dictionaries):
    x = []
    y = []
    for review in dictionaries:
        x.append(review['text'])
        y.append(1 if review['sentiment'] == 'positive' else 0)
    return x, y

 # create simple train and test and validation x - y arrays   
train_x, train_y = get_arrays(train) 
test_x, test_y = get_arrays(test) 
valid_x, valid_y = get_arrays(valid) 





In [2]:
from sklearn.feature_extraction.text import TfidfVectorizer
from time import time
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline

def fit_and_analyze(pipeline, x_train, y_train, x_test, y_test, x_valid, y_valid):
    t0 = time()
    
    sentiment_fit = pipeline.fit(x_train, y_train)
    
    y_pred_test = sentiment_fit.predict(x_test)
    y_pred_train = sentiment_fit.predict(x_train)
    y_pred_valid = sentiment_fit.predict(x_valid)
    
    train_test_time = time() - t0
    
    accuracy_test = accuracy_score(y_test, y_pred_test)
    accuracy_train = accuracy_score(y_train, y_pred_train)
    accuracy_valid = accuracy_score(y_valid, y_pred_valid)
    print ("accuracy score test: {0:.2f}%".format(accuracy_test*100))
    print ("accuracy score train: {0:.2f}%".format(accuracy_train*100))
    print ("accuracy score valid: {0:.2f}%".format(accuracy_valid*100))
    print ("time: {0:.2f}s".format(train_test_time))
    print ("-"*50)
    return sentiment_fit 


lr = LogisticRegression()
tvec = TfidfVectorizer(min_df=2)

ngram_size = 2
features = range(500, 5000, 500)

for numFeatures in features:
    result = []
    tvec.set_params(stop_words=stopwords.words('english'), max_features=numFeatures, ngram_range=(1, ngram_size))
    checker_pipeline = Pipeline([
        ('vectorizer', tvec),
        ('classifier', lr)
    ])
    print ("Result for {} features".format(numFeatures))
    fit_and_analyze(checker_pipeline, train_x, train_y, test_x, test_y, valid_x, valid_y)


Result for 500 features
accuracy score test: 77.00%
accuracy score train: 86.69%
accuracy score valid: 81.00%
time: 6.47s
--------------------------------------------------
Result for 1000 features
accuracy score test: 83.50%
accuracy score train: 91.38%
accuracy score valid: 83.50%
time: 5.89s
--------------------------------------------------
Result for 1500 features
accuracy score test: 85.50%
accuracy score train: 93.81%
accuracy score valid: 85.00%
time: 5.73s
--------------------------------------------------
Result for 2000 features
accuracy score test: 85.50%
accuracy score train: 94.75%
accuracy score valid: 85.50%
time: 5.61s
--------------------------------------------------
Result for 2500 features
accuracy score test: 85.50%
accuracy score train: 95.25%
accuracy score valid: 87.50%
time: 5.55s
--------------------------------------------------
Result for 3000 features
accuracy score test: 85.50%
accuracy score train: 95.62%
accuracy score valid: 85.50%
time: 5.55s
--------