In [None]:
import string
from itertools import chain
from time import time

# nltk
import nltk
from nltk.corpus import movie_reviews as mr
from nltk.corpus import stopwords
from nltk.probability import FreqDist

# scikit learn
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.decomposition import TruncatedSVD
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline
from sklearn.cross_validation import train_test_split

# keras
from keras.models import Sequential
from keras.layers import Dense, Dropout
from keras.wrappers.scikit_learn import KerasClassifier


In [None]:
test_plus_valid_perc = .2
valid_of_test_perc = .5

data = []
# create a useful dictionary from each review
for category in mr.categories():

    if category == 'pos':
        pretty_category_name = 'positive'
    elif category == 'neg':
        pretty_category_name = 'negative'

    for fileid in mr.fileids(category):

        review_words = mr.words(fileid)
        review_text = ''
        for word in review_words:
            review_text += ' ' + word

        review_dictionary = {
            'text': review_text,
            'sentiment': pretty_category_name
        }

        data.append(review_dictionary)
        
# create a useful dictionary from each review
train, test = train_test_split(data, test_size=test_plus_valid_perc)
test, valid = train_test_split(test, test_size=valid_of_test_perc)

In [None]:
def get_arrays(dictionaries):
    x = []
    y = []
    for review in dictionaries:
        x.append(review['text'])
        y.append(1 if review['sentiment'] == 'positive' else 0)
    return x, y

 # create simple train and test and validation x - y arrays   
train_x, train_y = get_arrays(train) 
test_x, test_y = get_arrays(test) 
valid_x, valid_y = get_arrays(valid) 

In [None]:
def fit_and_analyze(pipeline, x_train, y_train, x_test, y_test, x_valid, y_valid):
    # tic
    t0 = time()
    
    sentiment_fit = pipeline.fit(x_train, y_train)
    
    y_pred_test = sentiment_fit.predict(x_test)
    y_pred_train = sentiment_fit.predict(x_train)
    y_pred_valid = sentiment_fit.predict(x_valid)
    
    # toc
    train_test_time = time() - t0
    
    accuracy_test = accuracy_score(y_test, y_pred_test)
    accuracy_train = accuracy_score(y_train, y_pred_train)
    accuracy_valid = accuracy_score(y_valid, y_pred_valid)
    print ("accuracy score test: {0:.2f}%".format(accuracy_test*100))
    print ("accuracy score train: {0:.2f}%".format(accuracy_train*100))
    print ("accuracy score valid: {0:.2f}%".format(accuracy_valid*100))
    print ("time: {0:.2f}s".format(train_test_time))
    print ("-"*50)
    
    return sentiment_fit 

### Define learners

In [None]:
# Function to create model, required for KerasClassifier
def create_network(optimizer='rmsprop', init='glorot_uniform', dropout=0.5):
    # create model
    model = Sequential()
    model.add(Dense(128, input_dim=100, kernel_initializer=init, activation='relu'))
    model.add(Dropout(dropout))
    model.add(Dense(64, kernel_initializer=init, activation='relu'))
    model.add(Dropout(dropout))
    model.add(Dense(1, kernel_initializer=init, activation='sigmoid'))
    # Compile model
    model.compile(loss='binary_crossentropy', optimizer=optimizer, metrics=['accuracy'])
    return model

In [None]:
logistic_regression = LogisticRegression()
svm = SVC()
neural_network = KerasClassifier(create_network, epochs=10, batch_size=16, verbose=0)

models = [logistic_regression, svm, neural_network]

In [None]:
tvec = TfidfVectorizer(min_df=2)
svd = TruncatedSVD(n_components=100) # if you change n_component you need to change input_dim of the network

ngram_size = 2
features = range(200, 400, 100) #range(500, 5000, 500)

for model in models:
    print (model.__class__.__name__)
    for numFeatures in features:
        result = []
        tvec.set_params(stop_words=stopwords.words('english'), max_features=numFeatures, ngram_range=(1, ngram_size))
        pipeline = Pipeline([
            ('vectorizer', tvec),
            ('svd', svd),
            ('classifier', model)
        ])
    
        print ("Result for {} features".format(numFeatures))
        fit_and_analyze(pipeline, train_x, train_y, test_x, test_y, valid_x, valid_y)
        
    print("="*20)

In [None]:
help(TfidfVectorizer)