In [13]:
import numpy as np
from sklearn import svm
from os import listdir
from os.path import isfile, join
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.utils import shuffle
import time
from sklearn.neural_network import MLPClassifier
from sklearn import preprocessing
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import GaussianNB
#Process the data through tf and tf-idf
from sklearn.feature_extraction.text import TfidfTransformer

tfidf_transformer = TfidfTransformer()
tfidf_transformer_noidf = TfidfTransformer(use_idf=False)
max_abs_scaler = preprocessing.MaxAbsScaler()

In [7]:
parentpath='/home/orlandom/Documents/UdeM/A2017/IFT6390/Project/'

#Load files in the IMDB train set
mypath=parentpath + 'aclImdb/train/pos/'
files_pos_train = [mypath + f for f in listdir(mypath) if isfile(join(mypath, f))]
mypath=parentpath + 'aclImdb/train/neg/'
files_neg_train = [mypath + f for f in listdir(mypath) if isfile(join(mypath, f))]

#Load files in the IMDB test set
mypath=parentpath + 'aclImdb/test/pos/'
files_pos_test = [mypath + f for f in listdir(mypath) if isfile(join(mypath, f))]
mypath=parentpath + 'aclImdb/test/neg/'
files_neg_test = [mypath + f for f in listdir(mypath) if isfile(join(mypath, f))]

files = files_pos_train + files_pos_test + files_neg_train + files_neg_test

vocab_file = open(parentpath + 'aclImdb/imdb.vocab', "r")
vocab = vocab_file.read().splitlines()

### Count Vectorizer with supplied vocabulary and using a bigram

In [8]:
#Unigram
cv_wo_stop_w = CountVectorizer(input='filename', stop_words='english', 
                                   min_df=3, max_df=0.95, dtype='int32', vocabulary=vocab)
corpus_wo_stop_w = cv_wo_stop_w.fit_transform(files)

cv_w_stop_w = CountVectorizer(input='filename', min_df=3, max_df=0.95, dtype='int32', vocabulary=vocab)
corpus_w_stop_w = cv_w_stop_w.fit_transform(files)

#Bigram
cv_wo_stop_w_bi = CountVectorizer(input='filename', min_df=3, max_df=0.95, dtype='int32', ngram_range=(1, 2),
                                  stop_words='english') #vocabulary=vocab,
corpus_wo_stop_w_bi = cv_wo_stop_w_bi.fit_transform(files)

cv_w_stop_w_bi = CountVectorizer(input='filename', min_df=3, max_df=0.95, dtype='int32', 
                                 ngram_range=(1, 2)) #vocabulary=vocab,
corpus_w_stop_w_bi = cv_w_stop_w_bi.fit_transform(files)

#Trigram
cv_wo_stop_w_tri = CountVectorizer(input='filename', min_df=3, max_df=0.95, dtype='int32', ngram_range=(1, 3),
                                  stop_words='english') #vocabulary=vocab,
corpus_wo_stop_w_tri = cv_wo_stop_w_tri.fit_transform(files)

cv_w_stop_w_tri = CountVectorizer(input='filename', min_df=3, max_df=0.95, dtype='int32', 
                                 ngram_range=(1, 3)) #vocabulary=vocab,
corpus_w_stop_w_tri = cv_w_stop_w_tri.fit_transform(files)

In [11]:
def analyze_corpus_data(corpus, count_vectorizer):
    size_vocab = len(count_vectorizer.vocabulary_)
    size_docs = np.zeros(corpus.shape[0])

    for i in range(corpus.indptr.shape[0]-1):
        size_docs[i] = corpus.indptr[i+1] - corpus.indptr[i]
        
    print "Vocabulary size:", size_vocab
    print "Minimum size of doc is {} and maximum size is {}".format(min(size_docs), max(size_docs))
    print "Mean size is {} and variance is {}".format(size_docs.mean(), size_docs.var())

print "Removing stop words - trigram"
analyze_corpus_data(corpus_wo_stop_w_tri, cv_wo_stop_w_tri)
print "Leaving stop words - trigram"
analyze_corpus_data(corpus_w_stop_w_tri, cv_w_stop_w_tri)
print "Removing stop words - bigram"
analyze_corpus_data(corpus_wo_stop_w_bi, cv_wo_stop_w_bi)
print "Leaving stop words - bigram"
analyze_corpus_data(corpus_w_stop_w_bi, cv_w_stop_w_bi)
print "Removing stop words - unigram"
analyze_corpus_data(corpus_wo_stop_w, cv_wo_stop_w)
print "Leaving stop words - unigram"
analyze_corpus_data(corpus_w_stop_w, cv_w_stop_w)

Removing stop words - trigram
Vocabulary size: 366483
Minimum size of doc is 3.0 and maximum size is 1040.0
Mean size is 142.20958 and variance is 8696.30005622
Leaving stop words - trigram
Vocabulary size: 905884
Minimum size of doc is 8.0 and maximum size is 3026.0
Mean size is 386.43874 and variance is 61867.7205272
Removing stop words - bigram
Vocabulary size: 308910
Minimum size of doc is 3.0 and maximum size is 1001.0
Mean size is 133.6765 and variance is 7720.09192775
Leaving stop words - bigram
Vocabulary size: 455551
Minimum size of doc is 8.0 and maximum size is 2151.0
Mean size is 298.88416 and variance is 36446.5194211
Removing stop words - unigram
Vocabulary size: 89527
Minimum size of doc is 3.0 and maximum size is 771.0
Mean size is 86.78736 and variance is 3545.91026423
Leaving stop words - unigram
Vocabulary size: 89527
Minimum size of doc is 4.0 and maximum size is 940.0
Mean size is 134.62792 and variance is 6129.57431647


In [10]:
def split_data(corpus, num_train=40000, num_valid=5000, num_test=5000):

    #Create train, valid and test data
    doc_index = np.arange(corpus.shape[0])
    target_pos = np.ones(corpus.shape[0]/2).astype(int) #first 25000 docs have target 1=positive
    target_neg = np.zeros(corpus.shape[0]/2).astype(int) #next 25000 docs have target 0=negative
    target = np.concatenate((target_pos, target_neg), axis=0) #now we have data and target values

    #shuffle all of the data and target
    doc_index, corpus, target = shuffle(doc_index, corpus, target, random_state=0)

    train_data = corpus[:num_train]
    train_targets = target[:num_train]
    valid_data = corpus[num_train:num_train+num_valid]
    valid_targets = target[num_train:num_train+num_valid]
    test_data = corpus[num_train+num_valid:]
    test_targets = target[num_train+num_valid:]

    return train_data, train_targets, valid_data, valid_targets, test_data, test_targets

In [16]:
def eval_clf(clf, data):
    
    train_data = data[0]
    train_targets = data[1]
    valid_data = data[2]
    valid_targets = data[3]
    
    time0 = time.time()

    clf.fit(train_data, train_targets) 
    predictions_train = clf.predict(train_data)
    error_train = 1 - (predictions_train == train_targets).mean()
    predictions_valid = clf.predict(valid_data)
    error_valid = 1 - (predictions_valid == valid_targets).mean()

    time1 = time.time()

    print "It took {} seconds".format(time1-time0)
    print "Training error: ", error_train*100
    print "Validation error: ", error_valid*100
    
    return error_train, error_valid

In [14]:
data_corpus=[corpus_w_stop_w_tri, corpus_wo_stop_w_tri, corpus_w_stop_w_bi, 
             corpus_wo_stop_w_bi, corpus_w_stop_w, corpus_wo_stop_w]

#No transformation
data_raw = []
data_tf = []
data_tfidf = []
data_scaled = []

for corpus in data_corpus:
    data_raw.append(split_data(corpus))
    data_tf.append(split_data(tfidf_transformer_noidf.fit_transform(corpus)))
    data_tfidf.append(split_data(tfidf_transformer.fit_transform(corpus)))
    data_scaled.append(split_data(max_abs_scaler.fit_transform(corpus)))


In [18]:
linearSVM = svm.LinearSVC()

headers = ["Trigram: Run data set with stop words-------------------",
           "Trigram: Run data set without stop words-------------------",
           "Bigram: Run data set with stop words-------------------",
           "Bigram: Run data set with stop words-------------------",
           "Unigram: Run data set with stop words-------------------",
           "Unigram: Run data set without stop words-------------------"]

for (header, corpus) in zip(headers, data_raw):
    print header
    eval_clf(linearSVM, corpus)

Trigram: Run data set with stop words-------------------
It took 35.9928219318 seconds
Training error:  0.0
Validation error:  8.66
Trigram: Run data set without stop words-------------------
It took 11.5295898914 seconds
Training error:  0.0
Validation error:  10.16
Bigram: Run data set with stop words-------------------
It took 23.1913349628 seconds
Training error:  0.0
Validation error:  8.94
Bigram: Run data set with stop words-------------------
It took 11.8183670044 seconds
Training error:  0.0
Validation error:  9.98
Unigram: Run data set with stop words-------------------
It took 8.22762989998 seconds
Training error:  0.005
Validation error:  12.84
Unigram: Run data set without stop words-------------------
It took 5.61096692085 seconds
Training error:  0.01
Validation error:  12.86


### Run data after doing only TF transformation

In [19]:
headers = ["TF Trigram: Run data set with stop words-------------------",
           "TF Trigram: Run data set without stop words-------------------",
           "TF Bigram: Run data set with stop words-------------------",
           "TF Bigram: Run data set with stop words-------------------",
           "TF Unigram: Run data set with stop words-------------------",
           "TF Unigram: Run data set without stop words-------------------"]

for (header, corpus) in zip(headers, data_tf):
    print header
    eval_clf(linearSVM, corpus)

TF Trigram: Run data set with stop words-------------------
It took 3.37059187889 seconds
Training error:  0.825
Validation error:  8.28
TF Trigram: Run data set without stop words-------------------
It took 0.945204973221 seconds
Training error:  0.925
Validation error:  9.0
TF Bigram: Run data set with stop words-------------------
It took 1.9431810379 seconds
Training error:  1.3825
Validation error:  8.38
TF Bigram: Run data set with stop words-------------------
It took 0.866637945175 seconds
Training error:  1.08
Validation error:  8.9
TF Unigram: Run data set with stop words-------------------
It took 0.841536998749 seconds
Training error:  6.4325
Validation error:  9.92
TF Unigram: Run data set without stop words-------------------
It took 0.569452762604 seconds
Training error:  3.8625
Validation error:  9.8


### Run previous data after doing TF-IDF transformation

In [None]:
headers = ["TF-IDF Trigram: Run data set with stop words-------------------",
           "TF-IDF Trigram: Run data set without stop words-------------------",
           "TF-IDF Bigram: Run data set with stop words-------------------",
           "TF-IDF Bigram: Run data set with stop words-------------------",
           "TF-IDF Unigram: Run data set with stop words-------------------",
           "TF-IDF Unigram: Run data set without stop words-------------------"]

for (header, corpus) in zip(headers, data_tfidf):
    print header
    eval_clf(linearSVM, corpus)

T-IDF Trigram: Run data set with stop words-------------------
It took 3.103525877 seconds
Training error:  0.01
Validation error:  7.64
TF-IDF Trigram: Run data set without stop words-------------------
It took 0.957690954208 seconds
Training error:  0.045
Validation error:  8.62
TF-IDF Bigram: Run data set with stop words-------------------


In [13]:
print  "TF-IDF Trigram: Run data set with stop words-------------------"
eval_clf(linearSVM, data_tfidf_corpus_w_stop_w_tri)

print  "TF-IDF Trigram: Run data set without stop words-------------------"
eval_clf(linearSVM, data_tfidf_corpus_wo_stop_w_tri)

print  "TF-IDF Bigram: Run data set with stop words-------------------"
eval_clf(linearSVM, data_tfidf_corpus_w_stop_w_bi)

print  "TF-IDF Bigram: Run data set without stop words-------------------"
eval_clf(linearSVM, data_tfidf_corpus_wo_stop_w_bi)

print  "TF-IDF Unigram: Run data set with stop words-------------------"
eval_clf(linearSVM, data_tfidf_corpus_w_stop_w)

print  "TF-IDF Unigram: Run data set without stop words-------------------"
eval_clf(linearSVM, data_tfidf_corpus_wo_stop_w)

TF-IDF Trigram: Run data set with stop words-------------------
It took 3.2905421257 seconds
Training error:  0.01
Validation error:  7.64
TF-IDF Trigram: Run data set without stop words-------------------
It took 1.02786397934 seconds
Training error:  0.045
Validation error:  8.62
TF-IDF Bigram: Run data set with stop words-------------------
It took 1.83157110214 seconds
Training error:  0.04
Validation error:  7.8
TF-IDF Bigram: Run data set without stop words-------------------
It took 0.92861199379 seconds
Training error:  0.0675
Validation error:  8.74
TF-IDF Unigram: Run data set with stop words-------------------
It took 0.615745782852 seconds
Training error:  1.5775
Validation error:  9.68
TF-IDF Unigram: Run data set without stop words-------------------
It took 0.495291948318 seconds
Training error:  1.36
Validation error:  9.9


(0.013599999999999945, 0.098999999999999977)

### Run previous data after scaling

In [15]:
max_abs_scaler = preprocessing.MaxAbsScaler()

data_scaled_corpus_w_stop_w_tri = split_data(max_abs_scaler.fit_transform(corpus_w_stop_w_tri))
data_scaled_corpus_wo_stop_w_tri = split_data(max_abs_scaler.fit_transform(corpus_wo_stop_w_tri))
data_scaled_corpus_w_stop_w_bi = split_data(max_abs_scaler.fit_transform(corpus_w_stop_w_bi))
data_scaled_corpus_wo_stop_w_bi = split_data(max_abs_scaler.fit_transform(corpus_wo_stop_w_bi))
data_scaled_corpus_w_stop_w = split_data(max_abs_scaler.fit_transform(corpus_w_stop_w))
data_scaled_corpus_wo_stop_w = split_data(max_abs_scaler.fit_transform(corpus_wo_stop_w))

In [16]:
print  "Scale Trigram: Run data set with stop words-------------------"
eval_clf(linearSVM, data_scaled_corpus_w_stop_w_tri)

print  "Scale Trigram: Run data set without stop words-------------------"
eval_clf(linearSVM, data_scaled_corpus_wo_stop_w_tri)

print  "Scale Bigram: Run data set with stop words-------------------"
eval_clf(linearSVM, data_scaled_corpus_w_stop_w_bi)

print  "Scale Bigram: Run data set without stop words-------------------"
eval_clf(linearSVM, data_scaled_corpus_wo_stop_w_bi)

print  "Scale Unigram: Run data set with stop words-------------------"
eval_clf(linearSVM, data_scaled_corpus_w_stop_w)

print  "Scale Unigram: Run data set without stop words-------------------"
eval_clf(linearSVM, data_scaled_corpus_wo_stop_w)

Scale Trigram: Run data set with stop words-------------------
It took 53.3931951523 seconds
Training error:  0.0
Validation error:  8.48
Scale Trigram: Run data set without stop words-------------------
It took 19.4017632008 seconds
Training error:  0.0
Validation error:  10.26
Scale Bigram: Run data set with stop words-------------------
It took 31.1804440022 seconds
Training error:  0.0
Validation error:  8.76
Scale Bigram: Run data set without stop words-------------------
It took 13.9711098671 seconds
Training error:  0.0
Validation error:  10.52
Scale Unigram: Run data set with stop words-------------------
It took 2.75061607361 seconds
Training error:  0.1075
Validation error:  11.78
Scale Unigram: Run data set without stop words-------------------
It took 2.41466188431 seconds
Training error:  0.12
Validation error:  12.32


(0.0011999999999999789, 0.12319999999999998)

In [23]:
from sklearn.linear_model import SGDClassifier

sgd_clf = SGDClassifier(loss='hinge', penalty='l2', alpha=1e-3, random_state=42, max_iter=15, tol=None)

eval_clf(sgd_clf, data_scaled_corpus_wo_stop_w)

print  "Scale Trigram: Run data set with stop words-------------------"
eval_clf(sgd_clf, data_scaled_corpus_w_stop_w_tri)


eval_clf(sgd_clf, data_scaled_corpus_wo_stop_w_tri)

It took 0.262296915054 seconds
Training error:  7.1875
Validation error:  10.98
Scale Trigram: Run data set with stop words-------------------
It took 1.34645104408 seconds
Training error:  0.025
Validation error:  8.38
It took 0.459006786346 seconds
Training error:  0.78
Validation error:  9.92


(0.0078000000000000291, 0.099199999999999955)

In [None]:
def eval_svm_rbf(corpus, num_train=40000, num_valid=5000, num_test=5000):
    doc_index = np.arange(corpus.shape[0])
    target_pos = np.ones(corpus.shape[0]/2).astype(int) #first 25000 docs have target 1=positive
    target_neg = np.zeros(corpus.shape[0]/2).astype(int) #next 25000 docs have target 0=negative
    target = np.concatenate((target_pos, target_neg), axis=0) #now we have data and target values

    #shuffle all of the data and target
    doc_index, corpus, target = shuffle(doc_index, corpus, target, random_state=0)

    train_data = corpus[:num_train]
    train_targets = target[:num_train]
    valid_data = corpus[num_train:num_train+num_valid]
    valid_targets = target[num_train:num_train+num_valid]
    test_data = corpus[num_train+num_valid:]
    test_targets = target[num_train+num_valid:]
    
    time0 = time.time()

    clf_rbf = svm.SVC()
    clf_rbf.fit(train_data, train_targets) 
    predictions_train = clf_rbf.predict(train_data)
    error_train_rbf = 1 - (predictions_train == train_targets).mean()
    predictions_valid = clf_rbf.predict(valid_data)
    error_valid_rbf = 1 - (predictions_valid == valid_targets).mean()

    time1 = time.time()

    print "It took {} seconds".format(time1-time0)
    print "Training error: ", error_train_rbf
    print "Validation error: ", error_valid_rbf
    
    return error_train_rbf, error_valid_rbf

In [None]:
print "Run with RBF kernel-------------------------------------------------------------"
print  "Preprocessing after TF-IDF Trigram: Run data set with stop words-------------------"
eval_svm_rbf(preprocessing.maxabs_scale(tfidf_transformer.fit_transform(corpus_w_stop_w_tri)))

print  "Preprocessing after TF-IDF Trigram: Run data set without stop words-------------------"
eval_svm_rbf(preprocessing.maxabs_scale(tfidf_transformer.fit_transform(corpus_wo_stop_w_tri)))

print  "Preprocessing after TF-IDF Bigram: Run data set with stop words-------------------"
eval_svm_rbf(preprocessing.maxabs_scale(tfidf_transformer.fit_transform(corpus_w_stop_w_bi)))

print  "Preprocessing after TF-IDF Bigram: Run data set without stop words-------------------"
eval_svm_rbf(preprocessing.maxabs_scale(tfidf_transformer.fit_transform(corpus_wo_stop_w_bi)))

print  "Preprocessing after TF-IDF Unigram: Run data set with stop words-------------------"
eval_svm_rbf(preprocessing.maxabs_scale(tfidf_transformer.fit_transform(corpus_w_stop_w)))

print  "Preprocessing after TF-IDF Unigram: Run data set without stop words-------------------"
eval_svm_rbf(preprocessing.maxabs_scale(tfidf_transformer.fit_transform(corpus_wo_stop_w)))

### 

In [None]:
clf = MLPClassifier(solver='lbfgs', hidden_layer_sizes=(10,10,10), alpha=1e-5, random_state=1)

# print  "TF-IDF Trigram: Run data set with stop words-------------------"
# # eval_svm_linear(preprocessing.maxabs_scale(tfidf_transformer.fit_transform(corpus_w_stop_w_tri), with_mean=False))
# eval_clf(preprocessing.maxabs_scale(tfidf_transformer.fit_transform(corpus_w_stop_w_tri)))

# print  "TF-IDF Trigram: Run data set without stop words-------------------"
# eval_SGD(tfidf_transformer.fit_transform(corpus_wo_stop_w_tri))

# print  "TF-IDF Bigram: Run data set with stop words-------------------"
# eval_SGD(tfidf_transformer.fit_transform(corpus_w_stop_w_bi))

# print  "TF-IDF Bigram: Run data set without stop words-------------------"
# eval_SGD(tfidf_transformer.fit_transform(corpus_wo_stop_w_bi))

# print  "TF-IDF Unigram: Run data set with stop words-------------------"
# eval_SGD(tfidf_transformer.fit_transform(corpus_w_stop_w))

# print  "TF-IDF Unigram: Run data set without stop words-------------------"
# eval_SGD(tfidf_transformer.fit_transform(corpus_wo_stop_w))
eval_clf(preprocessing.maxabs_scale(tfidf_transformer.fit_transform(corpus_wo_stop_w)), clf)

In [None]:
eval_clf(preprocessing.maxabs_scale(tfidf_transformer.fit_transform(corpus_w_stop_w)), clf)

### Test MLP on trigram bow

In [None]:
clf = MLPClassifier(solver='lbfgs', hidden_layer_sizes=(10,10,10), alpha=1e-5, random_state=1)
eval_clf(tfidf_transformer.fit_transform(corpus_w_stop_w_tri), clf)

In [None]:
clf = MLPClassifier(solver='lbfgs', hidden_layer_sizes=(10,10,10), alpha=1e-3, random_state=1)
eval_clf(tfidf_transformer.fit_transform(corpus_w_stop_w_tri), clf)

In [None]:
clf = MLPClassifier(solver='sgd', hidden_layer_sizes=(10,10,10), alpha=1e-5, random_state=1,
                    learning_rate='adaptive', verbose=True)
eval_clf(tfidf_transformer.fit_transform(corpus_w_stop_w_tri), clf)

In [None]:
clf = MLPClassifier(solver='lbfgs', hidden_layer_sizes=(5,5,5), alpha=1e-3, random_state=1)
eval_clf(tfidf_transformer.fit_transform(corpus_w_stop_w_tri), clf)

In [None]:
clf = MLPClassifier(solver='lbfgs', hidden_layer_sizes=(5,5,5), alpha=1e-3, verbose=True)
eval_clf(preprocessing.maxabs_scale(tfidf_transformer.fit_transform(corpus_w_stop_w_tri)), clf)

In [None]:
clf = MLPClassifier(solver='lbfgs', hidden_layer_sizes=(5,5,5), alpha=1e-3, verbose=True)
eval_clf(tfidf_transformer.fit_transform(corpus_w_stop_w_tri), clf)

### Does preprocessing help in this case? No - see two runs above

In [None]:
clf = MLPClassifier(solver='lbfgs', hidden_layer_sizes=(5,5,5), alpha=0.01, verbose=True)
eval_clf(tfidf_transformer.fit_transform(corpus_w_stop_w_tri), clf)

In [None]:
clf = MLPClassifier(solver='lbfgs', hidden_layer_sizes=(5,5,5), alpha=0.1, verbose=True)
eval_clf(tfidf_transformer.fit_transform(corpus_w_stop_w_tri), clf)

In [None]:
clf = MLPClassifier(solver='lbfgs', hidden_layer_sizes=(5,5,5), alpha=0.5, verbose=True)
eval_clf(tfidf_transformer.fit_transform(corpus_w_stop_w_tri), clf)

In [None]:
clf = MLPClassifier(solver='lbfgs', hidden_layer_sizes=(5,5,5), alpha=1, verbose=True)
eval_clf(tfidf_transformer.fit_transform(corpus_w_stop_w_tri), clf)

In [None]:
clf = MLPClassifier(solver='lbfgs', hidden_layer_sizes=(10,10,10), alpha=1, verbose=True)
eval_clf(tfidf_transformer.fit_transform(corpus_w_stop_w_tri), clf)

In [None]:
clf = MLPClassifier(solver='lbfgs', hidden_layer_sizes=(10,10,10), alpha=2, verbose=True)
eval_clf(tfidf_transformer.fit_transform(corpus_w_stop_w_tri), clf)

In [None]:
clf = MLPClassifier(solver='lbfgs', hidden_layer_sizes=(10,10,10), alpha=4, verbose=True)
eval_clf(tfidf_transformer.fit_transform(corpus_w_stop_w_tri), clf)

### Test MLP on unigram bow

In [None]:
clf = MLPClassifier(solver='lbfgs', hidden_layer_sizes=(10,10,10), alpha=1, verbose=True)
eval_clf(tfidf_transformer.fit_transform(corpus_w_stop_w), clf)

In [None]:
clf = MLPClassifier(solver='lbfgs', hidden_layer_sizes=(10,10,10), alpha=1, verbose=True)
eval_clf(tfidf_transformer.fit_transform(corpus_wo_stop_w), clf)

In [None]:
clf = MLPClassifier(solver='lbfgs', hidden_layer_sizes=(10,10,10), alpha=0.1, verbose=True)
eval_clf(tfidf_transformer.fit_transform(corpus_w_stop_w), clf)

In [None]:
clf = MLPClassifier(solver='lbfgs', hidden_layer_sizes=(10,10,10), alpha=0.01, verbose=True)
eval_clf(tfidf_transformer.fit_transform(corpus_w_stop_w), clf)

In [None]:
clf = MLPClassifier(solver='lbfgs', hidden_layer_sizes=(20,20,20), alpha=1, verbose=True)
eval_clf(tfidf_transformer.fit_transform(corpus_w_stop_w), clf)

In [None]:
clf = MLPClassifier(solver='lbfgs', hidden_layer_sizes=(20,20), alpha=1, verbose=True)
eval_clf(tfidf_transformer.fit_transform(corpus_w_stop_w), clf)

In [None]:
clf = MLPClassifier(solver='lbfgs', hidden_layer_sizes=(50,50), alpha=1, verbose=True)
eval_clf(tfidf_transformer.fit_transform(corpus_w_stop_w), clf)

In [None]:
clf = MLPClassifier(solver='lbfgs', hidden_layer_sizes=(30,30,30), alpha=1, verbose=True)
eval_clf(tfidf_transformer.fit_transform(corpus_w_stop_w), clf)

In [None]:
clf = MLPClassifier(solver='lbfgs', hidden_layer_sizes=(10,10,10, 10), alpha=1, verbose=True)
eval_clf(tfidf_transformer.fit_transform(corpus_w_stop_w), clf)

### With sigmoid activation function

In [None]:
clf = MLPClassifier(solver='lbfgs', hidden_layer_sizes=(5,5,5), alpha=0.5, activation='logistic')
eval_clf(tfidf_transformer.fit_transform(corpus_w_stop_w_tri), clf)

### Multinomial NB

In [None]:
clf = MultinomialNB()
eval_clf(tfidf_transformer.fit_transform(corpus_w_stop_w), clf)

clf = MultinomialNB()
eval_clf(tfidf_transformer.fit_transform(corpus_wo_stop_w), clf)

clf = MultinomialNB()
eval_clf(tfidf_transformer.fit_transform(corpus_w_stop_w_tri), clf)

clf = MultinomialNB()
eval_clf(tfidf_transformer.fit_transform(corpus_wo_stop_w_tri), clf)

In [None]:
clf = GaussianNB()
eval_clf(tfidf_transformer.fit_transform(corpus_w_stop_w), clf)

clf = GaussianNB()
eval_clf(tfidf_transformer.fit_transform(corpus_wo_stop_w), clf)

# clf = MultinomialNB()
# eval_clf(tfidf_transformer.fit_transform(corpus_w_stop_w_tri), clf)

# clf = MultinomialNB()
# eval_clf(tfidf_transformer.fit_transform(corpus_wo_stop_w_tri), clf)