# Multiple Layer Perceptron Classifier
This notebook implements a MLP classifier to run the sentiment analysis.

The following two cells are responsible for feature selection. The major options here are
- ngram
- lemmatization

Only run these two cells when wish to generate new numpy files, otherwise loading generated files will be quicker.

In [None]:
import string
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer 
from nltk.corpus import wordnet as wn
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
import numpy as np
import pandas as pd

class FileReader():
    def __init__(self, train, test, tfidf=True, lemmatize=False):
        df = pd.read_csv(train)
        self.train_text = df['text']
        self.train_label = df['label']
        df = pd.read_csv(test)
        self.test_id = df['id']
        self.test_text = df['text']
        self.tfidf = tfidf
        self.stop_words = set(stopwords.words('english') + list(string.punctuation) \
                             + ["'d", "'ll", "'re", "'s", "'ve", '``', 'could', 'might', 'must', "n't", 'need', 'sha', 'wo', 'would'])
        if lemmatize:
            def tokenizer(text):
                def is_noun(tag):
                    return tag in ['NN', 'NNS', 'NNP', 'NNPS']
                def is_verb(tag):
                    return tag in ['VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ']
                def is_adverb(tag):
                    return tag in ['RB', 'RBR', 'RBS']
                def is_adjective(tag):
                    return tag in ['JJ', 'JJR', 'JJS']
                def penn_to_wn(tag):
                    if is_adjective(tag):
                        return wn.ADJ
                    elif is_noun(tag):
                        return wn.NOUN
                    elif is_adverb(tag):
                        return wn.ADV
                    elif is_verb(tag):
                        return wn.VERB
                    return wn.NOUN
                lemmatizer = WordNetLemmatizer()
                tokens = (word for word in nltk.word_tokenize(text) if len(word) > 1 and word not in self.stop_words and not word.isnumeric())
                token_pos = nltk.pos_tag(tokens)
                lemmas = (lemmatizer.lemmatize(item, penn_to_wn(pos)) for item, pos in token_pos)
                return lemmas
            self.tokenizer = tokenizer
        else:
            self.tokenizer = None
            
    def getLabel(self):
        return self.train_label
    
    def getTestId(self):
        return self.test_id
    
    def genMatrix(self, ngram=(1, 1)):
        Vectorizer = TfidfVectorizer if self.tfidf else CountVectorizer
        v = Vectorizer(stop_words=self.stop_words, ngram_range=ngram, lowercase=True, \
                                   min_df=3, max_df=0.999, use_idf=False, tokenizer=self.tokenizer)
        train_data_matrix = v.fit_transform(self.train_text)
        vv = Vectorizer(stop_words=self.stop_words, ngram_range=ngram, lowercase=True, \
                                  vocabulary=v.vocabulary_, use_idf=False, tokenizer=self.tokenizer)
        test_data_matrix = vv.transform(self.test_text)
        return train_data_matrix, test_data_matrix, v.vocabulary_

In [None]:
def saveMatrices(ngram, lemmatize):
    filename = '{}-{}{}.npy'.format(ngram[0], ngram[1], '-lemmatized' if lemmatize else '')
    print('Options: ngram =',ngram, 'lemmatize =', lemmatize, 'filename =', filename)
    print('Reading files...')
    fr = FileReader('data/train.csv', 'data/test.csv', lemmatize=False)
    print('Generating matrices...')
    train_data_matrix, test_data_matrix, vocab = fr.genMatrix(ngram=(1,5))
    train_label, test_id = fr.getLabel(), fr.getTestId()
    print('Train:', train_data_matrix.shape, train_label.shape)
    print('Vocab:', len(vocab))
    print('Test:', test_data_matrix.shape, test_id.shape)
    print('Writing numpy file...')
    obj = np.array([train_data_matrix, test_data_matrix, vocab, train_label, test_id])
    np.save(filename, obj)
    print('Done.')
    
saveMatrices((1, 5), False)
saveMatrices((1, 5), True)

Magic commands to cache notebook outputs.

In [2]:
%load_ext ipycache

  from IPython.utils.traitlets import Unicode


The cells below loads a binary numpy array which contains extracted data matrices and labels. Then, we run randomized serach on MLPClassifier to find the best hyperparameters.

In [3]:
import numpy as np
import pandas as pd
p = np.load('1-5.npy')
train_data_matrix, test_data_matrix, vocab, train_label, test_id = p
print('Train:', train_data_matrix.shape, train_label.shape)
print('Vocab:', len(vocab))
print('Test:', test_data_matrix.shape, test_id.shape)

Train: (16000, 123770) (16000,)
Vocab: 123770
Test: (4491, 123770) (4491,)


Note here because I have using multicore, the outputs do not generate properly, it outputted 1 here only but there were 30 fits.

In [19]:
%%cache mlp -d ipycache
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import RandomizedSearchCV, validation_curve
param_dist = {
    'alpha': np.logspace(-6, -2, 1e4),
    'activation': ['relu', 'logistic'],
    'hidden_layer_sizes': [(5,5), (10,10), (5,5,5), (10,10,10), (25, 25), (50, 50)]
}

# run randomized search
clf = RandomizedSearchCV(MLPClassifier(verbose=True, early_stopping=True, n_iter_no_change=10), 
                         param_distributions=param_dist, n_iter=100, 
                         cv=5, verbose=2, n_jobs=-1, return_train_score=True)
clf.fit(train_data_matrix, train_label)

[Skipped the cell's code and loaded variables  from file '/home/ec2-user/SageMaker/ipycache/mlp'.]
Fitting 5 folds for each of 100 candidates, totalling 500 fits
Iteration 1, loss = 1.44423627
Validation score: 0.429375
Iteration 2, loss = 1.36670521
Validation score: 0.429375
Iteration 3, loss = 1.36011420
Validation score: 0.429375
Iteration 4, loss = 1.34767835
Validation score: 0.429375
Iteration 5, loss = 1.32115302
Validation score: 0.430625
Iteration 6, loss = 1.26329960
Validation score: 0.485625
Iteration 7, loss = 1.16721417
Validation score: 0.518750
Iteration 8, loss = 1.06200391
Validation score: 0.559375
Iteration 9, loss = 0.97503328
Validation score: 0.573750
Iteration 10, loss = 0.90666436
Validation score: 0.605625
Iteration 11, loss = 0.85154996
Validation score: 0.609375
Iteration 12, loss = 0.80479912
Validation score: 0.624375
Iteration 13, loss = 0.76348115
Validation score: 0.616875
Iteration 14, loss = 0.72540222
Validation score: 0.629375
Iteration 15, loss = 

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed: 75.3min
[Parallel(n_jobs=-1)]: Done 154 tasks      | elapsed: 277.6min
[Parallel(n_jobs=-1)]: Done 357 tasks      | elapsed: 688.1min
[Parallel(n_jobs=-1)]: Done 500 out of 500 | elapsed: 929.4min finished


In [20]:
pd.DataFrame(clf.cv_results_)

Unnamed: 0,mean_fit_time,mean_score_time,mean_test_score,mean_train_score,param_activation,param_alpha,param_hidden_layer_sizes,params,rank_test_score,split0_test_score,...,split2_test_score,split2_train_score,split3_test_score,split3_train_score,split4_test_score,split4_train_score,std_fit_time,std_score_time,std_test_score,std_train_score
0,127.225514,0.016721,0.598500,0.878715,logistic,0.000934753,"(10, 10)","{'hidden_layer_sizes': (10, 10), 'alpha': 0.00...",23,0.602124,...,0.608125,0.930078,0.595313,0.936094,0.608818,0.937197,30.513643,0.000613,0.011291,0.070316
1,695.208497,0.162505,0.597562,0.910736,logistic,0.000818637,"(25, 25)","{'hidden_layer_sizes': (25, 25), 'alpha': 0.00...",24,0.585572,...,0.605625,0.877500,0.598750,0.913672,0.610069,0.867755,182.522692,0.100331,0.009609,0.034717
2,164.714003,0.070489,0.501188,0.646401,logistic,3.12212e-06,"(10, 10, 10)","{'hidden_layer_sizes': (10, 10, 10), 'alpha': ...",66,0.558089,...,0.533438,0.765625,0.430625,0.430703,0.430894,0.430636,69.692542,0.007079,0.058073,0.180057
3,148.061592,0.051578,0.478500,0.558478,logistic,0.000984236,"(10, 10, 10)","{'hidden_layer_sizes': (10, 10, 10), 'alpha': ...",72,0.545909,...,0.430625,0.430703,0.554375,0.765391,0.430894,0.430636,55.136869,0.027205,0.058571,0.156815
4,59.960558,0.030585,0.521687,0.772409,relu,1.63577e-05,"(5, 5, 5)","{'hidden_layer_sizes': (5, 5, 5), 'alpha': 1.6...",63,0.504997,...,0.562813,0.879922,0.521875,0.600781,0.523139,0.804952,48.443197,0.031327,0.023028,0.110243
5,2472.536669,0.145224,0.619625,0.815686,logistic,0.00349264,"(50, 50)","{'hidden_layer_sizes': (50, 50), 'alpha': 0.00...",1,0.626796,...,0.611563,0.823438,0.613437,0.888828,0.628518,0.804952,253.746440,0.053193,0.006886,0.041160
6,213.127925,0.055984,0.567750,0.924329,relu,4.08189e-06,"(10, 10)","{'hidden_layer_sizes': (10, 10), 'alpha': 4.08...",44,0.574641,...,0.530000,0.956641,0.567813,0.959766,0.610694,0.940712,96.942896,0.035129,0.026316,0.059917
7,245.093364,0.035093,0.566438,0.907838,relu,0.0036104,"(10, 10, 10)","{'hidden_layer_sizes': (10, 10, 10), 'alpha': ...",46,0.574641,...,0.551875,0.890000,0.578750,0.958906,0.598186,0.960631,137.663832,0.019472,0.023911,0.073562
8,138.588773,0.048928,0.563312,0.886688,relu,0.00167156,"(10, 10, 10)","{'hidden_layer_sizes': (10, 10, 10), 'alpha': ...",47,0.561836,...,0.582812,0.737344,0.569063,0.891875,0.572233,0.954695,60.097220,0.027736,0.017683,0.079870
9,55.444024,0.047222,0.430688,0.430688,logistic,0.00614298,"(5, 5)","{'hidden_layer_sizes': (5, 5), 'alpha': 0.0061...",81,0.430668,...,0.430625,0.430703,0.430625,0.430703,0.430894,0.430636,18.156570,0.014108,0.000105,0.000026


In [21]:
test_data_pre = clf.predict(test_data_matrix)

sub_df = pd.DataFrame()
sub_df["id"] = test_id
sub_df["pred"] = test_data_pre
sub_df.to_csv("submission-mlp_v2.csv", index=False)
print('Saved prediction.')

Saved prediction.
