# CS155 Miniproject 1

zchen@caltech.edu

Sentiment Analysis via Neural Networks

In [26]:
# Import libraries
import numpy as np
import matplotlib.pyplot as plt

import sklearn
import sklearn.model_selection

import tensorflow as tf 
import keras
from keras.models import Sequential
from keras.layers.core import Dense, Activation, Flatten, Dropout
from keras.layers import BatchNormalization, LeakyReLU

## Loading Dataset

In [77]:
# Load final validation data set
FV_data = np.loadtxt('../data/test_data.txt',delimiter=' ',skiprows=1)
# Load input training data set
train_data = np.loadtxt('../data/training_data.txt',delimiter=' ',skiprows=1)

# Splot y_train and x_train from training set
x_tall = train_data[:,1:]
y_tall = train_data[:,0]

# One hot encode categories
y_tall = keras.utils.np_utils.to_categorical(y_tall)

## Trying Neural Networks on bag of words dataset

In [82]:
# Function to generate DNN of given depth and width
def getModel(layers,Pdrop):
    model = Sequential()
    model.add(Dense(layers[0],input_shape=(1000,)))
    model.add(BatchNormalization())
    model.add(LeakyReLU(alpha=0.3))
    for i in layers[1:]:
        model.add(Dense(i))
        model.add(BatchNormalization())
        model.add(LeakyReLU(alpha=0.3))
        model.add(Dropout(Pdrop))
    
    # predicting probabilities of each of the 2 classes
    model.add(Dense(2))
    model.add(Activation('softmax'))
    return model

In [138]:
# Compile the DNN model
def trainModels(k,x_tall,y_tall):
    # Storage for k fold cross validation    
    trainErr = []
    testErr = []
    acc = []
    
    # Split the training data k-fold number of ways for k-fold validation of the learning algorithm
    kf = sklearn.model_selection.KFold(n_splits=k)
    inds = [ind for ind in kf.split(x_tall, y_tall)]
    
    i=0
    for train,val in inds:
        # Training and validation data for k fold cross validation
        Xtrain = x_tall[train]
        Ytrain = y_tall[train]
        Xval = x_tall[val]
        Yval = y_tall[val]
    
        # Define the DNN model
        model = getModel([500,250,125,75,25],0.4)
        
        # Compile it and fit
        model.compile(loss='categorical_crossentropy',optimizer='RMSprop', metrics=['accuracy'])
        fit = model.fit(Xtrain, Ytrain, batch_size=2**8, epochs=2,verbose=1,validation_data=(Xval, Yval))
        
        # store training and test error
        trainErr.append(model.evaluate(x=Xtrain, y=Ytrain))
        testErr.append(model.evaluate(x=Xval, y=Yval))
        acc.append(getAccuracy(model,Xval,Yval))

        # Status output
        print('k-iteration = ',i)
        i=i+1
        
    return np.array(trainErr),np.array(testErr),np.array(acc)

# undo one hot encoding
def Unencode(out):
    ypred = out[:,0] < out[:,1]
    return ypred

# Function to get explicit model accuracy from softmax
def getAccuracy(model,xt,yt):
    out = model.predict(xt)
    ypred = Unencode(out)
    ytrue = Unencode(yt)
    acc = 1.0*np.sum(ypred == ytrue)/len(ytrue)
    return acc

In [119]:
trainErr,testErr,Vacc = trainModels(5,x_tall,y_tall)

Train on 16000 samples, validate on 4000 samples
Epoch 1/2
Epoch 2/2
0.85375
k-iteration =  0
Train on 16000 samples, validate on 4000 samples
Epoch 1/2
Epoch 2/2
0.84725
k-iteration =  1
Train on 16000 samples, validate on 4000 samples
Epoch 1/2
Epoch 2/2
0.83775
k-iteration =  2
Train on 16000 samples, validate on 4000 samples
Epoch 1/2
Epoch 2/2
0.8385
k-iteration =  3
Train on 16000 samples, validate on 4000 samples
Epoch 1/2
Epoch 2/2
0.8475
k-iteration =  4


In [189]:
print('\n trainErr')
print(trainErr)
print('avg trainErr',np.mean(trainErr,axis=0))
print('\n testErr')
print(testErr)
print('avg testErr',np.mean(testErr,axis=0))
print('\n Accuracy')
print(Vacc)
print('avg acc',np.mean(Vacc))



 trainErr
[[0.28637429 0.8865    ]
 [0.29031374 0.8885625 ]
 [0.30523332 0.887     ]
 [0.28693927 0.8891875 ]
 [0.30363714 0.8849375 ]]
avg trainErr [0.29449955 0.8872375 ]

 testErr
[[0.38926657 0.85375   ]
 [0.43087638 0.84725   ]
 [0.48194636 0.83775   ]
 [0.47047041 0.8385    ]
 [0.43574135 0.8475    ]]
avg testErr [0.44166021 0.84495   ]

 Accuracy
[0.85375 0.84725 0.83775 0.8385  0.8475 ]
avg acc 0.8449500000000001


Hyperparameters plots

Validation accuracy plateaus at 85%

In [143]:
# Compile it and fit
model = getModel([500,250,125,75,25],0.4)
model.compile(loss='categorical_crossentropy',optimizer='RMSprop', metrics=['accuracy'])
fit = model.fit(x_tall, y_tall, batch_size=2**8, epochs=4,verbose=1)
out = model.predict(FV_data,batch_size=2**8,verbose=1)

ypred = Unencode(out)
ytrue = Unencode(y_tall)

Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4


In [181]:
# Final predictions
def writeResults(out):
    f = open('DNN_submission.txt','w')
    f.write('Id,Prediction\n')
    ypred = Unencode(out)
    for i in range(0,len(ypred)):
        f.write(str(i+1)+','+str(int(ypred[i]))+'\n')
    f.close()

writeResults(out)