# CS155 Miniproject 1

zchen@caltech.edu

Sentiment Analysis via Neural Networks

In [64]:
# Import libraries
import numpy as np
import matplotlib.pyplot as plt

import sklearn
import sklearn.model_selection

import tensorflow as tf 
import keras
from keras.models import Sequential
from keras.layers.core import Dense, Activation, Flatten, Dropout
from keras.layers import BatchNormalization, LeakyReLU

## Loading Dataset and preprocessing

In [141]:
# Load final validation data set
FV_data = np.loadtxt('../data/test_data.txt',delimiter=' ',skiprows=1)
# Load input training data set
train_data = np.loadtxt('../data/training_data.txt',delimiter=' ',skiprows=1)
utrain_data = np.unique(train_data,axis=0)

# Get header words list
f = open('../data/test_data.txt','r')
words = np.array(f.readline().split())
f.close()

# Splot y_train and x_train from training set
x_tall = train_data[:,1:]
y_tall = train_data[:,0]

x_uall = utrain_data[:,1:]
y_uall = utrain_data[:,0]

# One hot encode categories
y_tall = keras.utils.np_utils.to_categorical(y_tall)

## Useful functions for Neural network debugging

In [None]:
# Function to generate DNN of given depth and width
def getModel(layers,Pdrop):
    model = Sequential()
    model.add(Dense(layers[0],input_shape=(1000,)))
    model.add(BatchNormalization())
    model.add(Activation('relu'))
    model.add(Dropout(Pdrop))
    for i in layers[1:]:
        model.add(Dense(i))
        model.add(BatchNormalization())
        model.add(Activation('relu'))
    
    # predicting probabilities of each of the 2 classes
    model.add(Dense(2))
    model.add(Activation('softmax'))
    return model

# undo one hot encoding
def Unencode(out):
    ypred = out[:,0] < out[:,1]
    return ypred

# Function to get explicit model accuracy from softmax
def getAccuracy(model,xt,yt):
    out = model.predict(xt)
    ypred = Unencode(out)
    ytrue = Unencode(yt)
    acc = 1.0*np.sum(ypred == ytrue)/len(ytrue)
    return acc

# Function to get bag of words which were misclassified
def getBagOfWords(xtrain,ypred,ytrue,words):
    out = []
    ypredu = Unencode(ypred).astype(int)
    ytrueu = Unencode(ytrue).astype(int)
    # Get locations of bag of words which were misclassified
    idx = np.arange(0,len(ypredu))
    idxErr = idx[ypredu!=ytrueu]
    Xerr = xtrain[ypredu!=ytrueu]
    j = 0
    for i in Xerr:
        out.append([ytrue[idxErr[j]],ypred[idxErr[j]],words[i>0],i[i>0]])
        j=j+1
    return out

# Function to write final predictions
def writeResults(ypred):
    f = open('DNN_submission.txt','w')
    f.write('Id,Prediction\n')
    for i in range(0,len(ypred)):
        f.write(str(i+1)+','+str(int(ypred[i]))+'\n')
    f.close()

## Dense Neural network model for bag of words predictor

Initial testing of single DNN

In [72]:
# Split the training data k-fold number of ways for k-fold validation of the learning algorithm
k=5
kf = sklearn.model_selection.KFold(n_splits=k)
inds = [ind for ind in kf.split(x_tall, y_tall)]

i=0
train,val = inds[0]
# Training and validation data for k fold cross validation
Xtrain = x_tall[train]
Ytrain = y_tall[train]
Xval = x_tall[val]
Yval = y_tall[val]

# Define the DNN model
model = getModel([500,250,125],0.4)

# Compile it and fit
model.compile(loss='categorical_crossentropy',optimizer='RMSprop', metrics=['accuracy'])
model.fit(Xtrain, Ytrain, batch_size=2**8, epochs=4,verbose=1,validation_data=(Xval, Yval))
ypred = model.predict(Xval,batch_size=2**8,verbose=1)

Train on 16000 samples, validate on 4000 samples
Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4


In [73]:
# Debugging bag of words
out = getBagOfWords(Xval,ypred,Yval,words)
print(out[0])

[array([0., 1.]), array([0.6994479 , 0.30055207], dtype=float32), array(['thi', 'veri', 'get', 'onli', 'look', 'want', 'see', 'mani', 'new',
       'peopl', 'still', 'need', 'two', 'feel', 'start', 'long', 'listen',
       'excel', 'enough', 'person', 'cover', 'almost', 'scene', 'instead',
       'famili', 'sever', 'hour', 'els', 'fine', 'talk', 'american',
       'entir', 'lack', 'impress', 'state', 'avail', 'certainli',
       'student', 'danc', 'parent', 'critic', 'centuri', 'train',
       'aspect'], dtype='<U12'), array([3., 2., 1., 1., 2., 1., 2., 3., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       2., 1., 1., 1., 1., 2., 1., 1., 1., 1., 1., 1., 2., 1., 1., 1., 1.,
       1., 1., 2., 2., 6., 1., 1., 1., 1., 1.])]


Method  validation for single DNN

In [137]:
# Compile the DNN model
def getDNNCross(k,x_tall,y_tall):
    # Storage for k fold cross validation    
    trainErr = []
    testErr = []
    acc = []
    
    # Split the training data k-fold number of ways for k-fold validation of the learning algorithm
    kf = sklearn.model_selection.KFold(n_splits=k)
    inds = [ind for ind in kf.split(x_tall, y_tall)]
    
    i=0
    for train,val in inds:
        # Training and validation data for k fold cross validation
        Xtrain = x_tall[train]
        Ytrain = y_tall[train]
        Xval = x_tall[val]
        Yval = y_tall[val]
    
        # Define the DNN model
        model = getModel([500,250,125],0.4)
        
        # Compile it and fit
        model.compile(loss='categorical_crossentropy',optimizer='RMSprop', metrics=['accuracy'])
        model.fit(Xtrain, Ytrain, batch_size=2**8, epochs=10,verbose=1,validation_data=(Xval, Yval))
        
        # store training and test error
        trainErr.append(model.evaluate(x=Xtrain, y=Ytrain))
        testErr.append(model.evaluate(x=Xval, y=Yval))
        acc.append(getAccuracy(model,Xval,Yval))

        # Status output
        print('k-iteration = ',i)
        i=i+1
        trainErr = np.array(trainErr)
        testErr = np.array(testErr)
        
    print('\n trainErr')
    print(trainErr)
    print('avg trainErr',np.mean(trainErr,axis=0))
    print('\n testErr')
    print(testErr)
    print('avg testErr',np.mean(testErr,axis=0))
    print('\n Accuracy')
    print(acc)
    print('avg acc',np.mean(acc))

In [138]:
getDNNCross(5,x_tall,y_tall)

Train on 16000 samples, validate on 4000 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
k-iteration =  0

 trainErr
[[0.00926458 0.99825   ]]
avg trainErr [0.00926458 0.99825   ]

 testErr
[[0.61978289 0.84375   ]]
avg testErr [0.61978289 0.84375   ]

 Accuracy
[0.84375]
avg acc 0.84375
Train on 16000 samples, validate on 4000 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


AttributeError: 'numpy.ndarray' object has no attribute 'append'

Trying Random Forest of weakly trained Neural Networks

In [130]:
# Split the training data k-fold number of ways for k-fold validation of the learning algorithm
k=5
kf = sklearn.model_selection.KFold(n_splits=k)
inds = [ind for ind in kf.split(x_tall, y_tall)
train,val = inds[0]
# Training and validation data for k fold cross validation
Xtrain = x_uall[train]
Ytrain = y_uall[train]
Xval = x_uall[val]
Yval = y_uall[val]

# Specify number of Neural networks to train
N_models = 10
Predictions = []
TrainErr = []
TestErr = []

# Define the DNN model
for i in range(0,N_models):
    print('Training DNN ',i)
    model = getModel([500,250,125],0)
    # Compile it and fit
    model.compile(loss='categorical_crossentropy',optimizer='RMSprop', metrics=['accuracy'])
    model.fit(Xtrain, Ytrain, batch_size=2**8, epochs=2)
    # Store the models
    TrainErr.append(model.evaluate(x=Xtrain, y=Ytrain))
    TestErr.append(model.evaluate(x=Xval, y=Yval))
    # Use weakly trained model to predict and store predictions
    ypred = model.predict(Xval,batch_size=2**8)
    Predictions.append(ypred)

Predictions = np.array(Predictions)
TrainErr = np.array(TrainErr)
TestErr = np.array(TestErr)

Training DNN  0
Epoch 1/2
Epoch 2/2
Training DNN  1
Epoch 1/2
Epoch 2/2
Training DNN  2
Epoch 1/2
Epoch 2/2
Training DNN  3
Epoch 1/2
Epoch 2/2
Training DNN  4
Epoch 1/2
Epoch 2/2
Training DNN  5
Epoch 1/2
Epoch 2/2
Training DNN  6
Epoch 1/2
Epoch 2/2
Training DNN  7
Epoch 1/2
Epoch 2/2
Training DNN  8
Epoch 1/2
Epoch 2/2
Training DNN  9
Epoch 1/2
Epoch 2/2
Training DNN  10
Epoch 1/2
Epoch 2/2
Training DNN  11
Epoch 1/2
Epoch 2/2
Training DNN  12
Epoch 1/2
Epoch 2/2
Training DNN  13
Epoch 1/2
Epoch 2/2
Training DNN  14
Epoch 1/2
Epoch 2/2
Training DNN  15
Epoch 1/2
Epoch 2/2
Training DNN  16
Epoch 1/2
Epoch 2/2
Training DNN  17
Epoch 1/2
Epoch 2/2
Training DNN  18
Epoch 1/2
Epoch 2/2
Training DNN  19
Epoch 1/2
Epoch 2/2
Training DNN  20
Epoch 1/2
Epoch 2/2
Training DNN  21
Epoch 1/2
Epoch 2/2
Training DNN  22
Epoch 1/2
Epoch 2/2
Training DNN  23
Epoch 1/2
Epoch 2/2
Training DNN  24
Epoch 1/2


Epoch 2/2
Training DNN  25
Epoch 1/2
Epoch 2/2
Training DNN  26
Epoch 1/2
Epoch 2/2
Training DNN  27
Epoch 1/2
Epoch 2/2
Training DNN  28
Epoch 1/2
Epoch 2/2
Training DNN  29
Epoch 1/2
Epoch 2/2
Training DNN  30
Epoch 1/2
Epoch 2/2
Training DNN  31
Epoch 1/2
Epoch 2/2
Training DNN  32
Epoch 1/2
Epoch 2/2
Training DNN  33
Epoch 1/2
Epoch 2/2
Training DNN  34
Epoch 1/2
Epoch 2/2
Training DNN  35
Epoch 1/2
Epoch 2/2
Training DNN  36
Epoch 1/2
Epoch 2/2
Training DNN  37
Epoch 1/2
Epoch 2/2
Training DNN  38
Epoch 1/2
Epoch 2/2
Training DNN  39
Epoch 1/2
Epoch 2/2
Training DNN  40
Epoch 1/2
Epoch 2/2
Training DNN  41
Epoch 1/2
Epoch 2/2
Training DNN  42
Epoch 1/2
Epoch 2/2
Training DNN  43
Epoch 1/2
Epoch 2/2
Training DNN  44
Epoch 1/2
Epoch 2/2
Training DNN  45
Epoch 1/2
Epoch 2/2
Training DNN  46
Epoch 1/2
Epoch 2/2
Training DNN  47
Epoch 1/2
Epoch 2/2
Training DNN  48
Epoch 1/2


Epoch 2/2
Training DNN  49
Epoch 1/2
Epoch 2/2


In [136]:
ypred = []
for i in Predictions:
    ypred.append(Unencode(i).astype(int))

# Get mean and standard deviation of samples
ypmean=np.mean(ypred,axis=0)
std=np.std(ypred,axis=0)

print('Number of samples where stdev > 0, ',np.sum(std>0))
ypred = (ypmean > 0.5).astype(int)
ytrue = Unencode(Yval).astype(int)

acc = 1.0*np.sum(ypred == ytrue)/len(ytrue)
print('Final accuracy of random forest = ',acc)

Number of samples where stdev > 0,  2119
Final accuracy of random forest =  0.86


In [142]:
# Random neuron forest
def getDNNForestCross(k,N_models,x_tall,y_tall):
    # Storage for k fold cross validation
    facc = []
    
    # Split the training data k-fold number of ways for k-fold validation of the learning algorithm
    k=5
    kf = sklearn.model_selection.KFold(n_splits=k)
    inds = [ind for ind in kf.split(x_tall, y_tall)]

    j=0
    for train,val in inds:
        print('Cross fold validation ',j,'/',k)
        # Training and validation data for k fold cross validation
        Xtrain = x_tall[train]
        Ytrain = y_tall[train]
        Xval = x_tall[val]
        Yval = y_tall[val]

        # Store predictions from each tree in DNN forest
        Predictions = []
        
        # Define the DNN model
        for i in range(0,N_models):
            print('Training DNN ',i)
            model = getModel([500,250,125],0)
            # Compile it and fit
            model.compile(loss='categorical_crossentropy',optimizer='RMSprop', metrics=['accuracy'])
            model.fit(Xtrain, Ytrain, batch_size=2**8, epochs=2,verbose=1)

            # Use weakly trained model to predict and store predictions
            ypred = model.predict(Xval,batch_size=2**8,verbose=1)
            Predictions.append(ypred)

        Predictions = np.array(Predictions)
        
        ypred = []
        for i in Predictions:
            ypred.append(Unencode(i).astype(int))

        # Get mean and standard deviation of samples
        ypmean=np.mean(ypred,axis=0)
        std=np.std(ypred,axis=0)
        print('Number of samples where stdev > 0, ',np.sum(std>0))
        
        # Compute accuracy of predictions
        ypred = (ypmean > 0.5).astype(int)
        ytrue = Unencode(Yval).astype(int)
        acc = 1.0*np.sum(ypred == ytrue)/len(ytrue)
        print('Final accuracy of random forest = ',acc)
        j=j+1
        
        facc.append(acc)
        
    print('avg Accuracy of all random forests',np.mean(facc))

In [None]:
getDNNForestCross(5,10,x_tall,y_tall)

Cross fold validation  0 / 5
Training DNN  0
Epoch 1/2
Epoch 2/2
Training DNN  1
Epoch 1/2
Epoch 2/2
Training DNN  2
Epoch 1/2
Epoch 2/2
Training DNN  3
Epoch 1/2
Epoch 2/2
Training DNN  4
Epoch 1/2
Epoch 2/2
Training DNN  5
Epoch 1/2
Epoch 2/2
Training DNN  6
Epoch 1/2
Epoch 2/2
Training DNN  7
Epoch 1/2
Epoch 2/2
Training DNN  8
Epoch 1/2
Epoch 2/2
Training DNN  9
Epoch 1/2
Epoch 2/2
Number of samples where stdev > 0,  1412
Final accuracy of random forest =  0.85875
Cross fold validation  1 / 5
Training DNN  0
Epoch 1/2
Epoch 2/2
Training DNN  1
Epoch 1/2
Epoch 2/2
Training DNN  2
Epoch 1/2
Epoch 2/2
Training DNN  3
Epoch 1/2
Epoch 2/2
Training DNN  4
Epoch 1/2
Epoch 2/2
Training DNN  5
Epoch 1/2
Epoch 2/2
Training DNN  6
Epoch 1/2
Epoch 2/2
Training DNN  7
Epoch 1/2
Epoch 2/2
Training DNN  8
Epoch 1/2
Epoch 2/2
Training DNN  9
Epoch 1/2
Epoch 2/2
Number of samples where stdev > 0,  1477
Final accuracy of random forest =  0.8565
Cross fold validation  2 / 5
Training DNN  0
Epoch 1/2


### Output final predictions from the model

In [None]:
# Specify number of Neural networks to train
N_models = 20
Predictions = []

# Define the DNN model
for i in range(0,N_models):
    print('Training DNN ',i)
    model = getModel([500,250,125],0)
    # Compile it and fit
    model.compile(loss='categorical_crossentropy',optimizer='RMSprop', metrics=['accuracy'])
    model.fit(x_uall, y_uall, batch_size=2**8, epochs=2)
    # Use weakly trained model to predict and store predictions
    ypred = model.predict(FV_data,batch_size=2**8)
    Predictions.append(ypred)

In [None]:
Predictions = np.array(Predictions)

ypred = []
for i in Predictions:
    ypred.append(Unencode(i).astype(int))

# Get mean and standard deviation of samples
ypmean=np.mean(ypred,axis=0)
std=np.std(ypred,axis=0)
print('Number of samples where stdev > 0, ',np.sum(std>0))

# Compute final predictions and output it
ypred = (ypmean > 0.5).astype(int)
writeResults(ypred)