# Preprocessing of the data

In this part of the code I will import the data from the file. Transform the categorical varibale into numerical ones and normalize the value of each variable between 0 and 1

Here I am defining some function to transform process the dataset. The class strToIntGenerator transforms the categorical variables into numberical variables

In [41]:
import csv
import numpy as np


def is_number(s):
    try:
        float(s)
        return True
    except ValueError:
        return False


class stringToInt(object):

    def __init__(self, datasetDict):

        self.datasetDict = datasetDict


    def toInt(self, varName, listValue):

        isNumber = self.datasetDict[varName]['isNumber']
        hasEmpty = self.datasetDict[varName]['hasEmptySymb']
        
        if isNumber:
            listValue = self.toIntFromNumbers(varName, hasEmpty, listValue)

        if not isNumber:
            listValue = self.toIntFromLabels(varName, listValue)

        return listValue

    
    def toIntFromNumbers(self, varName, hasEmpty, listValue):
        
        if hasEmpty:
           emptySymb = self.datasetDict[varName]['emptySymb']

        lstValueFloat = []
        for vl in listValue:
            if not vl:
                lstValueFloat.append(emptySymb)
            else:
                lstValueFloat.append(float(vl))
        return lstValueFloat

    
        
    def toIntFromLabels(self, varName, listValue):
        lstValueFloat = []
        symb          = datasetDict[varName]['symbols']
        for vl in listValue:
            lstValueFloat.append(float(symb.index(vl)))
        return lstValueFloat


Loading the data from the train and test file

In [43]:
row_data = []
with open('test_ljn/train.csv','rb') as csvfile:
    reader = csv.reader(csvfile, delimiter= ' ', quotechar='|')
    for row in reader:
        row_data.append(row)
        
row_data_test = []
with open('test_ljn/test.csv','rb') as csvfile:
    reader = csv.reader(csvfile, delimiter= ' ', quotechar='|')
    for row in reader:
        row_data_test.append(row)




Building a dictionary describing the dataset. The dictionary is called datasetDict. 
It has an entry for each of the variables X0,...,X14,Y.

In [44]:
variableName = row_data[0][0].split(';')
row_data     = row_data[1:]
n_entry      = len(row_data[0][0].split(';'))

datasetDict = {}
isNumber    = [is_number(rw) for rw in row_data[1][0].split(';')]

for cnt in range(n_entry):
    variableDict = {'isNumber': isNumber[cnt]}
    dataString = []
    for ln in row_data:
        dataString.append(ln[0].split(';')[cnt])

    symbols = list(set(dataString))
    variableDict['n_symbols'] = len(symbols)
    variableDict['symbols']   = symbols

    if symbols[0] == '':
        variableDict['hasEmptySymb'] = True
        if isNumber[cnt] :
            variableDict['emptySymb'] = np.asarray(symbols[1:], dtype=float).mean()
    else:
        variableDict['hasEmptySymb'] = False


    datasetDict[variableName[cnt]] =  variableDict

print "Keys of the datasetDictionary:", datasetDict.keys()
print "Keys of one of the dataset entry:", datasetDict['X1'].keys()
    

Keys of the datasetDictionary: ['X12', 'X13', 'Y', 'X10', 'X8', 'X9', 'X11', 'X2', 'X3', 'X0', 'X1', 'X6', 'X7', 'X4', 'X5']
Keys of one of the dataset entry: ['symbols', 'isNumber', 'n_symbols', 'hasEmptySymb']


The class strToIntGenerator transforms the categorical variables into numberical variables

In [45]:
strToIntGenerator = stringToInt(datasetDict)  


train_set = []
for cnt in range(n_entry):
        varName  = variableName[cnt]
        data_col = [] 
        for ln in row_data:
            data_col.append(ln[0].split(';')[cnt])
        data_col = strToIntGenerator.toInt(varName, data_col)
        train_set.append(data_col)

row_data_test   = row_data_test[1:]
test_set        = []
for cnt in range(n_entry-1):
        varName  = variableName[cnt]
        data_col = [] 
        for ln in row_data_test:
            data_col.append(ln[0].split(';')[cnt])
        data_col = strToIntGenerator.toInt(varName, data_col)
        test_set.append(data_col)


The values of the variables are normalized between 0 and 1

In [46]:
train_set_norm = []
test_set_norm  = []
for tr, ts in zip(train_set[:-1], test_set):
    tr = np.asarray(tr)
    tr = (tr - tr.min())/(tr.max() - tr.min())
    ts = (ts - tr.min())/(tr.max() - tr.min())
    train_set_norm.append(tr)
    test_set_norm.append(ts)

train_set_norm.append(np.asarray(train_set[-1]))

train_set_norm = np.hstack([tr.reshape(-1,1) for tr in train_set_norm])
test_set_norm  = np.hstack([tr.reshape(-1,1) for tr in test_set_norm])

The classes are not balanced, the number of data belonging to the class 0 are much more then the ones
belonging to class 1

In [47]:
print 'Number of data belonging to class 1:', int(train_set_norm[:,-1].sum())
print 'Total number of data:', train_set_norm.shape[0]

Number of data belonging to class 1: 7841
Total number of data: 32561


In order to have dataset with balanced classes I decided to prepare three different datasets.
Each of the three datasets will have all the data belonging to class 1 and 1/3 of the data beloning to class 0.
In this way each dataset will be balanced. I will then train three different classifiers on the three datasets and avarege their outputs to obtain the predicted class for the test set. Each dataset will be composed of a train and a validation set. The validation set was set to be 1/10 of the training set

In [48]:
idx_one  = np.where(train_set_norm[:,-1]>.5)[0]
idx_zero = np.where(train_set_norm[:,-1]<.5)[0]
np.random.shuffle(idx_zero)

trainGroup = []
n_groups   = 3
sz_group   = idx_zero.shape[0]/n_groups
for cnt in range(n_groups):
    idx_zero_group = idx_zero[cnt*sz_group:(cnt+1)*sz_group]
    trainGroup.append(np.vstack([train_set_norm[idx_one,:], train_set_norm[idx_zero_group,:]]))


trainValidGroup = []
valid_frac      = 0.1
for gr in trainGroup:
    idx   = np.arange(gr.shape[0])
    np.random.shuffle(idx)

    n_valid = int(gr.shape[0] * valid_frac)
    valid   = gr[idx[:n_valid]]
    train   = gr[idx[n_valid:]]

    trainValidGroup.append([valid,train])

# Building Classifiers

In [49]:
import numpy as np
import theano
import theano.tensor as T
import numpy
import numpy as np
import numpy.random as rng

Define a model class: in the following a will build two kinds of model, a logistic regrassion and a neural network, both classes will inherit from the class classifier that here defined

In [50]:
class classifier(object):

    def __init__(self, **kwargs):
        pass
    
    def negative_log_likelihood(self, y):
        return -T.mean(T.log(self.p_y_given_x)[T.arange(y.shape[0]), y])


    def get_updates(self, cost, learning_rate):
       grds     = [T.grad(cost=cost, wrt=param) for param in self.params]
       updates  = [(param, param - learning_rate * grd) for param, grd in zip(self.params, grds)]
       return updates
       
    def errors(self, y):
        return T.mean(T.neq(self.y_pred, y))

Define the class Logistic regrassion that inherit from the class classifier. 

In [51]:
class LogisticRegression(classifier):

    def __init__(self, input, n_in, n_out):
        self.W = theano.shared(
            value=numpy.zeros(
                (n_in, n_out),
                dtype=theano.config.floatX
            ),
            name='W',
            borrow=True
        )
        
        self.b = theano.shared(
            value=numpy.zeros(
                (n_out,),
                dtype=theano.config.floatX
            ),
            name='b',
            borrow=True
        )

        self.p_y_given_x = T.nnet.softmax(T.dot(input, self.W) + self.b)
        self.y_pred = T.argmax(self.p_y_given_x, axis=1)
        self.params = [self.W, self.b]
        self.input = input

Define the class NN that also inherit from the class classifier.
The following class implements a FFW neural network.
n_in: dimension of the input
n_out: dimension of the output
n_hidden: number of node in the hidden layers
n_layers: number of hidden layers

In [52]:
class NN(classifier):

    def __init__(self, input, n_in, n_out, n_hidden, n_layers):

        
        W_values = [np.asarray(rng.uniform(low=-numpy.sqrt(6. / (n_in + n_hidden)),
                                           high=numpy.sqrt(6. / (n_in + n_hidden)),
                                           size=(n_in, n_hidden)
                              ), dtype=theano.config.floatX)]

        b_values = [numpy.zeros((n_hidden,), dtype=theano.config.floatX)]

        
        for _ in range(n_layers):
            W_values.append(np.asarray(rng.uniform(low   = -numpy.sqrt(6. / (n_hidden + n_hidden)),
                                                   high  = numpy.sqrt(6. / (n_hidden + n_hidden)),
                                                   size  = (n_hidden, n_hidden)),
                                                   dtype = theano.config.floatX))

            b_values.append(numpy.zeros((n_hidden,), dtype=theano.config.floatX))


            
        W_values.append(np.asarray(rng.uniform(low  = -numpy.sqrt(6. / (n_hidden + n_out)),
                                                   high = numpy.sqrt(6. / (n_hidden + n_out)),
                                                   size = (n_hidden, n_out)),
                                                   dtype= theano.config.floatX))

        b_values.append(numpy.zeros((n_out,), dtype=theano.config.floatX))
        

        self.Ws = [theano.shared(value=W, borrow=True) for W in W_values]
        self.bs = [theano.shared(value=b, borrow=True) for b in b_values]

        self.p_y_given_x = T.nnet.relu(T.dot(input, self.Ws[0]) + self.bs[0])
        
        for cnt_l in range(n_layers):
            self.p_y_given_x = T.nnet.relu(T.dot(self.p_y_given_x, self.Ws[cnt_l+1]) + self.bs[cnt_l+1])  

        self.p_y_given_x = T.nnet.softmax(T.dot(self.p_y_given_x, self.Ws[-1]) + self.bs[-1])
        self.y_pred = T.argmax(self.p_y_given_x, axis=1)
        self.params = self.Ws + self.bs
        self.input = input

# Training the classifier

Cast the datasets into theano shared variables

In [53]:
data = trainValidGroup#np.load('dataTrain.npy')

trainx = []
trainy = []
validx = []
validy = []
for gr in data:
    validx.append(theano.shared(np.asarray(gr[0][:,:-1], dtype=theano.config.floatX)))
    validy.append(T.cast(theano.shared(np.asarray(gr[0][:,-1], dtype=int)), 'int32'))
    trainx.append(theano.shared(np.asarray(gr[1][:,:-1], dtype=theano.config.floatX)))
    trainy.append(T.cast(theano.shared(np.asarray(gr[1][:,-1], dtype=int)), 'int32'))
       

valid_sets_x = validx
valid_sets_y = validy
train_sets_x = trainx
train_sets_y = trainy

In [54]:
batch_size    = 30
n_epochs      = 100
learning_rate = 0.01

Compiling theano function for training and validating

In [55]:
index = T.lscalar()
    
x = T.matrix('x') 
y = T.ivector('y')

n_models    = 3
#classifiers = [LogisticRegression(input=x, n_in=14, n_out=2) for cnt in range(n_models)]
classifiers = [NN(input=x, n_in=14, n_out=2, n_hidden=50, n_layers=1) for cnt in range(n_models)]
    
    # the cost we minimize during training is the negative log likelihood of
    # the model in symbolic format
costs = [classifier.negative_log_likelihood(y) for classifier in classifiers]

validate_models = [
    theano.function(
        inputs=[index],
        outputs=classifier.errors(y),
        givens={ x: valid_set_x[index * batch_size: (index + 1) * batch_size], 
                 y: valid_set_y[index * batch_size: (index + 1) * batch_size]\
               }
    )for classifier, valid_set_x, valid_set_y in zip(classifiers, valid_sets_x, valid_sets_y)]

updates = [classifier.get_updates(cost, learning_rate) for classifier, cost in zip(classifiers, costs)]

train_models = [
    theano.function(
        inputs=[index],
        outputs=cost,
        updates=update,
        givens={
            x: train_set_x[index * batch_size: (index + 1) * batch_size],
            y: train_set_y[index * batch_size: (index + 1) * batch_size]
        }
    ) for cost, update, train_set_x, train_set_y in zip(costs, updates, train_sets_x, train_sets_y)]

Training the three models

In [None]:
n_train_batches = train_set_x.get_value(borrow=True).shape[0] // batch_size
n_valid_batches = valid_set_x.get_value(borrow=True).shape[0] // batch_size
    
epoch = 0
while (epoch < n_epochs):
    epoch = epoch + 1

    train_error = [0.]*n_models
    cnt_btc     = 0
    for minibatch_index in range(n_train_batches):
        for cnt_mdl in range(n_models):
            train_error[cnt_mdl] += train_models[cnt_mdl](minibatch_index)
        cnt_btc     += 1
            
    train_error = np.asarray(train_error)/float(cnt_btc)

    cnt_btc_valid = 0
    validation_losses = [0.] * n_models
    for minibatch_index in range(n_valid_batches):
        for cnt_mdl in range(n_models):
            validation_losses[cnt_mdl]  += validate_models[cnt_mdl](minibatch_index)
        cnt_btc_valid += 1

    validation_losses = np.asarray(validation_losses)/float(cnt_btc_valid)

    if np.mod(epoch, 10)==0:
        print('epoch %i' %epoch)
        for cnt_mdl in range(n_models):
            print('Model %i: train error %.4f, validation error %.4f' %(cnt_mdl, train_error[cnt_mdl], validation_losses[cnt_mdl] * 100.))
