In [1]:
import pandas as pd
import numpy
import sklearn as sk
import matplotlib.pyplot as plt

from keras.models import Sequential
from keras.layers import Dense
from sklearn import tree
from sklearn import svm

Using Theano backend.


# Data preprocessing

In [2]:
#load dataset
dataframe = pd.DataFrame.from_csv('base_train.csv',sep=';',index_col=None)

Check that all variables are binary.

In [3]:
# check that sum of 1 and 0 values equals to the dimension of the data
dataset = dataframe.values
sum((dataset == 0)*1)+sum((dataset == 1)*1)

array([    0,     0, 42602, 42602, 42602, 42602, 42602, 42602, 42602,
       42602, 42602, 42602, 42602, 42602, 42602, 42602, 42602, 42602,
       42602, 42602, 42602, 42602, 42602, 42602, 42602, 42602, 42602,
       42602, 42602, 42602, 42602, 42602, 42602, 42602, 42602, 42602,
       42602, 42602, 42602, 42602, 42602, 42602, 42602, 42602, 42602,
       42602, 42602, 42602, 42602, 42602, 42602, 42602, 31083, 31083])

Find missing data.

In [4]:
# check the number of missing inputs and outputs
dataframe.isnull().sum()

report_dt        0
ID               0
1.1.             0
1.2.             0
1.3.             0
1.4.             0
1.5.             0
1.6.             0
1.7.             0
1.8.             0
1.9.             0
1.10.            0
1.11.            0
1.12.            0
1.13.            0
1.14.            0
1.15.            0
1.16.            0
1.17.            0
1.18.            0
1.19.            0
2.1.             0
2.2.             0
2.3.             0
2.4.             0
2.5.             0
2.6.             0
2.7.             0
2.8.             0
2.9.             0
2.10.            0
2.11.            0
2.12.            0
2.13.            0
2.14.            0
2.15.            0
2.16.            0
3.1.             0
3.2.             0
3.3.             0
3.4.             0
3.5.             0
3.6.             0
3.7.             0
3.8.             0
3.9.             0
3.10.            0
3.11.            0
3.12.            0
3.13.            0
3.14.            0
3.15.            0
X3          

There are no missing inputs, but about 25% of outputs are missing. We need to exclude those records from the dataset, as they aren't valid both for trainig and testing.

In [5]:
# delete data with missing values
dataframe = dataframe.dropna()

Next we test how the dataset is balanced, i.e. what is the proportion of 0 and 1 values of output variables. 

In [6]:
# number of 1 values in X3 variable
sum((dataframe['X3'] ==1)*1)

966

In [7]:
# number of 1 values in Y3 variable
sum((dataframe['Y3'] ==1)*1)

669

We see that value 1 is obtained by less then 1/30 of the dataset. Which means that we'll need to apply different weights to 0 and 1 during trainig to balance classes.

As there's no a priori information about the data we delete columns report_dt and ID. Also as we don't know the nature of the output variables (if ther's any connection between them) we use separate models to predict X3 and Y3.

In [8]:
del dataframe['report_dt']
del dataframe['ID']

In the last step we split data into training and test sets in proportion 3/1.

In [22]:
dataset = dataframe.values
# split into trainig (75%) and test (25%) sets
train, test = sk.cross_validation.train_test_split(dataset, test_size = 0.25)
# split into input (X) and (Y) variables
Xtrain = train[:,0:train.shape[1]-2]
Ytrain = train[:,train.shape[1]-2:train.shape[1]]
Xtest = test[:,0:test.shape[1]-2]
Ytest = test[:,test.shape[1]-2:test.shape[1]]

# Neural network

We build a fully-connected network with three layers. The network weights are initialized to a small random number drawn from a uniform distribution. As an activation function on the first two layers we use the rectifier function ('relu'). In the last layer we use sigmoid to insure the output is between 0 and 1. First layer has 12 neurons, 2nd layer 8 neurons, the output layer has 1 neuron to predict X3 variable.

In [24]:
# create model
model = Sequential()
model.add(Dense(12, input_dim=train.shape[1]-2, init='uniform', activation='relu'))
model.add(Dense(8, init='uniform', activation='relu'))
model.add(Dense(2, init='uniform', activation='sigmoid'))

To find the best set of weights to make predictions we must specify: the loss function, optimizer, metrics that we want to calculate during training. For binary classification problem we pick "binary_crossentropy" loss. The optimization method is gradient descent "adadelta". We calculate classification accuracy as a metric.

In [25]:
# set model
model.compile(loss='binary_crossentropy', optimizer='adadelta', metrics=['accuracy'])

Here we train the model. We pick 200 iterations for the optimization algorithm and a batch size of 10. Each epoch separates 20% of data for validation.

In [26]:
# train the model
model.fit(Xtrain,Ytrain, validation_split=0.2, nb_epoch=150, batch_size=32, class_weight="balanced", verbose=0)

<keras.callbacks.History at 0x110b2f0d0>

Finally, after the model is ready, we generate predicitons on the test set.

In [32]:
# make prediction on test set
Ypred= model.predict(Xtest)
#Ypred[Ypred<0.2] = 0
#Ypred[Ypred>=0.2] = 1

In [35]:
Ypred[Ypred<0.5]=0
Ypred[Ypred>=0.5]=1

# Decision tree

Here we build the decision tree model using sklearn framework.

In [28]:
# create and fit model
modeltree = tree.DecisionTreeClassifier()
modeltree.fit(Xtrain, Ytrain)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            random_state=None, splitter='best')

In [30]:
# make prediction on test set
Ypredtree = modeltree.predict(Xtest)

# SVM

The next model is the SVM classifier.

In [38]:
# create and fit model
modelsvm = svm.SVC(class_weight='auto')
modelsvm.fit(Xtrain, Ytrain)

ValueError: bad input shape (23312, 2)

In [39]:
# make prediction on test set
Ypredsvm = modelsvm.predict(Xtest)

NotFittedError: This SVC instance is not fitted yet. Call 'fit' with appropriate arguments before using this method.

# Evaluate the models

Here we define a function that evaluates performance of the model through different metrics. Short description for each metric is given in the code below.

In [40]:
def evaluation(Xtest,Ytest,Ypred):
    
    # accuracy on test set
    score = sk.metrics.accuracy_score(Ytest,Ypred)
    print 'accuracy:', round(score*100,2), '%'
    
    # precision on test set
    # The precision is the ratio tp / (tp + fp) where tp is the
    # number of true positives and fp the number of false positives.
    # The best value is 1 and the worst value is 0.
    score = sk.metrics.precision_score(Ytest,Ypred)
    print 'precision:', round(score*100,2), '%' 
    
    # recall on test set
    # The recall is the ratio tp / (tp + fn) where tp is the number of
    # true positives and fn the number of false negatives.
    # The best value is 1 and the worst value is 0.
    score = sk.metrics.recall_score(Ytest,Ypred)
    print 'recall:', round(score*100,2), '%'
    
    # f1 score on test set
    # The F1 score can be interpreted as a weighted average of the 
    # precision and recall, where an F1 score reaches its best value at
    # 1 and worst score at 0.
    score = sk.metrics.f1_score(Ytest,Ypred)
    print 'f1:', score
    
    # Area Under the Curve from prediction scores
    # The best performance is 1
    score = sk.metrics.roc_auc_score(Ytest,Ypred)
    print 'AUC:', score  
    
    # Confusion matrix
    # By definition a confusion matrix C is such that C_{i, j} 
    # is equal to the number of observations known to be in group i 
    # but predicted to be in group j.
    print 'Cofusion matrix:'
    print sk.metrics.confusion_matrix(Ytest,Ypred)
    
    # Matthews correlation coefficient (phi coefficient)
    # Is used in machine learning 
    # as a measure of the quality of binary (two-class) classifications.
    # It takes into account true and false positives and negatives and 
    # is generally regarded as a balanced measure which can be used even
    # if the classes are of very different sizes. 
    #+1 perfect prediction; 0 random prediction; -1 inverse prediction
    score = sk.metrics.matthews_corrcoef(Ytest,Ypred)
    print 'MCC:', score

Now lets evaluate each model performance. First goes the neural network.

In [41]:
evaluation(Xtest,Ytest,Ypred)

accuracy: 98.07 %
precision: 80.21 %
recall: 65.0 %
f1: 0.711718064584
AUC: 0.831426470567
Cofusion matrix:


  sample_weight=sample_weight)
  sample_weight=sample_weight)
  sample_weight=sample_weight)


ValueError: multilabel-indicator is not supported

Now the decision tree.

In [42]:
evaluation(Xtest,Ytest,Ypredtree)

accuracy: 98.13 %
precision: 81.08 %
recall: 68.1 %
f1: 0.736199193636
AUC: 0.845406280716
Cofusion matrix:


  sample_weight=sample_weight)
  sample_weight=sample_weight)
  sample_weight=sample_weight)


ValueError: multilabel-indicator is not supported

Finally, the svm.

In [None]:
evaluation(Xtest,Ytest,Ypredsvm)