In [16]:
import pandas as pd
import numpy
import matplotlib.pyplot as plt

#from keras.models import Sequential
#from keras.layers import Dense, Dropout

import sklearn as sk
from sklearn import tree
from sklearn.ensemble import RandomForestClassifier 
from sklearn import svm

# Data preprocessing

In [2]:
#load dataset
dataframe = pd.DataFrame.from_csv('base_train.csv',sep=';',index_col=None)

In [3]:
# convert dates to datetime format
dataframe['report_dt'] = pd.to_datetime(dataframe['report_dt'])

In [4]:
month = numpy.zeros(len(dataframe))
for i in range(len(dataframe)):
    month[i] = dataframe['report_dt'][i].month

In [5]:
dataframe['report_dt'] = month

In [6]:
dataframe

Unnamed: 0,report_dt,ID,1.1.,1.2.,1.3.,1.4.,1.5.,1.6.,1.7.,1.8.,...,3.8.,3.9.,3.10.,3.11.,3.12.,3.13.,3.14.,3.15.,X3,Y3
0,4,1-15JF8Z,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,,
1,7,1-6IW9T,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,4,1-2712QP,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,,
3,9,1-121YJOD,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,,
4,1,1-12667,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,5,1-88MWJ,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6,6,1-7TKF7,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7,9,1-165460,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,,
8,7,1-563FI7,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9,9,1-6B0W56,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [None]:
dataframe.groupby(['report_dt','X3']).size()

Check that all variables are binary.

In [7]:
# check that sum of 1 and 0 values equals to the dimension of the data
dataset = dataframe.values
sum((dataset == 0)*1)+sum((dataset == 1)*1)

array([ 3703,     0, 42602, 42602, 42602, 42602, 42602, 42602, 42602,
       42602, 42602, 42602, 42602, 42602, 42602, 42602, 42602, 42602,
       42602, 42602, 42602, 42602, 42602, 42602, 42602, 42602, 42602,
       42602, 42602, 42602, 42602, 42602, 42602, 42602, 42602, 42602,
       42602, 42602, 42602, 42602, 42602, 42602, 42602, 42602, 42602,
       42602, 42602, 42602, 42602, 42602, 42602, 42602, 31083, 31083])

Find missing data.

In [8]:
# check the number of missing inputs and outputs
dataframe.isnull().sum()

report_dt        0
ID               0
1.1.             0
1.2.             0
1.3.             0
1.4.             0
1.5.             0
1.6.             0
1.7.             0
1.8.             0
1.9.             0
1.10.            0
1.11.            0
1.12.            0
1.13.            0
1.14.            0
1.15.            0
1.16.            0
1.17.            0
1.18.            0
1.19.            0
2.1.             0
2.2.             0
2.3.             0
2.4.             0
2.5.             0
2.6.             0
2.7.             0
2.8.             0
2.9.             0
2.10.            0
2.11.            0
2.12.            0
2.13.            0
2.14.            0
2.15.            0
2.16.            0
3.1.             0
3.2.             0
3.3.             0
3.4.             0
3.5.             0
3.6.             0
3.7.             0
3.8.             0
3.9.             0
3.10.            0
3.11.            0
3.12.            0
3.13.            0
3.14.            0
3.15.            0
X3          

There are no missing inputs, but about 25% of outputs are missing. We need to exclude those records from the dataset, as they aren't valid both for trainig and testing.

In [9]:
# delete data with missing values
dataframe = dataframe.dropna()

Next we test how the dataset is balanced, i.e. what is the proportion of 0 and 1 values of output variables. 

In [10]:
# number of 1 values in X3 variable
sum((dataframe['X3'] ==1)*1)

966

In [11]:
# number of 1 values in Y3 variable
sum((dataframe['Y3'] ==1)*1)

669

We see that value 1 is obtained by less then 1/30 of the dataset. Which means that we'll need to apply different weights to 0 and 1 during trainig to balance classes.

As there's no a priori information about dependencies in the date we delete columns report_dt and ID. Also as we don't know the nature of the output variables (if ther's any connection between them) we use separate models to predict X3 and Y3. Next we predict X3 variable (to predict Y3 change "del dataframe['Y3']" to "del dataframe['X3']" in the code box below).

In [12]:
#del dataframe['report_dt']
del dataframe['ID']
del dataframe['Y3']

Next we select features that have the highest scores using chi2 function for scoring. However we should note that this is not required if we use a neural network (usually it's assumed that this step is performed by the network itself).

In [17]:
# calculate scores of featuress
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
values = SelectKBest(chi2, k='all').fit(dataframe.drop('X3',axis=1).values,dataframe['X3'].values)
scores = -numpy.log10(values.pvalues_)

# uncomment code below to plot results
#%matplotlib inline
#plt.bar(range(len(dataframe.columns)-1), scores)
#plt.xticks(range(len(dataframe.columns)-1), dataframe.columns, rotation='vertical')
#plt.show()

Now we leave 30 features with highest scores and drop others.

In [18]:
# select K best features and transform dataset
dataset_reduced = SelectKBest(chi2, k=30).fit_transform(dataframe.drop('X3',axis=1).values,dataframe['X3'].values)
dataset_reduced = numpy.append(dataset_reduced,numpy.reshape(dataframe['X3'].values,(len(dataframe['X3'].values),1)),axis=1)

In the last step we split data into training and test sets in proportion 3/1.

In [19]:
# split into trainig (75%) and test (25%) sets
train, test = sk.cross_validation.train_test_split(dataset_reduced, test_size = 0.25)
# split into input (X) and (Y) variables
Xtrain = train[:,0:train.shape[1]-1]
Ytrain = train[:,train.shape[1]-1]
Xtest = test[:,0:test.shape[1]-1]
Ytest = test[:,test.shape[1]-1]

# Neural network

We build a fully-connected network with three layers. The network weights are initialized to a small random number drawn from a uniform distribution. As an activation function on the first two layers we use the rectifier function 'relu'. In the last layer we use sigmoid to insure the output is between 0 and 1. First layer has 35 neurons, 2nd layer 24 neurons, the output layer has 1 neuron to predict the output. Between hiden layers we also perform dropout operation for better distribution of weights between neurons.

In [20]:
# create model
model = Sequential()
model.add(Dense(35, input_dim=train.shape[1]-1, init='uniform', activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(24, init='uniform', activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(1, init='uniform', activation='sigmoid'))

NameError: name 'Sequential' is not defined

To find the best set of weights we must specify: the loss function, optimizer and metrics that we want to calculate during training. For binary classification problem we pick "binary_crossentropy" loss. The optimization method is gradient descent "adadelta". We calculate classification accuracy as a metric.

In [13]:
# set model
model.compile(loss='binary_crossentropy', optimizer='adadelta', metrics=['accuracy'])

Here we train the model. We pick 300 iterations for the optimization algorithm and a batch size of 32. Each epoch separates 20% of data for validation.

In [14]:
# train the model
model.fit(Xtrain,Ytrain, validation_split=0.2, nb_epoch=300, batch_size=32, class_weight="balanced", verbose=0)

<keras.callbacks.History at 0x1165ee450>

Finally, after the model is ready, we generate predicitons on the test set.

In [15]:
# make prediction on test set
Ypred= model.predict_classes(Xtest)



# Decision tree

Here we build the decision tree model using sklearn framework.

In [21]:
# create and fit model
modeltree = tree.DecisionTreeClassifier()
modeltree.fit(Xtrain, Ytrain)

# make prediction on test set
Ypredtree = modeltree.predict(Xtest)

# Random forest 

Random forest is a batch of trees, we'll check it's perpormance too.

In [22]:
# Create the random forest
modelforest = RandomForestClassifier(n_estimators = 100)
modelforest.fit(Xtrain,Ytrain)

# make prediction on test set
Ypredforest = modelforest.predict(Xtest)

# SVM

The next model is the SVM classifier.

In [23]:
# create and fit model
modelsvm = svm.SVC(class_weight='auto')
modelsvm.fit(Xtrain, Ytrain)

# make prediction on test set
Ypredsvm = modelsvm.predict(Xtest)

# Evaluate the models

Here we define a function that evaluates performance of the model through different metrics. Short description for each metric is given in the code below.

In [24]:
def evaluation(Xtest,Ytest,Ypred):
    
    # accuracy on test set
    score = sk.metrics.accuracy_score(Ytest,Ypred)
    print 'accuracy:', round(score*100,2), '%'
    
    # precision on test set
    # The precision is the ratio tp / (tp + fp) where tp is the
    # number of true positives and fp the number of false positives.
    # The best value is 1 and the worst value is 0.
    score = sk.metrics.precision_score(Ytest,Ypred)
    print 'precision:', round(score*100,2), '%' 
    
    # recall on test set
    # The recall is the ratio tp / (tp + fn) where tp is the number of
    # true positives and fn the number of false negatives.
    # The best value is 1 and the worst value is 0.
    score = sk.metrics.recall_score(Ytest,Ypred)
    print 'recall:', round(score*100,2), '%'
    
    # f1 score on test set
    # The F1 score can be interpreted as a weighted average of the 
    # precision and recall, where an F1 score reaches its best value at
    # 1 and worst score at 0.
    score = sk.metrics.f1_score(Ytest,Ypred)
    print 'f1:', score
    
    # Area Under the Curve from prediction scores
    # The best performance is 1
    score = sk.metrics.roc_auc_score(Ytest,Ypred)
    print 'AUC:', score  
    
    # Confusion matrix
    # By definition a confusion matrix C is such that C_{i, j} 
    # is equal to the number of observations known to be in group i 
    # but predicted to be in group j.
    print 'Confusion matrix:'
    print sk.metrics.confusion_matrix(Ytest,Ypred)
    
    # Matthews correlation coefficient (phi coefficient)
    # Is used in machine learning 
    # as a measure of the quality of binary (two-class) classifications.
    # It takes into account true and false positives and negatives and 
    # is generally regarded as a balanced measure which can be used even
    # if the classes are of very different sizes. 
    #+1 perfect prediction; 0 random prediction; -1 inverse prediction
    score = sk.metrics.matthews_corrcoef(Ytest,Ypred)
    print 'MCC:', score

Now lets evaluate each model performance. First goes the neural network:

In [20]:
evaluation(Xtest,Ytest,Ypred)

accuracy: 98.21 %
precision: 80.77 %
recall: 58.57 %
f1: 0.678983833718
AUC: 0.790501557599
Confusion matrix:
[[7485   35]
 [ 104  147]]
MCC: 0.679197801722


Now the decision tree:

In [25]:
evaluation(Xtest,Ytest,Ypredtree)

accuracy: 98.17 %
precision: 76.38 %
recall: 61.54 %
f1: 0.681614349776
AUC: 0.804568969043
Confusion matrix:
[[7477   47]
 [  95  152]]
MCC: 0.676482674224


Random forest:

In [26]:
evaluation(Xtest,Ytest,Ypredforest)

accuracy: 98.22 %
precision: 77.95 %
recall: 61.54 %
f1: 0.68778280543
AUC: 0.804834785098
Confusion matrix:
[[7481   43]
 [  95  152]]
MCC: 0.683801580812


Finally, the SVM:

In [27]:
evaluation(Xtest,Ytest,Ypredsvm)

accuracy: 95.08 %
precision: 37.61 %
recall: 83.0 %
f1: 0.517676767677
AUC: 0.892385392385
Confusion matrix:
[[7184  340]
 [  42  205]]
MCC: 0.539098926708


You should not be confused with a value of accuracy which is so high just due to the unbalanced dataset. Precision, recall and f1 score (as well as other metrics) show similar performance for all the models (except for SVM which is different due to assignment of non-equal weights to classes).

Remark: though variables X3 and Y3 are correlated (with coefficient approx 0.82) we decided not to combine them into a single model as usually some additional knowledge of variables domain is needed to do so. Without explicit knowledge of the dependencies it is prefered to treat outputs separately.