In [1]:
import pandas as pd
import numpy
import matplotlib.pyplot as plt
import sklearn as sk
from sklearn import tree
from sklearn import svm

# Data preprocessing

In [2]:
#load dataset
dataframe = pd.DataFrame.from_csv('base_train.csv',sep=';',index_col=None)

In [3]:
# now check that all the variables are binary
dataset = dataframe.values
sum((dataset == 0)*1)+sum((dataset == 1)*1)

array([    0,     0, 42602, 42602, 42602, 42602, 42602, 42602, 42602,
       42602, 42602, 42602, 42602, 42602, 42602, 42602, 42602, 42602,
       42602, 42602, 42602, 42602, 42602, 42602, 42602, 42602, 42602,
       42602, 42602, 42602, 42602, 42602, 42602, 42602, 42602, 42602,
       42602, 42602, 42602, 42602, 42602, 42602, 42602, 42602, 42602,
       42602, 42602, 42602, 42602, 42602, 42602, 42602, 31083, 31083])

In [4]:
# check the number of missing inputs and outputs
dataframe.isnull().sum()

report_dt        0
ID               0
1.1.             0
1.2.             0
1.3.             0
1.4.             0
1.5.             0
1.6.             0
1.7.             0
1.8.             0
1.9.             0
1.10.            0
1.11.            0
1.12.            0
1.13.            0
1.14.            0
1.15.            0
1.16.            0
1.17.            0
1.18.            0
1.19.            0
2.1.             0
2.2.             0
2.3.             0
2.4.             0
2.5.             0
2.6.             0
2.7.             0
2.8.             0
2.9.             0
2.10.            0
2.11.            0
2.12.            0
2.13.            0
2.14.            0
2.15.            0
2.16.            0
3.1.             0
3.2.             0
3.3.             0
3.4.             0
3.5.             0
3.6.             0
3.7.             0
3.8.             0
3.9.             0
3.10.            0
3.11.            0
3.12.            0
3.13.            0
3.14.            0
3.15.            0
X3          

There are no missing inputs, but about 25% of outputs are missing. We need to exclude those records from the dataset, as they aren't valid both for trainig and testing.

In [5]:
# delete data with missing values
dataframe = dataframe.dropna()

Next we test how the dataset is balanced, i.e. what is the proportion of 0 and 1 values of output variables. 

In [6]:
# number of 1 values in X3 variable
sum((dataframe['X3'] ==1)*1)

966

In [7]:
# number of 1 values in Y3 variable
sum((dataframe['Y3'] ==1)*1)

669

We see that value 1 is obtained by less then 1/30 of the dataset. Which means that we'll need to apply different weights to 0 and 1 during trainig to balance classes. 

As there's no a priori information about the data we delete columns report_dt and ID. Also as we don't know the nature of the output variables (if ther's any connection between them) we use separate models to predict X3 and Y3.

In [8]:
del dataframe['report_dt']
del dataframe['ID']
del dataframe['Y3']

Next we select features that have the highest scores using chi2 function for scoring. However we should note that this is not required if we use a neural network (usually it's assumed that this step is performed by the network itself).

In [13]:
# calculate scores of featuress
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
values = SelectKBest(chi2, k='all').fit(dataframe.drop('X3',axis=1).values,dataframe['X3'].values)
scores = -numpy.log10(values.pvalues_)

# plot results
#%matplotlib inline
#plt.bar(range(len(dataframe.columns)-1), scores)
#plt.xticks(range(len(dataframe.columns)-1), dataframe.columns, rotation='vertical')
#plt.show()

Now we leave 20 features with highest scores and drop others.

In [24]:
dataset_reduced = SelectKBest(chi2, k=20).fit_transform(dataframe.drop('X3',axis=1).values,dataframe['X3'].values)

In [25]:
dataset_reduced = numpy.append(dataset_reduced,numpy.reshape(dataframe['X3'].values,(len(dataframe['X3'].values),1)),axis=1)

In the last step we split data into training and test sets in proportion 3/1.

In [33]:
# split into trainig (75%) and test (25%) sets
train, test = sk.cross_validation.train_test_split(dataset_reduced, test_size = 0.25)
# split into input (X) and (Y) variables
Xtrain = train[:,0:dataset_reduced.shape[1]-1]
Ytrain = train[:,dataset_reduced.shape[1]-1]
Xtest = test[:,0:dataset_reduced.shape[1]-1]
Ytest = test[:,dataset_reduced.shape[1]-1]

# Decision tree

Here we build the decision tree model using sklearn framework.

In [38]:
# create and fit model
modeltree = tree.DecisionTreeClassifier()
modeltree.fit(Xtrain, Ytrain)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            random_state=None, splitter='best')

In [39]:
# make prediction on test set
Ypredtree = modeltree.predict(Xtest)

# SVM

The next model is the SVM classifier.

In [41]:
# create and fit model
modelsvm = svm.SVC()
modelsvm.fit(Xtrain, Ytrain)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0, degree=3, gamma=0.0,
  kernel='rbf', max_iter=-1, probability=False, random_state=None,
  shrinking=True, tol=0.001, verbose=False)

In [42]:
# make prediction on test set
Ypredsvm = modelsvm.predict(Xtest)

# Evaluate models

Here we define a function that evaluates performance of the model through different metrics. Short description for each metric is given in the code below.

In [43]:
def evaluation(Xtest,Ytest,Ypred):
    
    # accuracy on test set
    score = sk.metrics.accuracy_score(Ytest,Ypred)
    print 'accuracy:', round(score*100,2), '%'
    
    # precision on test set
    # The precision is the ratio tp / (tp + fp) where tp is the
    # number of true positives and fp the number of false positives.
    # The best value is 1 and the worst value is 0.
    score = sk.metrics.precision_score(Ytest,Ypred)
    print 'precision:', round(score*100,2), '%' 
    
    # recall on test set
    # The recall is the ratio tp / (tp + fn) where tp is the number of
    # true positives and fn the number of false negatives.
    # The best value is 1 and the worst value is 0.
    score = sk.metrics.recall_score(Ytest,Ypred)
    print 'recall:', round(score*100,2), '%'
    
    # f1 score on test set
    # The F1 score can be interpreted as a weighted average of the 
    # precision and recall, where an F1 score reaches its best value at
    # 1 and worst score at 0.
    score = sk.metrics.f1_score(Ytest,Ypred)
    print 'f1:', score
    
    # Area Under the Curve from prediction scores
    # The best performance is 1
    score = sk.metrics.roc_auc_score(Ytest,Ypred)
    print 'AUC:', score  
    
    # Confusion matrix
    # By definition a confusion matrix C is such that C_{i, j} 
    # is equal to the number of observations known to be in group i 
    # but predicted to be in group j.
    print 'Cofusion matrix:'
    print sk.metrics.confusion_matrix(Ytest,Ypred)
    
    # Matthews correlation coefficient (phi coefficient)
    # Is used in machine learning 
    # as a measure of the quality of binary (two-class) classifications.
    # It takes into account true and false positives and negatives and 
    # is generally regarded as a balanced measure which can be used even
    # if the classes are of very different sizes. 
    #+1 perfect prediction; 0 random prediction; -1 inverse prediction
    score = sk.metrics.matthews_corrcoef(Ytest,Ypred)
    print 'MCC:', score

Now the decision tree.

In [44]:
evaluation(Xtest,Ytest,Ypredtree)

accuracy: 98.24 %
precision: 76.27 %
recall: 58.7 %
f1: 0.663390663391
AUC: 0.790693484315
Cofusion matrix:
[[7499   42]
 [  95  135]]
MCC: 0.660422334399


Finally, the svm.

In [45]:
evaluation(Xtest,Ytest,Ypredsvm)

accuracy: 97.85 %
precision: 66.32 %
recall: 55.65 %
f1: 0.605200945626
AUC: 0.773951096326
Cofusion matrix:
[[7476   65]
 [ 102  128]]
MCC: 0.596658005283
