Import Libraries
-------------------
In this project we need to import some libraries as **pandas,sklearn**,etc...

In [1]:
import pandas as pd
from sklearn import tree
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.calibration import CalibratedClassifierCV
from sklearn.metrics import log_loss
from sklearn.ensemble import ExtraTreesClassifier

Import Data
-------------------
We can easly handle data import task with **pandas** library.

In [2]:
df = pd.read_csv('train.csv')
print(df.head())

X = df.drop('id', axis=1).drop('target',axis=1).values
y = df['target'].values
print(X.shape)
print(y.shape)

   id  feat_1  feat_2  feat_3  feat_4  feat_5  feat_6  feat_7  feat_8  feat_9  \
0   1       1       0       0       0       0       0       0       0       0   
1   2       0       0       0       0       0       0       0       1       0   
2   3       0       0       0       0       0       0       0       1       0   
3   4       1       0       0       1       6       1       5       0       0   
4   5       0       0       0       0       0       0       0       0       0   

    ...     feat_85  feat_86  feat_87  feat_88  feat_89  feat_90  feat_91  \
0   ...           1        0        0        0        0        0        0   
1   ...           0        0        0        0        0        0        0   
2   ...           0        0        0        0        0        0        0   
3   ...           0        1        2        0        0        0        0   
4   ...           1        0        0        0        0        1        0   

   feat_92  feat_93   target  
0        0        0

Action!
---
#### First Try:
prediction for train data

In [3]:
clf = tree.DecisionTreeClassifier()
clf.fit(X, y)
tree_preds = clf.predict_proba(X)
print("Loss Value of DecisionTreeClassifier: %f" % log_loss(y, tree_preds))

Loss Value of DecisionTreeClassifier: 0.000000


#### Second Try:
how noise is changing loss value (again for train data)

In [4]:
df_noise = pd.read_csv('train_noise.csv')

X_noise = df_noise.drop('id', axis=1).drop('target',axis=1).values
y_noise = df_noise['target'].values

clf = tree.DecisionTreeClassifier()
clf.fit(X_noise, y_noise)
tree_preds_noise = clf.predict_proba(X_noise)
print("Loss Value of DecisionTreeClassifier: %f" % log_loss(y_noise, tree_preds_noise))
print(tree_preds_noise[-1])
print(y_noise[-1])

Loss Value of DecisionTreeClassifier: 0.000022
[ 0.5  0.   0.   0.   0.   0.   0.   0.   0.5]
Class_1


#### Third try:
serious one! evaluation decision tree classifier with default parameters 

In [5]:
clf = tree.DecisionTreeClassifier()
scores = cross_val_score(clf,X ,y , cv=5, scoring='neg_log_loss')
print("Loss Value of DecisionTreeClassifier: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))  

Loss Value of DecisionTreeClassifier: -9.89 (+/- 0.30)


#### ...new try:
as we have already seen above, overfitting occurs.
   >tuning maximum number of features (default is **None**, so... n_features)

In [6]:
for i in range(10,18):
    clf = tree.DecisionTreeClassifier(max_features=i*5)
    scores = cross_val_score(clf,X ,y , cv=5, scoring='neg_log_loss')
    print("Loss Value of DecisionTreeClassifier: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

Loss Value of DecisionTreeClassifier: -9.98 (+/- 0.31)
Loss Value of DecisionTreeClassifier: -10.08 (+/- 0.20)
Loss Value of DecisionTreeClassifier: -9.97 (+/- 0.27)
Loss Value of DecisionTreeClassifier: -9.94 (+/- 0.25)
Loss Value of DecisionTreeClassifier: -9.93 (+/- 0.42)
Loss Value of DecisionTreeClassifier: -9.92 (+/- 0.14)
Loss Value of DecisionTreeClassifier: -9.91 (+/- 0.20)
Loss Value of DecisionTreeClassifier: -10.04 (+/- 0.14)


   >tuning maximum depth of the tree (default is **None**, until all leaves are pure)

In [7]:
for i in range(5,15):
    clf = tree.DecisionTreeClassifier(max_depth=i)
    scores = cross_val_score(clf,X ,y , cv=5, scoring='neg_log_loss')
    print("Loss Value of DecisionTreeClassifier: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

Loss Value of DecisionTreeClassifier: -1.35 (+/- 0.03)
Loss Value of DecisionTreeClassifier: -1.27 (+/- 0.03)
Loss Value of DecisionTreeClassifier: -1.24 (+/- 0.04)
Loss Value of DecisionTreeClassifier: -1.20 (+/- 0.04)
Loss Value of DecisionTreeClassifier: -1.18 (+/- 0.06)
Loss Value of DecisionTreeClassifier: -1.19 (+/- 0.07)
Loss Value of DecisionTreeClassifier: -1.23 (+/- 0.03)
Loss Value of DecisionTreeClassifier: -1.29 (+/- 0.06)
Loss Value of DecisionTreeClassifier: -1.40 (+/- 0.06)
Loss Value of DecisionTreeClassifier: -1.58 (+/- 0.07)


   >tuning minimum number of samples required to split (default is **2**, we split always!)

In [8]:
for i in range(80,90):
    clf = tree.DecisionTreeClassifier(min_samples_split=i*10)
    scores = cross_val_score(clf,X ,y , cv=5, scoring='neg_log_loss')
    print("Loss Value of DecisionTreeClassifier: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

Loss Value of DecisionTreeClassifier: -1.05 (+/- 0.06)
Loss Value of DecisionTreeClassifier: -1.05 (+/- 0.06)
Loss Value of DecisionTreeClassifier: -1.05 (+/- 0.05)
Loss Value of DecisionTreeClassifier: -1.05 (+/- 0.06)
Loss Value of DecisionTreeClassifier: -1.04 (+/- 0.06)
Loss Value of DecisionTreeClassifier: -1.04 (+/- 0.05)
Loss Value of DecisionTreeClassifier: -1.04 (+/- 0.05)
Loss Value of DecisionTreeClassifier: -1.04 (+/- 0.05)
Loss Value of DecisionTreeClassifier: -1.04 (+/- 0.04)
Loss Value of DecisionTreeClassifier: -1.04 (+/- 0.05)


   >tuning minimum number of samples required to be at a leaf node (default is **1**, everybody can be a leaf node!)

In [9]:
for i in range(18,28):
    clf = tree.DecisionTreeClassifier(min_samples_leaf=i*10)
    scores = cross_val_score(clf,X ,y , cv=5, scoring='neg_log_loss')
    print("Loss Value of DecisionTreeClassifier: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

Loss Value of DecisionTreeClassifier: -1.01 (+/- 0.05)
Loss Value of DecisionTreeClassifier: -1.00 (+/- 0.02)
Loss Value of DecisionTreeClassifier: -1.00 (+/- 0.03)
Loss Value of DecisionTreeClassifier: -1.00 (+/- 0.02)
Loss Value of DecisionTreeClassifier: -1.00 (+/- 0.03)
Loss Value of DecisionTreeClassifier: -1.01 (+/- 0.03)
Loss Value of DecisionTreeClassifier: -1.00 (+/- 0.04)
Loss Value of DecisionTreeClassifier: -1.01 (+/- 0.04)
Loss Value of DecisionTreeClassifier: -1.00 (+/- 0.03)
Loss Value of DecisionTreeClassifier: -1.00 (+/- 0.03)


#### ...time to Extra Trees+Calibrated:
#### ...maybe Calibrated Classifier helps:

In [9]:
clf = ExtraTreesClassifier(n_estimators=10)
scores = cross_val_score(clf,X ,y , cv=5, scoring='neg_log_loss')
print("Loss Value of DecisionTreeClassifier: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

Loss Value of DecisionTreeClassifier: -1.54 (+/- 0.12)


In [19]:
for i in range(25,30):
    clf = ExtraTreesClassifier(n_estimators=10,min_samples_split=i)
    scores = cross_val_score(clf,X ,y , cv=5, scoring='neg_log_loss')
    print("Loss Value of DecisionTreeClassifier: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

Loss Value of DecisionTreeClassifier: -0.71 (+/- 0.04)
Loss Value of DecisionTreeClassifier: -0.71 (+/- 0.01)
Loss Value of DecisionTreeClassifier: -0.71 (+/- 0.02)
Loss Value of DecisionTreeClassifier: -0.71 (+/- 0.03)
Loss Value of DecisionTreeClassifier: -0.71 (+/- 0.02)


In [22]:
for i in range(1,5):
    clf = ExtraTreesClassifier(n_estimators=10,min_samples_leaf=i)
    scores = cross_val_score(clf,X ,y , cv=5, scoring='neg_log_loss')
    print("Loss Value of DecisionTreeClassifier: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

Loss Value of DecisionTreeClassifier: -1.53 (+/- 0.07)
Loss Value of DecisionTreeClassifier: -0.82 (+/- 0.07)
Loss Value of DecisionTreeClassifier: -0.73 (+/- 0.03)
Loss Value of DecisionTreeClassifier: -0.73 (+/- 0.01)


In [28]:
for i in range(30,35):
    clf = ExtraTreesClassifier(n_estimators=10,max_depth=i)
    scores = cross_val_score(clf,X ,y , cv=5, scoring='neg_log_loss')
    print("Loss Value of DecisionTreeClassifier: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

Loss Value of DecisionTreeClassifier: -0.83 (+/- 0.03)
Loss Value of DecisionTreeClassifier: -0.83 (+/- 0.01)
Loss Value of DecisionTreeClassifier: -0.81 (+/- 0.02)
Loss Value of DecisionTreeClassifier: -0.82 (+/- 0.03)
Loss Value of DecisionTreeClassifier: -0.81 (+/- 0.02)


In [13]:
clf = ExtraTreesClassifier(n_estimators=10)
calibrated_clf = CalibratedClassifierCV(clf, method='isotonic')
scores = cross_val_score(calibrated_clf,X ,y , cv=5, scoring='neg_log_loss')
print("Loss Value of DecisionTreeClassifier: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

Loss Value of DecisionTreeClassifier: -0.59 (+/- 0.02)


In [40]:
for i in range(15,20):
    clf = ExtraTreesClassifier(n_estimators=10,min_samples_split=i)
    calibrated_clf = CalibratedClassifierCV(clf, method='isotonic')
    scores = cross_val_score(calibrated_clf,X ,y , cv=5, scoring='neg_log_loss')
    print("Loss Value of DecisionTreeClassifier: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

Loss Value of DecisionTreeClassifier: -0.56 (+/- 0.01)
Loss Value of DecisionTreeClassifier: -0.56 (+/- 0.01)
Loss Value of DecisionTreeClassifier: -0.57 (+/- 0.02)
Loss Value of DecisionTreeClassifier: -0.57 (+/- 0.01)
Loss Value of DecisionTreeClassifier: -0.56 (+/- 0.02)


In [20]:
clf = ExtraTreesClassifier(n_estimators=100)
calibrated_clf = CalibratedClassifierCV(clf, method='isotonic')
scores = cross_val_score(calibrated_clf,X ,y , cv=5, scoring='neg_log_loss')
print("Loss Value of DecisionTreeClassifier: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

Loss Value of DecisionTreeClassifier: -0.48 (+/- 0.02)


In [None]:
import csv
import numpy as np
from sklearn.ensemble import ExtraTreesClassifier

test_data = np.genfromtxt('test.csv', delimiter=',',dtype=None)[1:]
test_data_x = test_data[:,1:]

clf = ExtraTreesClassifier(n_estimators=1000)
calibrated_clf = CalibratedClassifierCV(clf, method='isotonic')
calibrated_clf.fit(X ,y)
cikti_x = calibrated_clf.predict_proba(test_data_x)


with open('cikti7.csv', 'w') as csvfile:
    fieldnames = ['id','Class_1','Class_2','Class_3','Class_4','Class_5','Class_6','Class_7','Class_8','Class_9']
    writer = csv.DictWriter(csvfile, fieldnames=fieldnames)

    writer.writeheader()
    x = 1
    for item in cikti_x:
        #Write item to outcsv
        writer.writerow({'id':x,'Class_1':format(round(item[0],2)), 'Class_2':format(round(item[1],2)), 'Class_3':format(round(item[2],2)),'Class_4':format(round(item[3],2)),'Class_5':format(round(item[4],2)),'Class_6':format(round(item[5],2)),'Class_7':format(round(item[6],2)),'Class_8':format(round(item[7],2)),'Class_9':format(round(item[8],2))})
        x=x+1
    print(x)