In [3]:
import pandas as pd
import numpy as np
from sklearn import tree
from sklearn.model_selection import KFold

In [4]:
df = pd.read_csv('all_data_combined.csv')
df.head()
df.shape

Unnamed: 0,subject_id,age,urea_n_min,urea_n_max,urea_n_mean,resprate_min,resprate_max,resprate_mean,glucose_min,glucose_max,...,oasis,lods,gender_F,gender_M,marital_status_DIVORCED,marital_status_MARRIED,marital_status_SEPARATED,marital_status_SINGLE,marital_status_WIDOWED,death
0,15057,58.831224,16.0,37.0,26.111111,10.0,52,20.104478,67.0,405.0,...,36.5,5.5,0,1,0,1,0,0,0,1
1,79262,81.618606,10.0,15.0,13.5,11.0,31,18.704225,119.0,348.0,...,48.0,4.0,0,1,0,1,0,0,0,1
2,77191,79.102744,16.0,56.0,35.5,20.0,31,25.744681,90.0,188.0,...,35.0,6.0,1,0,1,0,0,0,0,0
3,84966,88.232043,8.0,27.0,17.692308,10.0,51,17.770833,92.0,271.0,...,40.5,4.0,1,0,0,0,0,1,0,1
4,94997,90.0,38.0,48.0,42.25,4.0,40,26.36,62.0,135.0,...,35.0,5.0,1,0,0,0,0,1,0,0


In [6]:
def decisionTreeMethod(df,method):
    
    kf = KFold(n_splits=3, random_state=0)
    
    result = []
    
    for train, test in kf.split(df):

        train_data = df.iloc[train,:]
        test_data =  df.iloc[test,:]

        trainx = train_data.iloc[:,1:(train_data.shape[1]-2)]
        trainy = train_data.iloc[:,train_data.shape[1]-1]

        testx = test_data.iloc[:,1:(test_data.shape[1]-2)]
        testy = test_data.iloc[:,test_data.shape[1]-1]

        model = tree.DecisionTreeClassifier(criterion = method)
        model.fit(trainx, trainy)
        result.append(model.score(testx, testy))
               
    return np.average(result)



def decisionTreeParams(df,method,params,params_value):
    
    kf = KFold(n_splits=3, random_state=0)
    
    result = []
    weight = []
    
    for train, test in kf.split(df):

        train_data = df.iloc[train,:]
        test_data =  df.iloc[test,:]

        trainx = train_data.iloc[:,1:(train_data.shape[1]-2)]
        trainy = train_data.iloc[:,train_data.shape[1]-1]

        testx = test_data.iloc[:,1:(test_data.shape[1]-2)]
        testy = test_data.iloc[:,test_data.shape[1]-1]
        
        if params=='depth':
            model = tree.DecisionTreeClassifier(criterion = method,max_depth=params_value)
            
        elif params=='sample':
#             model = tree.DecisionTreeClassifier(criterion = method,min_samples_split=params_value)
            model = tree.DecisionTreeClassifier(criterion = method,min_samples_leaf=params_value)
            
        model.fit(trainx, trainy)
        result.append(model.score(testx, testy))
     
    return np.average(result)

In [8]:
df_para = pd.read_csv('tree_params.csv')
print(df_para.head())

    method  depth  samples
0  entropy      1        2
1  entropy      2        8
2  entropy      4       32
3  entropy      8       64
4  entropy     16      128


In [9]:
scores =[]
for indx,row in df_para.iterrows():
    scores.append(decisionTreeMethod(df,method = row['method']))
df_para['accuraccy']=scores

df_para

Unnamed: 0,method,depth,samples,accuraccy
0,entropy,1,2,0.704933
1,entropy,2,8,0.705402
2,entropy,4,32,0.703999
3,entropy,8,64,0.706335
4,entropy,16,128,0.711441
5,entropy,32,256,0.710512
6,entropy,64,500,0.701681


In [10]:
scores =[]
for indx,row in df_para.iterrows():
    scores.append(decisionTreeParams(df,method = row['method'],params='depth',params_value=row['depth']))
df_para['accuraccy']=scores

df_para

Unnamed: 0,method,depth,samples,accuraccy
0,entropy,1,2,0.707253
1,entropy,2,8,0.70958
2,entropy,4,32,0.732351
3,entropy,8,64,0.699359
4,entropy,16,128,0.71005
5,entropy,32,256,0.701681
6,entropy,64,500,0.704007


In [11]:
scores =[]
for indx,row in df_para.iterrows():
    scores.append(decisionTreeParams(df,method = row['method'],params='sample',params_value=row['samples']))
df_para['accuraccy']=scores

df_para

Unnamed: 0,method,depth,samples,accuraccy
0,entropy,1,2,0.712367
1,entropy,2,8,0.711448
2,entropy,4,32,0.71933
3,entropy,8,64,0.723046
4,entropy,16,128,0.729557
5,entropy,32,256,0.690984
6,entropy,64,500,0.707253


In [12]:
def decisionTreeBest(df):
    
    kf = KFold(n_splits=3, random_state=0)
    
    result = []
    weight = []
    
    for train, test in kf.split(df):

        train_data = df.iloc[train,:]
        test_data =  df.iloc[test,:]

        trainx = train_data.iloc[:,1:(train_data.shape[1]-2)]
        trainy = train_data.iloc[:,train_data.shape[1]-1]

        testx = test_data.iloc[:,1:(test_data.shape[1]-2)]
        testy = test_data.iloc[:,test_data.shape[1]-1]

        model = tree.DecisionTreeClassifier(criterion = 'entropy',max_depth=4,min_samples_leaf=50)
        model.fit(trainx, trainy)
        result.append(model.score(testx, testy))
               
    return np.average(result)

In [13]:
scores =[]
for indx,row in df_para.iterrows():
    scores.append(decisionTreeBest(df))
df_para['accuraccy']=scores

df_para

Unnamed: 0,method,depth,samples,accuraccy
0,entropy,1,2,0.720733
1,entropy,2,8,0.720733
2,entropy,4,32,0.720733
3,entropy,8,64,0.720733
4,entropy,16,128,0.720733
5,entropy,32,256,0.720733
6,entropy,64,500,0.720733
