In [1]:
%matplotlib inline
%reload_ext autoreload
%autoreload 2

In [2]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
from sklearn.datasets import load_digits, load_boston
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, mean_squared_error
from DecisionTree import DecisionTree

### Classification

We will use digit dataset from sklearn

In [3]:
digits = load_digits()

 train_test_split with test sizeof 0.2

In [4]:
X_train, X_test, y_train, y_test = train_test_split(digits.data, digits.target, test_size=0.2, random_state=17)

Create tree of depth 4 and fit. Its not tuned, so the results are not that great.<br>
The default criterion is gini

In [5]:
tr = DecisionTree(max_depth=4)
tr.fit(X_train, y_train)
preds = tr.predict(X_test)
accuracy_score(y_test, preds)

0.5666666666666667

Let's try entropy criterion

In [6]:
tre = DecisionTree(max_depth=4, criterion='entropy')
tre.fit(X_train, y_train)
preds1 = tre.predict(X_test)
accuracy_score(y_test, preds1)

0.6361111111111111

Let's do grid search with max_depth between 3 to 11 and criterion -gini and entropy

In [7]:
tree_params = {'max_depth': range(3, 11)}

In [8]:
tr = DecisionTree()
tr_grid = GridSearchCV(tr, tree_params, cv=5, verbose=True, scoring='accuracy')
tr_grid.fit(X_train, y_train)

Fitting 5 folds for each of 8 candidates, totalling 40 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  40 out of  40 | elapsed:  8.3min finished


GridSearchCV(cv=5, error_score='raise-deprecating',
             estimator=DecisionTree(criterion='gini', debug=False,
                                    max_depth=inf, min_samples_split=2),
             iid='warn', n_jobs=None, param_grid={'max_depth': range(3, 11)},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring='accuracy', verbose=True)

In [9]:
tr_grid.best_params_, tr_grid.best_score_

({'max_depth': 9}, 0.8399443284620738)

In [10]:
grid_pred = tr_grid.predict(X_test)

In [11]:
accuracy_score(y_test, grid_pred)

0.8305555555555556

Now lets try entropy

In [12]:
tr1 = DecisionTree(criterion='entropy')
tr_grid1 = GridSearchCV(tr1, tree_params, cv=5, verbose=True, scoring='accuracy')
tr_grid1.fit(X_train, y_train)

Fitting 5 folds for each of 8 candidates, totalling 40 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  40 out of  40 | elapsed:  8.7min finished


GridSearchCV(cv=5, error_score='raise-deprecating',
             estimator=DecisionTree(criterion='entropy', debug=False,
                                    max_depth=inf, min_samples_split=2),
             iid='warn', n_jobs=None, param_grid={'max_depth': range(3, 11)},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring='accuracy', verbose=True)

In [13]:
tr_grid1.best_params_, tr_grid1.best_score_

({'max_depth': 8}, 0.8663883089770354)

In [14]:
grid_pred1 = tr_grid1.predict(X_test)

In [15]:
accuracy_score(y_test, grid_pred1)

0.8833333333333333

### Regression Tree

We'll use boston house pricing dataset from sklearn for regression problem. Here the criterion we use are variance and mad_median and for scoring we'll use mean_squared_error/neg_mean_squared_error

In [16]:
ds = load_boston()

In [17]:
X_train, X_test, y_train, y_test = train_test_split(ds.data, ds.target, test_size=0.2, random_state=17)

In [18]:
tr_reg = DecisionTree(max_depth=4, criterion='variance')
tr_reg.fit(X_train, y_train)
pred_reg = tr_reg.predict(X_test)
mean_squared_error(y_test, pred_reg)

13.626382527384985

In [19]:
tr_reg1 = DecisionTree(max_depth=4, criterion='mad_median')
tr_reg1.fit(X_train, y_train)
pred_reg1 = tr_reg1.predict(X_test)
mean_squared_error(y_test, pred_reg1)

40.2708660909879

grid search with max_depth between 3 to 11

In [20]:
tree_params = {'max_depth': range(3, 11)}

In [21]:
tr_reg2 = DecisionTree(criterion='variance')
tr_reg_grid = GridSearchCV(tr_reg2, tree_params, cv=5 , verbose=True, scoring='neg_mean_squared_error')
tr_reg_grid.fit(X_train, y_train)

Fitting 5 folds for each of 8 candidates, totalling 40 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  40 out of  40 | elapsed:  2.4min finished


GridSearchCV(cv=5, error_score='raise-deprecating',
             estimator=DecisionTree(criterion='variance', debug=False,
                                    max_depth=inf, min_samples_split=2),
             iid='warn', n_jobs=None, param_grid={'max_depth': range(3, 11)},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring='neg_mean_squared_error', verbose=True)

In [22]:
tr_reg_grid.best_params_, tr_reg_grid.best_score_

({'max_depth': 5}, -21.35204908672389)

In [23]:
tr_reg_preds = tr_reg_grid.predict(X_test)

In [24]:
mean_squared_error(y_test, tr_reg_preds)

12.778854727585674

With criterion mad_median

In [25]:
tr_reg3 = DecisionTree(criterion='mad_median')
tr_reg_grid1 = GridSearchCV(tr_reg3, tree_params, cv=5 , verbose=True, scoring='neg_mean_squared_error')
tr_reg_grid1.fit(X_train, y_train)

Fitting 5 folds for each of 8 candidates, totalling 40 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  40 out of  40 | elapsed:  3.9min finished


GridSearchCV(cv=5, error_score='raise-deprecating',
             estimator=DecisionTree(criterion='mad_median', debug=False,
                                    max_depth=inf, min_samples_split=2),
             iid='warn', n_jobs=None, param_grid={'max_depth': range(3, 11)},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring='neg_mean_squared_error', verbose=True)

In [26]:
tr_reg_grid1.best_params_, tr_reg_grid1.best_score_

({'max_depth': 10}, -46.26327320232432)

In [27]:
tr_reg_preds1 = tr_reg_grid1.predict(X_test)

In [28]:
mean_squared_error(y_test, tr_reg_preds1)

38.663449747070636

# Using pandas with another dataset.

In [22]:
df = pd.read_csv('train.csv', index_col='id', sep=';')

In [23]:
df['age_in_years'] = np.floor(df['age']/365.25)

In [24]:
labels = df['cardio']

In [25]:
df['gender'] =df['gender'].apply(lambda x: x-1)

In [26]:
df['Age_40-50'] = df['age_in_years'].apply(lambda x: 1 if x >= 40 and x < 50 else 0)
df['Age_50-55'] = df['age_in_years'].apply(lambda x: 1 if x >= 50 and x < 55 else 0)
df['Age_55-60'] = df['age_in_years'].apply(lambda x: 1 if x >= 55 and x < 60 else 0)
df['Age_60-65'] = df['age_in_years'].apply(lambda x: 1 if x >= 60 and x < 65 else 0)

In [27]:
df['aphi_120-140'] = df['ap_hi'].apply(lambda x: 1 if x >= 120 and x < 140 else 0)
df['aphi_140-160'] = df['ap_hi'].apply(lambda x: 1 if x >= 140 and x < 160 else 0)
df['aphi_160-180'] = df['ap_hi'].apply(lambda x: 1 if x >= 160 and x < 180 else 0)

In [28]:
df =pd.get_dummies(df, prefix=['cholesterol'], columns=['cholesterol'])

In [29]:
def f(a, b):
    return b/ (a/100)**2

In [30]:
df['bmi'] = df.apply(lambda x: f(x.height, x.weight), axis=1)

In [35]:
df = df.drop(labels=['gluc', 'ap_lo', 'alco', 'age', 'cardio', 'age_in_years', 'ap_hi', 'height', 'weight'], axis=1)

In [36]:
df['bmi'] = np.floor(df['bmi'])

In [38]:
df.head()

Unnamed: 0_level_0,gender,smoke,active,Age_40-50,Age_50-55,Age_55-60,Age_60-65,aphi_120-140,aphi_140-160,aphi_160-180,cholesterol_1,cholesterol_2,cholesterol_3,bmi
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
0,1,0,1,0,1,0,0,0,0,0,1,0,0,21.0
1,0,0,1,0,0,1,0,0,1,0,0,0,1,34.0
2,0,0,0,0,1,0,0,1,0,0,0,0,1,23.0
3,1,0,1,1,0,0,0,0,1,0,1,0,0,28.0
4,0,0,0,1,0,0,0,0,0,0,1,0,0,23.0


In [39]:
X_train, X_valid, y_train, y_valid = train_test_split(df, labels, test_size=0.3, random_state=17)

In [40]:
tr_pd = DecisionTree(max_depth=3)

In [41]:
tr_pd.fit(X_train, y_train)

In [42]:
pred = tr_pd.predict(X_valid)

In [43]:
accuracy_score(y_valid, pred)

0.704047619047619

In [44]:
tr = DecisionTree()

In [46]:
tree_params = {'max_depth': list(range(3, 12))}
tree_grid = GridSearchCV(tr, tree_params, cv=5, verbose=True, scoring='accuracy')

In [47]:
tree_grid.fit(X_train, y_train)

Fitting 5 folds for each of 9 candidates, totalling 45 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  45 out of  45 | elapsed: 25.7min finished


GridSearchCV(cv=5, error_score='raise-deprecating',
             estimator=DecisionTree(criterion='gini', debug=False,
                                    max_depth=inf, min_samples_split=2),
             iid='warn', n_jobs=None,
             param_grid={'max_depth': [3, 4, 5, 6, 7, 8, 9, 10, 11]},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring='accuracy', verbose=True)

In [48]:
preds = tree_grid.predict(X_valid)

In [49]:
accuracy_score(y_valid, preds)

0.7147142857142857

In [50]:
tree_grid.best_params_, tree_grid.best_score_

({'max_depth': 9}, 0.7148367346938775)