In [1]:
%matplotlib inline
%reload_ext autoreload
%autoreload 2

In [2]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
from sklearn.datasets import load_digits, load_boston
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, mean_squared_error
from DecisionTree import DecisionTree

## Classification

We will use digit dataset from sklearn

In [3]:
digits = load_digits()

 train_test_split with test sizeof 0.3

In [4]:
X_train, X_test, y_train, y_test = train_test_split(digits.data, digits.target, test_size=0.3, random_state=17)

#### Sklearn DecisionTreeClassifier result for reference

In [5]:
from sklearn.tree import DecisionTreeClassifier

Using gini criterion

In [6]:
tr = DecisionTreeClassifier(max_depth=4, random_state=17)
tr.fit(X_train, y_train)
preds = tr.predict(X_test)
accuracy_score(y_test, preds)

0.5648148148148148

Using entropy criterion

In [7]:
tre = DecisionTreeClassifier(max_depth=4, criterion='entropy', random_state=17)
tre.fit(X_train, y_train)
preds1 = tre.predict(X_test)
accuracy_score(y_test, preds1)

0.6666666666666666

#### Now Using DecisionTree implementation.

Create tree of depth 4 and fit. Its not tuned, so the results are not that great.<br>
The default criterion is gini

In [8]:
tr = DecisionTree(max_depth=4, random_seed=17)
tr.fit(X_train, y_train)
preds = tr.predict(X_test)
accuracy_score(y_test, preds)

0.5648148148148148

Let's try entropy criterion

In [9]:
tre = DecisionTree(max_depth=4, criterion='entropy', random_seed=17)
tre.fit(X_train, y_train)
preds1 = tre.predict(X_test)
accuracy_score(y_test, preds1)

0.6666666666666666

Let's do grid search with max_depth between 3 to 11 and criterion -gini and entropy

In [10]:
tree_params = {'max_depth': range(3, 11)}

In [11]:
tr = DecisionTree(random_seed=17)
tr_grid = GridSearchCV(tr, tree_params, cv=5, verbose=1, scoring='accuracy')
tr_grid.fit(X_train, y_train)

Fitting 5 folds for each of 8 candidates, totalling 40 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  40 out of  40 | elapsed:  3.8min finished


GridSearchCV(cv=5, error_score=nan,
             estimator=DecisionTree(criterion='gini', max_depth=inf,
                                    max_features=0, min_samples_split=2,
                                    random_seed=17),
             iid='deprecated', n_jobs=None,
             param_grid={'max_depth': range(3, 11)}, pre_dispatch='2*n_jobs',
             refit=True, return_train_score=False, scoring='accuracy',
             verbose=1)

In [12]:
tr_grid.best_params_, tr_grid.best_score_

({'max_depth': 8}, 0.8337380636185416)

In [13]:
grid_pred = tr_grid.predict(X_test)

In [14]:
accuracy_score(y_test, grid_pred)

0.8425925925925926

Now lets try entropy

In [15]:
tr1 = DecisionTree(criterion='entropy', random_seed=17)
tr_grid1 = GridSearchCV(tr1, tree_params, cv=5, verbose=1, scoring='accuracy')
tr_grid1.fit(X_train, y_train)

Fitting 5 folds for each of 8 candidates, totalling 40 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  40 out of  40 | elapsed:  3.2min finished


GridSearchCV(cv=5, error_score=nan,
             estimator=DecisionTree(criterion='entropy', max_depth=inf,
                                    max_features=0, min_samples_split=2,
                                    random_seed=17),
             iid='deprecated', n_jobs=None,
             param_grid={'max_depth': range(3, 11)}, pre_dispatch='2*n_jobs',
             refit=True, return_train_score=False, scoring='accuracy',
             verbose=1)

In [16]:
tr_grid1.best_params_, tr_grid1.best_score_

({'max_depth': 8}, 0.8472933662176689)

In [17]:
grid_pred1 = tr_grid1.predict(X_test)

In [18]:
accuracy_score(y_test, grid_pred1)

0.8407407407407408

## Regression Tree

We'll use boston house pricing dataset from sklearn for regression problem. Here the criterion we use are variance and mad_median and for scoring we'll use mean_squared_error/neg_mean_squared_error

In [19]:
ds = load_boston()

In [20]:
X_train, X_test, y_train, y_test = train_test_split(ds.data, ds.target, test_size=0.3, random_state=17)

#### Sklearn DecisionTreeRegressor results for reference

In [21]:
from sklearn.tree import DecisionTreeRegressor

Using criterion mse 

In [22]:
tr_reg = DecisionTreeRegressor(max_depth=4, criterion='mse', random_state=17)
tr_reg.fit(X_train, y_train)
pred_reg = tr_reg.predict(X_test)
mean_squared_error(y_test, pred_reg)

13.725796084941395

Using criteria mae

In [23]:
tr_reg1 = DecisionTreeRegressor(max_depth=4, criterion='mae', random_state=17)
tr_reg1.fit(X_train, y_train)
pred_reg1 = tr_reg1.predict(X_test)
mean_squared_error(y_test, pred_reg1)

13.869539473684211

#### Now using DecisionTree implementation

In [24]:
tr_reg = DecisionTree(max_depth=4, criterion='mse', random_seed=17)
tr_reg.fit(X_train, y_train)
pred_reg = tr_reg.predict(X_test)
mean_squared_error(y_test, pred_reg)

13.725796084941402

In [25]:
tr_reg1 = DecisionTree(max_depth=4, criterion='mae', random_seed=17)
tr_reg1.fit(X_train, y_train)
pred_reg1 = tr_reg1.predict(X_test)
mean_squared_error(y_test, pred_reg1)

12.43847059639562

grid search with max_depth between 3 to 11

In [26]:
tree_params = {'max_depth': range(3, 11)}

In [27]:
tr_reg2 = DecisionTree(criterion='mse')
tr_reg_grid = GridSearchCV(tr_reg2, tree_params, cv=5 , verbose=1, scoring='neg_mean_squared_error')
tr_reg_grid.fit(X_train, y_train)

Fitting 5 folds for each of 8 candidates, totalling 40 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  40 out of  40 | elapsed:  1.1min finished


GridSearchCV(cv=5, error_score=nan,
             estimator=DecisionTree(criterion='mse', max_depth=inf,
                                    max_features=0, min_samples_split=2,
                                    random_seed=0),
             iid='deprecated', n_jobs=None,
             param_grid={'max_depth': range(3, 11)}, pre_dispatch='2*n_jobs',
             refit=True, return_train_score=False,
             scoring='neg_mean_squared_error', verbose=1)

In [28]:
tr_reg_grid.best_params_, tr_reg_grid.best_score_

({'max_depth': 9}, -25.77324439957825)

In [29]:
tr_reg_preds = tr_reg_grid.predict(X_test)

In [30]:
mean_squared_error(y_test, tr_reg_preds)

13.241544976025546

With criterion mae

In [31]:
tr_reg3 = DecisionTree(criterion='mae', random_seed=17)
tr_reg_grid1 = GridSearchCV(tr_reg3, tree_params, cv=5 , verbose=True, scoring='neg_mean_squared_error')
tr_reg_grid1.fit(X_train, y_train)

Fitting 5 folds for each of 8 candidates, totalling 40 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  40 out of  40 | elapsed:  1.0min finished


GridSearchCV(cv=5, error_score=nan,
             estimator=DecisionTree(criterion='mae', max_depth=inf,
                                    max_features=0, min_samples_split=2,
                                    random_seed=17),
             iid='deprecated', n_jobs=None,
             param_grid={'max_depth': range(3, 11)}, pre_dispatch='2*n_jobs',
             refit=True, return_train_score=False,
             scoring='neg_mean_squared_error', verbose=True)

In [32]:
tr_reg_grid1.best_params_, tr_reg_grid1.best_score_

({'max_depth': 3}, -27.754810892767996)

In [33]:
tr_reg_preds1 = tr_reg_grid1.predict(X_test)

In [34]:
mean_squared_error(y_test, tr_reg_preds1)

17.591161638032855

# Using pandas with another dataset.

#### Problem

Predict the presence or absence of cardiovascular disease (CVD) using the patient examination results.

#### Data description

There are 3 types of input features:

- *Objective*: factual information;
- *Examination*: results of medical examination;
- *Subjective*: information given by the patient.

| Feature | Variable Type | Variable      | Value Type |
|---------|--------------|---------------|------------|
| Age | Objective Feature | age | int (days) |
| Height | Objective Feature | height | int (cm) |
| Weight | Objective Feature | weight | float (kg) |
| Gender | Objective Feature | gender | categorical code |
| Systolic blood pressure | Examination Feature | ap_hi | int |
| Diastolic blood pressure | Examination Feature | ap_lo | int |
| Cholesterol | Examination Feature | cholesterol | 1: normal, 2: above normal, 3: well above normal |
| Glucose | Examination Feature | gluc | 1: normal, 2: above normal, 3: well above normal |
| Smoking | Subjective Feature | smoke | binary |
| Alcohol intake | Subjective Feature | alco | binary |
| Physical activity | Subjective Feature | active | binary |
| Presence or absence of cardiovascular disease | Target Variable | cardio | binary |

All of the dataset values were collected at the moment of medical examination.

In [35]:
df = pd.read_csv('train.csv', index_col='id', sep=';')

In [36]:
df['age_in_years'] = np.floor(df['age']/365.25)

In [37]:
labels = df['cardio']

In [38]:
df['gender'] =df['gender'].apply(lambda x: x-1)

In [39]:
df['Age_40-50'] = df['age_in_years'].apply(lambda x: 1 if x >= 40 and x < 50 else 0)
df['Age_50-55'] = df['age_in_years'].apply(lambda x: 1 if x >= 50 and x < 55 else 0)
df['Age_55-60'] = df['age_in_years'].apply(lambda x: 1 if x >= 55 and x < 60 else 0)
df['Age_60-65'] = df['age_in_years'].apply(lambda x: 1 if x >= 60 and x < 65 else 0)

In [40]:
df['aphi_120-140'] = df['ap_hi'].apply(lambda x: 1 if x >= 120 and x < 140 else 0)
df['aphi_140-160'] = df['ap_hi'].apply(lambda x: 1 if x >= 140 and x < 160 else 0)
df['aphi_160-180'] = df['ap_hi'].apply(lambda x: 1 if x >= 160 and x < 180 else 0)

In [41]:
df =pd.get_dummies(df, prefix=['cholesterol'], columns=['cholesterol'])

In [42]:
def f(a, b):
    return b/ (a/100)**2

In [43]:
df['bmi'] = df.apply(lambda x: f(x.height, x.weight), axis=1)

In [44]:
df = df.drop(labels=['gluc', 'ap_lo', 'alco', 'age', 'cardio', 'age_in_years', 'ap_hi', 'height', 'weight'], axis=1)

In [45]:
df['bmi'] = np.floor(df['bmi'])

In [46]:
df.head()

Unnamed: 0_level_0,gender,smoke,active,Age_40-50,Age_50-55,Age_55-60,Age_60-65,aphi_120-140,aphi_140-160,aphi_160-180,cholesterol_1,cholesterol_2,cholesterol_3,bmi
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
0,1,0,1,0,1,0,0,0,0,0,1,0,0,21.0
1,0,0,1,0,0,1,0,0,1,0,0,0,1,34.0
2,0,0,0,0,1,0,0,1,0,0,0,0,1,23.0
3,1,0,1,1,0,0,0,0,1,0,1,0,0,28.0
4,0,0,0,1,0,0,0,0,0,0,1,0,0,23.0


In [47]:
X_train, X_valid, y_train, y_valid = train_test_split(df, labels, test_size=0.3, random_state=17)

In [48]:
tr_pd = DecisionTree(max_depth=3, random_seed=17)

In [49]:
tr_pd.fit(X_train, y_train)

In [50]:
pred = tr_pd.predict(X_valid)

In [51]:
accuracy_score(y_valid, pred)

0.704047619047619

In [52]:
tr = DecisionTree(random_seed=17)

In [53]:
tree_params = {'max_depth': list(range(3, 12)),
              'max_features': [0, 4, 8, 12],
              'criterion': ['gini', 'entropy']}
tree_grid = GridSearchCV(tr, tree_params, cv=5, verbose=1, scoring='accuracy')

In [54]:
tree_grid.fit(X_train, y_train)

Fitting 5 folds for each of 72 candidates, totalling 360 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 360 out of 360 | elapsed: 23.8min finished


GridSearchCV(cv=5, error_score=nan,
             estimator=DecisionTree(criterion='gini', max_depth=inf,
                                    max_features=0, min_samples_split=2,
                                    random_seed=17),
             iid='deprecated', n_jobs=None,
             param_grid={'criterion': ['gini', 'entropy'],
                         'max_depth': [3, 4, 5, 6, 7, 8, 9, 10, 11],
                         'max_features': [0, 4, 8, 12]},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring='accuracy', verbose=1)

In [55]:
tree_grid.best_params_, tree_grid.best_score_

({'criterion': 'gini', 'max_depth': 6, 'max_features': 8}, 0.7153673469387756)

In [56]:
preds = tree_grid.predict(X_valid)

In [57]:
accuracy_score(y_valid, preds)

0.7148095238095238