In [1]:
from ensemble_fns import (adaboost,
                          adaboost_pred,
                          adaboost_acc,
                          random_forest, 
                          bagging, 
                          forest_pred,
                          forest_acc)
import sys
sys.path.append('..')
from datasets import get_bank_data
from decision_tree_fns import predict, tree_maker, ID3
import os
import pickle
import numpy as np
import pandas as pd

In [14]:
%load_ext autoreload
%autoreload

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [3]:
os.listdir('../datasets/credit_card')

['default of credit card clients.xls']

In [6]:
! pip install xlrd ## for pd.read_excel()

Collecting xlrd
  Downloading xlrd-2.0.1-py2.py3-none-any.whl (96 kB)
[2K     [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m96.5/96.5 kB[0m [31m1.3 MB/s[0m eta [36m0:00:00[0m MB/s[0m eta [36m0:00:01[0m
[?25hInstalling collected packages: xlrd
Successfully installed xlrd-2.0.1


In [4]:
dataset_fp = os.path.join('../datasets/credit_card',os.listdir('../datasets/credit_card')[0])
df = pd.read_excel(dataset_fp)

In [5]:
print('preview df', df)

preview df       Unnamed: 0         X1   X2         X3        X4   X5     X6     X7  \
0             ID  LIMIT_BAL  SEX  EDUCATION  MARRIAGE  AGE  PAY_0  PAY_2   
1              1      20000    2          2         1   24      2      2   
2              2     120000    2          2         2   26     -1      2   
3              3      90000    2          2         2   34      0      0   
4              4      50000    2          2         1   37      0      0   
...          ...        ...  ...        ...       ...  ...    ...    ...   
29996      29996     220000    1          3         1   39      0      0   
29997      29997     150000    1          3         2   43     -1     -1   
29998      29998      30000    1          2         2   37      4      3   
29999      29999      80000    1          3         1   41      1     -1   
30000      30000      50000    1          2         1   46      0      0   

          X8     X9  ...        X15        X16        X17       X18       X1

In [6]:
dataset = df.to_numpy()
X = dataset[1:,1:-1] #all rows except first, all cols except first or last
y = dataset[1:,-1]   #all rows except first, only last column
feat_names = dataset[0,1:-1]  #first row is headers, include all columns except first/last

In [7]:
print(f'loaded credit card data from xls file {y.shape=} {X.shape}, {feat_names.shape=}')

loaded credit card data from xls file y.shape=(30000,) (30000, 23), feat_names.shape=(23,)


### Shuffle, Split

In [8]:
np.random.seed(seed=0)

shuffle = np.random.choice(len(X), 30000, replace=False)
split = 24000

# be careful that columns of X_train might not have all the feature values that X_test does
X,y = X[shuffle], y[shuffle]
X_train, y_train = X[:split], y[:split]
X_test, y_test = X[split:], y[split:]


In [9]:
print(f'split data, {X_train.shape=}, {y_train.shape=}, {X_test.shape=}, {y_test.shape=}')

split data, X_train.shape=(24000, 23), y_train.shape=(24000,), X_test.shape=(6000, 23), y_test.shape=(6000,)


### handle continuous features using median splitting 

In [10]:
# continuous features are limit bal [0], age [4], bill amt 1 [11], rest of features [12:]
# [] denotes the column
numeric_features = [0,4,11,12,13,14,15,16,17,18,19,20,21,22]
print('continous features are', feat_names[numeric_features])

medians = np.median(X_train[:, numeric_features], axis=0)
X_train[:,numeric_features] = X_train[:,numeric_features] > medians
X_test[:,numeric_features] = X_test[:,numeric_features] > medians

continous features are ['LIMIT_BAL' 'AGE' 'BILL_AMT1' 'BILL_AMT2' 'BILL_AMT3' 'BILL_AMT4'
 'BILL_AMT5' 'BILL_AMT6' 'PAY_AMT1' 'PAY_AMT2' 'PAY_AMT3' 'PAY_AMT4'
 'PAY_AMT5' 'PAY_AMT6']


### single tree

In [13]:
single_tree = ID3(X_train,y_train,feat_names, max_depth=100, IG_metric='entropy')

with open(f'./3/single_tree.pkl','wb') as f:
    pickle.dump(single_tree, f)

In [14]:
tree_train_acc = (y_train == [predict(X_train[ex],single_tree,feat_names) for ex in range(len(X_train))]).mean()
tree_test_acc = (y_test == [predict(X_test[ex],single_tree,feat_names) for ex in range(len(X_test))]).mean()

print(f'single tree accuracies, {tree_train_acc} train, {tree_test_acc} test')


single tree accuracies, 0.940375 train, 0.7555 test


### adaboost

In [15]:
stumps, amount_of_says = adaboost(X_train,
                                  y_train,
                                  feat_names, 
                                  num_stumps=500, 
                                  IG_metric='entropy')
with open(f'./3/adaboost.pkl','wb') as f:
    pickle.dump((stumps,amount_of_says), f)

In [16]:
ada_train_acc = adaboost_acc(X_train,y_train,stumps,feat_names, amount_of_says)
ada_test_acc = adaboost_acc(X_test,y_test,stumps,feat_names, amount_of_says)

print(f'adaboost ensemble accuracies, {ada_train_acc} train, {ada_test_acc} test')

adaboost ensemble accuracies, 0.821625 train, 0.8115 test


### random forest

In [15]:
rf = random_forest(X_train,
                   y_train,
                   feat_names,
                   tree_count=500,
                   max_depth=100, 
                   IG_metric='entropy')

with open(f'./3/random_forest.pkl','wb') as f:
    pickle.dump(rf, f)

In [16]:
rf_train_acc = forest_acc(X_train,y_train,rf,feat_names)
rf_test_acc = forest_acc(X_test,y_test,rf,feat_names)

print(f'random forest accuracies {rf_train_acc} train, {rf_test_acc} test')

random forest accuracies 0.8859166666666667 train, 0.797 test


### bagged forest

In [17]:
bagged_forest = bagging(X_train,
                        y_train,
                        feat_names,
                        tree_count=500,
                        max_depth=100,
                        IG_metric='entropy')

with open(f'./3/bagged_forest.pkl','wb') as f:
    pickle.dump(bagged_forest, f)

In [18]:

bagged_train_acc = forest_acc(X_train,y_train,bagged_forest,feat_names)
bagged_test_acc = forest_acc(X_test,y_test,bagged_forest,feat_names)

print(f'bagged forest accuracies {bagged_train_acc} train, {bagged_test_acc} test')

bagged forest accuracies 0.9250416666666667 train, 0.7883333333333333 test
