In [300]:
import numpy as np
from numpy import log2
from decision_tree_fns import subtree, predict, mode, calc_IG
from datasets import get_bank_data
import pickle

In [286]:
data = np.array([[0,0,1,0,0],
                  [0,1,0,0,0],
                  [0,0,1,1,1],
                  [1,0,0,1,1],
                  [0,1,1,0,0],
                  [1,1,0,0,0],
                  [0,1,0,1,0]])

y = data[:,-1:]
X = data[:,:-1]
feat_names=np.array(['x1','x2','x3','x4'])
# ID3(X,y,feat_names,IG_metric='entropy', max_depth=1)

weights =  np.ones((X.shape[0])) / X.shape[0]



# clean up some of the names, comment out

In [56]:
play_tennis = ['shhw','shhs','ohhw','rmhw','rcnw','rcns','ocns','smhw','scnw','rmnw','smns','omhs','ohnw','rmhs']
play_tennis = np.array([[*ex] for ex in play_tennis])
feat_names = ['outlook', 'temp', 'humidity', 'wind']
labels = np.array([0,0,1,1,1,0,1,0,1,1,1,1,1,0])
ID3(play_tennis,labels,feat_names, IG_metric='entropy')

# {'outlook': {'o': 1,
#   'r': {'wind': {'s': 0, 'w': 1}},
#   's': {'humidity': {'h': 0, 'n': 1}}}}

{'outlook': {'o': 1,
  'r': {'wind': {'s': 0, 'w': 1}},
  's': {'humidity': {'h': 0, 'n': 1}}}}

In [57]:
ID3(play_tennis,labels,feat_names, IG_metric='gini')

# {'outlook': {'o': 1,
#   'r': {'wind': {'s': 0, 'w': 1}},
#   's': {'humidity': {'h': 0, 'n': 1}}}}

{'outlook': {'o': 1,
  'r': {'wind': {'s': 0, 'w': 1}},
  's': {'humidity': {'h': 0, 'n': 1}}}}

In [58]:
ID3(play_tennis,labels,feat_names, IG_metric='me')

# {'outlook': {'o': 1,
#   'r': {'wind': {'s': 0, 'w': 1}},
#   's': {'humidity': {'h': 0, 'n': 1}}}}

{'outlook': {'o': 1,
  'r': {'wind': {'s': 0, 'w': 1}},
  's': {'humidity': {'h': 0, 'n': 1}}}}

---------------------------
## CAR EVALUATION DATASET
---------------------------

In [101]:
! pip install ucimlrepo

from ucimlrepo import fetch_ucirepo
# fetch dataset
car_evaluation = fetch_ucirepo(id=19)
# data (as pandas dataframes)
X = car_evaluation.data.features
y = car_evaluation.data.targets




In [102]:
np.random.seed(seed=1)

feat_names = np.array(X.columns)
X = X.to_numpy()
y = y.to_numpy()
shuffle = np.random.permutation(len(X))
train_test_split = 1000
train_X,train_y = X[shuffle][:train_test_split], y[shuffle][:train_test_split]
test_X, test_y = X[shuffle][train_test_split:], y[shuffle][train_test_split:]




In [11]:
max_depth_range = np.arange(start=1,stop=7)
train_accs, test_accs = np.zeros((len(max_depth_range),3)), np.zeros((len(max_depth_range),3))
for i, max_depth in enumerate(max_depth_range):
    car_tree_me = ID3(train_X,train_y,feat_names, max_depth=max_depth, IG_metric='me')
    car_tree_h = ID3(train_X,train_y,feat_names, max_depth=max_depth, IG_metric='entropy')
    car_tree_gini = ID3(train_X,train_y,feat_names, max_depth=max_depth, IG_metric='gini')

    car_trees = (car_tree_me, car_tree_h, car_tree_gini)
    for j, car_tree in enumerate(car_trees):
        train_accs[i,j] = (train_y == [predict(train_X[ex],car_tree,feat_names) for ex in range(len(train_X))]).mean()
        test_accs[i,j] = (test_y == [predict(test_X[ex],car_tree,feat_names) for ex in range(len(test_X))]).mean()



In [12]:
# depth is 1-6 across rows, IG metric is me,h,gini across columns
train_accs 

array([[0.709   , 0.709   , 0.709   ],
       [0.709   , 0.554304, 0.554304],
       [0.54166 , 0.549768, 0.549768],
       [0.543708, 0.547948, 0.548436],
       [0.548872, 0.548872, 0.548684],
       [0.55398 , 0.55398 , 0.55398 ]])

In [13]:
test_accs

array([[0.68818681, 0.68818681, 0.68818681],
       [0.68818681, 0.52300824, 0.52300824],
       [0.51845339, 0.52722724, 0.52722724],
       [0.51394193, 0.52554794, 0.52491019],
       [0.53460293, 0.5327708 , 0.53181228],
       [0.54785239, 0.54984679, 0.546975  ]])

In [14]:
train_accs - test_accs

array([[0.02081319, 0.02081319, 0.02081319],
       [0.02081319, 0.03129576, 0.03129576],
       [0.02320661, 0.02254076, 0.02254076],
       [0.02976607, 0.02240006, 0.02352581],
       [0.01426907, 0.0161012 , 0.01687172],
       [0.00612761, 0.00413321, 0.007005  ]])

In [302]:
%load_ext autoreload
%autoreload

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


---------------------------
## BANK DATASET
----------------------------

In [304]:
train,test,feat_names = get_bank_data(train_fp='datasets/bank/train.csv', 
                                      test_fp='datasets/bank/test.csv') # converts median to binary
X_train,y_train = train
X_test,y_test = test

In [305]:
# running tree w/o substituting unknown values 
max_depth_range = np.arange(start=1,stop=17)
train_accs, test_accs = np.zeros((len(max_depth_range),3)), np.zeros((len(max_depth_range),3))
for i, max_depth in enumerate(max_depth_range):
    bank_tree_me = ID3(X_train,y_train,feat_names, max_depth=max_depth, IG_metric='me')
    bank_tree_h = ID3(X_train,y_train,feat_names, max_depth=max_depth, IG_metric='entropy')
    bank_tree_gini = ID3(X_train,y_train,feat_names, max_depth=max_depth, IG_metric='gini')

    bank_trees = (bank_tree_me, bank_tree_h, bank_tree_gini)
    for j, bank_tree in enumerate(bank_trees):
        train_accs[i,j] = (y_train == [predict(X_train[ex],bank_tree,feat_names) for ex in range(len(X_train))]).mean()
        test_accs[i,j] = (y_test == [predict(X_test[ex],bank_tree,feat_names) for ex in range(len(X_test))]).mean()



In [306]:
# depth is 1-16 across rows, IG metric is me,h,gini across columns
train_accs

array([[0.85764736, 0.8808    , 0.85764736],
       [0.85505792, 0.86434944, 0.85505792],
       [0.8472896 , 0.8480512 , 0.84393856],
       [0.83982592, 0.83632256, 0.8358656 ],
       [0.83266688, 0.82962048, 0.82992512],
       [0.82535552, 0.82261376, 0.82261376],
       [0.81728256, 0.81560704, 0.81438848],
       [0.81789184, 0.80860032, 0.80707712],
       [0.81469312, 0.80479232, 0.8054016 ],
       [0.80905728, 0.8023552 , 0.80220288],
       [0.8069248 , 0.80022272, 0.7985472 ],
       [0.8031168 , 0.79885184, 0.79915648],
       [0.79991808, 0.79885184, 0.79885184],
       [0.79885184, 0.79885184, 0.79885184],
       [0.79885184, 0.79885184, 0.79885184],
       [0.79885184, 0.79885184, 0.79885184]])

In [113]:
test_accs

array([[0.84713504, 0.8752    , 0.84713504],
       [0.84488384, 0.85463904, 0.84488384],
       [0.83963104, 0.83572896, 0.83783008],
       [0.83497856, 0.83527872, 0.83407808],
       [0.82822496, 0.82897536, 0.82987584],
       [0.8204208 , 0.81952032, 0.81907008],
       [0.81021536, 0.81411744, 0.81126592],
       [0.81186624, 0.807664  , 0.8031616 ],
       [0.80961504, 0.80271136, 0.80106048],
       [0.80511264, 0.7979088 , 0.79805888],
       [0.80181088, 0.79700832, 0.79460704],
       [0.79700832, 0.7949072 , 0.79445696],
       [0.79220576, 0.79445696, 0.79400672],
       [0.792656  , 0.79445696, 0.79400672],
       [0.792656  , 0.79445696, 0.79400672],
       [0.792656  , 0.79445696, 0.79400672]])

In [114]:
train_accs - test_accs

array([[1.051232e-02, 5.600000e-03, 1.051232e-02],
       [1.017408e-02, 9.710400e-03, 1.017408e-02],
       [7.658560e-03, 1.232224e-02, 6.108480e-03],
       [4.847360e-03, 1.043840e-03, 1.787520e-03],
       [4.441920e-03, 6.451200e-04, 4.928000e-05],
       [4.934720e-03, 3.093440e-03, 3.543680e-03],
       [7.067200e-03, 1.489600e-03, 3.122560e-03],
       [6.025600e-03, 9.363200e-04, 3.915520e-03],
       [5.078080e-03, 2.080960e-03, 4.341120e-03],
       [3.944640e-03, 4.446400e-03, 4.144000e-03],
       [5.113920e-03, 3.214400e-03, 3.940160e-03],
       [6.108480e-03, 3.944640e-03, 4.699520e-03],
       [7.712320e-03, 4.394880e-03, 4.845120e-03],
       [6.195840e-03, 4.394880e-03, 4.845120e-03],
       [6.195840e-03, 4.394880e-03, 4.845120e-03],
       [6.195840e-03, 4.394880e-03, 4.845120e-03]])

---------------------------
# BANK DATASET w/ filled in MISSING VALUES using majority attribute value
----------------------------

In [115]:
cols_w_unknown = [1,3,8,15]
modes = [mode(X_train[np.where(X_train[:,col] != 'unknown')[0],col]) for col in cols_w_unknown]

In [116]:
feat_names[cols_w_unknown]

array(['job', 'education', 'contact', 'poutcome'], dtype='<U11')

In [117]:
# of columns w/ unknowns in them
modes

['blue-collar', 'secondary', 'cellular', 'failure']

In [118]:
# fill in X_train, X_test w/ modes

for i, col in enumerate(cols_w_unknown):
    X_train[np.where(X_train[:,col] == 'unknown')[0],col] = modes[i]
    X_test[np.where(X_test[:,col] == 'unknown')[0],col] = modes[i]



In [119]:
print((X_train[:,cols_w_unknown] == 'unknown').sum(),
      (X_test[:,cols_w_unknown] == 'unknown').sum()) # all values replaced

0 0


In [120]:
max_depth_range = np.arange(start=1,stop=17)
train_accs, test_accs = np.zeros((len(max_depth_range),3)), np.zeros((len(max_depth_range),3))
for i, max_depth in enumerate(max_depth_range):
    bank_tree_me = ID3(X_train,y_train,feat_names, max_depth=max_depth, IG_metric='me')
    bank_tree_h = ID3(X_train,y_train,feat_names, max_depth=max_depth, IG_metric='entropy')
    bank_tree_gini = ID3(X_train,y_train,feat_names, max_depth=max_depth, IG_metric='gini')

    bank_trees = (bank_tree_me, bank_tree_h, bank_tree_gini)
    for j, bank_tree in enumerate(bank_trees):
        train_accs[i,j] = (y_train == [predict(X_train[ex],bank_tree,feat_names) for ex in range(len(X_train))]).mean()
        test_accs[i,j] = (y_test == [predict(X_test[ex],bank_tree,feat_names) for ex in range(len(X_test))]).mean()



In [121]:
# depth is 1-16 across rows, IG metric is me,h,gini across columns
train_accs


array([[0.85764736, 0.8808    , 0.85764736],
       [0.8602368 , 0.86434944, 0.86282624],
       [0.85155456, 0.85079296, 0.85170688],
       [0.8495744 , 0.85064064, 0.85079296],
       [0.84058752, 0.83677952, 0.83906432],
       [0.83297152, 0.82840192, 0.82855424],
       [0.8252032 , 0.82078592, 0.819872  ],
       [0.82535552, 0.81515008, 0.81271296],
       [0.82093824, 0.81164672, 0.8107328 ],
       [0.81286528, 0.80524928, 0.80494464],
       [0.80981888, 0.80509696, 0.80433536],
       [0.80905728, 0.80265984, 0.80281216],
       [0.80570624, 0.80250752, 0.80250752],
       [0.80433536, 0.80250752, 0.80250752],
       [0.80250752, 0.80250752, 0.80250752],
       [0.80250752, 0.80250752, 0.80250752]])

In [122]:

test_accs

array([[0.84713504, 0.8752    , 0.84713504],
       [0.85103712, 0.85463904, 0.85358848],
       [0.84323296, 0.83888064, 0.84038144],
       [0.84233248, 0.84323296, 0.8436832 ],
       [0.83512864, 0.82852512, 0.83272736],
       [0.82612384, 0.81952032, 0.82102112],
       [0.81726912, 0.81261664, 0.81276672],
       [0.81786944, 0.8084144 , 0.80781408],
       [0.81426752, 0.80256128, 0.80406208],
       [0.80631328, 0.79970976, 0.8016608 ],
       [0.803912  , 0.79850912, 0.80016   ],
       [0.80331168, 0.79685824, 0.79895936],
       [0.79760864, 0.79685824, 0.79895936],
       [0.79460704, 0.7971584 , 0.79925952],
       [0.79445696, 0.7971584 , 0.79925952],
       [0.79445696, 0.7971584 , 0.79925952]])

In [123]:
train_accs - test_accs

array([[0.01051232, 0.0056    , 0.01051232],
       [0.00919968, 0.0097104 , 0.00923776],
       [0.0083216 , 0.01191232, 0.01132544],
       [0.00724192, 0.00740768, 0.00710976],
       [0.00545888, 0.0082544 , 0.00633696],
       [0.00684768, 0.0088816 , 0.00753312],
       [0.00793408, 0.00816928, 0.00710528],
       [0.00748608, 0.00673568, 0.00489888],
       [0.00667072, 0.00908544, 0.00667072],
       [0.006552  , 0.00553952, 0.00328384],
       [0.00590688, 0.00658784, 0.00417536],
       [0.0057456 , 0.0058016 , 0.0038528 ],
       [0.0080976 , 0.00564928, 0.00354816],
       [0.00972832, 0.00534912, 0.003248  ],
       [0.00805056, 0.00534912, 0.003248  ],
       [0.00805056, 0.00534912, 0.003248  ]])

In [127]:
## printing one of these big ole trees

In [126]:
import pprint
pp = pprint.PrettyPrinter(indent=4)
pp.pprint(bank_tree_h)

{   'duration': {   False: {   'month': {   'apr': {   'job': {   'admin.': {   'education': {   'primary': 'no',
                                                                                                 'secondary': 'no',
                                                                                                 'tertiary': {   'housing': {   'no': 'yes',
                                                                                                                                'yes': 'no'}}}},
                                                                  'blue-collar': {   'poutcome': {   'failure': 'no',
                                                                                                     'other': 'no',
                                                                                                     'success': 'yes'}},
                                                                  'entrepreneur': 'no',
                                         