In [1]:
import pandas as pd, numpy as np, matplotlib.pyplot as plt
from decision_tree import DecisionTreeClassifier as mytree
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.tree import DecisionTreeClassifier as sktree

In [2]:
data = pd.read_excel('./part1.xlsx')
ds = pd.DataFrame(columns=data.columns)
for col in ds.columns:
    ds[col], unique = pd.factorize(data[col])
X = ds.drop(['profitable'], axis=1).values 
y = ds['profitable'].values
print data

  price    maintenance  capacity airbag  profitable
0      low         low         2      no        yes
1      low         med         4     yes         no
2      low        high         4      no         no
3      med         med         4      no         no
4      med         med         4     yes        yes
5      med        high         2     yes         no
6     high         med         4     yes        yes
7     high        high         2     yes         no
8     high        high         5     yes        yes


In [3]:
# using gini 
tree = mytree(max_depth=3, min_size=2, \
              criteria='gini').fit(X, y)
predict = tree.predict(X)
result = pd.DataFrame([])
result['predicted'] = predict
result['true'] = y.astype('float')
print 'Using Gini: my model'
print pd.DataFrame(result)
print('\n')
accuracy = accuracy_score(y, predict)
print '  Accuracy = %0.3f' %(accuracy)
print('\n')
print('  Generated Tree:  ')
tree.print_tree()

Using Gini: my model
   predicted  true
0        0.0   0.0
1        0.0   1.0
2        1.0   1.0
3        0.0   1.0
4        0.0   0.0
5        1.0   1.0
6        0.0   0.0
7        1.0   1.0
8        0.0   0.0


  Accuracy = 0.778


  Generated Tree:  
[feature2 < 1.000]
	[0]
	[feature3 < 2.000]
		[feature2 < 2.000]
			[0]
			[1]
		[0]


In [4]:
# using information gain 
tree = mytree(max_depth=3, min_size=2, \
              criteria='information gain').fit(X, y)
predict = tree.predict(X)
result = pd.DataFrame([])
result['predicted'] = predict
result['true'] = y.astype('float')
print 'Using Information Gain: my model'
print pd.DataFrame(result)
print('\n')
accuracy = accuracy_score(y, predict)
print '  Accuracy = %0.3f' %(accuracy)
print('\n')
print('  Generated Tree:  ')
tree.print_tree()

Using Information Gain: my model
   predicted  true
0        0.0   0.0
1        1.0   1.0
2        1.0   1.0
3        0.0   1.0
4        0.0   0.0
5        1.0   1.0
6        0.0   0.0
7        1.0   1.0
8        0.0   0.0


  Accuracy = 0.889


  Generated Tree:  
[feature2 < 1.000]
	[0]
	[feature1 < 1.000]
		[1]
		[feature3 < 1.000]
			[1]
			[0]


In [5]:
# using gini
tree = sktree(max_depth=3, min_samples_split=2, \
              criterion = 'gini').fit(X, y)
predict = tree.predict(X)
result = pd.DataFrame([])
result['predicted'] = predict
result['true'] = y.astype('float')
print 'Using Gini: scikit-learn classifier'
print pd.DataFrame(result)
print('\n')
accuracy = accuracy_score(y, predict)
print '  Accuracy = %0.3f' %(accuracy)

Using Gini: scikit-learn classifier
   predicted  true
0          0   0.0
1          0   1.0
2          1   1.0
3          0   1.0
4          0   0.0
5          1   1.0
6          0   0.0
7          1   1.0
8          0   0.0


  Accuracy = 0.778


In [6]:
# using information gain 
tree = sktree(max_depth=3, min_samples_split=2, \
              criterion = 'entropy').fit(X, y)
predict = tree.predict(X)
result = pd.DataFrame([])
result['predicted'] = predict
result['true'] = y.astype('float')
print 'Using Gini: scikit-learn classifier'
print pd.DataFrame(result)
print('\n')
accuracy = accuracy_score(y, predict)
print '  Accuracy = %0.3f' %(accuracy)

Using Gini: scikit-learn classifier
   predicted  true
0          0   0.0
1          0   1.0
2          1   1.0
3          0   1.0
4          0   0.0
5          1   1.0
6          0   0.0
7          1   1.0
8          0   0.0


  Accuracy = 0.778
