In [22]:
import numpy as np
import h5py as h5
import scipy as sp
import pandas as pd
%matplotlib inline
import matplotlib.pyplot as plt
import graphviz as gviz
import pydotplus as pdp
import collections as clcs

import ML_func_defs as ML
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import accuracy_score
from sklearn import tree

# Prove 05 - Experimentation with Trees

## Car dataset

This has 
- Categorical data
- Classification

### Part 1 - Organize the data

In [36]:
car_data = pd.read_csv("car.txt", names=['price','maint_cost', 'doors',
                                         'persons','lug_boot','safety','quality_of_deal'],
                       na_values=["?"])

car_data = np.array( car_data.dropna(axis=0) )

car_train  = np.array( car_data[:,range(0,6)] )
car_target = np.array( car_data[:,6]          )

# Label encoding stuff
price     = {'vhigh':0,'high':1, 'med':2,  'low':3 }
doors     = {    '2':0,   '3':1,   '4':2,'5more':3 }
persons   = {    '2':0,   '4':1,'more':3 }
lug_booty = {'small':0, 'med':1, 'big':2 }
safety    = {  'low':0, 'med':1,'high':2 }
quality   = { 'unacc':0, 'acc':1, 'good':2, 'vgood':3 }

for i,[k0,k1,k2,k3,k4,k5] in enumerate(car_train):
    car_train[i][0] = price[     car_train[i][0] ]
    car_train[i][1] = price[     car_train[i][1] ]
    car_train[i][2] = doors[     car_train[i][2] ]
    car_train[i][3] = persons[   car_train[i][3] ]
    car_train[i][4] = lug_booty[ car_train[i][4] ]
    car_train[i][5] = safety[    car_train[i][5] ]

for i, k in enumerate(car_target):
    car_target[i] = quality[ car_target[i] ] 


# I chose 95% because vgood and good make up only about  ~6% of the data.
# Because of this small percentage the accuracy will change everytime you run it. 
X_train, X_test, y_train, y_test = train_test_split(car_train, car_target, train_size= 0.95)
X_train = X_train.astype('int')
y_train = y_train.astype('int')
X_test = X_test.astype('int')
y_test = y_test.astype('int')



### Part 2 -  Generate png file to visualize the tree

In [33]:
tree_clsfr = tree.DecisionTreeClassifier()
tree_clsfr.fit(X_train, y_train)
tree_clsfr.score(X_test, y_test)

tree_graph = tree.export_graphviz(tree_clsfr)
graph = pdp.graph_from_dot_data(tree_graph)
colors = ('turquoise', 'orange')
edges = clcs.defaultdict(list)

for edge in graph.get_edge_list():
    edges[edge.get_source()].append(int(edge.get_destination()))

for edge in edges:
    edges[edge].sort()    
    for i in range(2):
        dest = graph.get_node(str(edges[edge][i]))[0]
        dest.set_fillcolor(colors[i])

graph.write_png('car_tree.png')

True

### Part 3 - Trying different approaches for catagorical data

In [77]:
car_data = pd.read_csv("car.txt", names=['price','maint_cost', 'doors',
                                         'persons','lug_boot','safety','quality_of_deal'],
                       na_values=["?"])

##################
# LABEL ENCODING
##################
car_data = np.array(   car_data.dropna(axis=0) )
car_train  = np.array( car_data[:,range(0,6)]  )
car_target = np.array( car_data[:,6]           )

price     = {'vhigh':0,'high':1, 'med':2,  'low':3 }
doors     = {    '2':0,   '3':1,   '4':2,'5more':3 }
persons   = {    '2':0,   '4':1,'more':3 }
lug_booty = {'small':0, 'med':1, 'big':2 }
safety    = {  'low':0, 'med':1,'high':2 }
quality   = { 'unacc':0, 'acc':1, 'good':2, 'vgood':3 }

for i,[k0,k1,k2,k3,k4,k5] in enumerate(car_train):
    car_train[i][0] = price[     car_train[i][0] ]
    car_train[i][1] = price[     car_train[i][1] ]
    car_train[i][2] = doors[     car_train[i][2] ]
    car_train[i][3] = persons[   car_train[i][3] ]
    car_train[i][4] = lug_booty[ car_train[i][4] ]
    car_train[i][5] = safety[    car_train[i][5] ]

for i, k in enumerate(car_target):
    car_target[i] = quality[ car_target[i] ] 
# I chose 95% because vgood and good make up only about  ~6% of the data.
# Because of this small percentage the accuracy will change everytime you run it. 
X_train, X_test, y_train, y_test = train_test_split(car_train, car_target, train_size= 0.90)
X_train = X_train.astype('int')
y_train = y_train.astype('int')
X_test = X_test.astype('int')
y_test = y_test.astype('int')

tree_clsfr = tree.DecisionTreeClassifier()
tree_clsfr.fit(X_train, y_train)
print("label encoding: ",tree_clsfr.score(X_test, y_test) )

######################
# ONE-HOT-ENCODING
######################
pd_car_data = pd.read_csv("car.txt", names=['price','maint_cost', 'doors',
                                         'persons','lug_boot','safety','quality_of_deal'],
                       na_values=["?"])
pd_car_data = pd.get_dummies(pd_car_data, columns=['price','maint_cost', 'doors',
                                                'persons','lug_boot','safety'])
hot_car_data = np.array( pd_car_data.dropna(axis=0) )
    
hot_car_train  = np.array( hot_car_data[:,range(1,22)] )
hot_car_target = np.array( hot_car_data[:,0]          )

# label encode the target
quality   = { 'unacc':0, 'acc':1, 'good':2, 'vgood':3 }
for i, k in enumerate(hot_car_target):
    hot_car_target[i] = quality[ hot_car_target[i] ] 

X_train, X_test, y_train, y_test = train_test_split(hot_car_train, hot_car_target, train_size= 0.90)
X_train = X_train.astype('int')
y_train = y_train.astype('int')
X_test = X_test.astype('int')
y_test = y_test.astype('int')


tree_clsfr = tree.DecisionTreeClassifier()
tree_clsfr.fit(X_train, y_train)
print("one-hot encoding: ",tree_clsfr.score(X_test, y_test) )

label encoding:  0.9884393063583815
one-hot encoding:  0.9479768786127167




### Part 5 - Pruning the tree

In [111]:
car_data = pd.read_csv("car.txt", names=['price','maint_cost', 'doors',
                                         'persons','lug_boot','safety','quality_of_deal'],
                       na_values=["?"])

car_data = np.array( car_data.dropna(axis=0) )

car_train  = np.array( car_data[:,range(0,6)] )
car_target = np.array( car_data[:,6]          )

# Label encoding stuff
price     = {'vhigh':0,'high':1, 'med':2,  'low':3 }
doors     = {    '2':0,   '3':1,   '4':2,'5more':3 }
persons   = {    '2':0,   '4':1,'more':3 }
lug_booty = {'small':0, 'med':1, 'big':2 }
safety    = {  'low':0, 'med':1,'high':2 }
quality   = { 'unacc':0, 'acc':1, 'good':2, 'vgood':3 }

for i,[k0,k1,k2,k3,k4,k5] in enumerate(car_train):
    car_train[i][0] = price[     car_train[i][0] ]
    car_train[i][1] = price[     car_train[i][1] ]
    car_train[i][2] = doors[     car_train[i][2] ]
    car_train[i][3] = persons[   car_train[i][3] ]
    car_train[i][4] = lug_booty[ car_train[i][4] ]
    car_train[i][5] = safety[    car_train[i][5] ]

for i, k in enumerate(car_target):
    car_target[i] = quality[ car_target[i] ] 


# I chose 95% because vgood and good make up only about  ~6% of the data.
# Because of this small percentage the accuracy will change everytime you run it. 
X_train, X_test, y_train, y_test = train_test_split(car_train, car_target, train_size= 0.95)
X_train = X_train.astype('int')
y_train = y_train.astype('int')
X_test = X_test.astype('int')
y_test = y_test.astype('int')

for X in np.arange(3,14):
    tree_clsfr = tree.DecisionTreeClassifier(max_depth=X)
    tree_clsfr.fit(X_train, y_train)
    print("For max_depth = %d, the accuracy = %f.5" % (X, tree_clsfr.score(X_test, y_test) ) )

For max_depth = 3, the accuracy = 0.712644.5
For max_depth = 4, the accuracy = 0.839080.5
For max_depth = 5, the accuracy = 0.816092.5
For max_depth = 6, the accuracy = 0.942529.5
For max_depth = 7, the accuracy = 0.908046.5
For max_depth = 8, the accuracy = 0.965517.5
For max_depth = 9, the accuracy = 0.931034.5
For max_depth = 10, the accuracy = 0.977011.5
For max_depth = 11, the accuracy = 0.977011.5
For max_depth = 12, the accuracy = 0.977011.5
For max_depth = 13, the accuracy = 0.977011.5




## Iris Data

This has 
- Numeric data
- Classification

### Part 1 - Organize the data

In [13]:
iris = datasets.load_iris()

x_tr, x_test, y_tr, y_test = train_test_split(iris.data, iris.target, train_size= 0.7)



0.9555555555555556

### Part 2 - Generate png file to visualize the tree

In [27]:
tree_clsfr = tree.DecisionTreeClassifier()
tree_clsfr.fit(x_tr, y_tr)
tree_clsfr.score(x_test, y_test)

tree_graph = tree.export_graphviz(tree_clsfr)
graph = pdp.graph_from_dot_data(tree_graph)
colors = ('turquoise', 'orange')
edges = clcs.defaultdict(list)

for edge in graph.get_edge_list():
    edges[edge.get_source()].append(int(edge.get_destination()))

for edge in edges:
    edges[edge].sort()    
    for i in range(2):
        dest = graph.get_node(str(edges[edge][i]))[0]
        dest.set_fillcolor(colors[i])

graph.write_png('iris_tree.png')

True

### Part 5 - Pruning the tree

In [114]:
iris = datasets.load_iris()

x_tr, x_test, y_tr, y_test = train_test_split(iris.data, iris.target, train_size= 0.7)

for X in np.arange(3,7):
    tree_clsfr = tree.DecisionTreeClassifier()
    tree_clsfr.fit(x_tr, y_tr)
    print("For max_depth = %d, the accuracy = %f.5" % (X, tree_clsfr.score(x_test, y_test) ) )

For max_depth = 3, the accuracy = 0.911111.5
For max_depth = 4, the accuracy = 0.911111.5
For max_depth = 5, the accuracy = 0.933333.5
For max_depth = 6, the accuracy = 0.933333.5




## Balance Data

This has:
- Categorical data
- Classification

### Part 1 - Organize the data

In [69]:
data = open("balance_scale.txt","rt")
contents = data.readlines()
data.close()
#print( contents )  # it's ugly. 

# Turns the string version of the data into floats in a 150x4 array
bal_data = [[float(x.split(',')[1]),
             float(x.split(',')[2]),
             float(x.split(',')[3]),
             float((x.split(',')[4]).split('\n')[0])] for x in contents]
#print(bal_data)
## gets the iris names in stings 'setosa', 'versicolor', and 'virginica'
bal_target = [ x.split(',')[0] for x in contents ]
## dictionary to make recasting iris names intointegers easy
targ_vals = {'L':0, 'B':1, 'R':2}
## loop to recasting iris names into integers
for i,name in enumerate(bal_target):
    bal_target[i] = targ_vals[name]
    
x_tr, x_test, y_tr, y_test = train_test_split(bal_data, bal_target, train_size= 0.7)



### Part 2 - Generate png file to visualize the tree

In [70]:
tree_clsfr = tree.DecisionTreeClassifier()
tree_clsfr.fit(x_tr, y_tr)
tree_clsfr.score(x_test, y_test)

tree_graph = tree.export_graphviz(tree_clsfr)
graph = pdp.graph_from_dot_data(tree_graph)
colors = ('turquoise', 'orange')
edges = clcs.defaultdict(list)

for edge in graph.get_edge_list():
    edges[edge.get_source()].append(int(edge.get_destination()))

for edge in edges:
    edges[edge].sort()    
    for i in range(2):
        dest = graph.get_node(str(edges[edge][i]))[0]
        dest.set_fillcolor(colors[i])

graph.write_png('balance_tree.png')

True

### Part 3 - Trying different approaches for catagorical data

In [100]:
tr_sz = .85
################
# LABEL ENCODING 
################
data = open("balance_scale.txt","rt")
contents = data.readlines()
data.close()
#print( contents )  # it's ugly. 

# Turns the string version of the data into floats in a 150x4 array
bal_data = [[float(x.split(',')[1]),
             float(x.split(',')[2]),
             float(x.split(',')[3]),
             float((x.split(',')[4]).split('\n')[0])] for x in contents]
#print(bal_data)
## gets the iris names in stings 'setosa', 'versicolor', and 'virginica'
bal_target = [ x.split(',')[0] for x in contents ]
## dictionary to make recasting iris names intointegers easy
targ_vals = {'L':0, 'B':1, 'R':2}
## loop to recasting iris names into integers
for i,name in enumerate(bal_target):
    bal_target[i] = targ_vals[name]
    
x_tr, x_test, y_tr, y_test = train_test_split(bal_data, bal_target, train_size= tr_sz)
tree_clsfr = tree.DecisionTreeClassifier()
tree_clsfr.fit(x_tr, y_tr)
print("label encoding: ",tree_clsfr.score(x_test, y_test) )


######################
# ONE-HOT-ENCODING
######################
pd_bal_data = pd.read_csv("balance_scale.txt", names=['result','lw','ld','rw','rd'])
pd_bal_data = pd.get_dummies(pd_bal_data, columns=['lw','ld','rw','rd'])

bal_data = np.array( pd_bal_data )
bal_train = np.array(  bal_data[:,range(1,21)] )
bal_target = np.array( bal_data[:,0]           )

targ_vals = {'L':0, 'B':1, 'R':2}
## loop to recasting iris names into integers
for i,name in enumerate(bal_target):
    bal_target[i] = targ_vals[name]
    
X_train, X_test, y_train, y_test = train_test_split(bal_train, bal_target, train_size= tr_sz)
X_train = X_train.astype('int')
y_train = y_train.astype('int')
X_test = X_test.astype('int')
y_test = y_test.astype('int')

tree_clsfr = tree.DecisionTreeClassifier()
tree_clsfr.fit(X_train, y_train)
print("one-hot encoding: ",tree_clsfr.score(X_test, y_test) )

label encoding:  0.7127659574468085
one-hot encoding:  0.7659574468085106




### Part 5 - Pruning the tree

In [118]:
data = open("balance_scale.txt","rt")
contents = data.readlines()
data.close()
#print( contents )  # it's ugly. 

# Turns the string version of the data into floats in a 150x4 array
bal_data = [[float(x.split(',')[1]),
             float(x.split(',')[2]),
             float(x.split(',')[3]),
             float((x.split(',')[4]).split('\n')[0])] for x in contents]
#print(bal_data)
## gets the iris names in stings 'setosa', 'versicolor', and 'virginica'
bal_target = [ x.split(',')[0] for x in contents ]
## dictionary to make recasting iris names intointegers easy
targ_vals = {'L':0, 'B':1, 'R':2}
## loop to recasting iris names into integers
for i,name in enumerate(bal_target):
    bal_target[i] = targ_vals[name]
    
x_tr, x_test, y_tr, y_test = train_test_split(bal_data, bal_target, train_size= 0.7)



for X in np.arange(3,11):
    tree_clsfr = tree.DecisionTreeClassifier()
    tree_clsfr.fit(x_tr, y_tr)
    print("For max_depth = %d, the accuracy = %f.5" % (X, tree_clsfr.score(x_test, y_test) ) )

For max_depth = 3, the accuracy = 0.781915.5
For max_depth = 4, the accuracy = 0.792553.5
For max_depth = 5, the accuracy = 0.771277.5
For max_depth = 6, the accuracy = 0.787234.5
For max_depth = 7, the accuracy = 0.776596.5
For max_depth = 8, the accuracy = 0.776596.5
For max_depth = 9, the accuracy = 0.776596.5
For max_depth = 10, the accuracy = 0.792553.5


