In [1]:
import pandas as pd
import numpy as np
from sklearn import tree
from sklearn.tree import DecisionTreeClassifier
from sklearn import metrics
from sklearn import model_selection
from sklearn.model_selection import KFold, cross_val_score
from sklearn import preprocessing
import rpy2.robjects as robjects
from rpy2.robjects import pandas2ri, r



#### 8.3 Lab Decision Trees

In [2]:
pandas2ri.activate()
r.load('Carseats.rda')
df = r['Carseats']

  res = PandasDataFrame.from_items(items)


In [3]:
df['High'] = df.Sales >= 8

In [4]:
df.head()

Unnamed: 0,Sales,CompPrice,Income,Advertising,Population,Price,ShelveLoc,Age,Education,Urban,US,High
0,9.5,138.0,73.0,11.0,276.0,120.0,Bad,42.0,17.0,Yes,Yes,True
1,11.22,111.0,48.0,16.0,260.0,83.0,Good,65.0,10.0,Yes,Yes,True
2,10.06,113.0,35.0,10.0,269.0,80.0,Medium,59.0,12.0,Yes,Yes,True
3,7.4,117.0,100.0,4.0,466.0,97.0,Medium,55.0,14.0,Yes,Yes,False
4,4.15,141.0,64.0,3.0,340.0,128.0,Bad,38.0,13.0,Yes,No,False


In [5]:
df['High'],class_names = pd.factorize(df['High'])
df['ShelveLoc'],_= pd.factorize(df['ShelveLoc'])
df['Urban'], _ = pd.factorize(df['Urban'])
df['US'], _ = pd.factorize(df['US'])
df.columns

Index(['Sales', 'CompPrice', 'Income', 'Advertising', 'Population', 'Price',
       'ShelveLoc', 'Age', 'Education', 'Urban', 'US', 'High'],
      dtype='object')

In [6]:
min_max_scaler = preprocessing.MinMaxScaler()
np_scaled = min_max_scaler.fit_transform(df)
df_normalized = pd.DataFrame(np_scaled, columns= ['Sales', 'CompPrice',
                                                  'Income', 'Advertising',
                                                  'Population', 'Price',
                                                  'ShelveLoc', 'Age', 
                                                  'Education', 'Urban',
                                                  'US', 'High'])
df_normalized.head()

Unnamed: 0,Sales,CompPrice,Income,Advertising,Population,Price,ShelveLoc,Age,Education,Urban,US,High
0,0.583897,0.622449,0.525253,0.37931,0.533066,0.57485,0.0,0.309091,0.875,0.0,0.0,0.0
1,0.689613,0.346939,0.272727,0.551724,0.501002,0.353293,0.5,0.727273,0.0,0.0,0.0,0.0
2,0.618316,0.367347,0.141414,0.344828,0.519038,0.335329,1.0,0.618182,0.25,0.0,0.0,0.0
3,0.454825,0.408163,0.79798,0.137931,0.913828,0.437126,1.0,0.545455,0.5,0.0,0.0,1.0
4,0.255071,0.653061,0.434343,0.103448,0.661323,0.622754,0.0,0.236364,0.375,0.0,1.0,1.0


In [7]:
## In classification problems we don't need to normalize data but in regression problems normalization is a good idea.

X_normalized = df_normalized.loc[:, 'CompPrice':'US']
y = df.iloc[:, -1]
dtree_entropy = tree.DecisionTreeClassifier(criterion='entropy',  min_samples_split= 10, min_samples_leaf= 5)
dtree_entropy.fit(X_normalized, y)
y_entropy_normalized = dtree_entropy.predict(X_normalized)
count_entropy_normalized_misclassified = (y != y_entropy_normalized).sum()
print( 'Count misclassified with entropy: ', count_entropy_normalized_misclassified) 


Count misclassified with entropy:  27


In [8]:
# Exclude the Sales column and 'High' column and the rest will be our attributes
# 'High' column will be the column that we would like to predict.

X = df.loc[:, 'CompPrice':'US']
y = df.iloc[:, -1]

# Create a tree object and fit the data -- we will use both entropy and gini criteria and then compare them

dtree_entropy = tree.DecisionTreeClassifier(criterion='entropy',  min_samples_split= 10, min_samples_leaf= 5)
dtree_entropy.fit(X, y)

dtree_gini = tree.DecisionTreeClassifier(criterion='gini',  min_samples_split= 10, min_samples_leaf= 5)
dtree_gini.fit(X, y)

y_entropy_train = dtree_entropy.predict(X)
y_gini_train = dtree_gini.predict(X)
count_entropy_misclassified = (y != y_entropy_train).sum()
count_gini_misclassified = (y != y_gini_train ).sum()

print( 'Count misclassified with entropy: ', count_entropy_misclassified, 
       '\nCount misclassified with gini: ', count_gini_misclassified)



Count misclassified with entropy:  27 
Count misclassified with gini:  40


In [9]:
# Split the data so that we can see the performance on test set.
np.random.shuffle(X.values)
X_train, X_test, y_train, y_test = model_selection.train_test_split(X, y, test_size=0.25, random_state=0)

 
# use the model to make predictions with the test data
y_pred_gini = dtree_gini.predict(X_test)
y_pred_entropy = dtree_entropy.predict(X_test)

# how did our model perform?
count_test_gini_misclassified = (y_test != y_pred_gini).sum()
count_test_entropy_misclassified = (y_test != y_pred_entropy).sum()
print('Misclassified samples with gini: {}'.format(count_test_gini_misclassified))
print('Misclassified samples with entropy: {}'.format(count_test_entropy_misclassified))

accuracy_gini = metrics.accuracy_score(y_test, y_pred_gini)
accuracy_entropy = metrics.accuracy_score(y_test, y_pred_entropy)
print('Accuracy with gini: {:.2f}'.format(accuracy_gini))
print('Accuracy with entropy: {:.2f}'.format(accuracy_entropy))




Misclassified samples with gini: 12
Misclassified samples with entropy: 8
Accuracy with gini: 0.88
Accuracy with entropy: 0.92


In [10]:
cross_val_score(dtree_gini, X, y, cv =5)

array([0.82716049, 0.7375    , 0.7625    , 0.6       , 0.62025316])

In [11]:
cross_val_score(dtree_entropy, X, y, cv = 5)

array([0.83950617, 0.7375    , 0.775     , 0.6625    , 0.73417722])

In [12]:
import graphviz
feature_names = X.columns


dot_data_entropy = tree.export_graphviz(dtree_entropy, out_file=None, filled=True, rounded=True,
                                feature_names=feature_names  
                                )

dot_data_gini = tree.export_graphviz(dtree_gini, out_file=None, filled=True, rounded=True,
                                feature_names=feature_names , class_names = ['high', 'low']
                                )
graph_gini = graphviz.Source(dot_data_gini)  
graph_entropy = graphviz.Source(dot_data_entropy)  

graph_gini.render('Tree_gini')
graph_entropy.render('Tree_entropy')


'Tree_entropy.pdf'