# Day 53/60 Sickit-learn part 7
# Building a Decision Tree using Python and Sickit-learn

In [22]:
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import confusion_matrix 
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split

In [5]:
# let's create a decision tree on baseball data

df = pd.read_csv('500hits.csv', encoding = 'latin-1')

In [6]:
df.head(15)

Unnamed: 0,PLAYER,YRS,G,AB,R,H,2B,3B,HR,RBI,BB,SO,SB,CS,BA,HOF
0,Ty Cobb,24,3035,11434,2246,4189,724,295,117,726,1249,357,892,178,0.366,1
1,Stan Musial,22,3026,10972,1949,3630,725,177,475,1951,1599,696,78,31,0.331,1
2,Tris Speaker,22,2789,10195,1882,3514,792,222,117,724,1381,220,432,129,0.345,1
3,Derek Jeter,20,2747,11195,1923,3465,544,66,260,1311,1082,1840,358,97,0.31,1
4,Honus Wagner,21,2792,10430,1736,3430,640,252,101,0,963,327,722,15,0.329,1
5,Carl Yastrzemski,23,3308,11988,1816,3419,646,59,452,1844,1845,1393,168,116,0.285,1
6,Paul Molitor,21,2683,10835,1782,3319,605,114,234,1307,1094,1244,504,131,0.306,1
7,Eddie Collins,25,2826,9949,1821,3315,438,187,47,520,1499,286,744,173,0.333,1
8,Willie Mays,22,2992,10881,2062,3283,523,140,660,1903,1464,1526,338,103,0.302,1
9,Eddie Murray,21,3026,11336,1627,3255,560,35,504,1917,1333,1516,110,43,0.287,1


In [9]:
# let's split the data into i/p and o/p for prediction

df = df.drop(columns = ["PLAYER", "CS"] )


In [36]:
X = df.iloc[:, 0:13]
X

Unnamed: 0,YRS,G,AB,R,H,2B,3B,HR,RBI,BB,SO,SB,BA
0,24,3035,11434,2246,4189,724,295,117,726,1249,357,892,0.366
1,22,3026,10972,1949,3630,725,177,475,1951,1599,696,78,0.331
2,22,2789,10195,1882,3514,792,222,117,724,1381,220,432,0.345
3,20,2747,11195,1923,3465,544,66,260,1311,1082,1840,358,0.310
4,21,2792,10430,1736,3430,640,252,101,0,963,327,722,0.329
...,...,...,...,...,...,...,...,...,...,...,...,...,...
460,15,1920,6653,1105,1665,285,39,291,964,1224,1427,225,0.250
461,17,1829,6092,900,1664,379,10,275,1065,936,1453,20,0.273
462,15,1834,6499,1062,1661,338,67,210,761,960,1190,315,0.256
463,16,1822,6309,714,1660,254,25,54,593,396,489,74,0.263


In [37]:
y = df.iloc[:, 13]
y

0      1
1      1
2      1
3      1
4      1
      ..
460    0
461    0
462    0
463    0
464    0
Name: HOF, Length: 465, dtype: int64

In [14]:
# splitting data into train and test data

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 17, test_size = 0.2)

In [15]:
X_train.shape

(372, 13)

In [16]:
X_test.shape

(93, 13)

In [17]:
y_train.shape

(372,)

In [18]:
y_test.shape

(93,)

In [25]:
# Decision Tree
dt = DecisionTreeClassifier()
dt.get_params()

{'ccp_alpha': 0.0,
 'class_weight': None,
 'criterion': 'gini',
 'max_depth': None,
 'max_features': None,
 'max_leaf_nodes': None,
 'min_impurity_decrease': 0.0,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'random_state': None,
 'splitter': 'best'}

In [26]:
dt.fit(X_train, y_train)

In [28]:
y_pred = dt.predict(X_test)

In [32]:
# confusion matrix of our prediction

cm = confusion_matrix(y_test,y_pred)
print(cm)

[[51 10]
 [ 9 23]]


In [35]:
# Classification report 

print(classification_report(y_test,y_test))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00        61
           1       1.00      1.00      1.00        32

    accuracy                           1.00        93
   macro avg       1.00      1.00      1.00        93
weighted avg       1.00      1.00      1.00        93



In [43]:
# lets see its features importances

dt.feature_importances_

array([0.00919003, 0.02528959, 0.02053095, 0.0659248 , 0.39351028,
       0.06589131, 0.00953164, 0.05504118, 0.05536963, 0.10262703,
       0.0373047 , 0.04594388, 0.113845  ])

In [44]:
X.columns

Index(['YRS', 'G', 'AB', 'R', 'H', '2B', '3B', 'HR', 'RBI', 'BB', 'SO', 'SB',
       'BA'],
      dtype='object')

In [46]:
features  =pd.DataFrame(dt.feature_importances_, index  =X.columns)

In [47]:
features

Unnamed: 0,0
YRS,0.00919
G,0.02529
AB,0.020531
R,0.065925
H,0.39351
2B,0.065891
3B,0.009532
HR,0.055041
RBI,0.05537
BB,0.102627


In [49]:
features.head(15)

Unnamed: 0,0
YRS,0.00919
G,0.02529
AB,0.020531
R,0.065925
H,0.39351
2B,0.065891
3B,0.009532
HR,0.055041
RBI,0.05537
BB,0.102627


In [50]:
# Lets train our model one more time and see what difference happens

a = DecisionTreeClassifier(criterion = "entropy", ccp_alpha = 0.04)

In [51]:
a.fit(X_train, y_train)

In [52]:
y_pred2 = a.predict(X_test)

In [54]:
print(confusion_matrix(y_test, y_pred2))

[[50 11]
 [ 9 23]]


In [55]:
print(classification_report(y_test, y_pred2))

              precision    recall  f1-score   support

           0       0.85      0.82      0.83        61
           1       0.68      0.72      0.70        32

    accuracy                           0.78        93
   macro avg       0.76      0.77      0.77        93
weighted avg       0.79      0.78      0.79        93



In [56]:
# feature importance

feature2  = pd.DataFrame(a.feature_importances_, index = X.columns)

In [58]:
feature2.head()

Unnamed: 0,0
YRS,0.0
G,0.0
AB,0.0
R,0.0
H,0.837977
