# Classification with Decision Tree


In [3]:
import pandas as pd

from sklearn.metrics import  confusion_matrix,accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier, export_text
from sklearn.preprocessing import LabelEncoder


# Iris

## Import dataset

In [4]:
iris = pd.read_csv("https://gist.githubusercontent.com/curran/a08a1080b88344b0c8a7/raw/0e7a9b0a5d22642a06d3d5b9bcbad9890c8ee534/iris.csv")
iris.columns

Index(['sepal_length', 'sepal_width', 'petal_length', 'petal_width',
       'species'],
      dtype='object')

In [5]:
iris.dtypes

sepal_length    float64
sepal_width     float64
petal_length    float64
petal_width     float64
species          object
dtype: object

## Train model

In [6]:
X = iris.drop('species', axis=1)
Y = iris['species']

X_train,X_test,Y_train,Y_test =train_test_split(X,Y,test_size=0.3,random_state=0)

model = DecisionTreeClassifier()
model.fit(X_train,Y_train)

print(export_text(model,feature_names=['sepal.length','sepal.width','petal.length','petal.width']))


|--- petal.width <= 0.75
|   |--- class: setosa
|--- petal.width >  0.75
|   |--- petal.length <= 4.95
|   |   |--- petal.width <= 1.65
|   |   |   |--- class: versicolor
|   |   |--- petal.width >  1.65
|   |   |   |--- sepal.width <= 3.10
|   |   |   |   |--- class: virginica
|   |   |   |--- sepal.width >  3.10
|   |   |   |   |--- class: versicolor
|   |--- petal.length >  4.95
|   |   |--- petal.length <= 5.05
|   |   |   |--- sepal.width <= 2.75
|   |   |   |   |--- class: virginica
|   |   |   |--- sepal.width >  2.75
|   |   |   |   |--- class: versicolor
|   |   |--- petal.length >  5.05
|   |   |   |--- class: virginica



## Model evaluation

In [7]:
# Classes
Y.unique()

array(['setosa', 'versicolor', 'virginica'], dtype=object)

In [None]:
Y_pred = model.predict(X_test)
confusion_matrix(Y_test,Y_pred)

In [15]:
accuracy_score(Y_test,Y_pred)

0.9777777777777777

In [25]:
model.feature_names_in_.tolist()

['sepal_length', 'sepal_width', 'petal_length', 'petal_width']

# Bank account

Open a bank account? output(y): yes or no



## Import dataset

In [8]:
bank =pd.read_csv("https://raw.githubusercontent.com/vishurudratrainer/notebooks/main/bank.csv",sep=";")
bank.head()

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,30,unemployed,married,primary,no,1787,no,no,cellular,19,oct,79,1,-1,0,unknown,no
1,33,services,married,secondary,no,4789,yes,yes,cellular,11,may,220,1,339,4,failure,no
2,35,management,single,tertiary,no,1350,yes,no,cellular,16,apr,185,1,330,1,failure,no
3,30,management,married,tertiary,no,1476,yes,yes,unknown,3,jun,199,4,-1,0,unknown,no
4,59,blue-collar,married,secondary,no,0,yes,no,unknown,5,may,226,1,-1,0,unknown,no


In [9]:
bank.dtypes

age           int64
job          object
marital      object
education    object
default      object
balance       int64
housing      object
loan         object
contact      object
day           int64
month        object
duration      int64
campaign      int64
pdays         int64
previous      int64
poutcome     object
y            object
dtype: object

## Clean dataset

 * 1-hot encoding is required! 

In [10]:
# 1-hot encoding !
le = LabelEncoder()

# help(bank.select_dtypes)
for column in bank.select_dtypes(include='object').columns:
    # print(column)
    bank[column]=le.fit_transform(bank[column])

bank.head()

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,30,10,1,0,0,1787,0,0,0,19,10,79,1,-1,0,3,0
1,33,7,1,1,0,4789,1,1,0,11,8,220,1,339,4,0,0
2,35,4,2,2,0,1350,1,0,0,16,0,185,1,330,1,0,0
3,30,4,1,2,0,1476,1,1,2,3,6,199,4,-1,0,3,0
4,59,1,1,1,0,0,1,0,2,5,8,226,1,-1,0,3,0


## Train model

In [11]:
X = bank.drop('y', axis=1)
Y = bank['y']

X_train, X_test, Y_train, Y_test = train_test_split(X,Y,test_size=0.3)

model = DecisionTreeClassifier()
model.fit(X_train, Y_train)

_model_features_names = model.feature_names_in_.tolist()
print(export_text(model,feature_names= _model_features_names))

|--- duration <= 375.50
|   |--- pdays <= 0.00
|   |   |--- age <= 60.50
|   |   |   |--- age <= 20.50
|   |   |   |   |--- class: 1
|   |   |   |--- age >  20.50
|   |   |   |   |--- duration <= 223.50
|   |   |   |   |   |--- month <= 0.50
|   |   |   |   |   |   |--- balance <= 19.00
|   |   |   |   |   |   |   |--- day <= 20.50
|   |   |   |   |   |   |   |   |--- education <= 1.50
|   |   |   |   |   |   |   |   |   |--- class: 0
|   |   |   |   |   |   |   |   |--- education >  1.50
|   |   |   |   |   |   |   |   |   |--- class: 1
|   |   |   |   |   |   |   |--- day >  20.50
|   |   |   |   |   |   |   |   |--- class: 1
|   |   |   |   |   |   |--- balance >  19.00
|   |   |   |   |   |   |   |--- day <= 15.50
|   |   |   |   |   |   |   |   |--- balance <= 175.50
|   |   |   |   |   |   |   |   |   |--- class: 1
|   |   |   |   |   |   |   |   |--- balance >  175.50
|   |   |   |   |   |   |   |   |   |--- duration <= 133.50
|   |   |   |   |   |   |   |   |   |   |--- class: 

## Evaluate the model

In [48]:
Y_pred = model.predict(X_test)
confusion_matrix(Y_test,Y_pred)

array([[1083,  111],
       [  87,   76]])

In [49]:
accuracy_score(Y_test,Y_pred)

0.8540899042004422