In [3]:
import warnings

warnings.filterwarnings("ignore")
warnings.simplefilter(action="ignore",category=UserWarning)
warnings.simplefilter(action="ignore",category=FutureWarning)

# Suppress valuewarning when fitting ARIMA model.
from statsmodels.tools.sm_exceptions import ValueWarning
warnings.simplefilter('ignore', ValueWarning)


# Interactive plots embedded within the notebook
#%matplotlib notebook 
# Static images of plots embedded within the notebook  
%config InlineBackend.figure_formats = {'png', 'retina'}

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

from platform import python_version
import numpy as np
import scipy as sp
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import statsmodels as stm
import sklearn as skl


print('Python version', python_version())
print('Numpy version', np.__version__)
print('Scipy version', sp.__version__)
print('Pandas version', pd.__version__)
print('Matplotlib version', mpl.__version__)
print('Seaborn version', sns.__version__)
print('Statsmodels version', stm.__version__)
print('Scikit-learn version', skl.__version__)

Python version 3.12.5
Numpy version 2.0.2
Scipy version 1.14.1
Pandas version 2.2.3
Matplotlib version 3.9.2
Seaborn version 0.13.2
Statsmodels version 0.14.2
Scikit-learn version 1.5.2


# Decision Tree 

## Load data

In [4]:
import pandas as pd
enroll_df = pd.read_excel('data/supervised-learning.xlsx', sheet_name='ENROLL')

<font color='blue'>Look at the first few rows

<font color='blue'>Inspect basic information of dataset

<font color='blue'> Look at unique values of categorical columns to determine appropriate encoding


## Data preprocessing

In [5]:

# Rename columns
enroll_df.rename({'Jobsatisfaction': 'JobSat', 'Enrolls':'Target'}, axis=1, inplace=True)
enroll_df

Unnamed: 0,Age,Income,JobSat,Desire,Target
0,<=30,High,No,Fair,No
1,<=30,High,No,Excellent,No
2,31 to 40,High,No,Fair,Yes
3,>40,Medium,No,Fair,Yes
4,>40,Low,Yes,Fair,Yes
5,>40,Low,Yes,Excellent,No
6,31 to 40,Low,Yes,Excellent,Yes
7,<=30,Medium,No,Fair,No
8,<=30,Low,Yes,Fair,Yes
9,>40,Medium,Yes,Fair,Yes


In [6]:
df = enroll_df.copy()

<font color='blue'>One-hot encoding of columns `JobSat` and `Desire`

In [7]:
from sklearn.preprocessing import OneHotEncoder

enc_onehot = OneHotEncoder(handle_unknown='ignore', 
                           drop='if_binary', dtype=int)
onehot_columns = ['JobSat','Desire']

onehot_data = enc_onehot.fit_transform(df[onehot_columns])

onehot_df = pd.DataFrame(onehot_data.toarray(), 
                         columns=enc_onehot.get_feature_names_out())
onehot_df


Unnamed: 0,JobSat_Yes,Desire_Fair
0,0,1
1,0,0
2,0,1
3,0,1
4,1,1
5,1,0
6,1,0
7,0,1
8,1,1
9,1,1


Concatenate one-hot encoding columns to the input dataframe

In [8]:
df = df.drop(onehot_columns, axis=1); 
df = pd.concat([df, onehot_df], axis=1)
df

Unnamed: 0,Age,Income,Target,JobSat_Yes,Desire_Fair
0,<=30,High,No,0,1
1,<=30,High,No,0,0
2,31 to 40,High,Yes,0,1
3,>40,Medium,Yes,0,1
4,>40,Low,Yes,1,1
5,>40,Low,No,1,0
6,31 to 40,Low,Yes,1,0
7,<=30,Medium,No,0,1
8,<=30,Low,Yes,1,1
9,>40,Medium,Yes,1,1


Ordinal encoding of columns `Age`, `Income`, `Target`

In [9]:
from sklearn.preprocessing import OrdinalEncoder
import numpy as np

ord_columns = ['Age', 'Income']

# Specify the label order to encode 
Age_cat = ['<=30', '31 to 40', '>40']
Income_cat = ['Low', 'Medium', 'High']

enc_ord = OrdinalEncoder(categories=[Age_cat, Income_cat], dtype=np.int64)

df[ord_columns] = enc_ord.fit_transform(df[ord_columns])
df

Unnamed: 0,Age,Income,Target,JobSat_Yes,Desire_Fair
0,0,2,No,0,1
1,0,2,No,0,0
2,1,2,Yes,0,1
3,2,1,Yes,0,1
4,2,0,Yes,1,1
5,2,0,No,1,0
6,1,0,Yes,1,0
7,0,1,No,0,1
8,0,0,Yes,1,1
9,2,1,Yes,1,1


Separate feature set and target variable

In [10]:
X = df.drop('Target', axis=1)
Y = df[['Target']]
X
Y

Unnamed: 0,Age,Income,JobSat_Yes,Desire_Fair
0,0,2,0,1
1,0,2,0,0
2,1,2,0,1
3,2,1,0,1
4,2,0,1,1
5,2,0,1,0
6,1,0,1,0
7,0,1,0,1
8,0,0,1,1
9,2,1,1,1


Unnamed: 0,Target
0,No
1,No
2,Yes
3,Yes
4,Yes
5,No
6,Yes
7,No
8,Yes
9,Yes


## Model fitting and prediction

<font color='blue'>Fit the decision tree model to the train data

### Use the fitted model to predict

In [11]:
# See the features used in the model and their label 
dt_clf.feature_names_in_
enc_ord.categories_
enc_onehot.categories_

NameError: name 'dt_clf' is not defined

Use the encoders created earlier to construct the input for the model

In [None]:
# Encode the input {Age > 40, Income='Medium', JobSat='No', Desire='Excellent'} 
Xnew = enc_ord.transform([['>40', 'Medium']]) 
Xnew = np.hstack((Xnew, enc_onehot.transform([['No', 'Excellent']]).toarray()))
Xnew

Predict the target label and its probability

In [None]:
# Make the prediction 
dt_clf.predict(Xnew) 
dt_clf.predict_proba(Xnew)

## Evaluate performance

<font color='blue'>Determine the classification report for the train data

## Visualizing rules and tree

In [None]:
dt_clf.feature_names_in_

In [None]:
from sklearn import tree

text_representation = tree.export_text(dt_clf, 
                                       feature_names=dt_clf.feature_names_in_.tolist())
print(text_representation)

(Optional) Install necessary packages by running the following commands in the terminal:

`conda install python-graphviz`  
`pip install pydotplus`

In [None]:
import graphviz
from sklearn import tree

dot_data = tree.export_graphviz(dt_clf, out_file=None,
                           feature_names=X.columns,
                           class_names=enroll_df['Target'].unique(), impurity=False,
                           filled=True, rounded=True, proportion=True)
# Draw graph
graph = graphviz.Source(dot_data, format='png')  
graph

# graph = graphviz.Source(dot_data)  
# graph.render('enroll', format='png', view=True)

## Feature Importance

<font color='blue'>Create a list of feature importance and sort.

In [None]:
feature_list = pd.DataFrame({'feature':X.columns, 'value':dt_clf.feature_importances_})
feature_list_sorted = feature_list.sort_values('value')
feature_list_sorted

Plot the feature importance

In [None]:
import matplotlib.pyplot as plt

plt.figure(figsize=(7,3))
plt.barh(range(0,len(feature_list_sorted.index)), 
         feature_list_sorted.value, tick_label=feature_list_sorted.feature)
plt.tight_layout();