# Machine Learning - Decision Tree and HyperParameters
A Decision Tree is a machine learning model that works like a flowchart.

It asks yes/no questions (or checks conditions), step by step, to make a decision or prediction.

In [77]:
# Importing necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.impute import SimpleImputer


In [78]:
# Loading dataset
df = sns.load_dataset('titanic')
df


Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.2500,S,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.9250,S,Third,woman,False,,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1000,S,First,woman,False,C,Southampton,yes,False
4,0,3,male,35.0,0,0,8.0500,S,Third,man,True,,Southampton,no,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
886,0,2,male,27.0,0,0,13.0000,S,Second,man,True,,Southampton,no,True
887,1,1,female,19.0,0,0,30.0000,S,First,woman,False,B,Southampton,yes,True
888,0,3,female,,1,2,23.4500,S,Third,woman,False,,Southampton,no,False
889,1,1,male,26.0,0,0,30.0000,C,First,man,True,C,Cherbourg,yes,True


## 1. Handling Null Values

In [79]:
# Checking the total null values in each column
df.isnull().sum().sort_values(ascending = False)


Unnamed: 0,0
deck,688
age,177
embarked,2
embark_town,2
sex,0
pclass,0
survived,0
fare,0
parch,0
sibsp,0


In [80]:
# Deck has alot of null values so we will drop this column
df.drop(columns = 'deck', inplace = True, axis = 1)

# Checking information again
df.info()

# deck has been removed now

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 14 columns):
 #   Column       Non-Null Count  Dtype   
---  ------       --------------  -----   
 0   survived     891 non-null    int64   
 1   pclass       891 non-null    int64   
 2   sex          891 non-null    object  
 3   age          714 non-null    float64 
 4   sibsp        891 non-null    int64   
 5   parch        891 non-null    int64   
 6   fare         891 non-null    float64 
 7   embarked     889 non-null    object  
 8   class        891 non-null    category
 9   who          891 non-null    object  
 10  adult_male   891 non-null    bool    
 11  embark_town  889 non-null    object  
 12  alive        891 non-null    object  
 13  alone        891 non-null    bool    
dtypes: bool(2), category(1), float64(2), int64(4), object(5)
memory usage: 79.4+ KB


In [81]:
# Checking Null values again
df.isnull().sum()


Unnamed: 0,0
survived,0
pclass,0
sex,0
age,177
sibsp,0
parch,0
fare,0
embarked,2
class,0
who,0


## 2. Imputing

In [82]:
# Now age, embark and embarked down have few null values which we will fill using simple imputer
se = SimpleImputer(strategy = 'median')
df['age'] = se.fit_transform(df[['age']])

# Checking again
df.isnull().sum()

# Now the age null rows have been filled using median stratey

Unnamed: 0,0
survived,0
pclass,0
sex,0
age,0
sibsp,0
parch,0
fare,0
embarked,2
class,0
who,0


In [83]:
# Imputing missing values using simppleimputer by using most_frequent strategy
se = SimpleImputer(strategy = 'most_frequent')
df[['embarked', 'embark_town']] = se.fit_transform(df[['embarked', 'embark_town']])

# Checking again
df.isnull().sum()

# Now the embarked and embark_town null rows have been filled using median strategy.


Unnamed: 0,0
survived,0
pclass,0
sex,0
age,0
sibsp,0
parch,0
fare,0
embarked,0
class,0
who,0


## 3. Encoding

In [84]:
# Checking info
df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 14 columns):
 #   Column       Non-Null Count  Dtype   
---  ------       --------------  -----   
 0   survived     891 non-null    int64   
 1   pclass       891 non-null    int64   
 2   sex          891 non-null    object  
 3   age          891 non-null    float64 
 4   sibsp        891 non-null    int64   
 5   parch        891 non-null    int64   
 6   fare         891 non-null    float64 
 7   embarked     891 non-null    object  
 8   class        891 non-null    category
 9   who          891 non-null    object  
 10  adult_male   891 non-null    bool    
 11  embark_town  891 non-null    object  
 12  alive        891 non-null    object  
 13  alone        891 non-null    bool    
dtypes: bool(2), category(1), float64(2), int64(4), object(5)
memory usage: 79.4+ KB


In [85]:
# We will now encode the object and category columnn in our dataset using LabelEncoder
le = LabelEncoder()
for col in df.select_dtypes(include=['category', 'object']):
    df[col] = le.fit_transform(df[col])

# Checking
print(df.info())

# All the object and category datatype columns have been changed into int type using encoding.


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 14 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   survived     891 non-null    int64  
 1   pclass       891 non-null    int64  
 2   sex          891 non-null    int64  
 3   age          891 non-null    float64
 4   sibsp        891 non-null    int64  
 5   parch        891 non-null    int64  
 6   fare         891 non-null    float64
 7   embarked     891 non-null    int64  
 8   class        891 non-null    int64  
 9   who          891 non-null    int64  
 10  adult_male   891 non-null    bool   
 11  embark_town  891 non-null    int64  
 12  alive        891 non-null    int64  
 13  alone        891 non-null    bool   
dtypes: bool(2), float64(2), int64(10)
memory usage: 85.4 KB
None


## 4. Data Pre-Processing

In [86]:
# Checking first 10 rows
df.head(10)

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,embark_town,alive,alone
0,0,3,1,22.0,1,0,7.25,2,2,1,True,2,0,False
1,1,1,0,38.0,1,0,71.2833,0,0,2,False,0,1,False
2,1,3,0,26.0,0,0,7.925,2,2,2,False,2,1,True
3,1,1,0,35.0,1,0,53.1,2,0,2,False,2,1,False
4,0,3,1,35.0,0,0,8.05,2,2,1,True,2,0,True
5,0,3,1,28.0,0,0,8.4583,1,2,1,True,1,0,True
6,0,1,1,54.0,0,0,51.8625,2,0,1,True,2,0,True
7,0,3,1,2.0,3,1,21.075,2,2,0,False,2,0,False
8,1,3,0,27.0,0,2,11.1333,2,2,2,False,2,1,False
9,1,2,0,14.0,1,0,30.0708,0,1,0,False,0,1,False


In [87]:
# Choosing our features as x and target as y
# axis=1 tells Pandas to drop columns, not rows.
x = df.drop(['survived', 'alive'], axis = 1)
y = df['survived']


In [88]:
# checking x
x.head()


Unnamed: 0,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,embark_town,alone
0,3,1,22.0,1,0,7.25,2,2,1,True,2,False
1,1,0,38.0,1,0,71.2833,0,0,2,False,0,False
2,3,0,26.0,0,0,7.925,2,2,2,False,2,True
3,1,0,35.0,1,0,53.1,2,0,2,False,2,False
4,3,1,35.0,0,0,8.05,2,2,1,True,2,True


In [89]:
# Checking y
y.head()


Unnamed: 0,survived
0,0
1,1
2,1
3,1
4,0


## 5. Splitting Data ( Train and Test )

In [90]:
# Splitting into train and test
X_train, X_test, y_train, y_test = train_test_split (x, y, test_size = 0.2, random_state = 42)

## 6. Decision Tree Training

In [91]:
# Training The Decision Tree
dt = DecisionTreeClassifier(criterion= 'entropy', random_state = 42)
dt.fit(X_train, y_train)


## 7. Prediction

In [92]:
# Predicting the model
y_pred = dt.predict(X_test)

## 8. Evaluating Model

In [93]:
# Evaluating the model
conMatrix = confusion_matrix(y_test, y_pred)
classification = classification_report(y_test, y_pred)


In [97]:
# Printing
print(f"Confusion Matrix:\n {conMatrix}")
print(f"Classification Report:\n {classification}")


Confusion Matrix:
 [[82 23]
 [21 53]]
Classification Report:
               precision    recall  f1-score   support

           0       0.80      0.78      0.79       105
           1       0.70      0.72      0.71        74

    accuracy                           0.75       179
   macro avg       0.75      0.75      0.75       179
weighted avg       0.76      0.75      0.75       179



## Accuracy of Training and Testing Data sets

In [95]:
#checking the accuracy score for both the training and the testing data
# dt.score checks the model’s predictions against the real answers and tells you the percentage of correct guesses.
train_accuracy = dt.score(X_train, y_train)
test_accuracy = dt.score(X_test, y_test)

# Printing
print("Training Accuracy:", train_accuracy)
print("Testing Accuracy:", test_accuracy)

# So this model is overfit and we will improve this model


Training Accuracy: 0.9789325842696629
Testing Accuracy: 0.7541899441340782


## 9. Saving the Decision Tree

In [111]:
# Saving the decision tree classifier - can be used later anytime
# export_graphviz(...)saves your decision tree structure (model) as a file
# import os,  Python interacts with computer (like making folders).

from sklearn.tree import export_graphviz
import os

# Creating a directory to save file if it doesnt exist already
os.makedirs('./saved_models', exist_ok = True)

# Making a grid search diagram
export_graphviz(dt,
                out_file='./saved_models/Decision_tree.dot',
                feature_names= x.columns,
                rounded = True,
                filled = True)

# Decision Tree has been saved

## 10. Hyperparameters - Grid Search
Trying to find the best version of a Decision Tree model by testing different settings (called hyperparameters) usin GridSearchCV.

In [119]:
# Building decision tree with these different settings, and observing which one works best.”
# criterion: how to measure the quality of a split (gini or entropy)
# max_depth: how deep the tree can go (None = no limit)
# min_samples_split: the minimum number of samples required to split a node.
# min_samples_leaf: the minimum number of samples at a leaf (end point).

from sklearn.model_selection import GridSearchCV

param_grid = {
    'criterion': ['gini', 'entropy'],
    'max_depth': [None, 5, 10, 15, 20],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
    }

# This makes a basic Decision Tree model.
# random_state=42 just ensures the results are always the same each time you run it.
Model = DecisionTreeClassifier(random_state= 42)

# This creates the search tool that will: Try all combinations of the parameters you gave.
# Use 5-fold cross-validation (cv=5) — meaning it checks each model 5 times on different parts of your data
grid_search = GridSearchCV(Model, param_grid, cv = 5 )

# Training lots of decision tree models using different settings and finding the one that performs the best.
grid_search.fit(X_train, y_train)


In [120]:
# Getting the best parameters from grid search
best_param = grid_search.best_params_
print(best_param)


{'criterion': 'entropy', 'max_depth': 10, 'min_samples_leaf': 4, 'min_samples_split': 10}


In [122]:
# Now making a new decision tree with the best parameters
tuned_tree = DecisionTreeClassifier(**best_param, random_state = 2)


In [124]:
# Training thihs tuned decision tree
tuned_tree.fit(X_train, y_train)

In [125]:
# Predicting
y_pred_tuned = tuned_tree.predict(X_test)


In [126]:
# Evaluating Model
conMat = confusion_matrix(y_test, y_pred_tuned)
classificationn = classification_report(y_test, y_pred_tuned)


In [127]:
# Printing
print(f"Confusion Matrix of Tuned Model:\n {conMat}")
print(f"Classification Report of Tuned Model:\n {classificationn}")


Confusion Matrix of Tuned Model:
 [[93 12]
 [16 58]]
Classification Report of Tuned Model:
               precision    recall  f1-score   support

           0       0.85      0.89      0.87       105
           1       0.83      0.78      0.81        74

    accuracy                           0.84       179
   macro avg       0.84      0.83      0.84       179
weighted avg       0.84      0.84      0.84       179



In [133]:
# Comparing the test, train accuracy before and after training
# Accuracy of original Model
original_train_accuracy = dt.score(X_train, y_train)
original_test_accuracy = dt.score(X_test, y_test)

# Accuracy of tuned model
tuned_train_accuracy = tuned_tree.score(X_train, y_train)
tuned_test_accuracy = tuned_tree.score(X_test, y_test)


# Printing both
print("Original Model - Training Accuracy:", original_train_accuracy)
print("Original Model - Testing Accuracy:", original_test_accuracy)
print()
print("Tuned Model - Training Accuracy:", tuned_train_accuracy)
print("Tuned Model - Testing Accuracy:", tuned_test_accuracy)
print()

# Classification reports of original and tuned model
classification_original = classification_report(y_test, y_pred)
classification_tuned = classification_report(y_test, y_pred_tuned)

# Prnting
print("Classification Report of Original Model :\n", classification_original)
print()
print("Classification Report of Tuned Model :\n", classification_tuned)


Original Model - Training Accuracy: 0.9789325842696629
Original Model - Testing Accuracy: 0.7541899441340782

Tuned Model - Training Accuracy: 0.8806179775280899
Tuned Model - Testing Accuracy: 0.8435754189944135

Classification Report of Original Model :
               precision    recall  f1-score   support

           0       0.80      0.78      0.79       105
           1       0.70      0.72      0.71        74

    accuracy                           0.75       179
   macro avg       0.75      0.75      0.75       179
weighted avg       0.76      0.75      0.75       179


Classification Report of Tuned Model :
               precision    recall  f1-score   support

           0       0.85      0.89      0.87       105
           1       0.83      0.78      0.81        74

    accuracy                           0.84       179
   macro avg       0.84      0.83      0.84       179
weighted avg       0.84      0.84      0.84       179



In [149]:
print("\n\t\t\t\t--- Summary of Hyperparameter Tuning Impact ---")
print()
print("A comparison between the original and tuned Decision Tree models:")
print(f"- The original model showed a noticeable gap between training ({original_train_accuracy:.3f}) and testing ({original_test_accuracy:.3f}) accuracy, which is a clear sign of overfitting.")
print(f"- The tuned model narrowed this gap, with improved training ({tuned_train_accuracy:.3f}) and testing ({tuned_test_accuracy:.3f}) accuracy, indicating better generalization.")
print("- The increase in test accuracy for the tuned model reflects a stronger ability to perform well on unseen data.")
print("- The classification report also reveals that the tuned model achieved better scores in precision, recall, and F1-score—especially when predicting the 'survived' class.")
print("\n\t\t\t\t\t\t--- Conclusion ---")
print()
print("- Through hyperparameter tuning, the Decision Tree Classifier became more balanced, reduced overfitting, and performed better on the test data.")



				--- Summary of Hyperparameter Tuning Impact ---

A comparison between the original and tuned Decision Tree models:
- The original model showed a noticeable gap between training (0.979) and testing (0.754) accuracy, which is a clear sign of overfitting.
- The tuned model narrowed this gap, with improved training (0.881) and testing (0.844) accuracy, indicating better generalization.
- The increase in test accuracy for the tuned model reflects a stronger ability to perform well on unseen data.
- The classification report also reveals that the tuned model achieved better scores in precision, recall, and F1-score—especially when predicting the 'survived' class.

						--- Conclusion ---

- Through hyperparameter tuning, the Decision Tree Classifier became more balanced, reduced overfitting, and performed better on the test data.
