A Decision Tree is a machine learning model (algorithm) used for both classification and regression tasks.

It works by splitting data into branches based on feature values — kind of like a flowchart that leads to a final decision or prediction.

In [3]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

from sklearn.metrics import classification_report

import warnings
warnings.filterwarnings('ignore')

In [4]:
#Connecting to dataset from google drive
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [5]:
#Using titanic dataset
df=pd.read_csv('/content/drive/MyDrive/Data Science_NG/Colab Notebooks/Datasets/titanic.csv')
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [6]:
df = df.drop("PassengerId",axis=1)

In [7]:
df = df.drop("Name",axis=1)

In [8]:
df.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,0,3,male,22.0,1,0,A/5 21171,7.25,,S
1,1,1,female,38.0,1,0,PC 17599,71.2833,C85,C
2,1,3,female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,1,1,female,35.0,1,0,113803,53.1,C123,S
4,0,3,male,35.0,0,0,373450,8.05,,S


In [9]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 10 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Survived  891 non-null    int64  
 1   Pclass    891 non-null    int64  
 2   Sex       891 non-null    object 
 3   Age       714 non-null    float64
 4   SibSp     891 non-null    int64  
 5   Parch     891 non-null    int64  
 6   Ticket    891 non-null    object 
 7   Fare      891 non-null    float64
 8   Cabin     204 non-null    object 
 9   Embarked  889 non-null    object 
dtypes: float64(2), int64(4), object(4)
memory usage: 69.7+ KB


In [10]:
df.isnull().sum()

Unnamed: 0,0
Survived,0
Pclass,0
Sex,0
Age,177
SibSp,0
Parch,0
Ticket,0
Fare,0
Cabin,687
Embarked,2


In [11]:
df["Age"].unique()

array([22.  , 38.  , 26.  , 35.  ,   nan, 54.  ,  2.  , 27.  , 14.  ,
        4.  , 58.  , 20.  , 39.  , 55.  , 31.  , 34.  , 15.  , 28.  ,
        8.  , 19.  , 40.  , 66.  , 42.  , 21.  , 18.  ,  3.  ,  7.  ,
       49.  , 29.  , 65.  , 28.5 ,  5.  , 11.  , 45.  , 17.  , 32.  ,
       16.  , 25.  ,  0.83, 30.  , 33.  , 23.  , 24.  , 46.  , 59.  ,
       71.  , 37.  , 47.  , 14.5 , 70.5 , 32.5 , 12.  ,  9.  , 36.5 ,
       51.  , 55.5 , 40.5 , 44.  ,  1.  , 61.  , 56.  , 50.  , 36.  ,
       45.5 , 20.5 , 62.  , 41.  , 52.  , 63.  , 23.5 ,  0.92, 43.  ,
       60.  , 10.  , 64.  , 13.  , 48.  ,  0.75, 53.  , 57.  , 80.  ,
       70.  , 24.5 ,  6.  ,  0.67, 30.5 ,  0.42, 34.5 , 74.  ])

In [12]:
df["Age"].mean()

np.float64(29.69911764705882)

In [13]:
df["Age"].value_counts()

Unnamed: 0_level_0,count
Age,Unnamed: 1_level_1
24.00,30
22.00,27
18.00,26
28.00,25
30.00,25
...,...
24.50,1
0.67,1
0.42,1
34.50,1


In [14]:
#Replacing null values with mean value of that column
df["Age"].replace(np.nan,30, inplace=True)

In [15]:
df["Cabin"].value_counts()

Unnamed: 0_level_0,count
Cabin,Unnamed: 1_level_1
G6,4
C23 C25 C27,4
B96 B98,4
F2,3
D,3
...,...
E17,1
A24,1
C50,1
B42,1


In [20]:
#Replacing the category with O
df["Cabin"].replace(np.nan,0, inplace=True)

In [21]:
#Renaming the O to G6 category
df["Cabin"].replace(0,"G6", inplace=True)

In [22]:
df["Cabin"].value_counts()

Unnamed: 0_level_0,count
Cabin,Unnamed: 1_level_1
G6,691
C23 C25 C27,4
B96 B98,4
F2,3
D,3
...,...
E17,1
A24,1
C50,1
B42,1


In [23]:
df["Embarked"].value_counts()

Unnamed: 0_level_0,count
Embarked,Unnamed: 1_level_1
S,644
C,168
Q,77


In [24]:
df["Embarked"].replace(np.nan,"S", inplace=True)

In [25]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 10 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Survived  891 non-null    int64  
 1   Pclass    891 non-null    int64  
 2   Sex       891 non-null    object 
 3   Age       891 non-null    float64
 4   SibSp     891 non-null    int64  
 5   Parch     891 non-null    int64  
 6   Ticket    891 non-null    object 
 7   Fare      891 non-null    float64
 8   Cabin     891 non-null    object 
 9   Embarked  891 non-null    object 
dtypes: float64(2), int64(4), object(4)
memory usage: 69.7+ KB


In [26]:
df.isnull().sum()

Unnamed: 0,0
Survived,0
Pclass,0
Sex,0
Age,0
SibSp,0
Parch,0
Ticket,0
Fare,0
Cabin,0
Embarked,0


In [28]:
#Categorical columns
df_cat = df.select_dtypes(object)

In [40]:
#Numerical columns
df_num = df.select_dtypes(['int64','float64'])

In [41]:
#Using Labelencoder for Cateogrical columns to be all numerical columns dataset
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()

In [42]:
for col in df_cat:
    le = LabelEncoder()
    df_cat[col] = le.fit_transform(df_cat[col])

In [43]:
df_cat.head()

Unnamed: 0,Sex,Ticket,Cabin,Embarked
0,1,523,145,2
1,0,596,81,0
2,0,669,145,2
3,0,49,55,2
4,1,472,145,2


In [44]:
df_new = pd.concat([df_num,df_cat],axis=1)

In [45]:
df_new

Unnamed: 0,Survived,Pclass,Age,SibSp,Parch,Fare,Sex,Ticket,Cabin,Embarked
0,0,3,22.0,1,0,7.2500,1,523,145,2
1,1,1,38.0,1,0,71.2833,0,596,81,0
2,1,3,26.0,0,0,7.9250,0,669,145,2
3,1,1,35.0,1,0,53.1000,0,49,55,2
4,0,3,35.0,0,0,8.0500,1,472,145,2
...,...,...,...,...,...,...,...,...,...,...
886,0,2,27.0,0,0,13.0000,1,101,145,2
887,1,1,19.0,0,0,30.0000,0,14,30,2
888,0,3,30.0,1,2,23.4500,0,675,145,2
889,1,1,26.0,0,0,30.0000,1,8,60,0


In [46]:
y = df_new["Survived"]
X = df_new.drop("Survived",axis=1)

In [47]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.3,random_state=1)

In [48]:
def train_model(model):
    model.fit(X_train,y_train)
    y_pred = model.predict(X_test)
    print(classification_report(y_test,y_pred))

    return model

In [49]:
log = LogisticRegression()

In [50]:
train_model(log)

              precision    recall  f1-score   support

           0       0.75      0.86      0.80       153
           1       0.77      0.63      0.69       115

    accuracy                           0.76       268
   macro avg       0.76      0.74      0.75       268
weighted avg       0.76      0.76      0.76       268



# **Building Baseline model**

In [51]:
dt1 = DecisionTreeClassifier()

In [52]:
dt1 = train_model(dt1)

              precision    recall  f1-score   support

           0       0.80      0.86      0.83       153
           1       0.80      0.71      0.75       115

    accuracy                           0.80       268
   macro avg       0.80      0.79      0.79       268
weighted avg       0.80      0.80      0.80       268



In [53]:
dt1.feature_importances_

array([0.08554154, 0.13248118, 0.03541505, 0.00808001, 0.15522881,
       0.32100818, 0.16261397, 0.06833278, 0.03129848])

In [54]:
print(df_new.columns.tolist())

['Survived', 'Pclass', 'Age', 'SibSp', 'Parch', 'Fare', 'Sex', 'Ticket', 'Cabin', 'Embarked']


# **It returns the depth of the trained decision tree — i.e., the length of the longest path from the root node to a leaf node. How many decision levels it went upto!**

In [55]:
from sklearn import tree

In [56]:
#it went upto 18 decision levels
dt1.get_depth()

18

In [57]:
#Limiting the max_depth to 10 and rechecking the 4 scores
dt2 = DecisionTreeClassifier(max_depth=10)

In [58]:
train_model(dt2)

              precision    recall  f1-score   support

           0       0.81      0.88      0.84       153
           1       0.82      0.72      0.77       115

    accuracy                           0.81       268
   macro avg       0.82      0.80      0.81       268
weighted avg       0.81      0.81      0.81       268



In [59]:
#The minimum number of samples required to be at a leaf node. A leaf can have just 1 sample, which might cause overfitting (the model learns noise).
dt3 = DecisionTreeClassifier(min_samples_leaf=40)

In [60]:
train_model(dt3)

              precision    recall  f1-score   support

           0       0.74      0.93      0.82       153
           1       0.86      0.57      0.68       115

    accuracy                           0.77       268
   macro avg       0.80      0.75      0.75       268
weighted avg       0.79      0.77      0.76       268



In [61]:
#the criterion parameter tells the model how to measure the “quality” of a split — in other words, how it decides which feature and threshold to split on at each node.
#When you set criterion='entropy',the tree uses Information Gain to decide where to split.
#At each node: It calculates the entropy (a measure of impurity or uncertainty). It tests different splits and picks the one with the highest information gain — i.e., the greatest reduction in entropy.


dt4 = DecisionTreeClassifier(criterion='entropy')

In [62]:
train_model(dt4)

              precision    recall  f1-score   support

           0       0.79      0.84      0.81       153
           1       0.76      0.70      0.73       115

    accuracy                           0.78       268
   macro avg       0.78      0.77      0.77       268
weighted avg       0.78      0.78      0.78       268



# **The END!**