# Decision Tree Model

In [104]:
# import necessary libraries

import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px

from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import *
from sklearn.model_selection import GridSearchCV

In [105]:
df = pd.read_csv('C:/Users/ISHAN/OneDrive/Documents/GitHub/Projects/MachineLearning/daily_weather (1).xls')
df.head()

Unnamed: 0,air_pressure_9am,air_temp_9am,avg_wind_direction_9am,avg_wind_speed_9am,max_wind_direction_9am,max_wind_speed_9am,rain_accumulation_9am,rain_duration_9am,relative_humidity_9am,high_humidity_3pm
0,918.06,74.822,271.1,2.080354,295.4,2.863283,0.0,0.0,42.42,1
1,917.347688,71.403843,101.935179,2.443009,140.471549,3.533324,0.0,0.0,24.328697,0
2,923.04,60.638,51.0,17.067852,63.7,22.100967,0.0,20.0,8.9,0
3,920.502751,70.138895,198.832133,4.337363,211.203341,5.190045,0.0,0.0,12.189102,0
4,921.16,44.294,277.8,1.85666,136.5,2.863283,8.9,14730.0,92.41,1


#### EDA (Exploratory Data Analysis)

In [106]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1095 entries, 0 to 1094
Data columns (total 10 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   air_pressure_9am        1092 non-null   float64
 1   air_temp_9am            1090 non-null   float64
 2   avg_wind_direction_9am  1091 non-null   float64
 3   avg_wind_speed_9am      1092 non-null   float64
 4   max_wind_direction_9am  1092 non-null   float64
 5   max_wind_speed_9am      1091 non-null   float64
 6   rain_accumulation_9am   1089 non-null   float64
 7   rain_duration_9am       1092 non-null   float64
 8   relative_humidity_9am   1095 non-null   float64
 9   high_humidity_3pm       1095 non-null   int64  
dtypes: float64(9), int64(1)
memory usage: 85.7 KB


In [107]:
# Checking for null values
df.isnull().sum()

air_pressure_9am          3
air_temp_9am              5
avg_wind_direction_9am    4
avg_wind_speed_9am        3
max_wind_direction_9am    3
max_wind_speed_9am        4
rain_accumulation_9am     6
rain_duration_9am         3
relative_humidity_9am     0
high_humidity_3pm         0
dtype: int64

In [108]:
# Dropping null values
df.dropna(inplace=True)

In [109]:
# Checking for any duplicate rows
df.duplicated().sum()

1

In [110]:
# Droping duplicate rows
df.drop_duplicates(inplace=True)

In [111]:
# Checking if the our target variable is balanced
# Visualizing the target variable
plt.figure(figsize=(6, 5))
px.histogram(df, x='high_humidity_3pm', title='Distribution of High Humidity at 3 PM', color='high_humidity_3pm')

<Figure size 600x500 with 0 Axes>

As per the Data it Seems that the target column is Balanced and we have to predict the target column based on the other features.

### Machine Learning Model

In [112]:
X = df.drop('high_humidity_3pm',axis=1)
y = df['high_humidity_3pm']

In [113]:
# Splitting the Data
X_train, X_test, y_train, y_test =train_test_split(X,y, test_size=0.2, random_state=6)

In [114]:
# Initializing the Decision Tree Model 
dt_1 = DecisionTreeClassifier()

In [115]:
# Training the Model
dt_1.fit(X_train,y_train)

In [116]:
#Testing the Model
dt_1_y = dt_1.predict(X_test)

In [120]:
# Validating the Accuracy of the model how well it learning
print(f'Accuracy score :', accuracy_score(y_test,dt_1_y)*100)

Accuracy score : 87.79342723004694


In [118]:
dt_1_y_train = dt_1.predict(X_train)

In [119]:
print('Accuracy Score of Train',accuracy_score(y_train,dt_1_y_train)*100)
print('Accuracy Score of Test',accuracy_score(y_test,dt_1_y)*100)

Accuracy Score of Train 100.0
Accuracy Score of Test 87.79342723004694


Based on the Accuracy of the model for Train and Test Data, its clear that the model is overfitting. We can try to reduce the overfitting by using the following methods:
1. **Pruning**: This involves removing sections of the tree that provide little power to classify instances.
2. **Cross-Validation**: This technique involves dividing the data into multiple subsets and training the model on different combinations of these subsets to ensure it generalizes well to unseen data.
3. **Regularization**: This technique adds a penalty to the loss function to discourage overly complex models, helping to prevent overfitting.

In [48]:
#Get the depth of the tree
dt_1.get_depth()

13

In [76]:
dt_1.get_params()


{'ccp_alpha': 0.0,
 'class_weight': None,
 'criterion': 'gini',
 'max_depth': None,
 'max_features': None,
 'max_leaf_nodes': None,
 'min_impurity_decrease': 0.0,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'monotonic_cst': None,
 'random_state': None,
 'splitter': 'best'}

As the Accuracy score is 88% will try to improve the accuracy score by using HyperParameter Tuning using GridSearchCV.

In [None]:
# Creating parameters for Cross Validation
para_grid = {
    'criterion': ['gini', 'entropy'],
    'max_depth': [2, 3, 4, 5, 6, 8, 10, 13],
    'min_samples_split': [2, 5, 8, 10],
    'splitter': ['best']  
}


In [85]:
grid_search=GridSearchCV(dt_1, para_grid)

In [86]:
grid_search.fit(X_train,y_train)

In [87]:
# Getting the Best Estimators
print('Get the Best Estimators', grid_search.best_params_)

Get the Best Estimators {'criterion': 'gini', 'max_depth': 4, 'min_samples_split': 2, 'splitter': 'best'}


In [88]:
# Will test the model with the best Estimators
best_model = grid_search.best_estimator_
# dt_2_y is goiing to store our predicted values as Decision Tree model 2 
dt_2_y = best_model.predict(X_test) 

In [89]:
accuracy_score(y_test,dt_2_y)

0.8685446009389671

In [91]:
# Lets check how accurately the model is learning and giving the results or if there is
# any case of overfitting taking place

dt_2_y_train = best_model.predict(X_train) 

In [92]:
print('Accuracy Score of Train',accuracy_score(y_train,dt_2_y_train))
print('Accuracy Score of Test',accuracy_score(y_test,dt_2_y))

Accuracy Score of Train 0.9141176470588235
Accuracy Score of Test 0.8685446009389671


After perfoming the HyperParameter Tuning using GridSearchCV, the accuracy score of the model is improved to 90% for Train Data and 88% for Test Data.
And the model is not overfitting.
We can say that the model is performing well and is ready for deployment.