## Video 8 - Building The Decision Tree Regression Model

### Importing the required libraries

In [1]:
#Importing libraries
import os
import pandas as pd
import numpy as np

### Displaying the first five rows

In [2]:
#Loading the data
# os.chdir(r'C:\Users\Deepak\Downloads')
data = pd.read_csv('Synergix_data_preprocessed_new.csv')
data.head()

Unnamed: 0,Page_traffic,Unit_price,Units_sold,Segment,1_Star_Rating,2_Star_Rating,3_Star_Rating,4_Star_Rating,5_Star_Rating,Image_Count,...,Sum of Spend,Online_Clicks,Online_Cost,Online_Impressions,num_unique_campaigns,Clicks_video,Cost_video,Impressions_video,num_unique_campaigns_offline,Units_sold>1000
0,5835.0,22.214389,2071,Skincare,99.0,72.0,194.0,453.0,3272.0,6.0,...,0.0,1271.0,3102.448115,238075.0,6.0,3479.0,40244.28,3856872.0,2.0,1
1,1881.0,11.870778,681,Skincare,118.0,73.0,200.0,430.0,3084.0,9.0,...,0.0,494.0,1723.716359,75868.0,2.0,0.0,0.0,0.0,0.0,0
2,2477.0,27.645714,875,Makeup,108.0,85.0,232.0,511.0,3402.0,10.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
3,4087.0,16.896848,1396,Makeup,82.0,64.0,167.0,303.0,2058.0,5.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
4,1446.0,16.357664,822,Hair Care,29.0,24.0,88.0,200.0,1221.0,0.0,...,0.0,0.0,0.0,0.0,0.0,6024.0,77040.81,7539723.0,2.0,0


<h3>Data Preprocessing</h3>

#### Adding a new column _Good_By_Bad_Rating_

In [3]:
#Storing the ratio in a list named Rating_ratio
Rating_ratio = []
for row in data.values:
    if(row[4]+row[5] == 0):
        if(row[7]+row[8] == 0):
            #If all the ratings are zero then overall rating ratio will also be zero
            Rating_ratio.append(0.0)
        else:
            #If only the numerator(1 and 2 star) ratings are zero then adding -99999 to the list temporarily which 
            #will be taken care of in the next cell.
            Rating_ratio.append(-99999)
    else:
        Rating_ratio.append((int(row[7])+(row[8]))/(int(row[4])+int(row[5])))

#replacing -99999 with the maximum ratio in the list
max_rating = max(Rating_ratio)
for x in range(len(Rating_ratio)):
    if(Rating_ratio[x] == -99999):
        Rating_ratio[x] = max_rating

#adding the column 'Good_By_Bad_Rating' to the dataframe
data['Good_By_Bad_Rating'] = Rating_ratio

data = data.drop(columns = ['1_Star_Rating', '2_Star_Rating', '3_Star_Rating', '4_Star_Rating', '5_Star_Rating'])

### Performing Label Encoding

In [4]:
from sklearn.preprocessing import LabelEncoder
data [['Segment']]= data [['Segment']].apply(LabelEncoder().fit_transform)

### Separating the feature variable and the target variable

In [5]:
data = data.drop(columns = 'Units_sold>1000',axis=1)

In [6]:
X = data.drop(columns = 'Units_sold')
y = data['Units_sold']

<h3>Performing train-test split</h3>

In [7]:
# Importing the train-test split from scikit-learn
from sklearn.model_selection import train_test_split

In [8]:
# Performing train and test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.30, random_state = 42)

<h3>Building the model</h3>

In [9]:
# Import DecisionTreeRegressor from sklearn.tree
from sklearn.tree import DecisionTreeRegressor

In [10]:
# Creating a decision tree classifier object called DT_model
DT_model = DecisionTreeRegressor(random_state = 42)

In [11]:
# Building the model using the training data
DT_model.fit(X_train, y_train)

In [12]:
# Perfoming prediction on both the train and test data
y_pred_train = DT_model.predict(X_train)
y_pred_test = DT_model.predict(X_test)

<h3>Evaluating the model</h3>

In [13]:
from sklearn.metrics import r2_score

In [14]:
r2_train = r2_score(y_train, y_pred_train)
r2_train

1.0

In [15]:
r2_test = r2_score(y_test, y_pred_test)
r2_test

0.5687419312062367

<h3>Calculating feature importance</h3>

In [16]:
#Retreiving the score of each feature
feature_imp = DT_model.feature_importances_

#  Sort the features by importance in descending order in a dataframe.
imp = pd.DataFrame({'Col_names': X_train.columns, 'Importance': feature_imp}).sort_values(by='Importance', ascending=False)
# Calculate the cumulative sum of the 'Importance' column and store it in a new column called 'cum_imp'
imp['cum_imp'] = imp.Importance.cumsum()
imp

Unnamed: 0,Col_names,Importance,cum_imp
0,Page_traffic,0.417963,0.417963
1,Unit_price,0.157714,0.575677
19,Good_By_Bad_Rating,0.075478,0.651155
5,Title_Count,0.073342,0.724497
3,Image_Count,0.050059,0.774556
6,Description_Length,0.049367,0.823923
8,Organic Search Rank,0.029333,0.853256
7,num_unique_Search_Terms,0.028471,0.881727
2,Segment,0.020882,0.90261
13,Online_Impressions,0.018842,0.921452


In [17]:
drop_col = imp[imp.cum_imp >  0.99]['Col_names'].to_list()
drop_col

['Impressions_video',
 'Sum of Spend',
 'num_unique_campaigns_offline',
 'Num_of_Promotions']

In [18]:
# Dropping the columns from X_train
X_train.drop(columns = drop_col, axis = 1, inplace = True)

In [19]:
# Dropping the columns from X_test
X_test.drop(columns = drop_col, axis = 1, inplace = True)

<h3>Making predictions using the model</h3>

In [20]:
DT_model = DecisionTreeRegressor(random_state = 42)

In [21]:
DT_model.fit(X_train, y_train)

In [22]:
y_pred = DT_model.predict(X_test)

<h3>Evaluating the model</h3>

In [23]:
y_pred_train = DT_model.predict(X_train)

In [24]:
r2 = r2_score(y_train, y_pred_train)
r2

1.0

In [25]:
r2 = r2_score(y_test, y_pred)
r2

0.5741323313286454

### Hyperparameter tuning

In [26]:
# Storing the depth used by default in the model
depth = DT_model.get_depth()

In [27]:
depth

35

In [28]:
# List of max_depth values 
max_depth_list = list(range(depth,0,-3))

In [29]:
# Dictionary to store the train and test r square scores

train_scores = {}
test_scores = {}

# Loop through max_depth values and train the models
for depth in max_depth_list:
    # Initialize the Decision Tree model with the current max_depth value
    DT_model = DecisionTreeRegressor(max_depth=depth, random_state=42)
    
    # Train the model
    DT_model.fit(X_train, y_train)
    
    # Make predictions on the train dataset
    y_train_pred = DT_model.predict(X_train)

    # Make predictions on the test dataset
    y_test_pred = DT_model.predict(X_test)
    
    # Store the train r square score in the dictionary with the max_depth as the key
    train_scores[depth] = r2_score(y_train, y_train_pred)
    
    # Store the test r square score in the dictionary with the max_depth as the key
    test_scores[depth] = r2_score(y_test, y_test_pred)
    

# Print the train and test r square scores for each model
for depth in max_depth_list:
    print(f"max_depth = {depth}|\
    Train Score = {train_scores[depth]:.3f} |\
    Test score = {test_scores[depth]:.3f}")
    print('_'*65)

max_depth = 35|    Train Score = 1.000 |    Test score = 0.574
_________________________________________________________________
max_depth = 32|    Train Score = 1.000 |    Test score = 0.565
_________________________________________________________________
max_depth = 29|    Train Score = 1.000 |    Test score = 0.562
_________________________________________________________________
max_depth = 26|    Train Score = 1.000 |    Test score = 0.562
_________________________________________________________________
max_depth = 23|    Train Score = 0.997 |    Test score = 0.565
_________________________________________________________________
max_depth = 20|    Train Score = 0.984 |    Test score = 0.575
_________________________________________________________________
max_depth = 17|    Train Score = 0.946 |    Test score = 0.595
_________________________________________________________________
max_depth = 14|    Train Score = 0.867 |    Test score = 0.605
__________________________________

In [31]:
# Dictionary to store the train and test r square scores

train_scores = {}
test_scores = {}

# Fill the values in the below list and assign value to the variable!
min_sample_leaf_list = list(range(1,20,2))
depth = 11

# Loop through min_sample_leaf values and train the models
for min_sample_leaf in min_sample_leaf_list:
    
    # Initialize the Decision Tree model with the current min_samples_leaf value
    DT_model = DecisionTreeRegressor(min_samples_leaf=min_sample_leaf, max_depth=depth, random_state=42)
    
    # Train the model
    DT_model.fit(X_train, y_train)
    
    # Make predictions on the train dataset
    y_train_pred = DT_model.predict(X_train)

    # Make predictions on the test dataset
    y_test_pred = DT_model.predict(X_test)
    
    # Store the train r square score in the dictionary with the min_sample_leaf as the key
    train_scores[min_sample_leaf] = r2_score (y_train, y_train_pred)
    
    # Store the test r square score in the dictionary with the min_sample_leaf as the key
    test_scores[min_sample_leaf] = r2_score (y_test, y_test_pred)
    

# Print the train and test r square scores for each model
for min_sample_leaf in min_sample_leaf_list:
    print(f"min_sample_leaf = {min_sample_leaf}|\
    Train Score = {train_scores[min_sample_leaf]:.3f} |\
    Test Score = {test_scores[min_sample_leaf]:.3f}")
    print('_'*65)

min_sample_leaf = 1|    Train Score = 0.752 |    Test Score = 0.610
_________________________________________________________________
min_sample_leaf = 3|    Train Score = 0.745 |    Test Score = 0.615
_________________________________________________________________
min_sample_leaf = 5|    Train Score = 0.732 |    Test Score = 0.615
_________________________________________________________________
min_sample_leaf = 7|    Train Score = 0.720 |    Test Score = 0.609
_________________________________________________________________
min_sample_leaf = 9|    Train Score = 0.711 |    Test Score = 0.614
_________________________________________________________________
min_sample_leaf = 11|    Train Score = 0.705 |    Test Score = 0.614
_________________________________________________________________
min_sample_leaf = 13|    Train Score = 0.697 |    Test Score = 0.610
_________________________________________________________________
min_sample_leaf = 15|    Train Score = 0.693 |    Test Score