### 1 : Importing Libraries

In [1]:
# ignore the warnings in the output
import warnings
warnings.filterwarnings("ignore")

In [2]:
# import libraries and packages
import numpy as np
import pandas as pd 
import matplotlib.pyplot as plt
import mlflow
from pycaret.classification import *

 ### 2 Reading Data

In [3]:
##Copy the cleaned data obtained from the data_cleaning notebook into the Data folder before proceeding
%time
dataset = pd.read_csv('Data/cleaned_data.csv')

CPU times: user 2 μs, sys: 1 μs, total: 3 μs
Wall time: 6.44 μs


In [4]:
dataset.head()

Unnamed: 0,created_date,city_tier,first_platform_c,first_utm_medium_c,first_utm_source_c,total_leads_droppped,referred_lead,app_complete_flag,assistance_interaction,career_interaction,payment_interaction,social_interaction,syllabus_interaction
0,2021-07-01 00:08:15,1.0,Level0,Level11,Level2,1.0,0.0,1,0.0,0.0,0.0,0.0,0.0
1,2021-07-01 00:16:43,2.0,Level3,Level0,others,1.0,0.0,1,0.0,0.0,0.0,0.0,0.0
2,2021-07-01 00:22:20,1.0,Level3,Level0,Level0,1.0,0.0,1,0.0,0.0,0.0,0.0,0.0
3,2021-07-01 00:23:13,1.0,Level1,Level3,others,2.0,0.0,0,0.0,0.0,0.0,0.0,0.0
4,2021-07-01 00:28:38,1.0,Level3,Level0,Level0,1.0,0.0,0,0.0,0.0,0.0,0.0,0.0


In [5]:
# drop the data column as it is not needed for training
dataset = dataset.drop(['created_date'], axis=1)

In [6]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 238964 entries, 0 to 238963
Data columns (total 12 columns):
 #   Column                  Non-Null Count   Dtype  
---  ------                  --------------   -----  
 0   city_tier               238964 non-null  float64
 1   first_platform_c        238964 non-null  object 
 2   first_utm_medium_c      238964 non-null  object 
 3   first_utm_source_c      238964 non-null  object 
 4   total_leads_droppped    238964 non-null  float64
 5   referred_lead           238964 non-null  float64
 6   app_complete_flag       238964 non-null  int64  
 7   assistance_interaction  238964 non-null  float64
 8   career_interaction      238964 non-null  float64
 9   payment_interaction     238964 non-null  float64
 10  social_interaction      238964 non-null  float64
 11  syllabus_interaction    238964 non-null  float64
dtypes: float64(8), int64(1), object(3)
memory usage: 21.9+ MB


### 3 Setting up Environment: 

In [None]:
# create a connection and setup a SQLite database with the name "lead_scoring_model_experimentation.db" in 
# 'Assignment/02_training_pipeline/notebooks/' location

In [None]:
import os
import sqlite3

# Get current directory (where notebook is running)
db_dir = os.getcwd()  # this will be '02_training_pipeline/notebooks' if running from the notebook

# Define DB name and full path
db_name = 'lead_scoring_model_experimentation.db'
db_path = os.path.join(db_dir, db_name)

# Create a connection to the SQLite database
conn = sqlite3.connect(db_path)

# Print confirmation
print(f"Database created at: {db_path}")

# Close connection
conn.close()

In [None]:
'''
Now you need to start the MLflow server in a new terminal. 
Note: Before you start the MLflow server, create a folder named mlruns in the assignment directory.
Now you need to run the command to start MLflow server such that:
1. The lead_scoring_model_experimentation.db which you created above is used as the backend-store.
2. mlruns folder is used as an artifact directory. 
3. The server runs on the port 6006.

The steps to do so are as follows:
Open a new terminal.
Then go to the Assignment directory using the cd command. Type the command: cd Assignment/
Create a folder named mlruns here. You can create this folder using either the command line or GUI. To create this folder via the command line run the command: mkdir ./mlruns
Then, type the following command to start the MLflow server: 
mlflow server --backend-store-uri='sqlite:///./02_training_pipeline/notebooks/lead_scoring_model_experimentation.db' --default-artifact-root="./mlruns" --port=6006 --host=0.0.0.0
'''

In [None]:
# Once, your server is successfully running, create a mlflow tracking uri at "http://0.0.0.0:6006"

My working mlflow command

mlflow ui   --backend-store-uri="sqlite:///02_training_pipeline/notebooks/lead_scoring_model_experimentation.db"   \
--default-artifact-root="file:///home/CodePro-Lead-Scoring2/02_training_pipeline/notebooks/mlruns"   \
--port=5001

mlflow ui --backend-store-uri="sqlite:///02_training_pipeline/notebooks/lead_scoring_model_experimentation.db" \
--default-artifact-root="file:///home/CodePro-Lead-Scoring2/02_training_pipeline/notebooks/mlruns" \
--port=5001 \
--host=0.0.0.0 \
--gunicorn-opts="--log-level=ERROR" \
2>/dev/null

My Mlflow url

https://76a7d32852700.notebooks.jarvislabs.net/proxy/5001/#/experiments/0

In [None]:
# setup pycaret

In [None]:
# Check mlflow URI
# import mlflow
# print("Tracking URI:", mlflow.get_tracking_uri())
# print("Artifact URI:", mlflow.get_artifact_uri())


In [7]:
import mlflow
mlflow.set_tracking_uri("http://127.0.0.1:5001")

In [9]:
from pycaret.classification import *

# Setup with PyCaret 3.3.2 syntax
clf1 = setup(
    data=dataset,
    target='app_complete_flag',
    session_id=42,
    categorical_features=['first_platform_c', 'first_utm_medium_c', 'first_utm_source_c'],
    normalize=True,
    transformation=True,  # Keep original parameter
    use_gpu=True,
    log_experiment=True,
    experiment_name='lead_scoring_baseline_AUC',
    log_plots=True,
    profile=True  # Changed from log_profile to profile
)

# Compare models
best_model = compare_models(sort='AUC', exclude=['gbc', 'knn', 'qda', 'dummy', 'svm', 'ada'])

[LightGBM] [Info] Number of positive: 1, number of negative: 1
[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 0
[LightGBM] [Info] Number of data points in the train set: 2, number of used features: 0
[LightGBM] [Info] Number of positive: 1, number of negative: 1
[LightGBM] [Info] Number of positive: 1, number of negative: 1
[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 0
[LightGBM] [Info] Number of data points in the train set: 2, number of used features: 0
[LightGBM] [Info] Number of positive: 1, number of negative: 1
[LightGBM] [Info] Number of positive: 1, number of negative: 1
[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 0
[LightGBM] [Info] Number of data points in the train set: 2, number of used features: 0
[LightGBM] [Info] Number of positive: 1, number of negative: 1


[LightGBM] [Fatal] CUDA Tree Learner was not enabled in this build.
Please recompile with CMake option -DUSE_CUDA=1
[LightGBM] [Fatal] CUDA Tree Learner was not enabled in this build.
Please recompile with CMake option -DUSE_CUDA=1
[LightGBM] [Fatal] CUDA Tree Learner was not enabled in this build.
Please recompile with CMake option -DUSE_CUDA=1


[LightGBM] [Info] Number of positive: 1, number of negative: 1
[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 0
[LightGBM] [Info] Number of data points in the train set: 2, number of used features: 0
[LightGBM] [Info] Number of positive: 1, number of negative: 1
[LightGBM] [Info] Number of positive: 1, number of negative: 1
[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 0
[LightGBM] [Info] Number of data points in the train set: 2, number of used features: 0
[LightGBM] [Info] Number of positive: 1, number of negative: 1
[LightGBM] [Info] Number of positive: 1, number of negative: 1
[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 0
[LightGBM] [Info] Number of data points in the train set: 2, number of used features: 0
[LightGBM] [Info] Number of positive: 1, number of negative: 1


[LightGBM] [Fatal] CUDA Tree Learner was not enabled in this build.
Please recompile with CMake option -DUSE_CUDA=1
[LightGBM] [Fatal] CUDA Tree Learner was not enabled in this build.
Please recompile with CMake option -DUSE_CUDA=1
[LightGBM] [Fatal] CUDA Tree Learner was not enabled in this build.
Please recompile with CMake option -DUSE_CUDA=1


Unnamed: 0,Description,Value
0,Session id,42
1,Target,app_complete_flag
2,Target type,Binary
3,Original data shape,"(238964, 12)"
4,Transformed data shape,"(238964, 44)"
5,Transformed train set shape,"(167274, 44)"
6,Transformed test set shape,"(71690, 44)"
7,Numeric features,8
8,Categorical features,3
9,Preprocess,True


Loading profile... Please Wait!


[LightGBM] [Info] Number of positive: 1, number of negative: 1
[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 0
[LightGBM] [Info] Number of data points in the train set: 2, number of used features: 0
[LightGBM] [Info] Number of positive: 1, number of negative: 1
[LightGBM] [Info] Number of positive: 1, number of negative: 1
[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 0
[LightGBM] [Info] Number of data points in the train set: 2, number of used features: 0
[LightGBM] [Info] Number of positive: 1, number of negative: 1


[LightGBM] [Fatal] CUDA Tree Learner was not enabled in this build.
Please recompile with CMake option -DUSE_CUDA=1
[LightGBM] [Fatal] CUDA Tree Learner was not enabled in this build.
Please recompile with CMake option -DUSE_CUDA=1


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
xgboost,Extreme Gradient Boosting,0.7389,0.8206,0.8309,0.703,0.7616,0.4773,0.4855,5.257
lightgbm,Light Gradient Boosting Machine,0.7385,0.8206,0.8347,0.7012,0.7622,0.4765,0.4855,5.576
rf,Random Forest Classifier,0.7379,0.8181,0.821,0.7053,0.7588,0.4754,0.482,6.657
et,Extra Trees Classifier,0.7371,0.8172,0.8196,0.7049,0.7579,0.4739,0.4804,6.968
dt,Decision Tree Classifier,0.7369,0.8165,0.8192,0.7047,0.7576,0.4734,0.4798,5.282
lr,Logistic Regression,0.7132,0.7898,0.8039,0.6819,0.7378,0.426,0.4331,5.348
ridge,Ridge Classifier,0.7093,0.7877,0.8182,0.6732,0.7387,0.4182,0.4283,5.09
lda,Linear Discriminant Analysis,0.7093,0.7877,0.8182,0.6732,0.7387,0.4182,0.4283,5.395
nb,Naive Bayes,0.6795,0.7377,0.8531,0.6345,0.7277,0.3581,0.3817,5.067


### 4 : Model Experimentation with pycaret

In [None]:
# create a experimentation with pycaret and exclude ['gbc','knn','qda', 'dummy', 'svm', 'ada']

In [None]:
# create a model which gives the highest accuracy

In [None]:
# create feature importance plot

### 5 : Model Experimentation after dropping features

From the above feature tests we can claerly see that some of the features are not significant. We will now drop all the insignificant features and select only the significant ones.
The list of the significant features is 
['total_leads_droppped', 'city_tier', 'referred_lead', 'app_complete_flag', 'first_platform_c', 'first_utm_medium_c', 'first_utm_source_c'].
So now you will train your model with onlly these features.

Also note that in our previous experiments we saw that tree based models are our top performers. In case of tree based models we do not require transformationss normalization, scaling etc. So make sure that you use setup pycaret in the proper way. i.e, make sure that you use normalize = False and transformation = False.

In [None]:
# 
# ['total_leads_droppped', 'city_tier', 'referred_lead', 'app_complete_flag', 'first_platform_c', 'first_utm_medium_c', 'first_utm_source_c']
#
# Train the model using the features listed above. Since we are using tree models we do not require any transformaions 
# such as normalization, scaling etc.So make sure that you use setup pycaret in the proper way. i.e, make sure that you use 
# normalize = False and transformation = False.

In [None]:
# create a experimentation with pycaret and exclude ['gbc','knn','qda', 'dummy', 'svm', 'ada']. 

In [None]:
# You should get lightgbm as the best performing model. So now we will train a lightGBM model manually using pycaret

In [None]:
# Tune the hyper parameters of the lightgbm model using optuna on 10 folds and optimise AUC as that was our system metric, 
# hence we will optimise AUC

In [None]:
# Print the final models configuration so that we can use it in the model retraining pipeline 