### 1 : Importing Libraries

In [1]:
# ignore the warnings in the output
import warnings
warnings.filterwarnings("ignore")

In [2]:
# import libraries and packages
import numpy as np
import pandas as pd 
import matplotlib.pyplot as plt
import mlflow
from pycaret.classification import *

 ### 2 Reading Data

In [3]:
##Copy the cleaned data obtained from the data_cleaning notebook into the Data folder before proceeding
%time
dataset = pd.read_csv('Data/cleaned_data.csv')

CPU times: user 2 µs, sys: 0 ns, total: 2 µs
Wall time: 5.01 µs


In [4]:
dataset.head()

Unnamed: 0,created_date,city_tier,first_platform_c,first_utm_medium_c,first_utm_source_c,total_leads_droppped,referred_lead,app_complete_flag,assistance_interaction,career_interaction,payment_interaction,social_interaction,syllabus_interaction
0,2021-07-01 00:08:15,1.0,Level0,Level11,Level2,1.0,0.0,1,0.0,0.0,0.0,0.0,0.0
1,2021-07-01 00:16:43,2.0,Level3,Level0,others,1.0,0.0,1,0.0,0.0,0.0,0.0,0.0
2,2021-07-01 00:22:20,1.0,Level3,Level0,Level0,1.0,0.0,1,0.0,0.0,0.0,0.0,0.0
3,2021-07-01 00:23:13,1.0,Level1,Level3,others,2.0,0.0,0,0.0,0.0,0.0,0.0,0.0
4,2021-07-01 00:28:38,1.0,Level3,Level0,Level0,1.0,0.0,0,0.0,0.0,0.0,0.0,0.0


In [5]:
# drop the data column as it is not needed for training
dataset = dataset.drop(['created_date'], axis=1)

In [6]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 238964 entries, 0 to 238963
Data columns (total 12 columns):
 #   Column                  Non-Null Count   Dtype  
---  ------                  --------------   -----  
 0   city_tier               238964 non-null  float64
 1   first_platform_c        238964 non-null  object 
 2   first_utm_medium_c      238964 non-null  object 
 3   first_utm_source_c      238964 non-null  object 
 4   total_leads_droppped    238964 non-null  float64
 5   referred_lead           238964 non-null  float64
 6   app_complete_flag       238964 non-null  int64  
 7   assistance_interaction  238964 non-null  float64
 8   career_interaction      238964 non-null  float64
 9   payment_interaction     238964 non-null  float64
 10  social_interaction      238964 non-null  float64
 11  syllabus_interaction    238964 non-null  float64
dtypes: float64(8), int64(1), object(3)
memory usage: 21.9+ MB


### 3 Setting up Environment: 

In [7]:
# create a connection and setup a SQLite database with the name "lead_scoring_model_experimentation.db" in 
# 'Assignment/02_training_pipeline/notebooks/' location

In [8]:
import os
import sqlite3

# Get current directory (where notebook is running)
db_dir = os.getcwd()  # this will be '02_training_pipeline/notebooks' if running from the notebook

# Define DB name and full path
db_name = 'lead_scoring_model_experimentation.db'
db_path = os.path.join(db_dir, db_name)

# Create a connection to the SQLite database
conn = sqlite3.connect(db_path)

# Print confirmation
print(f"Database created at: {db_path}")

# Close connection
conn.close()

Database created at: /home/CodePro-Lead-Scoring2/02_training_pipeline/notebooks/lead_scoring_model_experimentation.db


In [None]:
'''
Now you need to start the MLflow server in a new terminal. 
Note: Before you start the MLflow server, create a folder named mlruns in the assignment directory.
Now you need to run the command to start MLflow server such that:
1. The lead_scoring_model_experimentation.db which you created above is used as the backend-store.
2. mlruns folder is used as an artifact directory. 
3. The server runs on the port 6006.

The steps to do so are as follows:
Open a new terminal.
Then go to the Assignment directory using the cd command. Type the command: cd Assignment/
Create a folder named mlruns here. You can create this folder using either the command line or GUI. To create this folder via the command line run the command: mkdir ./mlruns
Then, type the following command to start the MLflow server: 
mlflow server --backend-store-uri='sqlite:///./02_training_pipeline/notebooks/lead_scoring_model_experimentation.db' --default-artifact-root="./mlruns" --port=6006 --host=0.0.0.0
'''

In [None]:
# Once, your server is successfully running, create a mlflow tracking uri at "http://0.0.0.0:6006"

My working mlflow command

mlflow ui \
  --backend-store-uri="sqlite:///02_training_pipeline/notebooks/lead_scoring_model_experimentation.db" \
  --default-artifact-root="file:///home/CodePro-Lead-Scoring2/02_training_pipeline/notebooks/mlruns" \
  --port=5001


My Mlflow url

https://76a7d32852700.notebooks.jarvislabs.net/proxy/5001/#/experiments/0

In [None]:
# setup pycaret

In [11]:
import mlflow

print("Tracking URI:", mlflow.get_tracking_uri())
print("Artifact URI:", mlflow.get_artifact_uri())


Tracking URI: http://127.0.0.1:5001
Artifact URI: file:///home/CodePro-Lead-Scoring2/02_training_pipeline/notebooks/mlruns/1/8982638149a34a7c821bac668311d72a/artifacts


In [9]:
import mlflow
mlflow.set_tracking_uri("http://127.0.0.1:5001")

In [12]:
from pycaret.classification import *

clf1 = setup(
    data=dataset,
    target='app_complete_flag',
    session_id=42,
    categorical_features=['first_platform_c', 'first_utm_medium_c', 'first_utm_source_c'],
    normalize=True,
    transformation=True,
    silent=True,
    use_gpu=True,
    log_experiment=True,
    experiment_name='lead_scoring_baseline',
    log_plots=True,
    log_profile=True
)

best_model = compare_models(sort='Accuracy', exclude=['gbc', 'knn', 'qda', 'dummy', 'svm', 'ada'])


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
lightgbm,Light Gradient Boosting Machine,0.7388,0.8211,0.8336,0.7025,0.7624,0.4771,0.4858,0.599
rf,Random Forest Classifier,0.7372,0.8188,0.8244,0.7037,0.7593,0.4738,0.4811,2.158
et,Extra Trees Classifier,0.7367,0.8179,0.8214,0.7042,0.7583,0.4729,0.4797,2.437
dt,Decision Tree Classifier,0.7364,0.8169,0.8209,0.704,0.758,0.4723,0.4791,0.496
lr,Logistic Regression,0.7165,0.7917,0.8017,0.6869,0.7398,0.4325,0.4387,3.928
ridge,Ridge Classifier,0.715,0.0,0.8134,0.6815,0.7416,0.4294,0.4378,0.212
lda,Linear Discriminant Analysis,0.715,0.7904,0.8134,0.6815,0.7416,0.4294,0.4378,1.065
nb,Naive Bayes,0.6785,0.7382,0.8535,0.6339,0.7275,0.3557,0.3796,0.152


### 4 : Model Experimentation with pycaret

In [None]:
# create a experimentation with pycaret and exclude ['gbc','knn','qda', 'dummy', 'svm', 'ada']

In [None]:
# create a model which gives the highest accuracy

In [None]:
# create feature importance plot

### 5 : Model Experimentation after dropping features

From the above feature tests we can claerly see that some of the features are not significant. We will now drop all the insignificant features and select only the significant ones.
The list of the significant features is 
['total_leads_droppped', 'city_tier', 'referred_lead', 'app_complete_flag', 'first_platform_c', 'first_utm_medium_c', 'first_utm_source_c'].
So now you will train your model with onlly these features.

Also note that in our previous experiments we saw that tree based models are our top performers. In case of tree based models we do not require transformationss normalization, scaling etc. So make sure that you use setup pycaret in the proper way. i.e, make sure that you use normalize = False and transformation = False.

In [None]:
# 
# ['total_leads_droppped', 'city_tier', 'referred_lead', 'app_complete_flag', 'first_platform_c', 'first_utm_medium_c', 'first_utm_source_c']
#
# Train the model using the features listed above. Since we are using tree models we do not require any transformaions 
# such as normalization, scaling etc.So make sure that you use setup pycaret in the proper way. i.e, make sure that you use 
# normalize = False and transformation = False.

In [None]:
# create a experimentation with pycaret and exclude ['gbc','knn','qda', 'dummy', 'svm', 'ada']. 

In [None]:
# You should get lightgbm as the best performing model. So now we will train a lightGBM model manually using pycaret

In [None]:
# Tune the hyper parameters of the lightgbm model using optuna on 10 folds and optimise AUC as that was our system metric, 
# hence we will optimise AUC

In [None]:
# Print the final models configuration so that we can use it in the model retraining pipeline 