### 1 : Importing Libraries

In [2]:
# ignore the warnings in the output
import warnings
warnings.filterwarnings("ignore")

In [3]:
# import libraries and packages
import numpy as np
import pandas as pd 
import matplotlib.pyplot as plt
import mlflow
from pycaret.classification import *

 ### 2 Reading Data

In [4]:
##Copy the cleaned data obtained from the data_cleaning notebook into the Data folder before proceeding
%time
dataset = pd.read_csv('Data/cleaned_data.csv')

CPU times: user 6 µs, sys: 1e+03 ns, total: 7 µs
Wall time: 17.2 µs


In [5]:
dataset.head()

Unnamed: 0,created_date,city_tier,first_platform_c,first_utm_medium_c,first_utm_source_c,total_leads_droppped,referred_lead,app_complete_flag,assistance_interaction,career_interaction,payment_interaction,social_interaction,syllabus_interaction
0,2021-07-01 00:08:15,1.0,Level0,Level11,Level2,1.0,0.0,1,0.0,0.0,0.0,0.0,0.0
1,2021-07-01 00:16:43,2.0,Level3,Level0,others,1.0,0.0,1,0.0,0.0,0.0,0.0,0.0
2,2021-07-01 00:22:20,1.0,Level3,Level0,Level0,1.0,0.0,1,0.0,0.0,0.0,0.0,0.0
3,2021-07-01 00:23:13,1.0,Level1,Level3,others,2.0,0.0,0,0.0,0.0,0.0,0.0,0.0
4,2021-07-01 00:28:38,1.0,Level3,Level0,Level0,1.0,0.0,0,0.0,0.0,0.0,0.0,0.0


In [6]:
# drop the data column as it is not needed for training
dataset = dataset.drop(['created_date'], axis=1)

In [7]:
import utils 
data_for_model, data_unseen = utils.get_validation_unseen_set(dataset, validation_frac=0.05, sample=False, sample_frac=0.1)


### 3 Setting up Environment: 

In [8]:
root_folder = "/home/"

data_directory = root_folder+"data/raw/"
data_profile_path = root_folder+"/data/profile_report/"
intermediate_data_path = root_folder+"data/interim/"
final_processed_data_path = root_folder+"data/processed/"
database_path = root_folder+"database/"

In [9]:
import utils
# create a connection and setup a SQLite database with the name "lead_scoring_model_experimentation.db" in 
# 'Assignment/02_training_pipeline/notebooks/' location
utils.create_sqlit_connection(database_path,r"lead_scoring_model_experimentation.db")

2.6.0


In [None]:
'''
Now you need to start the MLflow server in a new terminal. 
Note: Before you start the MLflow server, create a folder named mlruns in the assignment directory.
Now you need to run the command to start MLflow server such that:
1. The lead_scoring_model_experimentation.db which you created above is used as the backend-store.
2. mlruns folder is used as an artifact directory. 
3. The server runs on the port 6006.

The steps to do so are as follows:
Open a new terminal.
Then go to the Assignment directory using the cd command. Type the command: cd Assignment/
Create a folder named mlruns here. You can create this folder using either the command line or GUI. To create this folder via the command line run the command: mkdir ./mlruns
Then, type the following command to start the MLflow server: 
mlflow server --backend-store-uri='sqlite:///./02_training_pipeline/notebooks/lead_scoring_model_experimentation.db' --default-artifact-root="./mlruns" --port=6006 --host=0.0.0.0
'''
mlflow server --backend-store-uri='sqlite:///lead_scoring_model_experimentation.db' --default-artifact-root="mlruns/" --port=6006 --host=0.0.0.0

In [10]:
# Once, your server is successfully running, create a mlflow tracking uri at "http://0.0.0.0:6006"
mlflow.set_tracking_uri("http://0.0.0.0:6006")

In [15]:
# setup pycaret 
from pycaret.classification import *
exp_clf102 = setup(data=data_for_model, target = 'app_complete_flag',  fold_shuffle=True, 
                   session_id = 42,
                   normalize = False, 
                   transformation = False, 
                   ignore_low_variance = True,
                   remove_multicollinearity = True, multicollinearity_threshold = 0.95,
                   #date_features=['registration_init_time','transaction_date_min','transaction_date_max','membership_expire_date_max','last_login'],
                   fix_imbalance=False ,#ignore_features=['msno'],
                   n_jobs=-1,use_gpu=False,
                   log_experiment=True,experiment_name='lead_scoring_model_experimentation',
                   log_plots=True, log_data=True,
                   silent=True, verbose=True,
                   log_profile=False)

Unnamed: 0,Description,Value
0,session_id,42
1,Target,app_complete_flag
2,Target Type,Binary
3,Label Encoded,
4,Original Data,"(227016, 12)"
5,Missing Values,False
6,Numeric Features,7
7,Categorical Features,4
8,Ordinal Features,False
9,High Cardinality Features,False


2023/04/16 06:17:40 INFO mlflow.tracking.fluent: Experiment with name 'lead_scoring_model_experimentation' does not exist. Creating a new experiment.


### 4 : Model Experimentation with pycaret

In [16]:
# create a experimentation with pycaret and exclude ['gbc','knn','qda', 'dummy', 'svm', 'ada']
best_model = compare_models(fold = 5,exclude=['gbc','knn','qda', 'dummy', 'svm', 'ada'])

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
lightgbm,Light Gradient Boosting Machine,0.7363,0.8142,0.8328,0.6999,0.7606,0.4719,0.4808,1.304
xgboost,Extreme Gradient Boosting,0.7361,0.8139,0.8335,0.6995,0.7606,0.4715,0.4806,8.894
rf,Random Forest Classifier,0.735,0.8117,0.8227,0.7018,0.7575,0.4694,0.4766,2.552
et,Extra Trees Classifier,0.7343,0.8109,0.8202,0.7018,0.7564,0.468,0.4749,3.116
dt,Decision Tree Classifier,0.7341,0.8104,0.82,0.7017,0.7563,0.4677,0.4745,0.162
lr,Logistic Regression,0.7093,0.777,0.8195,0.6734,0.7393,0.4178,0.4281,1.154
ridge,Ridge Classifier,0.6993,0.0,0.7955,0.6692,0.7269,0.3979,0.4053,0.084
lda,Linear Discriminant Analysis,0.6993,0.7637,0.7955,0.6692,0.7269,0.398,0.4054,0.402
nb,Naive Bayes,0.66,0.7229,0.8742,0.6138,0.7212,0.3181,0.352,0.534


In [17]:
# create a model which gives the highest accuracy
lgbm  = create_model('lightgbm', fold = 5) 

Unnamed: 0_level_0,Unnamed: 1_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Split,Fold,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
CV-Val,0,0.7391,0.8172,0.8341,0.7027,0.7628,0.4775,0.4862
CV-Val,1,0.7335,0.8133,0.8302,0.6975,0.7581,0.4663,0.4751
CV-Val,2,0.7374,0.8141,0.8317,0.7016,0.7611,0.4742,0.4827
CV-Val,3,0.733,0.8119,0.8337,0.6958,0.7586,0.4654,0.4749
CV-Val,4,0.7385,0.8147,0.8345,0.7019,0.7625,0.4763,0.4852
CV-Val,Mean,0.7363,0.8142,0.8328,0.6999,0.7606,0.4719,0.4808
CV-Val,Std,0.0025,0.0018,0.0016,0.0027,0.002,0.0051,0.0049
Train,,0.7375,0.817,0.8356,0.7004,0.762,0.4743,0.4836


In [29]:
# create feature importance plot
#features = data_for_model.columns.tolist()
#pd.DataFrame({'Value':lgbm.feature_importances_,'Feature':features}).sort_values(by="Value",ascending=False)
lgbm.feature_importances_

array([404, 655,  83,  78,  95,  57, 103, 118,  93,  80,  11,  15,  81,
        11,  32,  59,  47,  68,  70,  34,  21,  54,  19,  30,  40,  52,
        14,  87,  63,  41,  28,  59,  35,  55, 102,  41,  65], dtype=int32)

### 5 : Model Experimentation after dropping features

From the above feature tests we can claerly see that some of the features are not significant. We will now drop all the insignificant features and select only the significant ones.
The list of the significant features is 
['total_leads_droppped', 'city_tier', 'referred_lead', 'app_complete_flag', 'first_platform_c', 'first_utm_medium_c', 'first_utm_source_c'].
So now you will train your model with onlly these features.

Also note that in our previous experiments we saw that tree based models are our top performers. In case of tree based models we do not require transformationss normalization, scaling etc. So make sure that you use setup pycaret in the proper way. i.e, make sure that you use normalize = False and transformation = False.

In [11]:
# 
# ['total_leads_droppped', 'city_tier', 'referred_lead', 'app_complete_flag', 'first_platform_c', 'first_utm_medium_c', 'first_utm_source_c']
#
# Train the model using the features listed above. Since we are using tree models we do not require any transformaions 
# such as normalization, scaling etc.So make sure that you use setup pycaret in the proper way. i.e, make sure that you use 
# normalize = False and transformation = False.
# setup pycaret 
from pycaret.classification import *
exp_clf102 = setup(data=data_for_model, target = 'app_complete_flag',  fold_shuffle=True, 
                   session_id = 42,
                   normalize = False, 
                   transformation = False, 
                   ignore_low_variance = True,
                   remove_multicollinearity = True, multicollinearity_threshold = 0.95,
                   #date_features=['registration_init_time','transaction_date_min','transaction_date_max','membership_expire_date_max','last_login'],
                   fix_imbalance=False ,
                   ignore_features=['assistance_interaction', 'career_interaction', 'payment_interaction', 'social_interaction', 'syllabus_interaction'],
                   n_jobs=-1,use_gpu=False,
                   log_experiment=True,experiment_name='lead_scoring_model_experimentation_significant',
                   log_plots=True, log_data=True,
                   silent=True, verbose=True,
                   log_profile=False)

Unnamed: 0,Description,Value
0,session_id,42
1,Target,app_complete_flag
2,Target Type,Binary
3,Label Encoded,
4,Original Data,"(227016, 12)"
5,Missing Values,False
6,Numeric Features,2
7,Categorical Features,4
8,Ordinal Features,False
9,High Cardinality Features,False


2023/04/16 07:27:47 INFO mlflow.tracking.fluent: Experiment with name 'lead_scoring_model_experimentation_significant' does not exist. Creating a new experiment.


In [12]:
# create a experimentation with pycaret and exclude ['gbc','knn','qda', 'dummy', 'svm', 'ada']. 
best_model_significant = compare_models(fold = 5,exclude=['gbc','knn','qda', 'dummy', 'svm', 'ada'])

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
lightgbm,Light Gradient Boosting Machine,0.7363,0.8142,0.8328,0.6999,0.7606,0.4719,0.4808,1.392
xgboost,Extreme Gradient Boosting,0.7361,0.8139,0.8335,0.6995,0.7606,0.4715,0.4806,7.366
rf,Random Forest Classifier,0.735,0.8117,0.8227,0.7018,0.7575,0.4694,0.4766,2.378
et,Extra Trees Classifier,0.7343,0.8109,0.8202,0.7018,0.7564,0.468,0.4749,2.908
dt,Decision Tree Classifier,0.7341,0.8104,0.82,0.7017,0.7563,0.4677,0.4745,0.16
lr,Logistic Regression,0.7093,0.777,0.8195,0.6734,0.7393,0.4178,0.4281,1.106
ridge,Ridge Classifier,0.6993,0.0,0.7955,0.6692,0.7269,0.3979,0.4053,0.072
lda,Linear Discriminant Analysis,0.6993,0.7637,0.7955,0.6692,0.7269,0.398,0.4054,0.278
nb,Naive Bayes,0.66,0.7229,0.8742,0.6138,0.7212,0.3181,0.352,0.452


In [23]:
# You should get lightgbm as the best performing model. So now we will train a lightGBM model manually using pycaret
lgbm  = create_model('lightgbm', fold = 10) 

Unnamed: 0_level_0,Unnamed: 1_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Split,Fold,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
CV-Val,0,0.7418,0.8178,0.8339,0.7061,0.7647,0.4831,0.4913
CV-Val,1,0.7374,0.8173,0.8359,0.7002,0.762,0.4742,0.4835
CV-Val,2,0.7339,0.8128,0.8287,0.6985,0.7581,0.4672,0.4757
CV-Val,3,0.7322,0.8133,0.8289,0.6965,0.757,0.4638,0.4726
CV-Val,4,0.739,0.8138,0.832,0.7034,0.7623,0.4773,0.4856
CV-Val,5,0.7354,0.8149,0.8297,0.7,0.7594,0.4703,0.4787
CV-Val,6,0.7323,0.8141,0.8341,0.6949,0.7582,0.4639,0.4737
CV-Val,7,0.732,0.8097,0.8317,0.6953,0.7574,0.4633,0.4726
CV-Val,8,0.7393,0.8163,0.8329,0.7034,0.7627,0.478,0.4864
CV-Val,9,0.737,0.8135,0.8389,0.6987,0.7624,0.4732,0.4832


In [16]:
!pip install optuna

Collecting optuna
  Downloading optuna-3.1.1-py3-none-any.whl (365 kB)
[K     |████████████████████████████████| 365 kB 2.1 MB/s eta 0:00:01
Collecting cmaes>=0.9.1
  Downloading cmaes-0.9.1-py3-none-any.whl (21 kB)
Installing collected packages: cmaes, optuna
Successfully installed cmaes-0.9.1 optuna-3.1.1


In [33]:
from optuna.distributions import IntDistribution

# Define the search space for the hyperparameters
param_distributions = {
    'num_leaves': IntDistribution(low=10, high=100)
}

In [34]:
# Tune the hyper parameters of the lightgbm model using optuna on 10 folds and optimise AUC as that was our system metric, 
# hence we will optimise AUC
tuned_lgbm, tuner_0 = tune_model(lgbm, 
                            fold = 10,                                    
                            search_library='optuna',
                            search_algorithm='random',
                            optimize = 'AUC',
                            custom_grid = param_distributions,
                            choose_better = True, 
                            return_tuner=True)

Unnamed: 0_level_0,Unnamed: 1_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Split,Fold,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
CV-Val,0,0.7419,0.8173,0.8353,0.7057,0.765,0.4832,0.4917
CV-Val,1,0.7372,0.8173,0.8325,0.7011,0.7612,0.4738,0.4825
CV-Val,2,0.7342,0.8129,0.8289,0.6988,0.7583,0.4678,0.4762
CV-Val,3,0.7323,0.8132,0.83,0.6962,0.7572,0.464,0.4729
CV-Val,4,0.7396,0.8137,0.8326,0.7039,0.7629,0.4786,0.4869
CV-Val,5,0.7352,0.8149,0.8282,0.7002,0.7589,0.4698,0.478
CV-Val,6,0.7331,0.8143,0.8335,0.6959,0.7585,0.4654,0.475
CV-Val,7,0.7327,0.8099,0.8311,0.6963,0.7578,0.4647,0.4738
CV-Val,8,0.74,0.8164,0.8336,0.7041,0.7634,0.4795,0.4879
CV-Val,9,0.7371,0.8133,0.8354,0.7,0.7617,0.4735,0.4828


In [38]:
# Print the final models configuration so that we can use it in the model retraining pipeline 
print(tuner_0)

OptunaSearchCV(callbacks=None,
               cv=StratifiedKFold(n_splits=10, random_state=42, shuffle=True),
               enable_pruning=False, error_score='raise',
               estimator=Pipeline(memory=None,
                                  steps=[('empty_step', 'passthrough'),
                                         ('actual_estimator',
                                          LGBMClassifier(boosting_type='gbdt',
                                                         class_weight=None,
                                                         colsample_bytree=1.0,
                                                         importance_type='split',
                                                         learning_rate=0.1,
                                                         max_depth=-1,
                                                         m...
               max_iter=10, n_jobs=-1, n_trials=10,
               param_distributions={'actual_estimator__num_leaves': IntDis