### 1 : Importing Libraries

In [None]:
# ignore the warnings in the output
import warnings
warnings.filterwarnings("ignore")

In [None]:
# import libraries and packages
import numpy as np
import pandas as pd 
import matplotlib.pyplot as plt
import mlflow
from pycaret.classification import *

 ### 2 Reading Data

In [None]:
##Copy the cleaned data obtained from the data_cleaning notebook into the Data folder before proceeding
%time
data = pd.read_csv('data/cleaned_data.csv')

In [None]:
data.head()

In [None]:
# drop the data column as it is not needed for training
data = data.drop(['created_date'], axis=1)

In [None]:
data.info()

### 3 Setting up Environment: 

In [None]:
# create a connection and setup a SQLite database with the name "lead_scoring_model_experimentation.db" in 
# 'Assignment/02_training_pipeline/notebooks/' location

In [None]:
# import os
# import sqlite3

# # Get current directory (where notebook is running)
# db_dir = os.getcwd()  # this will be '02_training_pipeline/notebooks' if running from the notebook

# # Define DB name and full path
# db_name = 'lead_scoring_model_experimentation.db'
# db_path = os.path.join(db_dir, db_name)

# # Create a connection to the SQLite database
# conn = sqlite3.connect(db_path)

# # Print confirmation
# print(f"Database created at: {db_path}")

# # Close connection
# conn.close()

In [None]:
'''
Now you need to start the MLflow server in a new terminal. 
Note: Before you start the MLflow server, create a folder named mlruns in the assignment directory.
Now you need to run the command to start MLflow server such that:
1. The lead_scoring_model_experimentation.db which you created above is used as the backend-store.
2. mlruns folder is used as an artifact directory. 
3. The server runs on the port 6006.

The steps to do so are as follows:
Open a new terminal.
Then go to the Assignment directory using the cd command. Type the command: cd Assignment/
Create a folder named mlruns here. You can create this folder using either the command line or GUI. To create this folder via the command line run the command: mkdir ./mlruns
Then, type the following command to start the MLflow server: 
mlflow server --backend-store-uri='sqlite:///./02_training_pipeline/notebooks/lead_scoring_model_experimentation.db' --default-artifact-root="./mlruns" --port=6006 --host=0.0.0.0
'''

In [None]:
# Once, your server is successfully running, create a mlflow tracking uri at "http://0.0.0.0:6006"

My working mlflow command

mlflow ui   --backend-store-uri="sqlite:///02_training_pipeline/notebooks/lead_scoring_model_experimentation.db"   \
--default-artifact-root="file:///home/CodePro-Lead-Scoring2/02_training_pipeline/notebooks/mlruns"   \
--port=5001

mlflow ui --backend-store-uri="sqlite:///02_training_pipeline/notebooks/lead_scoring_model_experimentation.db" \
--default-artifact-root="file:///home/CodePro-Lead-Scoring2/02_training_pipeline/notebooks/mlruns" \
--port=5001 \
--host=0.0.0.0 \
--gunicorn-opts="--log-level=ERROR" \
2>/dev/null

mlflow ui --port=5001 --host=0.0.0.0

My Mlflow url

https://76a7d32852700.notebooks.jarvislabs.net/proxy/5001/#/experiments/0

In [None]:
# setup pycaret

In [None]:
# # Check mlflow URI
# import mlflow
# print("Tracking URI:", mlflow.get_tracking_uri())
# print("Artifact URI:", mlflow.get_artifact_uri())


In [None]:
import mlflow
mlflow.set_tracking_uri("http://127.0.0.1:5001")

In [None]:
from pycaret.classification import *

# Setup with PyCaret 3.3.2 syntax
exp = setup(
    data=data,
    target = 'app_complete_flag',  
    fold_shuffle=True, 
    session_id = 42,
    normalize = True, 
    transformation = True, 
    remove_multicollinearity = True, 
    multicollinearity_threshold = 0.95,
    n_jobs=4,
    use_gpu=False,
    log_experiment=True,
    # experiment_name='Lead_Scoring_Model_Experimentation',
    log_plots=True,
    log_data=True,
    verbose=True,
    log_profile=False
)

### 4 : Model Experimentation with pycaret

In [None]:
# create a experimentation with pycaret and exclude ['gbc','knn','qda', 'dummy', 'svm', 'ada']
best_model = compare_models(sort='AUC', exclude=['gbc', 'knn', 'qda', 'dummy', 'svm', 'ada'])

In [None]:
# create a model which gives the highest accuracy
final_model = create_model(best_model, fold=5)

In [None]:
print(final_model)

In [None]:
# create feature importance plot
plot_model(final_model, plot='feature_all')

In [None]:
plot_model(final_model, plot = 'auc')


In [None]:
plot_model(final_model, plot = 'confusion_matrix', plot_kwargs = {'percent' : True})


### 5 : Model Experimentation after dropping features

From the above feature tests we can claerly see that some of the features are not significant. We will now drop all the insignificant features and select only the significant ones.
The list of the significant features is 
['total_leads_droppped', 'city_tier', 'referred_lead', 'app_complete_flag', 'first_platform_c', 'first_utm_medium_c', 'first_utm_source_c'].
So now you will train your model with onlly these features.

Also note that in our previous experiments we saw that tree based models are our top performers. In case of tree based models we do not require transformationss normalization, scaling etc. So make sure that you use setup pycaret in the proper way. i.e, make sure that you use normalize = False and transformation = False.

In [None]:
# 
# ['total_leads_droppped', 'city_tier', 'referred_lead', 'app_complete_flag', 'first_platform_c', 'first_utm_medium_c', 'first_utm_source_c']
#
# Train the model using the features listed above. Since we are using tree models we do not require any transformaions 
# such as normalization, scaling etc.So make sure that you use setup pycaret in the proper way. i.e, make sure that you use 
# normalize = False and transformation = False.

In [None]:
significant_features = ['total_leads_droppped', 'city_tier', 'referred_lead', 'app_complete_flag', 
                        'first_platform_c', 'first_utm_medium_c', 'first_utm_source_c']
data = data[significant_features]

In [None]:
# Setup PyCaret with tree-based model settings
exp_tree = setup(
    data=data,
    target='app_complete_flag',
    fold_shuffle=True,
    session_id=42,
    normalize=False,
    transformation=False,
    remove_multicollinearity=True,
    multicollinearity_threshold=0.95,
    n_jobs=5,
    use_gpu=False,
    log_experiment=True,
    log_plots=True,
    log_data=True,
    verbose=True,
    log_profile=False
)

In [None]:
# create a experimentation with pycaret and exclude ['gbc','knn','qda', 'dummy', 'svm', 'ada']. 

In [None]:
best_tree_model = compare_models(sort='AUC', exclude=['gbc', 'knn', 'qda', 'dummy', 'svm', 'ada'])

In [None]:
# You should get lightgbm as the best performing model. So now we will train a lightGBM model manually using pycaret

In [None]:
lightgbm_model = create_model('lightgbm')

In [None]:
# Tune the hyper parameters of the lightgbm model using optuna on 10 folds and optimise AUC as that was our system metric, 
# hence we will optimise AUC

In [None]:
# param_grid = {
#     'force_row_wise': [True]
# }

tuned_lightgbm = tune_model(lightgbm_model, optimize='AUC', fold=10, search_library='optuna', custom_grid=param_grid)

In [None]:
# Print the final models configuration so that we can use it in the model retraining pipeline 

In [None]:
print(tuned_lightgbm)