### 1. Importing Libraries

In [None]:
import warnings
warnings.filterwarnings("ignore")

In [None]:
import numpy as np
import pandas as pd 
import matplotlib.pyplot as plt
import mlflow
from pycaret.classification import *

### 2. Reading Data

In [None]:
data = pd.read_csv('data/cleaned_data.csv')

In [None]:
data.head()

In [None]:
data = data.drop(['created_date'], axis=1)

In [None]:
data.info()

### 3. Starting the MLflow Server

Now you need to start the MLflow server in a new terminal.  
**Note:** Before you start the MLflow server, create a folder named `mlruns` in the assignment directory.

You need to run the command to start the MLflow server such that:

1. The `lead_scoring_model_experimentation.db` (which you created above) is used as the backend store.
2. `mlruns` folder is used as an artifact directory.
3. The server runs on port **6006**

### 5. Steps to Follow

1. Open a **new terminal**

2. Create a folder named `mlruns`

   ```bash
   mkdir /home/mlruns
   ```

3. Start the MLflow server by running the following command:

   ```bash
   mlflow server \
      --backend-store-uri='sqlite:////home/mlflow/lead_scoring.db' \
      --default-artifact-root="/home/mlruns" \
      --port=6006 \
      --host=0.0.0.0
   ```

4. In notebook, point MLflow Tracking URI to:

   ```
   http://0.0.0.0:6006
   ```


### 3. Setting up Environment 

In [None]:
import os
import sqlite3

mlflow_db = '/home/mlflow/lead_scoring.db'
mlruns_path = '/home/mlruns'

db_dir = os.path.dirname(mlflow_db)

# Create the directory if it doesn't exist
if not os.path.exists(db_dir):
    os.makedirs(db_dir, exist_ok=True)

# Create the SQLite database (if not already existing)
if not os.path.exists(mlflow_db):
    conn = sqlite3.connect(mlflow_db)
    conn.close()
    print(f"Database created at: {mlflow_db}")
else:
    print(f"Database already exists at: {mlflow_db}")

# Check and create the folder if it doesn't exist
if not os.path.exists(mlruns_path):
    os.makedirs(mlruns_path, exist_ok=True)
    print(f"Created directory: {mlruns_path}")
else:
    print(f"Directory already exists: {mlruns_path}")

#### Start mlflow from within notebook

In [None]:
import subprocess
import socket
import os

def is_port_in_use(port):
    """Check if a port is already being used."""
    with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
        return s.connect_ex(('0.0.0.0', port)) == 0 or s.connect_ex(('127.0.0.1', port)) == 0

mlflow_port = 6006

if not is_port_in_use(mlflow_port):
    print(f"Starting MLflow server on port {mlflow_port}...")
    subprocess.Popen([
        "mlflow", "server",
        "--backend-store-uri", "sqlite:////home/mlflow/lead_scoring.db",
        "--default-artifact-root", "/home/mlruns",
        "--port", str(mlflow_port),
        "--host", "0.0.0.0"
    ])
else:
    print(f"MLflow is already running on port {mlflow_port}")

In [None]:
import mlflow
mlflow.set_tracking_uri("http://0.0.0.0:6006")

### 6. Pycaret experiment setup

In [None]:
from pycaret.classification import setup, compare_models, create_model, tune_model

exp = setup(
    data=data,
    target='app_complete_flag',  
    fold_shuffle=True, 
    session_id=42,
    normalize=False, 
    transformation=False, 
    remove_multicollinearity=True, 
    multicollinearity_threshold = 0.95,
    n_jobs=4,
    use_gpu=False,
    log_experiment=True,
    log_plots=True,
    log_data=True,
    verbose=True,
    log_profile=False,
    silent=True,
    experiment_name='Experiment_with_all_features'
)

### 7. Model Experimentation with pycaret

We are excluding

```python
['gbc', 'knn', 'qda', 'dummy', 'svm', 'ada']
```

In [None]:

best_model = compare_models(sort='AUC', exclude=['gbc', 'knn', 'qda', 'dummy', 'svm', 'ada'], fold=5)

In [None]:
# create a model which gives the highest accuracy (AUC)
final_model = create_model(best_model, fold=5)

In [None]:
print(final_model)

In [None]:
# create feature importance plot
plot_model(final_model, plot='feature_all')

In [None]:
plot_model(final_model, plot = 'auc')

In [None]:
plot_model(final_model, plot = 'confusion_matrix', plot_kwargs = {'percent' : True})

### 8. Model Experimentation after dropping features

#### Model Training with Selected Features

We will train the model using only the following significant features:

```
[
    'total_leads_droppped',
    'city_tier',
    'referred_lead',
    'app_complete_flag',
    'first_platform_c', 
    'first_utm_medium_c', 
    'first_utm_source_c'
]
```

Since we are using **tree-based models**, we do **not** require any transformations such as normalization or scaling.

> Make sure to set up PyCaret with the correct configuration:
- `normalize = False`
- `transformation = False`

This ensures the model leverages the raw structure of the data, which tree-based algorithms handle effectively.

In [None]:
significant_features = [
    'total_leads_droppped', 
    'city_tier',
    'referred_lead',
    'app_complete_flag',
    'first_platform_c',
    'first_utm_medium_c',
    'first_utm_source_c'
]

data = data[significant_features]

In [None]:
from pycaret.classification import setup, compare_models, create_model, tune_model, get_config

exp_tree = setup(
    data=data,
    target='app_complete_flag',
    fold_shuffle=True,
    session_id=42,
    normalize=False,
    transformation=False,
    remove_multicollinearity=True,
    multicollinearity_threshold=0.95,
    n_jobs=4,
    use_gpu=False,
    log_experiment=True,
    log_plots=True,
    log_data=True,
    verbose=True,
    log_profile=False,
    silent=True,
    experiment_name='Experiment_with_reduced_features',
)

In [None]:
tree_models = ['rf', 'et', 'xgboost', 'lightgbm', 'dt']
best_tree_model = compare_models(include=tree_models, sort='AUC')

In [None]:
final_model = create_model(best_tree_model)

In [None]:
tuned_final_model = tune_model(final_model, 
                            optimize='AUC', 
                            fold=10, 
                            search_library='optuna')

In [None]:
print(tuned_final_model)

In [None]:
# import psutil

# def kill_process_on_port(port):
#     """Find and kill process using the specified port."""
#     for proc in psutil.process_iter(['pid', 'name', 'connections']):
#         try:
#             for conn in proc.info['connections']:
#                 if conn.status == psutil.CONN_LISTEN and conn.laddr.port == port:
#                     print(f"Killing process '{proc.info['name']}' with PID {proc.info['pid']} on port {port}")
#                     psutil.Process(proc.info['pid']).terminate()
#                     return
#         except (psutil.NoSuchProcess, psutil.AccessDenied, psutil.ZombieProcess):
#             continue
#     print(f"No process found on port {port}.")

# # Kill MLflow server running on port 6006
# kill_process_on_port(6006)