### Import libraries

In [1]:
import pandas as pd
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
import sys
#import gc
#import time
import mlflow
import sklearn
from lightgbm import LGBMClassifier
from sklearn.metrics import roc_auc_score, roc_curve
from sklearn.model_selection import KFold, StratifiedKFold
from sklearn.model_selection import train_test_split
import warnings
import re
import imblearn
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import (train_test_split, GridSearchCV)
from mlflow.models.signature import infer_signature 
warnings.simplefilter(action='ignore', category=FutureWarning)

In [2]:
print('Libraries versions:')
print('Python                : ' + sys.version)
print('NumPy                 : ' + np.version.full_version)
print('Pandas                : ' + pd.__version__)
print('mlflow                : ' + mlflow.__version__)
print('matplotlib            : ' + matplotlib.__version__)
print('Seaborn               : ' + sns.__version__)

Libraries versions:
Python                : 3.10.7 (tags/v3.10.7:6cc6b13, Sep  5 2022, 14:08:36) [MSC v.1933 64 bit (AMD64)]
NumPy                 : 1.23.5
Pandas                : 1.4.2
mlflow                : 2.2.2
matplotlib            : 3.5.2
Seaborn               : 0.12.0


In [3]:
def main_inf(df):
    print("DataFrame shape: ", df.shape)
    print("Nan rate: ", df.isna().mean().mean())
    print("Doublons: ", df.duplicated().sum())  

### Import cleaned dataset

In [4]:
df =pd.read_csv('cleaned.csv').drop(columns='Unnamed: 0')
main_inf(df)
categorical_columns = [col for col in df.columns if df[col].dtype == 'object']
df=df.drop(columns=categorical_columns)
df=df.dropna()

DataFrame shape:  (307507, 171)
Nan rate:  0.07917560456047813
Doublons:  0


In [5]:
df.shape

(35185, 161)

In [6]:
df.head()

Unnamed: 0,TARGET,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,AMT_GOODS_PRICE,REGION_POPULATION_RELATIVE,...,INSTAL_AMT_INSTALMENT_MEAN,INSTAL_AMT_INSTALMENT_SUM,INSTAL_AMT_PAYMENT_MIN,INSTAL_AMT_PAYMENT_MAX,INSTAL_AMT_PAYMENT_MEAN,INSTAL_AMT_PAYMENT_SUM,INSTAL_DAYS_ENTRY_PAYMENT_MAX,INSTAL_DAYS_ENTRY_PAYMENT_MEAN,INSTAL_DAYS_ENTRY_PAYMENT_SUM,INSTAL_COUNT
13,0.0,0.0,1.0,1.0,1.0,225000.0,918468.0,28966.5,697500.0,0.016612,...,13508.9205,405267.615,11783.52,16967.295,13508.9205,405267.615,-21.0,-962.533333,-28876.0,30.0
22,0.0,1.0,0.0,1.0,1.0,450000.0,497520.0,32521.5,450000.0,0.020713,...,119294.595,954356.76,23531.175,770004.315,119294.595,954356.76,-425.0,-1105.875,-8847.0,8.0
25,0.0,1.0,0.0,0.0,0.0,90000.0,225000.0,11074.5,225000.0,0.028663,...,11820.645,70923.87,9035.685,17846.145,11820.645,70923.87,-233.0,-730.666667,-4384.0,6.0
32,0.0,1.0,0.0,1.0,0.0,90000.0,199008.0,20893.5,180000.0,0.010032,...,19217.569091,422786.52,5305.995,35172.135,19217.569091,422786.52,-235.0,-1247.409091,-27443.0,22.0
35,0.0,1.0,0.0,1.0,0.0,112500.0,450000.0,44509.5,450000.0,0.008575,...,6089.499,121789.98,130.365,7297.515,5359.29075,107185.815,-1101.0,-1722.7,-34454.0,20.0


### Pipeline

In [7]:
from sklearn import datasets, preprocessing, model_selection, ensemble, pipeline
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder

In [8]:
num_columns = [col for col in df.columns if df[col].dtype != 'object']
num_columns.remove('TARGET')
df[num_columns]

Unnamed: 0,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,AMT_GOODS_PRICE,REGION_POPULATION_RELATIVE,DAYS_BIRTH,...,INSTAL_AMT_INSTALMENT_MEAN,INSTAL_AMT_INSTALMENT_SUM,INSTAL_AMT_PAYMENT_MIN,INSTAL_AMT_PAYMENT_MAX,INSTAL_AMT_PAYMENT_MEAN,INSTAL_AMT_PAYMENT_SUM,INSTAL_DAYS_ENTRY_PAYMENT_MAX,INSTAL_DAYS_ENTRY_PAYMENT_MEAN,INSTAL_DAYS_ENTRY_PAYMENT_SUM,INSTAL_COUNT
13,0.0,1.0,1.0,1.0,225000.0,918468.0,28966.5,697500.0,0.016612,-14086.0,...,13508.920500,405267.615,11783.520,16967.295,13508.920500,405267.615,-21.0,-962.533333,-28876.0,30.0
22,1.0,0.0,1.0,1.0,450000.0,497520.0,32521.5,450000.0,0.020713,-11146.0,...,119294.595000,954356.760,23531.175,770004.315,119294.595000,954356.760,-425.0,-1105.875000,-8847.0,8.0
25,1.0,0.0,0.0,0.0,90000.0,225000.0,11074.5,225000.0,0.028663,-19334.0,...,11820.645000,70923.870,9035.685,17846.145,11820.645000,70923.870,-233.0,-730.666667,-4384.0,6.0
32,1.0,0.0,1.0,0.0,90000.0,199008.0,20893.5,180000.0,0.010032,-12974.0,...,19217.569091,422786.520,5305.995,35172.135,19217.569091,422786.520,-235.0,-1247.409091,-27443.0,22.0
35,1.0,0.0,1.0,0.0,112500.0,450000.0,44509.5,450000.0,0.008575,-12158.0,...,6089.499000,121789.980,130.365,7297.515,5359.290750,107185.815,-1101.0,-1722.700000,-34454.0,20.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
307454,1.0,0.0,0.0,0.0,112500.0,239850.0,23364.0,225000.0,0.031329,-15034.0,...,6535.671562,313712.235,21.195,24952.680,6310.604062,302908.995,-31.0,-1350.416667,-64820.0,48.0
307473,1.0,0.0,0.0,0.0,360000.0,796396.5,38443.5,643500.0,0.007020,-18463.0,...,10759.792119,2539310.940,1450.440,22045.095,10759.792119,2539310.940,-30.0,-1366.881356,-322584.0,236.0
307476,1.0,0.0,1.0,1.0,81000.0,1350000.0,39474.0,1350000.0,0.024610,-10567.0,...,34720.299545,1145769.885,0.450,1047495.375,34277.979545,1131173.325,-239.0,-2052.212121,-67723.0,33.0
307482,1.0,1.0,0.0,1.0,292500.0,355536.0,18283.5,270000.0,0.072508,-16010.0,...,35778.949773,787136.895,2789.235,53109.315,35778.949773,787136.895,-29.0,-1089.000000,-23958.0,22.0


In [9]:
#Categorical variables to use
cat_vars = [categorical_columns][0]
# Numerical Variables to use
num_vars = [num_columns][0]
num_vars

['CODE_GENDER',
 'FLAG_OWN_CAR',
 'FLAG_OWN_REALTY',
 'CNT_CHILDREN',
 'AMT_INCOME_TOTAL',
 'AMT_CREDIT',
 'AMT_ANNUITY',
 'AMT_GOODS_PRICE',
 'REGION_POPULATION_RELATIVE',
 'DAYS_BIRTH',
 'DAYS_EMPLOYED',
 'DAYS_REGISTRATION',
 'DAYS_ID_PUBLISH',
 'FLAG_MOBIL',
 'FLAG_EMP_PHONE',
 'FLAG_WORK_PHONE',
 'FLAG_CONT_MOBILE',
 'FLAG_PHONE',
 'FLAG_EMAIL',
 'CNT_FAM_MEMBERS',
 'REGION_RATING_CLIENT',
 'REGION_RATING_CLIENT_W_CITY',
 'HOUR_APPR_PROCESS_START',
 'REG_REGION_NOT_LIVE_REGION',
 'REG_REGION_NOT_WORK_REGION',
 'LIVE_REGION_NOT_WORK_REGION',
 'REG_CITY_NOT_LIVE_CITY',
 'REG_CITY_NOT_WORK_CITY',
 'LIVE_CITY_NOT_WORK_CITY',
 'EXT_SOURCE_2',
 'EXT_SOURCE_3',
 'YEARS_BEGINEXPLUATATION_AVG',
 'FLOORSMAX_AVG',
 'YEARS_BEGINEXPLUATATION_MODE',
 'FLOORSMAX_MODE',
 'YEARS_BEGINEXPLUATATION_MEDI',
 'FLOORSMAX_MEDI',
 'TOTALAREA_MODE',
 'OBS_30_CNT_SOCIAL_CIRCLE',
 'DEF_30_CNT_SOCIAL_CIRCLE',
 'OBS_60_CNT_SOCIAL_CIRCLE',
 'DEF_60_CNT_SOCIAL_CIRCLE',
 'DAYS_LAST_PHONE_CHANGE',
 'FLAG_DOCUMENT_

In [10]:
df.isna().mean().mean()

0.0

In [11]:
numeric_features = num_vars
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(missing_values=np.nan,strategy='median')),
    ('scaler', StandardScaler())])

categorical_features = cat_vars
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(missing_values=np.nan,strategy='constant', fill_value='missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        #('cat', categorical_transformer, categorical_features)
    ])

In [12]:
clfP = Pipeline(steps=[('preprocessor', preprocessor), #LGBM classifier pipeline
                  ('classifier', LGBMClassifier(
        nthread=4,
        n_estimators=10000,
        learning_rate=0.02,
        num_leaves=34,
        colsample_bytree=0.9497036,
        subsample=0.8715623,
        max_depth=8,
        reg_alpha=0.041545473,
        reg_lambda=0.0735294,
        min_split_gain=0.0222415,
        min_child_weight=39.3259775,
        verbose=-1))])  

In [13]:
from sklearn.dummy import DummyClassifier
clfD=Pipeline(steps=[('preprocessor', preprocessor),#Dummy classifier pipeline
                  ('classifier', DummyClassifier(strategy="constant",random_state=None, constant=0))])

In [14]:
y=df['TARGET']
X = df.drop(columns=['TARGET'])

In [15]:
# apply SMOTE to the data
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X, y)

# reset the index of X_resampled and y_resampled
X_resampled = X_resampled.reset_index(drop=True)
y_resampled = y_resampled.reset_index(drop=True)


In [16]:
X_train_res,X_test_res,y_train_res,y_test_res = train_test_split(X_resampled,y_resampled,test_size=0.3)
X_train, X_test, y_train,y_test=train_test_split(X,y,test_size=0.3)

In [17]:
clfD.fit(X_train_res,y_train_res)

In [18]:
clfP.fit(X_train_res,y_train_res)

### Evaluating the models

In [19]:
ypred_testD=clfD.predict(X_test).tolist()

In [20]:
ypred_test=clfP.predict(X_test).tolist()

In [21]:
print(sklearn.metrics.classification_report(y_test, ypred_testD))#DuumyClassifier predicting 0 class metrics

              precision    recall  f1-score   support

         0.0       0.93      1.00      0.96      9812
         1.0       0.00      0.00      0.00       744

    accuracy                           0.93     10556
   macro avg       0.46      0.50      0.48     10556
weighted avg       0.86      0.93      0.90     10556



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [22]:
print(sklearn.metrics.classification_report(y_test, ypred_test))#LGBM classifier metrics

              precision    recall  f1-score   support

         0.0       0.98      1.00      0.99      9812
         1.0       0.96      0.72      0.82       744

    accuracy                           0.98     10556
   macro avg       0.97      0.86      0.90     10556
weighted avg       0.98      0.98      0.98     10556



In [23]:
lgbmMetrics=sklearn.metrics.classification_report(y_test, ypred_test,output_dict=True)
metrics=lgbmMetrics.get('1.0')#lgbm classifier metrics

In [24]:
y_pred_proba=clfP.named_steps['classifier'].predict_proba(X_test).tolist()#get probabilities from classifier

### Save and Log best model (LGBM) to Mlflow

In [25]:
from mlflow.models.signature import infer_signature 
signature = infer_signature(X_train, y_train)
#mlflow.sklearn.save_model(clf, 'mlflow_model', signature=signature)

In [26]:
if __name__ == "__main__": 
    import mlflow.sklearn
    with mlflow.start_run():
        #signature = infer_signature(X_train, y_train)
        mlflow.sklearn.save_model(clfP, 'mlflow_modelN', signature=signature)
        mlflow.sklearn.log_model(clfP, 'mlflow_modelN', signature=signature)
        mlflow.log_metrics(metrics)
        



### Request from the model deployed locally

In [31]:
import requests
DEPLOYED_MODEL = "http://127.0.0.1:5003/invocations"
headers = {"Content-Type": "application/json"}

inputs = (X_test).to_dict(orient="list")
prediction = requests.post(url=DEPLOYED_MODEL,
                            json={"inputs": inputs},
                           headers=headers)

# Extract the response data as a dictionary
response_data = prediction.json()

# Print the response data
print(response_data)


{'predictions': [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0

In [32]:
print(sklearn.metrics.classification_report(y_test, ypred_test))

              precision    recall  f1-score   support

         0.0       0.98      1.00      0.99      9812
         1.0       0.96      0.72      0.82       744

    accuracy                           0.98     10556
   macro avg       0.97      0.86      0.90     10556
weighted avg       0.98      0.98      0.98     10556



In [34]:
print(sklearn.metrics.classification_report(y_test, response_data.get('predictions')))

              precision    recall  f1-score   support

         0.0       0.98      1.00      0.99      9812
         1.0       0.96      0.72      0.82       744

    accuracy                           0.98     10556
   macro avg       0.97      0.86      0.90     10556
weighted avg       0.98      0.98      0.98     10556

