<a href="https://colab.research.google.com/github/nileshmalode11/Model-Monitoring-classification/blob/main/Model_monitoring_on_classification_problem.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install nannyml

Collecting nannyml
  Downloading nannyml-0.9.1-py3-none-any.whl (20.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m20.8/20.8 MB[0m [31m80.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting APScheduler<4.0.0,>=3.9.1 (from nannyml)
  Downloading APScheduler-3.10.3-py3-none-any.whl (59 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m59.4/59.4 kB[0m [31m8.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting FLAML<2.0.0,>=1.0.11 (from nannyml)
  Downloading FLAML-1.2.4-py3-none-any.whl (260 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m260.5/260.5 kB[0m [31m31.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting Jinja2<3.1 (from nannyml)
  Downloading Jinja2-3.0.3-py3-none-any.whl (133 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m133.6/133.6 kB[0m [31m17.9 MB/s[0m eta [36m0:00:00[0m
Collecting analytics-python<2.0.0,>=1.4.0 (from nannyml)
  Downloading analytics_python-1.4.post1-py2.py3-none-any.whl (23 kB)

In [None]:
import numpy as np
import pandas as pd
import random
import datetime as dt
import matplotlib.pyplot as plt
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import RandomOverSampler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import recall_score
import nannyml as nml

### **Reading Dataset**

In [None]:
df = pd.read_csv("/content/hmeq.csv")
df.head()

Unnamed: 0,BAD,LOAN,MORTDUE,VALUE,REASON,JOB,YOJ,DEROG,DELINQ,CLAGE,NINQ,CLNO,DEBTINC
0,1,1100,25860.0,39025.0,HomeImp,Other,10.5,0.0,0.0,94.366667,1.0,9.0,
1,1,1300,70053.0,68400.0,HomeImp,Other,7.0,0.0,2.0,121.833333,0.0,14.0,
2,1,1500,13500.0,16700.0,HomeImp,Other,4.0,0.0,0.0,149.466667,1.0,10.0,
3,1,1500,,,,,,,,,,,
4,0,1700,97800.0,112000.0,HomeImp,Office,3.0,0.0,0.0,93.333333,0.0,14.0,


### **Data Understanding**

In [None]:
df = pd.get_dummies(df).dropna()

In [None]:
df.shape


(3515, 19)

In [None]:
df.isnull().sum()

BAD               0
LOAN              0
MORTDUE           0
VALUE             0
YOJ               0
DEROG             0
DELINQ            0
CLAGE             0
NINQ              0
CLNO              0
DEBTINC           0
REASON_DebtCon    0
REASON_HomeImp    0
JOB_Mgr           0
JOB_Office        0
JOB_Other         0
JOB_ProfExe       0
JOB_Sales         0
JOB_Self          0
dtype: int64

In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3515 entries, 5 to 5959
Data columns (total 19 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   BAD             3515 non-null   int64  
 1   LOAN            3515 non-null   int64  
 2   MORTDUE         3515 non-null   float64
 3   VALUE           3515 non-null   float64
 4   YOJ             3515 non-null   float64
 5   DEROG           3515 non-null   float64
 6   DELINQ          3515 non-null   float64
 7   CLAGE           3515 non-null   float64
 8   NINQ            3515 non-null   float64
 9   CLNO            3515 non-null   float64
 10  DEBTINC         3515 non-null   float64
 11  REASON_DebtCon  3515 non-null   uint8  
 12  REASON_HomeImp  3515 non-null   uint8  
 13  JOB_Mgr         3515 non-null   uint8  
 14  JOB_Office      3515 non-null   uint8  
 15  JOB_Other       3515 non-null   uint8  
 16  JOB_ProfExe     3515 non-null   uint8  
 17  JOB_Sales       3515 non-null   u

In [None]:
df["BAD"].value_counts()

0    3206
1     309
Name: BAD, dtype: int64

### **Data Balancing**

In [None]:
over = RandomOverSampler(sampling_strategy=0.8)
df = pd.concat(over.fit_resample(df[df.columns[1:]], df["BAD"]), axis=1)
under = RandomUnderSampler(sampling_strategy=0.9)
df = pd.concat(under.fit_resample(df[df.columns[:-1]], df["BAD"]), axis=1)

In [None]:
df["BAD"].value_counts()

0    2848
1    2564
Name: BAD, dtype: int64

### **train test splitting**

In [None]:
X_train, X_rest, y_train, y_rest = train_test_split(df[df.columns[:-1]],
                                                    df["BAD"],
                                                    stratify=df["BAD"],
                                                    test_size=0.5)
X_test, X_prod, y_test, y_prod = train_test_split(X_rest,
                                                  y_rest,
                                                  stratify=y_rest,
                                                  test_size=0.2)

### **Model Fiting**

In [None]:
rfc = RandomForestClassifier(random_state=0)
rfc.fit(X_train, y_train)

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
logreg = LogisticRegression()
logreg.fit(X_train, y_train)


lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression



In [None]:
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Dense,Input
import keras
from keras.layers import Dense
from keras.models import Sequential
from keras.utils import to_categorical
from keras.optimizers import SGD

In [None]:
model = Sequential()

# The Input Layer :
model.add(Dense(128, kernel_initializer='normal',input_dim = X_train.shape[1], activation='relu'))

# The Hidden Layers :
model.add(Dense(256, kernel_initializer='normal',activation='relu'))
model.add(Dense(256, kernel_initializer='normal',activation='relu'))
model.add(Dense(256, kernel_initializer='normal',activation='relu'))

# The Output Layer :
model.add(Dense(1, kernel_initializer='normal',activation='linear'))

# Compile the network :
model.compile(loss='mean_absolute_error', optimizer='adam', metrics=['mean_absolute_error'])

In [None]:
model.fit(X_train, y_train, epochs=10, batch_size=32, validation_split = 0.1)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7cdddef2f490>

In [None]:
X_test['partition'], X_test['BAD'] = 'reference', y_test.values
X_prod['partition'], X_prod['BAD'] = 'analysis', y_prod.values
X_prod['YOJ'] = X_prod['YOJ'] * (random.uniform(0, 12) + 0.4)
X_prod['MORTDUE'] = X_prod['MORTDUE'] * random.uniform(0, 5)
X_prod['CLAGE'] = X_prod['CLAGE'] * 0.1
nanny_ml_df = pd.concat([X_test, X_prod])
nanny_ml_df['Time'] = [dt.datetime(2020,1,1) + dt.timedelta(hours=x+5)
                       for x in range(len(nanny_ml_df))]
nanny_ml_df['identifier'] = nanny_ml_df.index
nanny_ml_df['y_pred_proba'] = logreg.predict_proba(nanny_ml_df[nanny_ml_df.columns[:-4]])[:,1]
nanny_ml_df['y_pred'] = nanny_ml_df['y_pred_proba'].map(lambda p: int(p >= 0.75))

reference = nanny_ml_df[nanny_ml_df['partition']=='reference'].copy()
analysis = nanny_ml_df[nanny_ml_df['partition']=='analysis'].copy()



In [None]:
for partition_name, partition_data in nanny_ml_df.groupby('partition', sort=False):
    print(partition_name, recall_score(partition_data['BAD'], partition_data['y_pred']))

reference 0.004878048780487805
analysis 0.0


### **Model MOnetoring**

In [None]:
CBPE

In [None]:
metadata = nml.CBPE(reference,
                                exclude_columns=['identifier', 'y_pred_proba',
                                                 'y_pred', 'BAD'],
                                model_type='classification_binary')
metadata.target_column_name = 'BAD'
metadata.timestamp_column_name = 'Time'

TypeError: ignored

In [None]:
estimator = nml.CBPE(
    problem_type='classification_binary',
    y_pred_proba='y_pred_proba',
    y_pred='y_pred',
    y_true='BAD',
    metrics=['roc_auc'],
)

In [None]:
estimator = estimator.fit(reference)
estimated_performance = estimator.estimate(analysis)

CalculatorException: ignored

In [None]:
figure = estimated_performance.plot()
figure.show()

In [None]:
df.columns

Index(['LOAN', 'MORTDUE', 'VALUE', 'YOJ', 'DEROG', 'DELINQ', 'CLAGE', 'NINQ',
       'CLNO', 'DEBTINC', 'REASON_DebtCon', 'REASON_HomeImp', 'JOB_Mgr',
       'JOB_Office', 'JOB_Other', 'JOB_ProfExe', 'JOB_Sales', 'JOB_Self',
       'BAD'],
      dtype='object')

In [None]:
features = ['LOAN','MORTDUE','VALUE','YOJ','DEROG','DELINQ','CLAGE','NINQ','CLNO','DEBTINC','REASON_DebtCon',
            'REASON_HomeImp','JOB_Mgr','JOB_Office','JOB_Other','JOB_ProfExe','JOB_Sales','JOB_Self']

univariate_calculator = nml.UnivariateDriftCalculator(
    column_names=features,
)

univariate_calculator.fit(reference)
univariate_drift = univariate_calculator.calculate(analysis)

In [None]:
alert_count_ranker = nml.AlertCountRanker()
alert_count_ranked_features = alert_count_ranker.rank(univariate_drift)
display(alert_count_ranked_features.head())

Unnamed: 0,number_of_alerts,column_name,rank
0,20,YOJ,1
1,19,DEBTINC,2
2,17,LOAN,3
3,16,NINQ,4
4,16,CLNO,5


In [None]:
figure = univariate_drift.filter(column_names=['YOJ','DEBTINC', 'CLNO']).plot()
figure.show()

In [None]:
uni_drift_AGEP_analysis = univariate_drift.filter(column_names=['DEBTINC'], period='analysis')
figure = estimated_performance.compare(uni_drift_AGEP_analysis).plot()
figure.show()

In [None]:
figure = univariate_drift.filter(period='analysis', column_names=['YOJ','DEBTINC', 'CLNO']).plot(kind='distribution')
figure.show()

### **Comparing Estimated with Realized Performance when Targets Arrive**

In [None]:
_, _, analysis_targets = nml.load_us_census_ma_employment_data()

df_analysis_with_targets = pd.concat([analysis, analysis_targets], axis=1)
display(df_analysis_with_targets.head())

Unnamed: 0,LOAN,MORTDUE,VALUE,YOJ,DEROG,DELINQ,CLAGE,NINQ,CLNO,DEBTINC,...,JOB_ProfExe,JOB_Sales,JOB_Self,partition,BAD,Time,identifier,y_pred_proba,y_pred,employed
3574,28200.0,12112.013281,109762.0,55.286976,0.0,1.0,13.096957,0.0,21.0,60.295413,...,0.0,0.0,0.0,analysis,1.0,2020-03-31 09:00:00,3574.0,0.84,1.0,0
788,15600.0,10738.116542,92443.0,41.465232,0.0,0.0,12.80296,0.0,13.0,28.484859,...,1.0,0.0,0.0,analysis,0.0,2020-03-31 10:00:00,788.0,0.22,0.0,0
1443,18900.0,18856.79674,120119.0,48.376104,0.0,0.0,29.275174,1.0,19.0,30.681689,...,0.0,0.0,0.0,analysis,0.0,2020-03-31 11:00:00,1443.0,0.25,0.0,1
4615,13700.0,13557.402384,92427.0,48.376104,0.0,3.0,18.13058,0.0,33.0,39.272466,...,0.0,0.0,0.0,analysis,1.0,2020-03-31 12:00:00,4615.0,0.52,0.0,1
447,18400.0,8953.915799,69559.0,138.21744,0.0,0.0,17.801583,0.0,23.0,39.707988,...,1.0,0.0,0.0,analysis,0.0,2020-03-31 13:00:00,447.0,0.31,0.0,1


In [None]:
performance_calculator = nml.PerformanceCalculator(
    problem_type='classification_binary',
    y_pred_proba='y_pred_proba',
    y_pred='y_pred',
    y_true='BAD',
    metrics=['roc_auc'])

performance_calculator.fit(reference)
calculated_performance = performance_calculator.calculate(df_analysis_with_targets)

figure = estimated_performance.filter(period='analysis').compare(calculated_performance).plot()
figure.show()



In [None]:
rcerror_calculator = (nml.DataReconstructionDriftCalculator(model_metadata=estimator)
                         .fit(reference_data=reference))
rcerror_results = rcerror_calculator.calculate(data=pd.concat([reference, analysis],
                                               ignore_index=True))
fig = rcerror_results.plot()
fig.show()

TypeError: ignored