In [54]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
sns.set(context='notebook')
sns.set_style("whitegrid", {'axes.grid' : False})
plt.tight_layout()
from sklearn.model_selection import train_test_split, RepeatedStratifiedKFold, cross_val_score, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_curve, roc_auc_score, confusion_matrix, precision_recall_curve, auc, recall_score, fbeta_score, precision_score
from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.ensemble import RandomForestClassifier
from sklearn.inspection import permutation_importance
from sklearn import preprocessing
from scipy.stats import chi2_contingency
import math
import re
import pickle

import warnings
warnings.filterwarnings("ignore")

%matplotlib inline

<Figure size 640x480 with 0 Axes>

# Loading the prediction data, the models and other important data

<span style="color: red">Replace the predict_url and column_dictionary_url variables with the urls of the data you want to make predictions with</span>

In [55]:
predict_url='InputData.csv' #Replace the predict_url variable with the url of the data you want to make predictions with
data_predict=pd.read_csv(predict_url)
data_predict=data_predict[data_predict['label']=='oot']

column_dictionary_url='data_dictionary_v1.xlsx' #Replace the column_dictionary_url variable with the url of the dictionary you want to make predictions with
dictionary=pd.read_excel(column_dictionary_url).set_index('Rename').to_dict()

 #Loading the models and some important data that'll be used for the predictions (derived during the training process)

lr_model = pickle.load(open('Default_Predict_LR_model.sav', 'rb'))
decision_threshold_lr=0.470235
binsDict={'# NEGATIVE EVENTS IN LAST 6 MONTHS (LOANS)': [-0.01, 1, 3, 5, float("inf")],
              'CUMULATIVE MINIMUM BALANCE LAST 90 DAYS': [-0.01, 150, 450, 18300, float("inf")],
              'gms_version': [-0.01, 203915, 204516, 204713, 204714, float("inf")],
              'gms_sub_version': [-0.01, 100406, 120408, float("inf")],
              'AVG. OF MINIMUM BALANCE PER MONTH LAST 360 DAYS': [-0.01, 70, 700, 1800, 3600, 13000, 50000, float("inf")],
              'count_debit_transactions_last_360_days': [-0.01, 70, 85, 100, 130, 250, float("inf")],
              'AVG. MONTHLY DEBIT LAST 30 DAYS BY AVG. MONTHLY DEBIT LAST 90 DAYS': [-0.01, 1.7, 1.88, 2.15, 3, float("inf")],
              '# NEFT/RTGS/IMPS TRANSACTIONS LAST 360 DAYS': [-0.01, 2, 5, 15, float("inf")],
             'Last closing balance amount (overall)': [-float("inf"), 100, 3700, 7500, 13000, 35000, float("inf")],
             '# ATM TRANSACTIONS LIFETIME': [-0.01, 3, 8, 13, 33, float("inf")],
             'CURRENT LOAN LIABILITY IN THE LAST 3 MONTHS': [-float("inf"), 40, 1600, 15000, 30000, 80000, float("inf")],
             'AVG. DAILY DEBIT LAST 30 DAYS BY AVG. DAILY DEBIT LAST 60-120 DAYS': [-0.01, 1.5, 2, 5, float("inf")],
             'AVG. MONTHLY CREDIT TRANSACTIONS AMOUNT LIFETIME': [-0.01, 7000, 13000, 20000, 32000, 125000, 200000, 375000, float("inf")],
             'AVG. DAILY DEBIT TRANSACTIONS COUNT LIFETIME': [-0.01, 0.27, 0.43, 0.65, 3.28, float("inf")],
             'TOTAL DEBIT AMOUNT : TOTAL CREDIT AMOUNT RATIO LAST 90 DAYS': [-0.01, 0.96, 1, 1.02, 1.36, float("inf")],
             'distance_from_pin_code': [0, 2000, 13500, 150000, float("inf")],
             '# Loan defaults in last 21 days': [-0.01, 1, 2, float("inf")],
             'AVG. MISSED PAYMENT AMOUNT LAST 360 DAYS': [-float("inf"), 1300, 7500, float("inf")]
                 }
binData=pd.read_csv('binData.csv')
binData.set_index(['column', 'bins'], inplace=True)


rf_model = pickle.load(open('Default_Predict_RF_model.sav', 'rb'))
decision_threshold_rf=0.571695
RF_fit_Columns=['AVG. DAILY DEBIT TRANSACTIONS COUNT LIFETIME',
 'AVG. DAILY DEBIT TRANSACTIONS AMOUNT LAST 180 DAYS',
 'AVG. OF MINIMUM BALANCE PER MONTH LAST 360 DAYS',
 'count_debit_transactions_last_360_days',
 'AVG. CREDIT PER TRANSACTION LAST 30 DAYS',
 'AVG. MONTHLY DEBIT CARD TRANSACTIONS AMOUNT LAST 180 DAYS',
 'AVG. MONTHLY DEBIT LAST 30 DAYS BY AVG. MONTHLY DEBIT LAST 90 DAYS',
 'gms_sub_version',
 'AVG. MONTHLY CREDIT TRANSACTIONS AMOUNT LAST 180 DAYS',
 'CURRENT LOAN LIABILITY IN THE LAST 3 MONTHS',
 'AVG. OF MAXIMUM BALANCE PER MONTH LIFETIME',
 'AVG. MISSED PAYMENT AMOUNT LAST 360 DAYS',
 'total_debit_transaction_amount_last_90_days',
 'AVG. DAILY DEBIT LAST 30 DAYS BY AVG. DAILY DEBIT LAST 60-120 DAYS',
 'CUMULATIVE MAXIMUM BALANCE LAST 30 DAYS',
 '# UTILITIES PAYING BILLS FOR',
 'AVG. MONTHLY CREDIT TRANSACTIONS AMOUNT LIFETIME',
 'gms_version']

# Defining the classes for the Preprocessing Pipelines

In [56]:
class Data_Cleaner(BaseEstimator, TransformerMixin):
    def __init__(self, dictionary): # no *args or *kargs
        self.dictionary=dictionary
        
    def fit(self, X, y=None):
        
        return self
    def transform(self, X):
    #Replacing the dummy column names with actual column names from the provided dictionary. This will help in our analysis.
        X.rename(columns = dictionary['description'], inplace = True)
        X.drop(columns = ['label', 'brand', 'carrier', 'device_name', 'manufacturer', 'network_type', 'screen_height', 'screen_width', 'screen_dpi'], inplace = True)
        return X

In [57]:
class woe_out_of_sample(BaseEstimator, TransformerMixin):
    def __init__(self, binDict, binData): # no *args or *kargs
        self.binDict=binDict
        self.binData=binData
        
    def fit(self, X, y=None):
        
        return self
    def transform(self, X): 
        cols=X.columns
        for col in cols[~cols.isin(['default'])]: 
            X[col]=pd.cut(X[col], bins=self.binDict[col])
            X[col]=(X[col].cat.add_categories('missing values').fillna('missing values'))
            X[col]=X[col].astype("str")
            X[col]=X[col].apply(lambda x, col=col, binData=self.binData: binData.loc[col, x])
#         print(X)
        return X

In [58]:
class columnFilter(BaseEstimator, TransformerMixin):
    def __init__(self): # no *args or *kargs
        pass
    def process_gms_sub_version(self, gms: str):
#     print(gms)
        if(gms!=gms):
            return gms
        else:
            return(float((gms.split()[1][1:]).split('-')[0]))
    def process_gms_version(self, gms: str):
        if(gms!=gms):
            return gms
        else:
            split=re.split(',|\.', gms.split(' ')[0])
            res=''
            for i in split:
                if(len(i)==1):
                    res=res+'0'+i
                else:
                    res=res+i
            return(float(res))
    def fit(self, X, y=None):
        
        return self
    def transform(self, X):
        X['gms_sub_version']= X['gms_version'].apply(self.process_gms_sub_version)
        X['gms_version'] = X['gms_version'].apply(self.process_gms_version)
#         print(X)
        return(X.loc[:,['default', 'gms_version', 'gms_sub_version', '# NEGATIVE EVENTS IN LAST 6 MONTHS (LOANS)', 'CUMULATIVE MINIMUM BALANCE LAST 90 DAYS', 'AVG. OF MINIMUM BALANCE PER MONTH LAST 360 DAYS', 'count_debit_transactions_last_360_days', 'Last closing balance amount (overall)', '# ATM TRANSACTIONS LIFETIME', 'AVG. MONTHLY DEBIT LAST 30 DAYS BY AVG. MONTHLY DEBIT LAST 90 DAYS', '# NEFT/RTGS/IMPS TRANSACTIONS LAST 360 DAYS', 'CURRENT LOAN LIABILITY IN THE LAST 3 MONTHS', 'AVG. DAILY DEBIT LAST 30 DAYS BY AVG. DAILY DEBIT LAST 60-120 DAYS',  'AVG. MONTHLY CREDIT TRANSACTIONS AMOUNT LIFETIME', 'AVG. DAILY DEBIT TRANSACTIONS COUNT LIFETIME', 'TOTAL DEBIT AMOUNT : TOTAL CREDIT AMOUNT RATIO LAST 90 DAYS', 'distance_from_pin_code', '# Loan defaults in last 21 days', 'AVG. MISSED PAYMENT AMOUNT LAST 360 DAYS']])
        

In [59]:
#Feature engineering transformer to create the final 'gms_version' and 'gms_sub_version' features

class RF_Preprocessing(BaseEstimator, TransformerMixin):
    def __init__(self): # no *args or *kargs
        pass
    def process_gms_sub_version(self, gms: str):
    #     print(gms)
        if(gms!=gms):
            return gms
        else:
            return(float((gms.split()[1][1:]).split('-')[0]))
    def process_gms_version(self, gms: str):
#     print(gms)
        if(gms!=gms):
            return gms
        else:
            split=re.split(',|\.', gms.split(' ')[0])
            res=''
            for i in split:
                if(len(i)==1):
                    res=res+'0'+i
                else:
                    res=res+i
            return(float(res))

    def fit(self, X, y=None):
        
        return self
    def transform(self, X):
        X['gms_sub_version']= X['gms_version'].apply(self.process_gms_sub_version)
        X['gms_version'] = X['gms_version'].apply(self.process_gms_version)
        X=X.fillna(-1)
        return X

# Transforming the prediction data using the pipelines and making the final predictions

In [71]:
decision_ids=data_predict.copy()['decision_id']

woe_oos = woe_out_of_sample(binsDict, binData)
filterColumns=columnFilter()
dataCleaner=Data_Cleaner(dictionary)
pipeline_predict_lr = Pipeline(steps=[('cleanData', dataCleaner), ('filter', filterColumns), ('woe_predict', woe_oos)])
transformed_predict_Data_LR=pipeline_predict_lr.fit_transform(data_predict.copy().drop(columns=['decision_id']))
X_predict_LR=transformed_predict_Data_LR.drop(columns=['default'])
y_pred_proba_LR_predict=pd.DataFrame(lr_model.predict_proba(X_predict_LR), index=decision_ids.index)
y_pred_adjusted_LR_predict=y_pred_proba_LR_predict.apply(lambda x: 0 if x[1]<decision_threshold_lr else 1, axis=1)
y_pred_adjusted_LR_predict=pd.concat([decision_ids, y_pred_adjusted_LR_predict], axis=1).rename(columns={0: 'default'})

In [72]:
rf_preprocessor=RF_Preprocessing()
testRFPipeline=Pipeline([('cleanData', dataCleaner), ('preprocessing', rf_preprocessor)])
transformed_predict_Data_RF=testRFPipeline.fit_transform(data_predict.copy().drop(columns=['default', 'decision_id']))
X_predict_RF=transformed_predict_Data_RF
X_predict_RF=X_predict_RF.loc[:,RF_fit_Columns]
y_pred_proba_RF_predict=pd.DataFrame(rf_model.predict_proba(X_predict_RF), index=decision_ids.index)
y_pred_adjusted_RF_predict=y_pred_proba_RF_predict.apply(lambda x: 0 if x[1]<decision_threshold_rf else 1, axis=1)
y_pred_adjusted_RF_predict=pd.concat([decision_ids, y_pred_adjusted_RF_predict], axis=1).rename(columns={0: 'default'})

### The variables y_pred_adjusted_LR_predict and y_pred_adjusted_RF_predict contain the final predictions for the two models