# CFP Propensity Version 1.0

### LIBRARY IMPORTS

In [1]:
'''Importing Required Libraries'''
import copy
from collections import OrderedDict
import numpy as np
import pandas as pd
import lightgbm as lgb
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.options.display.float_format = '{:.4f}'.format
pd.options.mode.chained_assignment = None  # default='warn'
from scipy import stats
from scipy.stats import norm, skew
from sklearn import svm, tree
from sklearn.preprocessing import PowerTransformer, QuantileTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.linear_model import LogisticRegression, SGDClassifier,SGDRegressor
from sklearn.ensemble import RandomForestClassifier,VotingClassifier
from sklearn.calibration import CalibratedClassifierCV
from sklearn.metrics import accuracy_score, roc_curve, roc_auc_score, confusion_matrix, ConfusionMatrixDisplay
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.pipeline import make_pipeline
import xgboost
import matplotlib.pyplot as plt
import seaborn as sns
import joblib

### DATA IMPORT

In [2]:
'''Importing the data for modelling'''
# cfp_data = pd.read_csv(r"D:\Dbeaver Out\CFP_metrics_Green_202302091523.csv", na_values=['',' ','NH', 'NA', '[]'], low_memory=False)
cfp_data = pd.read_csv(r"D:\Dbeaver Out\CFP_TrainData_Green_DecJan_202302131645.csv", na_values=['',' ','NH', 'NA', '[]'], low_memory=False)
# cfp_data = pd.read_csv(r"D:\Dbeaver Out\CFP_DataGreen_2301LTD_202301311058.csv", na_values=['',' ','NH', 'NA', '[]'], low_memory=False)
# cfp_data = pd.read_csv(r"D:\Dbeaver Out\CFP_DataRed_2301LTD_202301311058.csv", na_values=['',' ','NH', 'NA', '[]'], low_memory=False)

In [3]:
# cfp_data=cfp_data[['user_id','cfp_subscribed',
#                    'monthly_income',
#                    'credit_score',
#                    'ptp_last_three_months_flag',
#                    'positive_response_rate',
#                    'contactability',
#                    'cfp_interest',
#                    'credit_card_count',
#                    'consumer_loan_count',
#                    'open_bank_nbfc',
#                    'settled_accounts',
#                    'open_negative_status_count',
#                    'total_amt_sanctioned',
#                    'total_balance',
#                    'credit_limit',
#                    'salary_account',
# #                    'employment_type',
#                    'last_login_vintage',
#                   'latest_login_date','ltd','first_profile_date','latest_profile_date','pincode']]

In [4]:
cfp_data.shape

(52651, 90)

In [5]:
cfp_data.columns

Index(['user_id', 'ltd', 'customer_type', 'nsaleable', 'latest_login_date',
       'salary_account', 'cfp_subscribed', 'monthly_income', 'credit_score',
       'age', 'pincode', 'marital_status', 'employment_type',
       'first_profile_date', 'latest_profile_date', 'city_band',
       'ptp_last_three_months_flag', 'positive_response_rate',
       'contactability', 'email_login_flag', 'unique_days_logged_in_flag',
       'sms_login_flag', 'last_login_vintage', 'negative_status_flag',
       'settled_flag', 'open_total_ratio', 'totl_neg_ratio',
       'saleable_accounts_count', 'cfp_interest', 'auto_loan_count',
       'gold_loan_count', 'consumer_loan_count', 'housing_loan_count',
       'credit_card_count', 'personal_loan_count', 'two_wheeler_count',
       'open_accounts', 'open_bank_fintech', 'open_bank_nbfc',
       'closed_accounts', 'settled_accounts', 'written_off_accounts',
       'flows_accounts', 'secured', 'unsecured', 'open_negative_status_count',
       'individual_account

In [6]:
# cfp_data.drop(['nsaleable',''],axis=1,inplace=True)

In [7]:
cfp_data.head()

Unnamed: 0,user_id,ltd,customer_type,nsaleable,latest_login_date,salary_account,cfp_subscribed,monthly_income,credit_score,age,pincode,marital_status,employment_type,first_profile_date,latest_profile_date,city_band,ptp_last_three_months_flag,positive_response_rate,contactability,email_login_flag,unique_days_logged_in_flag,sms_login_flag,last_login_vintage,negative_status_flag,settled_flag,open_total_ratio,totl_neg_ratio,saleable_accounts_count,cfp_interest,auto_loan_count,gold_loan_count,consumer_loan_count,housing_loan_count,credit_card_count,personal_loan_count,two_wheeler_count,open_accounts,open_bank_fintech,open_bank_nbfc,closed_accounts,settled_accounts,written_off_accounts,flows_accounts,secured,unsecured,open_negative_status_count,individual_account,joint_account,total_amt_sanctioned,total_balance,credit_limit,repayment_tenure,total_emi_amount,hdfc_neg_flag,citi_neg_flag,sbi_neg_flag,icici_neg_flag,cc_neg_flag,pl_neg_flag,cl_neg_flag,auto_loan_close_count_last3mnths,gold_loan_close_count_last3mnths,consumer_loan_close_count_last3mnths,housing_loan_close_count_last3mnths,credit_card_close_count_last3mnths,personal_loan_close_count_last3mnths,two_wheeler_close_count_last3mnths,individual_close_account_last3mnths,closed_accounts_last3mnths,secured_close_last3mnths,unsecured_close_last3mnths,negative_status_close_count_last3mnths,joint_account_close_last3mnths,max_score,score_on_m9,score_on_m3,score_on_m12,score_on_m6,neg_acc_on_m9,neg_acc_on_m3,neg_acc_on_m12,neg_acc_on_m6,max_dpd_on_m9,max_dpd_on_m3,max_dpd_on_m12,max_dpd_on_m6,reff_88,reff_07,ref_390,ref_prod_disb
0,2326221,2212,Green,0,,,0,54176.168,750,41.0,400081.0,Married,Salaried,2017-08-07 18:31:21.000,2021-02-14 06:29:33.000,Mumbai,,0.0,0.0,Zero,Zero,Zero,BeyondM12,MoreThanThree,1,0.5,0.0,0,,0.0,0.0,2.0,1.0,1.0,0.0,0.0,2.0,0,0,2.0,2.0,0.0,2.0,1.0,3.0,0.0,3.0,1.0,38.0,21.0,12.0,11.0,128728.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,,,,,,,,,,,,775,,,,,,,,,,,,,0,0,0,0
1,45382,2301,Green,0,,,0,25382.268,775,48.0,500084.0,Single,Self Employed,2015-10-10 13:05:03.000,2021-01-15 15:30:04.000,Hyderabad,,0.0,0.0,Zero,Zero,Zero,BeyondM12,MoreThanThree,1,0.25,0.0,0,,0.0,0.0,3.0,3.0,13.0,1.0,0.0,5.0,0,0,15.0,15.0,0.0,5.0,3.0,17.0,0.0,19.0,1.0,94.0,23.0,79.0,21.0,149732.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,,,,,,,,,,,,775,,,,,,,,,,,,,0,0,0,0
2,1893213,2301,Green,0,,,0,23594.9866,750,34.0,110063.0,Single,Salaried,2017-05-19 01:18:34.000,2021-02-14 18:40:44.000,Delhi/NCR,,0.0,0.0,Zero,Zero,Zero,BeyondM12,MoreThanThree,1,0.67,0.0,0,,1.0,0.0,0.0,0.0,5.0,0.0,0.0,4.0,0,0,2.0,2.0,0.0,4.0,1.0,5.0,0.0,6.0,0.0,15.0,14.0,61.0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,,,,,,,,,,,,750,,,,,,,,,,,,,0,0,0,0
3,754443,2301,Green,0,,,0,15448.2885,775,50.0,517501.0,Married,Salaried,2016-09-04 16:02:46.000,2016-09-04 16:02:46.000,Missing/Others,,0.0,0.0,Zero,Zero,Zero,BeyondM12,MoreThanThree,1,0.5,0.0,0,,1.0,2.0,0.0,2.0,1.0,0.0,0.0,3.0,0,0,3.0,3.0,0.0,3.0,5.0,1.0,0.0,4.0,2.0,79.0,47.0,10.0,22.0,49702.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,,,,,,,,,,,,775,,,,,,,,,,,,,0,0,0,0
4,3203125,2301,Green,0,,,0,21913.9034,725,30.0,500051.0,Single,Salaried,2017-10-29 17:59:01.000,2021-02-18 17:52:25.000,Hyderabad,,0.0,0.0,Zero,Zero,Zero,BeyondM12,MoreThanThree,1,0.75,0.0,0,,0.0,0.0,0.0,0.0,2.0,2.0,0.0,3.0,0,0,1.0,1.0,0.0,3.0,0.0,4.0,0.0,4.0,0.0,32.0,30.0,9.0,4.0,2424.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,,,,,,,,,,,,750,,,,,,,,,,,,,0,0,0,0


## PRE-PROCESSING

#### Handling Null Values

In [8]:
cfp_data.isnull().sum()

user_id                                       0
ltd                                           0
customer_type                                 0
nsaleable                                     0
latest_login_date                         50000
salary_account                            44951
cfp_subscribed                                0
monthly_income                               53
credit_score                                  0
age                                          10
pincode                                       5
marital_status                             2368
employment_type                           10061
first_profile_date                            0
latest_profile_date                           0
city_band                                     0
ptp_last_three_months_flag                50033
positive_response_rate                        0
contactability                                0
email_login_flag                              0
unique_days_logged_in_flag              

In [9]:
''' Replacing with 0s and Harcoded Values'''
cfp_data.ptp_last_three_months_flag.fillna(0, inplace= True)
cfp_data.totl_neg_ratio.fillna(0, inplace= True)
cfp_data.max_score.fillna(cfp_data.max_score.mean(),inplace=True)
cfp_data.auto_loan_close_count_last3mnths.fillna(0, inplace= True)
cfp_data.gold_loan_close_count_last3mnths.fillna(0, inplace= True)
cfp_data.consumer_loan_close_count_last3mnths.fillna(0, inplace= True)
cfp_data.housing_loan_close_count_last3mnths.fillna(0, inplace= True)
cfp_data.credit_card_close_count_last3mnths.fillna(0, inplace= True)
cfp_data.personal_loan_close_count_last3mnths.fillna(0, inplace= True)
cfp_data.two_wheeler_close_count_last3mnths.fillna(0, inplace= True)
cfp_data.individual_close_account_last3mnths.fillna(0, inplace= True)
cfp_data.secured_close_last3mnths.fillna(0, inplace= True)
cfp_data.closed_accounts_last3mnths.fillna(0, inplace= True)
cfp_data.negative_status_close_count_last3mnths.fillna(0, inplace= True)
cfp_data.joint_account_close_last3mnths.fillna(0, inplace= True)
cfp_data.unsecured_close_last3mnths.fillna(0, inplace= True)

cfp_data.auto_loan_count.fillna(0, inplace= True)
cfp_data.gold_loan_count.fillna(0, inplace= True)

cfp_data.housing_loan_count.fillna(0, inplace= True)
cfp_data.credit_card_count.fillna(0, inplace= True)
cfp_data.personal_loan_count.fillna(0, inplace= True)
cfp_data.two_wheeler_count.fillna(0, inplace= True)
cfp_data.individual_account.fillna(0, inplace= True)
cfp_data.open_accounts.fillna(0, inplace= True)
cfp_data.settled_accounts.fillna(0, inplace= True)
cfp_data.secured.fillna(0, inplace= True)
cfp_data.unsecured.fillna(0, inplace= True)
cfp_data.open_negative_status_count.fillna(0, inplace= True)
cfp_data.joint_account.fillna(0, inplace= True)
cfp_data.total_balance.fillna(0, inplace= True)
cfp_data.written_off_accounts.fillna(0, inplace= True)
cfp_data.flows_accounts.fillna(0, inplace= True)
cfp_data.open_accounts.fillna(0, inplace= True)
cfp_data.closed_accounts.fillna(0, inplace= True)


cfp_data.hdfc_neg_flag.fillna(0, inplace= True)
cfp_data.citi_neg_flag.fillna(0, inplace= True)
cfp_data.sbi_neg_flag.fillna(0, inplace= True)        
cfp_data.icici_neg_flag.fillna(0, inplace= True)         
cfp_data.cc_neg_flag.fillna(0, inplace= True)       
cfp_data.pl_neg_flag.fillna(0, inplace= True)
cfp_data.cl_neg_flag.fillna(0, inplace= True)

cfp_data.total_balance.fillna(0,inplace=True)
cfp_data.total_emi_amount.fillna(0,inplace=True)
cfp_data.total_amt_sanctioned.fillna(0,inplace=True)

cfp_data.credit_limit.fillna(0, inplace= True)
cfp_data.repayment_tenure.fillna(0, inplace= True)


cfp_data.cfp_interest.fillna(0, inplace= True)

cfp_data.salary_account.fillna('Unknown', inplace=True)

In [10]:
'''Replacing based on median'''
median_marriage_age=36
def marital_na_filler(marital_status, age): 
    try: 
        if np.isnan(marital_status): 
            if age >= median_marriage_age: 
                return 'Married' 
            else: 
                return 'Single' 
        else: 
            return (marital_status) 
    except: 
        return (marital_status)
cfp_data.marital_status = cfp_data.apply(lambda x: marital_na_filler(x.marital_status, x.age), axis = 1)
cfp_data.monthly_income.fillna(40000, inplace= True)
cfp_data.score_on_m9.fillna(650, inplace= True)
cfp_data.score_on_m3.fillna(650, inplace= True)
cfp_data.score_on_m12.fillna(650, inplace= True)
cfp_data.score_on_m6.fillna(650, inplace= True)
cfp_data.neg_acc_on_m9.fillna(1, inplace= True)
cfp_data.neg_acc_on_m3.fillna(1, inplace= True)
cfp_data.neg_acc_on_m12.fillna(1, inplace= True)
cfp_data.neg_acc_on_m6.fillna(1, inplace= True)
cfp_data.max_dpd_on_m9.fillna(0, inplace= True)
cfp_data.max_dpd_on_m3.fillna(0, inplace= True)
cfp_data.max_dpd_on_m12.fillna(0, inplace= True)
cfp_data.max_dpd_on_m6.fillna(0, inplace= True)
cfp_data.age.fillna(32, inplace= True)

In [11]:
''' Replacing FLAG in city_band with 'Missing/Others' '''
cfp_data.city_band = cfp_data.city_band.apply(lambda x: 'Missing/Others' if x == 'FLAG' else x)

In [12]:
'''Replacing Salary Account Bank with Categories'''
def getFormattedSalaryAccount(Bank):
    try:
        formatted_employment_type = ""
        SBI = ['SBI GROUP','SBI']
        AXIS = ['AXIS','Axis']
        HDFC = ['HDFC','HDFC Bank']
        KOTAK = ['kotak mahindra bank','KOTAK','kotak']
        BOI = ['BANK OF INDIA']
        PNB = ['PUNJAB NATIONAL', 'PNB']
        if Bank in SBI:
            return "Public"
        elif Bank in AXIS:
            return "Private"
        elif Bank in HDFC:
            return "Private"
        elif Bank in KOTAK:
            return "Private"
        elif Bank in PNB:
            return "Private"
        elif Bank in BOI:
            return "Public"
        elif Bank == "YESBANK":
            return "Private"
        elif Bank == "ANDHRA":
            return "Public"
        elif Bank == "IDBI":
            return "Private"
        elif Bank == "CANARA":
            return "Public"
        elif Bank == "UNION":
            return "Public"
        elif Bank == "ICICI":
            return "Private"
        elif Bank == "receiveByCashOrCheque":
            return "I receive by cash"
        elif Bank == "CITI":
            return "Private"
        elif Bank == 'I receive by cash':
            return "I receive by cash"
        elif Bank == 'I receive by cheque':
            return "I receive by cash"
        elif Bank == 'Unknown':
            return "Unknown"
        else:
            return 'Other'
    except:
        return employment_type
cfp_data.salary_account = cfp_data.salary_account.apply(lambda x: getFormattedSalaryAccount(x))
cfp_data.salary_account.value_counts()

Unknown              44951
Private               3875
Other                 2188
Public                1384
I receive by cash      253
Name: salary_account, dtype: int64

In [13]:
'''Format Employment and handle null values'''
def getFormattedEmploymentType(employment_type):
    try:
        formatted_employment_type = ""
        salaried_list = ['Salaried','salaried','Salaried Doctor','Working Executive','Student','Salaried doctor', 'salaried', 'salariedDoctor']
        self_employed_list = ['Self Employed', 'Self employed', 'Self employed professional', 'Self-Employed', 'selfemployee', 'selfEmployedProfessional','Self employed business', 'selfEmployedBusiness', 'Self Employed Business']
        if employment_type in salaried_list:
            return "Salaried"
        elif employment_type in self_employed_list:
            return "Self Employed"
        else:
            return 'Salaried'
    except:
        return employment_type
cfp_data.employment_type = cfp_data.employment_type.apply(lambda x: getFormattedEmploymentType(x))

In [14]:
''' Income and Age to positive if negative '''
cfp_data.monthly_income = cfp_data.monthly_income.apply(lambda x: (x * -1) if x < 0 else x)
cfp_data.age = cfp_data.age.apply(lambda x: (x * -1) if x < 0 else x)

In [15]:
cfp_data.isnull().sum()

user_id                                       0
ltd                                           0
customer_type                                 0
nsaleable                                     0
latest_login_date                         50000
salary_account                                0
cfp_subscribed                                0
monthly_income                                0
credit_score                                  0
age                                           0
pincode                                       5
marital_status                                0
employment_type                               0
first_profile_date                            0
latest_profile_date                           0
city_band                                     0
ptp_last_three_months_flag                    0
positive_response_rate                        0
contactability                                0
email_login_flag                              0
unique_days_logged_in_flag              

### Pickling Data

In [16]:
joblib.dump(cfp_data, 'cfp_clean_data_Green.pkl')
# joblib.dump(cfp_data, 'cfp_clean_data_Red.pkl')
# joblib.dump(cfp_data, 'cfp_clean_data_Red_ltd.pkl')
# joblib.dump(cfp_data, 'cfp_clean_data_Green_ltd.pkl')

['cfp_clean_data_Green.pkl']

In [17]:
cfp_data=joblib.load('cfp_clean_data_Green.pkl')
# cfp_data=joblib.load('cfp_clean_data_Red.pkl')
# cfp_data=joblib.load('cfp_clean_data_Red_ltd.pkl')
# cfp_data=joblib.load('cfp_clean_data_Green_ltd.pkl')

In [18]:
cfp_backup=copy.deepcopy(cfp_data)
cfp_data.drop(['latest_login_date','ltd','first_profile_date','latest_profile_date','pincode'],axis=1,inplace=True) 

#### Probe and Handle Outliers

In [19]:
# '''Percentile values'''
# Q01=cfp_data.quantile(0.01)
# Q05=cfp_data.quantile(0.05)
# Q10=cfp_data.quantile(0.10)
# Q95=cfp_data.quantile(0.95)
# Q99=cfp_data.quantile(0.99)

In [20]:
# '''Capping Upper Limit'''
# cfp_data['monthly_income'] = np.where(cfp_data['monthly_income']>Q95.monthly_income,Q95.monthly_income,cfp_data['monthly_income'])
# cfp_data['nsaleable'] = np.where(cfp_data['nsaleable']>Q95.nsaleable,Q95.nsaleable,cfp_data['nsaleable'])
# for feature in numerical_columns:
#     if feature not in exclude:
#         if feature not in ('monthly_income','nsaleable','age','max_dpd_on_m9',
#        'max_dpd_on_m3', 'max_dpd_on_m12', 'max_dpd_on_m6'):# include dpd,  ''        
#             cfp_data[feature] = np.where(cfp_data[feature]>Q99[feature],Q99[feature],cfp_data[feature])


# '''Capping Upper and Lower Limit'''
# cfp_data['age'] = np.where(cfp_data['age']>Q95.age,Q95.age,np.where(cfp_data['age']<Q05.age,Q05.age,cfp_data['age']))

In [21]:
'''Capping'''
cfp_data['age'] = np.where(cfp_data['age']<15,15,cfp_data['age'])
cfp_data['age'] = np.where(cfp_data['age']>42,42,cfp_data['age'])
cfp_data['monthly_income'] = np.where(cfp_data['monthly_income']>33000,33000,cfp_data['monthly_income'])
cfp_data['credit_score'] = np.where(cfp_data['credit_score']>750,750,cfp_data['credit_score'])
cfp_data['auto_loan_count'] = np.where(cfp_data['auto_loan_count']>2,2,cfp_data['auto_loan_count'])
cfp_data['gold_loan_count'] = np.where(cfp_data['gold_loan_count']>9,9,cfp_data['gold_loan_count'])
cfp_data['consumer_loan_count'] = np.where(cfp_data['consumer_loan_count']>3,3,cfp_data['consumer_loan_count'])
cfp_data['housing_loan_count'] = np.where(cfp_data['housing_loan_count']>1,1,cfp_data['housing_loan_count'])
cfp_data['credit_card_count'] = np.where(cfp_data['credit_card_count']>4,4,cfp_data['credit_card_count'])
cfp_data['personal_loan_count'] = np.where(cfp_data['personal_loan_count']>2,2,cfp_data['personal_loan_count'])
cfp_data['two_wheeler_count'] = np.where(cfp_data['two_wheeler_count']>2,2,cfp_data['two_wheeler_count'])
cfp_data['open_accounts'] = np.where(cfp_data['open_accounts']>7,7,cfp_data['open_accounts'])
cfp_data['open_bank_fintech'] = np.where(cfp_data['open_bank_fintech']>1,1,cfp_data['open_bank_fintech'])
cfp_data['open_bank_nbfc'] = np.where(cfp_data['open_bank_nbfc']>2,2,cfp_data['open_bank_nbfc'])
cfp_data['closed_accounts'] = np.where(cfp_data['closed_accounts']>5,5,cfp_data['closed_accounts'])
cfp_data['settled_accounts'] = np.where(cfp_data['settled_accounts']>5,5,cfp_data['settled_accounts'])
cfp_data['written_off_accounts'] = np.where(cfp_data['written_off_accounts']>5,5,cfp_data['written_off_accounts'])
cfp_data['flows_accounts'] = np.where(cfp_data['flows_accounts']>5,5,cfp_data['flows_accounts'])
cfp_data['secured'] = np.where(cfp_data['secured']>7,7,cfp_data['secured'])
cfp_data['unsecured'] = np.where(cfp_data['unsecured']>7,7,cfp_data['unsecured'])
cfp_data['open_negative_status_count'] = np.where(cfp_data['open_negative_status_count']>5,5,cfp_data['open_negative_status_count'])
cfp_data['individual_account'] = np.where(cfp_data['individual_account']>5,5,cfp_data['individual_account'])
cfp_data['joint_account'] = np.where(cfp_data['joint_account']>5,5,cfp_data['joint_account'])
cfp_data['total_amt_sanctioned'] = np.where(cfp_data['total_amt_sanctioned']>200,200,cfp_data['total_amt_sanctioned'])
cfp_data['total_balance'] = np.where(cfp_data['total_balance']>50,50,cfp_data['total_balance'])
cfp_data['credit_limit'] = np.where(cfp_data['credit_limit']>20,20,cfp_data['credit_limit'])
cfp_data['repayment_tenure'] = np.where(cfp_data['repayment_tenure']>12,12,cfp_data['repayment_tenure'])
cfp_data['total_emi_amount'] = np.where(cfp_data['total_emi_amount']>20000,20000,cfp_data['total_emi_amount'])
cfp_data['hdfc_neg_flag'] = np.where(cfp_data['hdfc_neg_flag']>1,1,cfp_data['hdfc_neg_flag'])
cfp_data['citi_neg_flag'] = np.where(cfp_data['citi_neg_flag']>1,1,cfp_data['citi_neg_flag'])
cfp_data['sbi_neg_flag'] = np.where(cfp_data['sbi_neg_flag']>1,1,cfp_data['sbi_neg_flag'])
cfp_data['icici_neg_flag'] = np.where(cfp_data['icici_neg_flag']>1,1,cfp_data['icici_neg_flag'])
cfp_data['cc_neg_flag'] = np.where(cfp_data['cc_neg_flag']>1,1,cfp_data['cc_neg_flag'])
cfp_data['pl_neg_flag'] = np.where(cfp_data['pl_neg_flag']>1,1,cfp_data['pl_neg_flag'])
cfp_data['cl_neg_flag'] = np.where(cfp_data['cl_neg_flag']>1,1,cfp_data['cl_neg_flag'])
cfp_data['auto_loan_close_count_last3mnths'] = np.where(cfp_data['auto_loan_close_count_last3mnths']>1,1,cfp_data['auto_loan_close_count_last3mnths'])
cfp_data['gold_loan_close_count_last3mnths'] = np.where(cfp_data['gold_loan_close_count_last3mnths']>1,1,cfp_data['gold_loan_close_count_last3mnths'])
cfp_data['consumer_loan_close_count_last3mnths'] = np.where(cfp_data['consumer_loan_close_count_last3mnths']>1,1,cfp_data['consumer_loan_close_count_last3mnths'])
cfp_data['housing_loan_close_count_last3mnths'] = np.where(cfp_data['housing_loan_close_count_last3mnths']>1,1,cfp_data['housing_loan_close_count_last3mnths'])
cfp_data['credit_card_close_count_last3mnths'] = np.where(cfp_data['credit_card_close_count_last3mnths']>1,1,cfp_data['credit_card_close_count_last3mnths'])
cfp_data['personal_loan_close_count_last3mnths'] = np.where(cfp_data['personal_loan_close_count_last3mnths']>1,1,cfp_data['personal_loan_close_count_last3mnths'])
cfp_data['two_wheeler_close_count_last3mnths'] = np.where(cfp_data['two_wheeler_close_count_last3mnths']>1,1,cfp_data['two_wheeler_close_count_last3mnths'])
cfp_data['individual_close_account_last3mnths'] = np.where(cfp_data['individual_close_account_last3mnths']>1,1,cfp_data['individual_close_account_last3mnths'])
cfp_data['closed_accounts_last3mnths'] = np.where(cfp_data['closed_accounts_last3mnths']>1,1,cfp_data['closed_accounts_last3mnths'])
cfp_data['secured_close_last3mnths'] = np.where(cfp_data['secured_close_last3mnths']>1,1,cfp_data['secured_close_last3mnths'])
cfp_data['unsecured_close_last3mnths'] = np.where(cfp_data['unsecured_close_last3mnths']>1,1,cfp_data['unsecured_close_last3mnths'])
cfp_data['negative_status_close_count_last3mnths'] = np.where(cfp_data['negative_status_close_count_last3mnths']>1,1,cfp_data['negative_status_close_count_last3mnths'])
cfp_data['joint_account_close_last3mnths'] = np.where(cfp_data['joint_account_close_last3mnths']>1,1,cfp_data['joint_account_close_last3mnths'])
cfp_data['max_score'] = np.where(cfp_data['max_score']>750,750,cfp_data['max_score'])
cfp_data['score_on_m9'] = np.where(cfp_data['score_on_m9']>750,750,cfp_data['score_on_m9'])
cfp_data['score_on_m3'] = np.where(cfp_data['score_on_m3']>750,750,cfp_data['score_on_m3'])
cfp_data['score_on_m12'] = np.where(cfp_data['score_on_m12']>750,750,cfp_data['score_on_m12'])
cfp_data['score_on_m6'] = np.where(cfp_data['score_on_m6']>750,750,cfp_data['score_on_m6'])
cfp_data['neg_acc_on_m9'] = np.where(cfp_data['neg_acc_on_m9']>6,6,cfp_data['neg_acc_on_m9'])
cfp_data['neg_acc_on_m3'] = np.where(cfp_data['neg_acc_on_m3']>6,6,cfp_data['neg_acc_on_m3'])
cfp_data['neg_acc_on_m12'] = np.where(cfp_data['neg_acc_on_m12']>6,6,cfp_data['neg_acc_on_m12'])
cfp_data['neg_acc_on_m6'] = np.where(cfp_data['neg_acc_on_m6']>6,6,cfp_data['neg_acc_on_m6'])
cfp_data['reff_88'] = np.where(cfp_data['reff_88']>2,2,cfp_data['reff_88'])
cfp_data['reff_07'] = np.where(cfp_data['reff_07']>2,2,cfp_data['reff_07'])
cfp_data['ref_390'] = np.where(cfp_data['ref_390']>1,1,cfp_data['ref_390'])
cfp_data['ref_prod_disb'] = np.where(cfp_data['ref_prod_disb']>4,4,cfp_data['ref_prod_disb'])

In [22]:
cfp_data.corr(method='pearson').style.format("{:.2}").background_gradient(cmap=plt.get_cmap('coolwarm'), axis=1)

  smin = np.nanmin(gmap) if vmin is None else vmin
  smax = np.nanmax(gmap) if vmax is None else vmax


Unnamed: 0,user_id,nsaleable,cfp_subscribed,monthly_income,credit_score,age,ptp_last_three_months_flag,positive_response_rate,contactability,settled_flag,open_total_ratio,totl_neg_ratio,saleable_accounts_count,cfp_interest,auto_loan_count,gold_loan_count,consumer_loan_count,housing_loan_count,credit_card_count,personal_loan_count,two_wheeler_count,open_accounts,open_bank_fintech,open_bank_nbfc,closed_accounts,settled_accounts,written_off_accounts,flows_accounts,secured,unsecured,open_negative_status_count,individual_account,joint_account,total_amt_sanctioned,total_balance,credit_limit,repayment_tenure,total_emi_amount,hdfc_neg_flag,citi_neg_flag,sbi_neg_flag,icici_neg_flag,cc_neg_flag,pl_neg_flag,cl_neg_flag,auto_loan_close_count_last3mnths,gold_loan_close_count_last3mnths,consumer_loan_close_count_last3mnths,housing_loan_close_count_last3mnths,credit_card_close_count_last3mnths,personal_loan_close_count_last3mnths,two_wheeler_close_count_last3mnths,individual_close_account_last3mnths,closed_accounts_last3mnths,secured_close_last3mnths,unsecured_close_last3mnths,negative_status_close_count_last3mnths,joint_account_close_last3mnths,max_score,score_on_m9,score_on_m3,score_on_m12,score_on_m6,neg_acc_on_m9,neg_acc_on_m3,neg_acc_on_m12,neg_acc_on_m6,max_dpd_on_m9,max_dpd_on_m3,max_dpd_on_m12,max_dpd_on_m6,reff_88,reff_07,ref_390,ref_prod_disb
user_id,1.0,,0.85,-0.16,-0.11,-0.16,0.64,0.74,0.78,0.047,-0.069,,,0.85,-0.073,0.031,0.11,-0.12,-0.22,0.15,,0.015,0.014,0.1,0.048,0.052,,-0.0046,-0.013,0.033,,0.029,-0.052,0.055,-0.024,-0.21,0.046,0.057,,,,,,,,0.018,0.046,0.041,,0.028,0.11,,0.13,0.13,0.051,0.12,,,-0.094,-0.026,0.56,-0.033,0.09,-0.27,-0.76,-0.16,-0.44,0.047,0.057,0.029,0.064,0.33,0.79,0.41,0.25
nsaleable,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
cfp_subscribed,0.85,,1.0,-0.13,-0.17,-0.11,0.72,0.87,0.9,0.049,-0.059,,,1.0,-0.059,0.044,0.12,-0.092,-0.16,0.19,,0.077,0.019,0.12,0.081,0.083,,0.066,0.0095,0.081,,0.064,-0.031,0.084,0.025,-0.16,0.088,0.072,,,,,,,,0.019,0.05,0.054,,0.042,0.13,,0.16,0.16,0.057,0.15,,,-0.12,0.0078,0.61,-0.00074,0.14,-0.37,-0.87,-0.24,-0.56,0.073,0.07,0.065,0.096,0.4,0.9,0.47,0.28
monthly_income,-0.16,,-0.13,1.0,0.037,0.32,-0.11,-0.12,-0.12,0.14,-0.035,,,-0.13,0.2,0.0068,-0.048,0.28,0.36,0.075,,0.25,0.0047,-0.0048,0.2,0.2,,0.23,0.18,0.23,,0.23,0.18,0.21,0.27,0.33,0.22,0.26,,,,,,,,-0.002,-0.0036,-0.0027,,-0.0092,-0.015,,-0.015,-0.015,-0.0027,-0.015,,,0.05,0.032,-0.07,0.031,0.011,0.02,0.1,-0.0018,0.044,-0.012,-0.016,-0.0073,-0.016,-0.048,-0.12,-0.061,-0.029
credit_score,-0.11,,-0.17,0.037,1.0,0.0094,-0.14,-0.16,-0.13,-0.08,0.085,,,-0.17,-0.076,-0.14,-0.073,0.057,-0.027,-0.12,,-0.15,-0.0058,-0.02,-0.15,-0.15,,-0.11,-0.15,-0.15,,-0.13,-0.06,-0.2,-0.19,0.00015,-0.12,-0.087,,,,,,,,0.003,-0.024,0.0073,,-0.022,-0.016,,-0.027,-0.027,-0.023,-0.02,,,0.72,0.021,0.098,0.022,0.029,0.054,0.14,0.043,0.086,-0.031,-0.033,-0.028,-0.045,-0.076,-0.13,-0.051,-0.028
age,-0.16,,-0.11,0.32,0.0094,1.0,-0.094,-0.099,-0.1,0.15,-0.16,,,-0.11,0.26,0.082,-0.022,0.33,0.15,-0.058,,0.084,0.005,-0.0099,0.22,0.22,,0.065,0.27,0.11,,0.13,0.22,0.22,0.15,0.14,0.2,0.2,,,,,,,,-0.0024,0.0032,-0.00094,,0.002,-0.02,,-0.014,-0.014,0.0037,-0.016,,,-0.022,0.034,-0.069,0.03,0.008,0.023,0.094,0.0072,0.053,-0.0059,-0.0054,-0.005,-0.0057,-0.068,-0.11,-0.053,-0.025
ptp_last_three_months_flag,0.64,,0.72,-0.11,-0.14,-0.094,1.0,0.74,0.69,0.032,-0.041,,,0.72,-0.054,0.0096,0.083,-0.077,-0.14,0.13,,0.036,0.026,0.078,0.047,0.048,,0.032,-0.018,0.048,,0.038,-0.035,0.032,-0.0056,-0.13,0.047,0.035,,,,,,,,-0.00072,0.039,0.036,,0.035,0.086,,0.1,0.1,0.034,0.098,,,-0.11,-0.021,0.41,-0.02,0.07,-0.24,-0.64,-0.14,-0.4,0.048,0.045,0.045,0.057,0.24,0.68,0.35,0.22
positive_response_rate,0.74,,0.87,-0.12,-0.16,-0.099,0.74,1.0,0.85,0.042,-0.053,,,0.87,-0.052,0.039,0.11,-0.083,-0.15,0.16,,0.063,0.014,0.11,0.069,0.071,,0.055,0.0082,0.068,,0.054,-0.031,0.068,0.015,-0.14,0.075,0.06,,,,,,,,-0.00087,0.036,0.043,,0.037,0.12,,0.14,0.14,0.041,0.14,,,-0.12,0.0041,0.5,-0.01,0.12,-0.31,-0.75,-0.2,-0.48,0.061,0.062,0.062,0.07,0.35,0.8,0.42,0.24
contactability,0.78,,0.9,-0.12,-0.13,-0.1,0.69,0.85,1.0,0.043,-0.05,,,0.9,-0.056,0.036,0.11,-0.086,-0.15,0.17,,0.068,0.016,0.11,0.07,0.071,,0.059,0.0038,0.071,,0.058,-0.031,0.069,0.018,-0.15,0.075,0.06,,,,,,,,-0.00091,0.033,0.055,,0.041,0.11,,0.14,0.14,0.03,0.14,,,-0.091,0.0054,0.6,-0.0076,0.12,-0.34,-0.78,-0.23,-0.5,0.042,0.056,0.049,0.076,0.34,0.83,0.44,0.26
settled_flag,0.047,,0.049,0.14,-0.08,0.15,0.032,0.042,0.043,1.0,-0.7,,,0.049,0.17,0.14,0.3,0.15,0.24,0.27,,0.25,0.0019,0.0086,0.66,0.66,,0.23,0.27,0.5,,0.58,0.15,0.35,0.23,0.19,0.26,0.26,,,,,,,,0.0019,0.0051,0.0055,,0.0043,0.013,,0.016,0.016,0.0058,0.015,,,-0.0029,0.028,0.026,0.029,0.039,-0.045,-0.052,-0.046,-0.052,0.0047,0.0034,0.0032,0.0033,0.021,0.043,0.02,0.0079


In [23]:
exclude=['user_id','cfp_interest','positive_response_rate', 'contactability',
         'ptp_last_three_months_flag','cfp_subscribed'] #,'cfp_subscribed'

In [24]:
numerical_columns=list(cfp_data.select_dtypes(include='number').columns)
for feature in exclude:
    numerical_columns.remove(feature)
print(numerical_columns)

['nsaleable', 'monthly_income', 'credit_score', 'age', 'settled_flag', 'open_total_ratio', 'totl_neg_ratio', 'saleable_accounts_count', 'auto_loan_count', 'gold_loan_count', 'consumer_loan_count', 'housing_loan_count', 'credit_card_count', 'personal_loan_count', 'two_wheeler_count', 'open_accounts', 'open_bank_fintech', 'open_bank_nbfc', 'closed_accounts', 'settled_accounts', 'written_off_accounts', 'flows_accounts', 'secured', 'unsecured', 'open_negative_status_count', 'individual_account', 'joint_account', 'total_amt_sanctioned', 'total_balance', 'credit_limit', 'repayment_tenure', 'total_emi_amount', 'hdfc_neg_flag', 'citi_neg_flag', 'sbi_neg_flag', 'icici_neg_flag', 'cc_neg_flag', 'pl_neg_flag', 'cl_neg_flag', 'auto_loan_close_count_last3mnths', 'gold_loan_close_count_last3mnths', 'consumer_loan_close_count_last3mnths', 'housing_loan_close_count_last3mnths', 'credit_card_close_count_last3mnths', 'personal_loan_close_count_last3mnths', 'two_wheeler_close_count_last3mnths', 'individu

In [25]:
cfp_data.describe(percentiles=[0.01,0.05,0.10,0.25,0.5,0.75,0.9,0.95,0.99],datetime_is_numeric=True).transpose()

Unnamed: 0,count,mean,std,min,1%,5%,10%,25%,50%,75%,90%,95%,99%,max
user_id,52651.0,3416996.0128,4492449.7717,266.0,78857.0,250467.0,441037.0,941390.5,2855882.0,3953947.5,4444795.0,5362251.0,26820790.5,28464881.0
nsaleable,52651.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
cfp_subscribed,52651.0,0.0504,0.2187,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0
monthly_income,52651.0,22093.7195,6746.5489,0.0,4166.6667,10907.5275,14534.1543,17611.7638,21573.6011,26822.0757,33000.0,33000.0,33000.0,33000.0
credit_score,52651.0,724.6743,37.2412,425.0,575.0,650.0,675.0,725.0,750.0,750.0,750.0,750.0,750.0,750.0
age,52651.0,35.2007,5.9128,15.0,22.0,25.0,27.0,31.0,36.0,42.0,42.0,42.0,42.0,42.0
ptp_last_three_months_flag,52651.0,0.0265,0.1606,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0
positive_response_rate,52651.0,0.0288,0.1434,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.83,1.0
contactability,52651.0,0.0329,0.1577,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0
settled_flag,52651.0,0.8347,0.3715,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [26]:
skewww=cfp_data.skew()
for feature in skewww.keys():
    if skewww[feature]<=-3 or skewww[feature]>=3:
        skewww[feature]='(Critical)                '+str(skewww[feature])
    else:
         pass
skewww

  skewww=cfp_data.skew()


user_id                                   (Critical)                3.9638253221104107
nsaleable                                                                       0.0000
cfp_subscribed                            (Critical)                4.1127595387888825
monthly_income                                                                 -0.1841
credit_score                                                                   -2.2135
age                                                                            -0.4149
ptp_last_three_months_flag                 (Critical)                5.896766055004825
positive_response_rate                      (Critical)                5.16183694983389
contactability                             (Critical)                4.941154115481259
settled_flag                                                                   -1.8018
open_total_ratio                                                                0.1011
totl_neg_ratio                             

In [27]:
kurt=cfp_data.kurtosis()
for feature in kurt.keys():
    if kurt[feature]<=-10 or kurt[feature]>=10:
        kurt[feature]='(Critical)                '+str(kurt[feature])
    else:
         pass
kurt

  kurt=cfp_data.kurtosis()


user_id                                   (Critical)                16.729748816390476
nsaleable                                                                       0.0000
cfp_subscribed                            (Critical)                14.915357596979074
monthly_income                                                                 -0.0494
credit_score                                                                    6.0882
age                                                                            -0.8774
ptp_last_three_months_flag                 (Critical)                32.77309482424362
positive_response_rate                    (Critical)                26.299622488025104
contactability                            (Critical)                23.751246919816072
settled_flag                                                                    1.2467
open_total_ratio                                                               -0.8088
totl_neg_ratio                             

In [28]:
# '''Finding Skewness'''
# skewed=[]
# def find_skew(data,numerical_columns):
#     skew_list={}
#     for feature in numerical_columns:
#         try:
#             skew_value=(3*(data[feature].mean()-data[feature].median()))/data[feature].std()
#             if ((round(skew_value,2)<-3) or (round(skew_value,2)>3)):
#                 skew_list[feature]='Skewwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwww'
#                 skewed.append(feature)
#             else:
#                 skew_list[feature]=round(skew_value,2)
#         except ZeroDivisionError:
#             skew_list[feature]='Skewwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwww'
#             skewed.append(feature)
#     return skew_list
# find_skew(cfp_data,numerical_columns)

In [29]:
# '''Transforming features - Adjusting for skewness'''
# qt = QuantileTransformer(n_quantiles=51090, output_distribution='normal')
# for i in numerical_columns:
#     if i in exclude:
#         pass
#     else:
# #         if i in skewed:
#         array = np.array(cfp_data[i]).reshape(-1, 1)
#         try:
#             cfp_data[i] = qt.fit_transform(array)
#         except:
#             pass
    

### Pickling Data

In [30]:
joblib.dump(cfp_data, 'cfp_transformed_data_Green.pkl')
# joblib.dump(cfp_data, 'cfp_transformed_data_Red.pkl')
# joblib.dump(cfp_data, 'cfp_transformed_data_Red_ltd.pkl')
# joblib.dump(cfp_data, 'cfp_transformed_data_Green_ltd.pkl')

['cfp_transformed_data_Green.pkl']

In [31]:
cfp_clean_data=joblib.load('cfp_transformed_data_Green.pkl')
# cfp_clean_data=joblib.load('cfp_transformed_data_Red.pkl')
# cfp_clean_data=joblib.load('cfp_transformed_data_Red_ltd.pkl')
# cfp_clean_data=joblib.load('cfp_transformed_data_Green_ltd.pkl')

In [32]:
cfp_clean_data.head()

Unnamed: 0,user_id,customer_type,nsaleable,salary_account,cfp_subscribed,monthly_income,credit_score,age,marital_status,employment_type,city_band,ptp_last_three_months_flag,positive_response_rate,contactability,email_login_flag,unique_days_logged_in_flag,sms_login_flag,last_login_vintage,negative_status_flag,settled_flag,open_total_ratio,totl_neg_ratio,saleable_accounts_count,cfp_interest,auto_loan_count,gold_loan_count,consumer_loan_count,housing_loan_count,credit_card_count,personal_loan_count,two_wheeler_count,open_accounts,open_bank_fintech,open_bank_nbfc,closed_accounts,settled_accounts,written_off_accounts,flows_accounts,secured,unsecured,open_negative_status_count,individual_account,joint_account,total_amt_sanctioned,total_balance,credit_limit,repayment_tenure,total_emi_amount,hdfc_neg_flag,citi_neg_flag,sbi_neg_flag,icici_neg_flag,cc_neg_flag,pl_neg_flag,cl_neg_flag,auto_loan_close_count_last3mnths,gold_loan_close_count_last3mnths,consumer_loan_close_count_last3mnths,housing_loan_close_count_last3mnths,credit_card_close_count_last3mnths,personal_loan_close_count_last3mnths,two_wheeler_close_count_last3mnths,individual_close_account_last3mnths,closed_accounts_last3mnths,secured_close_last3mnths,unsecured_close_last3mnths,negative_status_close_count_last3mnths,joint_account_close_last3mnths,max_score,score_on_m9,score_on_m3,score_on_m12,score_on_m6,neg_acc_on_m9,neg_acc_on_m3,neg_acc_on_m12,neg_acc_on_m6,max_dpd_on_m9,max_dpd_on_m3,max_dpd_on_m12,max_dpd_on_m6,reff_88,reff_07,ref_390,ref_prod_disb
0,2326221,Green,0,Unknown,0,33000.0,750,41.0,Married,Salaried,Mumbai,0.0,0.0,0.0,Zero,Zero,Zero,BeyondM12,MoreThanThree,1,0.5,0.0,0,0.0,0.0,0.0,2.0,1.0,1.0,0.0,0.0,2.0,0,0,2.0,2.0,0.0,2.0,1.0,3.0,0.0,3.0,1.0,38.0,21.0,12.0,11.0,20000.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,750,650.0,650.0,650.0,650.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0,0,0,0
1,45382,Green,0,Unknown,0,25382.268,750,42.0,Single,Self Employed,Hyderabad,0.0,0.0,0.0,Zero,Zero,Zero,BeyondM12,MoreThanThree,1,0.25,0.0,0,0.0,0.0,0.0,3.0,1.0,4.0,1.0,0.0,5.0,0,0,5.0,5.0,0.0,5.0,3.0,7.0,0.0,5.0,1.0,94.0,23.0,20.0,12.0,20000.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,750,650.0,650.0,650.0,650.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0,0,0,0
2,1893213,Green,0,Unknown,0,23594.9866,750,34.0,Single,Salaried,Delhi/NCR,0.0,0.0,0.0,Zero,Zero,Zero,BeyondM12,MoreThanThree,1,0.67,0.0,0,0.0,1.0,0.0,0.0,0.0,4.0,0.0,0.0,4.0,0,0,2.0,2.0,0.0,4.0,1.0,5.0,0.0,5.0,0.0,15.0,14.0,20.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,750,650.0,650.0,650.0,650.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0,0,0,0
3,754443,Green,0,Unknown,0,15448.2885,750,42.0,Married,Salaried,Missing/Others,0.0,0.0,0.0,Zero,Zero,Zero,BeyondM12,MoreThanThree,1,0.5,0.0,0,0.0,1.0,2.0,0.0,1.0,1.0,0.0,0.0,3.0,0,0,3.0,3.0,0.0,3.0,5.0,1.0,0.0,4.0,2.0,79.0,47.0,10.0,12.0,20000.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,750,650.0,650.0,650.0,650.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0,0,0,0
4,3203125,Green,0,Unknown,0,21913.9034,725,30.0,Single,Salaried,Hyderabad,0.0,0.0,0.0,Zero,Zero,Zero,BeyondM12,MoreThanThree,1,0.75,0.0,0,0.0,0.0,0.0,0.0,0.0,2.0,2.0,0.0,3.0,0,0,1.0,1.0,0.0,3.0,0.0,4.0,0.0,4.0,0.0,32.0,30.0,9.0,4.0,2424.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,750,650.0,650.0,650.0,650.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0,0,0,0


### Removing Unwanted Columns

In [33]:
user_ids=pd.concat([cfp_clean_data['user_id'].reset_index(drop = True)], axis = 1)
cfp_clean_data=cfp_clean_data.drop(['user_id'],axis=1)

In [34]:
user_ids.head()

Unnamed: 0,user_id
0,2326221
1,45382
2,1893213
3,754443
4,3203125


### Segregate Columns for Encoding and Scaling

In [35]:
exemption_features=['cfp_subscribed']
id_columns=['user_id']

In [36]:
categorical_features=list(cfp_clean_data.select_dtypes(exclude='number').columns)
categorical_features

['customer_type',
 'salary_account',
 'marital_status',
 'employment_type',
 'city_band',
 'email_login_flag',
 'unique_days_logged_in_flag',
 'sms_login_flag',
 'last_login_vintage',
 'negative_status_flag']

In [37]:
numeric_features=list(cfp_clean_data.select_dtypes(include='number').columns)
numeric_features
flag_features=[]
for feature in exemption_features:
    numeric_features.remove(feature)
for feature in exclude:
    if feature in numeric_features:
        flag_features.append(feature)
        numeric_features.remove(feature)
    

In [38]:
'''Encoding categorical features with dummies'''
dummies = pd.get_dummies(data = cfp_clean_data[categorical_features], drop_first=False)
# dummies.drop(['salary_account_Other'],axis=1,inplace=True)
# dummies.drop(['salary_account_CANARA'],axis=1,inplace=True) #Only for green ltd
dummies.columns

Index(['customer_type_Green', 'salary_account_I receive by cash',
       'salary_account_Other', 'salary_account_Private',
       'salary_account_Public', 'salary_account_Unknown',
       'marital_status_Married', 'marital_status_Single',
       'employment_type_Salaried', 'employment_type_Self Employed',
       'city_band_Bangalore', 'city_band_Chennai', 'city_band_Delhi/NCR',
       'city_band_Hyderabad', 'city_band_Kolkata', 'city_band_Missing/Others',
       'city_band_Mumbai', 'city_band_Pune', 'email_login_flag_MoreThanOnce',
       'email_login_flag_One', 'email_login_flag_Zero',
       'unique_days_logged_in_flag_MoreThanThree',
       'unique_days_logged_in_flag_One', 'unique_days_logged_in_flag_Three',
       'unique_days_logged_in_flag_Two', 'unique_days_logged_in_flag_Zero',
       'sms_login_flag_MoreThanOnce', 'sms_login_flag_One',
       'sms_login_flag_Zero', 'last_login_vintage_BeyondM12',
       'last_login_vintage_M1toM2', 'last_login_vintage_M3toM6',
       'last_lo

In [39]:
'''Scaling numeric features'''
standard_scalar = StandardScaler(with_mean=True,with_std=True)
standard_scalar.fit(cfp_clean_data[numeric_features])
numeric_data_scaled = standard_scalar.transform(cfp_clean_data[numeric_features])
numeric_data_scaled = pd.DataFrame(numeric_data_scaled,columns=numeric_features)
numeric_data_scaled.head()

Unnamed: 0,nsaleable,monthly_income,credit_score,age,settled_flag,open_total_ratio,totl_neg_ratio,saleable_accounts_count,auto_loan_count,gold_loan_count,consumer_loan_count,housing_loan_count,credit_card_count,personal_loan_count,two_wheeler_count,open_accounts,open_bank_fintech,open_bank_nbfc,closed_accounts,settled_accounts,written_off_accounts,flows_accounts,secured,unsecured,open_negative_status_count,individual_account,joint_account,total_amt_sanctioned,total_balance,credit_limit,repayment_tenure,total_emi_amount,hdfc_neg_flag,citi_neg_flag,sbi_neg_flag,icici_neg_flag,cc_neg_flag,pl_neg_flag,cl_neg_flag,auto_loan_close_count_last3mnths,gold_loan_close_count_last3mnths,consumer_loan_close_count_last3mnths,housing_loan_close_count_last3mnths,credit_card_close_count_last3mnths,personal_loan_close_count_last3mnths,two_wheeler_close_count_last3mnths,individual_close_account_last3mnths,closed_accounts_last3mnths,secured_close_last3mnths,unsecured_close_last3mnths,negative_status_close_count_last3mnths,joint_account_close_last3mnths,max_score,score_on_m9,score_on_m3,score_on_m12,score_on_m6,neg_acc_on_m9,neg_acc_on_m3,neg_acc_on_m12,neg_acc_on_m6,max_dpd_on_m9,max_dpd_on_m3,max_dpd_on_m12,max_dpd_on_m6,reff_88,reff_07,ref_390,ref_prod_disb
0,0.0,1.6166,0.6801,0.9808,0.4451,-0.1233,0.0,0.0,-0.5556,-0.328,0.7989,1.597,-0.7171,-0.9014,0.0,-0.7341,-0.0044,-0.0266,-0.4789,-0.4602,0.0,-0.5914,-0.2915,-0.7373,0.0,-0.7091,0.5401,-0.2739,-0.111,0.2086,0.9299,1.3529,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.0044,-0.0115,-0.0123,0.0,-0.0097,-0.0299,0.0,-0.0367,-0.0367,-0.0131,-0.0343,0.0,0.0,0.5071,-0.0226,-0.1598,-0.0325,-0.0514,0.1325,0.2235,0.125,0.1682,-0.0178,-0.0162,-0.0162,-0.0229,-0.0978,-0.2101,-0.11,-0.065
1,0.0,0.4874,0.6801,1.1499,0.4451,-0.9902,0.0,0.0,-0.5556,-0.328,1.616,1.597,1.1773,0.2415,0.0,0.5813,-0.0044,-0.0266,1.0581,1.0729,0.0,1.1329,0.716,0.9471,0.0,0.6657,0.5401,0.755,0.0008,1.1322,1.1232,1.3529,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.0044,-0.0115,-0.0123,0.0,-0.0097,-0.0299,0.0,-0.0367,-0.0367,-0.0131,-0.0343,0.0,0.0,0.5071,-0.0226,-0.1598,-0.0325,-0.0514,0.1325,0.2235,0.125,0.1682,-0.0178,-0.0162,-0.0162,-0.0229,-0.0978,-0.2101,-0.11,-0.065
2,0.0,0.2225,0.6801,-0.2031,0.4451,0.4662,0.0,0.0,1.0729,-0.328,-0.8353,-0.6262,1.1773,-0.9014,0.0,0.1428,-0.0044,-0.0266,-0.4789,-0.4602,0.0,0.5581,-0.2915,0.1049,0.0,0.6657,-0.4871,-0.6965,-0.5021,1.1322,-1.1965,-0.933,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.0044,-0.0115,-0.0123,0.0,-0.0097,-0.0299,0.0,-0.0367,-0.0367,-0.0131,-0.0343,0.0,0.0,0.5071,-0.0226,-0.1598,-0.0325,-0.0514,0.1325,0.2235,0.125,0.1682,-0.0178,-0.0162,-0.0162,-0.0229,-0.0978,-0.2101,-0.11,-0.065
3,0.0,-0.985,0.6801,1.1499,0.4451,-0.1233,0.0,0.0,1.0729,0.784,-0.8353,1.597,-0.7171,-0.9014,0.0,-0.2957,-0.0044,-0.0266,0.0335,0.0508,0.0,-0.0167,1.7235,-1.5795,0.0,-0.0217,1.5672,0.4794,1.3417,-0.0223,1.1232,1.3529,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.0044,-0.0115,-0.0123,0.0,-0.0097,-0.0299,0.0,-0.0367,-0.0367,-0.0131,-0.0343,0.0,0.0,0.5071,-0.0226,-0.1598,-0.0325,-0.0514,0.1325,0.2235,0.125,0.1682,-0.0178,-0.0162,-0.0162,-0.0229,-0.0978,-0.2101,-0.11,-0.065
4,0.0,-0.0267,0.0087,-0.8796,0.4451,0.7437,0.0,0.0,-0.5556,-0.328,-0.8353,-0.6262,-0.0856,1.3844,0.0,-0.2957,-0.0044,-0.0266,-0.9912,-0.9712,0.0,-0.0167,-0.7953,-0.3162,0.0,-0.0217,-0.4871,-0.3841,0.3919,-0.1378,-0.4233,-0.656,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.0044,-0.0115,-0.0123,0.0,-0.0097,-0.0299,0.0,-0.0367,-0.0367,-0.0131,-0.0343,0.0,0.0,0.5071,-0.0226,-0.1598,-0.0325,-0.0514,0.1325,0.2235,0.125,0.1682,-0.0178,-0.0162,-0.0162,-0.0229,-0.0978,-0.2101,-0.11,-0.065


In [40]:
'''Concatenating all features'''
processed_dataset = pd.concat([user_ids[id_columns].reset_index(drop = True), 
                               numeric_data_scaled.reset_index(drop = True),
                               cfp_clean_data[flag_features].reset_index(drop = True),
                               dummies.reset_index(drop = True)
                              ,cfp_clean_data[exemption_features].reset_index(drop = True)
                              ], axis=1)
processed_dataset.shape

(52651, 109)

In [41]:
'''Replace Infinite'''
processed_dataset = processed_dataset.replace([np.inf, -np.inf], np.nan).dropna(axis=0)
processed_dataset.shape

(52650, 109)

In [42]:
'''Eliminating worst features to increase binding'''
'''red'''
# worst_features=['auto_loan_close_count_last3mnths',
#  'cc_neg_flag',
#  'citi_neg_flag',
#  'cl_neg_flag',
#  'consumer_loan_close_count_last3mnths',
#  'credit_card_close_count_last3mnths',
#  'gold_loan_close_count_last3mnths',
#  'hdfc_neg_flag',
#  'housing_loan_close_count_last3mnths',
#  'icici_neg_flag',
#  'joint_account_close_last3mnths',
#  'max_dpd_on_m12',
#  'max_dpd_on_m3',
#  'max_dpd_on_m6',
#  'max_dpd_on_m9',
#  'neg_acc_on_m12',
#  'neg_acc_on_m3',
#  'neg_acc_on_m6',
#  'neg_acc_on_m9',
#  'negative_status_close_count_last3mnths',
#  'negative_status_flag_One',
#  'nsaleable',
#  'open_negative_status_count',
#  'pl_neg_flag',
#  'saleable_accounts_count',
#  'sbi_neg_flag',
#  'secured_close_last3mnths',
#  'totl_neg_ratio',
#  'two_wheeler_close_count_last3mnths',
#  'two_wheeler_count',
#  'written_off_accounts']
# best_features=['age','auto_loan_count','cc_neg_flag','cfp_interest','city_band_Chennai','city_band_Delhi/NCR','city_band_Hyderabad','city_band_Kolkata','city_band_Missing/Others','city_band_Mumbai','city_band_Pune','cl_neg_flag','closed_accounts','closed_accounts_last3mnths','consumer_loan_count','contactability','credit_card_count','credit_limit','credit_score','email_login_flag_One','email_login_flag_Zero','employment_type_Self Employed','flows_accounts','gold_loan_count','hdfc_neg_flag','housing_loan_count','icici_neg_flag','individual_account','individual_close_account_last3mnths','joint_account','last_login_vintage_M1toM2','last_login_vintage_M3toM6','last_login_vintage_M7toM12','marital_status_Single','max_dpd_on_m3','max_score','monthly_income','neg_acc_on_m12','neg_acc_on_m3','neg_acc_on_m6','neg_acc_on_m9','negative_status_flag_One','negative_status_flag_Three','negative_status_flag_Two','nsaleable','open_accounts','open_negative_status_count','open_total_ratio','personal_loan_count','pl_neg_flag','positive_response_rate','ptp_last_three_months_flag','repayment_tenure','salary_account_AXIS','salary_account_CITI','salary_account_HDFC','salary_account_HDFC Bank','salary_account_I receive by cash','salary_account_ICICI','salary_account_JAMMU & KASHMIR','salary_account_SBI','salary_account_Unknown','salary_account_YESBANK','salary_account_andhra bank','salary_account_otherBank','saleable_accounts_count','sbi_neg_flag','score_on_m12','score_on_m3','score_on_m6','score_on_m9','secured','settled_accounts','settled_flag','sms_login_flag_One','sms_login_flag_Zero','total_amt_sanctioned','total_balance','total_emi_amount','totl_neg_ratio','unique_days_logged_in_flag_One','unique_days_logged_in_flag_Three','unique_days_logged_in_flag_Two','unsecured','unsecured_close_last3mnths','written_off_accounts']
'''green'''
# worst_features=['two_wheeler_count', 'citi_neg_flag', 'auto_loan_close_count_last3mnths', 'gold_loan_close_count_last3mnths', 'consumer_loan_close_count_last3mnths', 'housing_loan_close_count_last3mnths', 'credit_card_close_count_last3mnths', 'two_wheeler_close_count_last3mnths', 'secured_close_last3mnths', 'negative_status_close_count_last3mnths', 'joint_account_close_last3mnths', 'max_dpd_on_m9', 'max_dpd_on_m12', 'max_dpd_on_m6','nsaleable', 'monthly_income', 'age', 'positive_response_rate', 'settled_flag', 'totl_neg_ratio', 'saleable_accounts_count', 'gold_loan_count', 'housing_loan_count', 'credit_card_count', 'open_accounts', 'closed_accounts', 'settled_accounts', 'written_off_accounts', 'open_negative_status_count', 'joint_account', 'total_emi_amount', 'hdfc_neg_flag', 'sbi_neg_flag', 'icici_neg_flag', 'cc_neg_flag', 'pl_neg_flag', 'cl_neg_flag', 'individual_close_account_last3mnths', 'closed_accounts_last3mnths', 'score_on_m9', 'score_on_m12', 'score_on_m6', 'neg_acc_on_m9', 'neg_acc_on_m3', 'neg_acc_on_m12', 'neg_acc_on_m6', 'max_dpd_on_m3', 'salary_account_ACCOUNT', 'salary_account_ACCOUNT TRANSFER', 'salary_account_ALLAHABAD BANK', 'salary_account_ANDHRA', 'salary_account_ANDHRA BANK', 'salary_account_ANDRA BANK', 'salary_account_ANNAI SAI MAGAR BANK', 'salary_account_AU SMALL FINANCE', 'salary_account_AXIS', 'salary_account_Allahabad BAnk', 'salary_account_Allahabad bank', 'salary_account_Andhar Bank', 'salary_account_Andhara bank', 'salary_account_AndhraBank', 'salary_account_Axis', 'salary_account_BANK OF BARODA', 'salary_account_BANK OF INDIA', 'salary_account_BANK OF MAHARASHTRA', 'salary_account_BANK OF MAHARASTRA', 'salary_account_BANK OF MAHARASTRA/BANK OF BARODA', 'salary_account_BANK of BARODA', 'salary_account_BOB', 'salary_account_BOI', 'salary_account_BOM', 'salary_account_BY CASH', 'salary_account_Bandhan bank', 'salary_account_Bank of Baroda', 'salary_account_Bank of India', 'salary_account_Bank of Maharashtra', 'salary_account_Bank of baroda', 'salary_account_Bank of india', 'salary_account_Bank of maharashtra', 'salary_account_Bank or maharashtra', 'salary_account_CANA', 'salary_account_CANARA BANK', 'salary_account_CASH', 'salary_account_CBI', 'salary_account_CBQ', 'salary_account_CENTRAL BANK OF INDIA', 'salary_account_CITI', 'salary_account_CITI UNION BANK', 'salary_account_CITY UNION', 'salary_account_CORPORATIOM', 'salary_account_CORPORATION', 'salary_account_CORPORATION BANK', 'salary_account_COSMO BANK', 'salary_account_COSMOS BANK', 'salary_account_Canara', 'salary_account_Canara Bank', 'salary_account_Canara bank', 'salary_account_Cash', 'salary_account_Catholic Syrian bank', 'salary_account_Central Bank of India', 'salary_account_Central bank of india', 'salary_account_Co-operative Bank', 'salary_account_Corp Bank', 'salary_account_Corporation Bank', 'salary_account_Corporation bank', 'salary_account_DENA', 'salary_account_DEUTSCHE BANK', 'salary_account_DUTCH BANK', 'salary_account_Dena bank', 'salary_account_Deutsche Bank', 'salary_account_Dhanalaxmi Bank', 'salary_account_Direct Credit in union bank of india', 'salary_account_FEDERAL BANK', 'salary_account_FORD', 'salary_account_Federal Bank', 'salary_account_GDCC Bank Pvt. Ltd', 'salary_account_HAND CASH', 'salary_account_HDFC Bank', 'salary_account_HINDUSTHAN', 'salary_account_HSBC', 'salary_account_HSBC Electronic Data Processing Pvt Ltd', 'salary_account_HSBC bank', 'salary_account_Hsbc', 'salary_account_I receive by cash', 'salary_account_I receive by cheque', 'salary_account_I received in cash', 'salary_account_ICICI', 'salary_account_IDBI', 'salary_account_IDBI BANK', 'salary_account_IDBI BANK LTD.', 'salary_account_IDBI Bank', 'salary_account_IDFC', 'salary_account_INDIAN', 'salary_account_INDIAN BANK', 'salary_account_INDIAN OVERSEAS', 'salary_account_INDUSIND', 'salary_account_INDUSIND BANK', 'salary_account_ING', 'salary_account_ING Vysya', 'salary_account_ING is Now Kotak', 'salary_account_IOB', 'salary_account_IOB BANK', 'salary_account_IOB Bank', 'salary_account_Idbi', 'salary_account_Idbi bank', 'salary_account_Indian Bank', 'salary_account_Indian bank', 'salary_account_Indian overseas bank', 'salary_account_Indian oversies bank', 'salary_account_Indianbank', 'salary_account_Indusind', 'salary_account_Indusind Bank', 'salary_account_Indusind bank', 'salary_account_Indusind bank ltd', 'salary_account_Ing vysya bank', 'salary_account_JANATA SAHAKARI BANK', 'salary_account_Janata sahakari bank', 'salary_account_KALYAN JANATA SAHAKARI', 'salary_account_KARNATAKA BANK', 'salary_account_KARUR VYSYA', 'salary_account_KARYR VYSYA BANK', 'salary_account_KMB', 'salary_account_KOTAK', 'salary_account_KOTAK MAHINDAR BANK', 'salary_account_KOTAK MAHINDRA', 'salary_account_KOTAK MAHINDRA BANK', 'salary_account_KOTAK MAHINDRA BANL LTD', 'salary_account_KOTAK MAHINDRA bank', 'salary_account_KVB', 'salary_account_Karnataka bank Ltd', 'salary_account_Karur Vysya Bank', 'salary_account_Kotak', 'salary_account_Kotak Mahendra bank', 'salary_account_Kotak Mahindra', 'salary_account_Kotak Mahindra Bank', 'salary_account_Kotak Mahindra bank', 'salary_account_Kotak bank', 'salary_account_Kotak mahendra bank', 'salary_account_Kotak mahindra', 'salary_account_Kotak mahindra bank', 'salary_account_LVB', 'salary_account_Mahindra kotak', 'salary_account_NRI', 'salary_account_OBC', 'salary_account_ORIENTAL BANK OF COMMERCE', 'salary_account_Oriental Bank of Commerce', 'salary_account_PMC bank', 'salary_account_PNB', 'salary_account_POSTOFFICE      SALARY ACCOUNT', 'salary_account_PUNJAB NATIONAL BANK', 'salary_account_Punjab National Bank', 'salary_account_Punjab national bank', 'salary_account_RBL', 'salary_account_RBL BANK', 'salary_account_RBL Bank', 'salary_account_Ratnakar Bank', 'salary_account_Ratnakar Bank Limited', 'salary_account_SBBJ', 'salary_account_SBH', 'salary_account_SBI', 'salary_account_SBI GROUP', 'salary_account_SBM', 'salary_account_SBP', 'salary_account_SBT', 'salary_account_SCB', 'salary_account_SOUTH INDIA BANK', 'salary_account_SOUTH INDIAN', 'salary_account_SOUTH INDIAN BANK', 'salary_account_STANDARD', 'salary_account_STANDARD CHARTED', 'salary_account_STANDARD CHARTED BANK', 'salary_account_STANDARD CHARTERED', 'salary_account_STANDARD CHARTERED BANK', 'salary_account_STANDRED CHARTRED BANK', 'salary_account_STATE BANK OF BIKANER & JAIPUR', 'salary_account_STATE BANK OF HYDERABAD', 'salary_account_STATE BANK OF MYSORE', 'salary_account_STATEBANK OF TRAVANCORE', 'salary_account_STB', 'salary_account_SYNDICATE', 'salary_account_SYNDICATE BANK', 'salary_account_Saraswat bank', 'salary_account_Sarswath cooperative bank', 'salary_account_Sbbj', 'salary_account_South Indian Bank', 'salary_account_South Indian bank', 'salary_account_Standard Charted Bank', 'salary_account_Standard Charterd Bank', 'salary_account_Standard Chartered', 'salary_account_Standard Chartered Bank', 'salary_account_Standard Chartered bank', 'salary_account_Standard Chattered', 'salary_account_Standard chartered', 'salary_account_Standard chartered bank', 'salary_account_Standart chartered', 'salary_account_Standered charted bank', 'salary_account_State Bank Of Tranvakoor', 'salary_account_State Bank of Hyderabad', 'salary_account_State bank of Hyderabad', 'salary_account_State bank of hyderabad', 'salary_account_State bank of mysore', 'salary_account_Syndicate Bank', 'salary_account_Syndicate bank', 'salary_account_Syndicatebank', 'salary_account_TAMILNAD MERCANTILE BANK', 'salary_account_THE FEDERAL BANK LTD', 'salary_account_THROUGH BANK ACCOUNT', 'salary_account_TMB', 'salary_account_The Ahmedabad District Cooperative bank Ltd', 'salary_account_The Saraswat co-operative Bank', 'salary_account_Transfer through UAE', 'salary_account_UBI', 'salary_account_UCO', 'salary_account_UCO BANK', 'salary_account_UNION', 'salary_account_UNION BANK OF INDIA', 'salary_account_UNITED BANK OF INDIA', 'salary_account_Ubi', 'salary_account_Uco bank', 'salary_account_Union Bank of India', 'salary_account_Union bank', 'salary_account_Union bank of india', 'salary_account_Unknown', 'salary_account_Uttar banga kheyriya gramin bank', 'salary_account_VIJAYA BANK', 'salary_account_Vijaya Bank', 'salary_account_Vijaya bank', 'salary_account_YES', 'salary_account_YES BANK', 'salary_account_YESBANK', 'salary_account_Yes', 'salary_account_Yes Bank', 'salary_account_Yes bank', 'salary_account_andhra bank', 'salary_account_andhrabank', 'salary_account_axis bank', 'salary_account_bank', 'salary_account_bank of baroda', 'salary_account_bank of india', 'salary_account_bank of maharashtra', 'salary_account_bob', 'salary_account_boi', 'salary_account_bom', 'salary_account_by cash', 'salary_account_canara', 'salary_account_canara bank', 'salary_account_cash', 'salary_account_cbi', 'salary_account_central bank of india', 'salary_account_citibank', 'salary_account_city union bank', 'salary_account_co operative bank', 'salary_account_co-operative', 'salary_account_corparation bank', 'salary_account_corporation', 'salary_account_corporation Bank', 'salary_account_corporation bank', 'salary_account_creditted to bank', 'salary_account_deutsche bank', 'salary_account_dhanalakshmi bank', 'salary_account_federal bank', 'salary_account_hdfc', 'salary_account_hsbc', 'salary_account_idbi', 'salary_account_idbi bank', 'salary_account_img', 'salary_account_indian', 'salary_account_indian bank', 'salary_account_indian overseas bank', 'salary_account_indusind', 'salary_account_indusind bank', 'salary_account_indusinda bank', 'salary_account_indusinf', 'salary_account_ing', 'salary_account_ing vysa', 'salary_account_iob', 'salary_account_karur vysya bank', 'salary_account_kotak', 'salary_account_kotak BANK', 'salary_account_kotak Mahindra Bank', 'salary_account_kotak bank', 'salary_account_kotak mahindra', 'salary_account_kotak mahindra bank', 'salary_account_lakshmi vilas bank', 'salary_account_neft', 'salary_account_obc', 'salary_account_oriental bank of commerce', 'salary_account_other', 'salary_account_otherBank', 'salary_account_others', 'salary_account_pnb', 'salary_account_punjab national bank', 'salary_account_receiveByCashOrCheque', 'salary_account_saraswath co-operativa bank', 'salary_account_sbbj', 'salary_account_sbh', 'salary_account_sbi', 'salary_account_sbm', 'salary_account_south Indian Bank', 'salary_account_standard Charatered', 'salary_account_standard charted', 'salary_account_standard charted bank', 'salary_account_standard chartered', 'salary_account_standard chartered bank', 'salary_account_standard chatered', 'salary_account_standerd charted', 'salary_account_state Bank of travancore', 'salary_account_state bank of hyderabad', 'salary_account_state bank of mysore', 'salary_account_state bank of patila', 'salary_account_state bank of travancore', 'salary_account_statebankofhyderabad', 'salary_account_suh', 'salary_account_syndicate', 'salary_account_syndicate bank', 'salary_account_through bank', 'salary_account_ubi', 'salary_account_union bank of india', 'salary_account_yes bank', 'salary_account_yesbank', 'marital_status_Single', 'employment_type_Self Employed', 'city_band_Chennai', 'city_band_Kolkata', 'city_band_Pune', 'email_login_flag_One', 'unique_days_logged_in_flag_Three', 'negative_status_flag_One']
# try:
#     for feature in worst_features:
#         processed_dataset.drop([feature],axis=1,inplace=True)
# except:
#     pass
# processed_dataset=processed_dataset[best_features]
# processed_dataset.head()


'green'

In [43]:
list(processed_dataset.columns)

['user_id',
 'nsaleable',
 'monthly_income',
 'credit_score',
 'age',
 'settled_flag',
 'open_total_ratio',
 'totl_neg_ratio',
 'saleable_accounts_count',
 'auto_loan_count',
 'gold_loan_count',
 'consumer_loan_count',
 'housing_loan_count',
 'credit_card_count',
 'personal_loan_count',
 'two_wheeler_count',
 'open_accounts',
 'open_bank_fintech',
 'open_bank_nbfc',
 'closed_accounts',
 'settled_accounts',
 'written_off_accounts',
 'flows_accounts',
 'secured',
 'unsecured',
 'open_negative_status_count',
 'individual_account',
 'joint_account',
 'total_amt_sanctioned',
 'total_balance',
 'credit_limit',
 'repayment_tenure',
 'total_emi_amount',
 'hdfc_neg_flag',
 'citi_neg_flag',
 'sbi_neg_flag',
 'icici_neg_flag',
 'cc_neg_flag',
 'pl_neg_flag',
 'cl_neg_flag',
 'auto_loan_close_count_last3mnths',
 'gold_loan_close_count_last3mnths',
 'consumer_loan_close_count_last3mnths',
 'housing_loan_close_count_last3mnths',
 'credit_card_close_count_last3mnths',
 'personal_loan_close_count_last

In [44]:
processed_dataset.isnull().sum()

user_id                                     0
nsaleable                                   0
monthly_income                              0
credit_score                                0
age                                         0
settled_flag                                0
open_total_ratio                            0
totl_neg_ratio                              0
saleable_accounts_count                     0
auto_loan_count                             0
gold_loan_count                             0
consumer_loan_count                         0
housing_loan_count                          0
credit_card_count                           0
personal_loan_count                         0
two_wheeler_count                           0
open_accounts                               0
open_bank_fintech                           0
open_bank_nbfc                              0
closed_accounts                             0
settled_accounts                            0
written_off_accounts              

In [45]:
# processed_dataset.last_login_vintage_M1toM2.value_counts()

In [46]:
# processed_dataset.loc[processed_dataset['cfp_subscribed'] == 1].last_login_vintage_M1toM2.value_counts()

In [47]:
# processed_dataset.corr(method='pearson').style.format("{:.2}").background_gradient(cmap=plt.get_cmap('coolwarm'), axis=1)

## Train Test Split

In [48]:
'''  Preparing test and train data '''
X = processed_dataset.loc[:,processed_dataset.columns != 'cfp_subscribed']
y = processed_dataset['cfp_subscribed']
train_X_u,test_X_u,train_y,test_y = train_test_split(X,y,random_state=31,test_size=0.1)
train_X = train_X_u.loc[:,train_X_u.columns != 'user_id']
test_X = test_X_u.loc[:,test_X_u.columns != 'user_id']

In [49]:
# train_y.head()

## Model Building

### Training

In [50]:
# #LGB
# ''' Tuning Grid '''

# gridParams = {
#     'learning_rate': np.arange(0.01,0.15,0.05),
#     'n_estimators': np.arange(100,500,100),
#     'num_leaves': np.arange(5,20,5),
#     'boosting_type' : ['gbdt'],
#     'task': ['train'],
#     'random_state' : [100], 
#     'colsample_bytree' : np.arange(0.7,0.2,-0.1),
#     'reg_alpha' : [1,1.2],
#     'reg_lambda' : [1,1.2,1.4],
#     'metric': ['mse'],
#     #'metric': ['logloss'],
#     'application' : ['regression']}  

# ''' Model Object '''

# Light_GBM = lgb.LGBMRegressor(boosting_type= 'gbdt', 
#           objective = 'binary', random_state=2734,
#           silent = True)

# grid = RandomizedSearchCV(estimator = Light_GBM , param_distributions= gridParams, verbose=1 , cv=5)

# ''' Fitting the model'''

# grid.fit(X=train_X,y=train_y)


##### Initializing Model, Hyper Parameter Tuning and Model Fitting

In [51]:
#SGD
''' Tuning Grid '''
gridParams = {'loss': ['modified_huber'],
                'penalty':['l1','l2','elasticnet'],
                'max_iter':[10000],
                'random_state' : [13,100],
                'warm_start':[False],
                'n_iter_no_change':[5,10],
                'early_stopping':[True],
                'learning_rate':['optimal']
            }

'''Model Object'''
model1 = SGDClassifier()

''' Grid Formation '''
grid1 = RandomizedSearchCV(estimator = model1 , param_distributions= gridParams, verbose = 2, cv = 5, n_iter=500)
# grid1.fit(train_X, train_y)

In [52]:
#RandomForest
''' Tuning Grid '''
gridParams = {'n_estimators': [100,300],
                'max_features':['sqrt','log2',None],
                'random_state' : [21],
                'criterion':['gini', 'entropy'],
                 'max_depth':[None,50],
                'min_samples_split':[2,3],
                'min_samples_leaf':[1,2]
             }

'''Model Object'''
model2 = RandomForestClassifier(warm_start=False)

''' Grid Formation '''
grid2 = RandomizedSearchCV(estimator = model2 , param_distributions= gridParams, verbose = 2, cv = 5, n_iter=500)
grid2.fit(train_X, train_y)
# ensembled=grid2



Fitting 5 folds for each of 96 candidates, totalling 480 fits
[CV] END criterion=gini, max_depth=None, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=100, random_state=21; total time=   2.4s
[CV] END criterion=gini, max_depth=None, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=100, random_state=21; total time=   2.0s
[CV] END criterion=gini, max_depth=None, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=100, random_state=21; total time=   1.4s
[CV] END criterion=gini, max_depth=None, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=100, random_state=21; total time=   1.6s
[CV] END criterion=gini, max_depth=None, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=100, random_state=21; total time=   1.6s
[CV] END criterion=gini, max_depth=None, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=300, random_state=21; total time=   5.4s
[CV] END

[CV] END criterion=gini, max_depth=None, max_features=log2, min_samples_leaf=1, min_samples_split=3, n_estimators=100, random_state=21; total time=   1.5s
[CV] END criterion=gini, max_depth=None, max_features=log2, min_samples_leaf=1, min_samples_split=3, n_estimators=100, random_state=21; total time=   1.5s
[CV] END criterion=gini, max_depth=None, max_features=log2, min_samples_leaf=1, min_samples_split=3, n_estimators=300, random_state=21; total time=   4.4s
[CV] END criterion=gini, max_depth=None, max_features=log2, min_samples_leaf=1, min_samples_split=3, n_estimators=300, random_state=21; total time=   4.9s
[CV] END criterion=gini, max_depth=None, max_features=log2, min_samples_leaf=1, min_samples_split=3, n_estimators=300, random_state=21; total time=   5.3s
[CV] END criterion=gini, max_depth=None, max_features=log2, min_samples_leaf=1, min_samples_split=3, n_estimators=300, random_state=21; total time=   4.4s
[CV] END criterion=gini, max_depth=None, max_features=log2, min_sample

[CV] END criterion=gini, max_depth=None, max_features=None, min_samples_leaf=2, min_samples_split=2, n_estimators=300, random_state=21; total time=  15.8s
[CV] END criterion=gini, max_depth=None, max_features=None, min_samples_leaf=2, min_samples_split=2, n_estimators=300, random_state=21; total time=  15.3s
[CV] END criterion=gini, max_depth=None, max_features=None, min_samples_leaf=2, min_samples_split=2, n_estimators=300, random_state=21; total time=  15.3s
[CV] END criterion=gini, max_depth=None, max_features=None, min_samples_leaf=2, min_samples_split=2, n_estimators=300, random_state=21; total time=  10.7s
[CV] END criterion=gini, max_depth=None, max_features=None, min_samples_leaf=2, min_samples_split=3, n_estimators=100, random_state=21; total time=   4.8s
[CV] END criterion=gini, max_depth=None, max_features=None, min_samples_leaf=2, min_samples_split=3, n_estimators=100, random_state=21; total time=   5.0s
[CV] END criterion=gini, max_depth=None, max_features=None, min_sample

[CV] END criterion=gini, max_depth=50, max_features=log2, min_samples_leaf=1, min_samples_split=2, n_estimators=100, random_state=21; total time=   2.8s
[CV] END criterion=gini, max_depth=50, max_features=log2, min_samples_leaf=1, min_samples_split=2, n_estimators=100, random_state=21; total time=   1.8s
[CV] END criterion=gini, max_depth=50, max_features=log2, min_samples_leaf=1, min_samples_split=2, n_estimators=100, random_state=21; total time=   1.8s
[CV] END criterion=gini, max_depth=50, max_features=log2, min_samples_leaf=1, min_samples_split=2, n_estimators=100, random_state=21; total time=   1.5s
[CV] END criterion=gini, max_depth=50, max_features=log2, min_samples_leaf=1, min_samples_split=2, n_estimators=100, random_state=21; total time=   1.3s
[CV] END criterion=gini, max_depth=50, max_features=log2, min_samples_leaf=1, min_samples_split=2, n_estimators=300, random_state=21; total time=   4.6s
[CV] END criterion=gini, max_depth=50, max_features=log2, min_samples_leaf=1, min_

[CV] END criterion=gini, max_depth=50, max_features=None, min_samples_leaf=1, min_samples_split=3, n_estimators=100, random_state=21; total time=   3.4s
[CV] END criterion=gini, max_depth=50, max_features=None, min_samples_leaf=1, min_samples_split=3, n_estimators=300, random_state=21; total time=  15.2s
[CV] END criterion=gini, max_depth=50, max_features=None, min_samples_leaf=1, min_samples_split=3, n_estimators=300, random_state=21; total time=  15.3s
[CV] END criterion=gini, max_depth=50, max_features=None, min_samples_leaf=1, min_samples_split=3, n_estimators=300, random_state=21; total time=  15.2s
[CV] END criterion=gini, max_depth=50, max_features=None, min_samples_leaf=1, min_samples_split=3, n_estimators=300, random_state=21; total time=  15.7s
[CV] END criterion=gini, max_depth=50, max_features=None, min_samples_leaf=1, min_samples_split=3, n_estimators=300, random_state=21; total time=  11.3s
[CV] END criterion=gini, max_depth=50, max_features=None, min_samples_leaf=2, min_

[CV] END criterion=entropy, max_depth=None, max_features=sqrt, min_samples_leaf=2, min_samples_split=2, n_estimators=300, random_state=21; total time=   4.8s
[CV] END criterion=entropy, max_depth=None, max_features=sqrt, min_samples_leaf=2, min_samples_split=2, n_estimators=300, random_state=21; total time=   5.3s
[CV] END criterion=entropy, max_depth=None, max_features=sqrt, min_samples_leaf=2, min_samples_split=2, n_estimators=300, random_state=21; total time=   5.2s
[CV] END criterion=entropy, max_depth=None, max_features=sqrt, min_samples_leaf=2, min_samples_split=3, n_estimators=100, random_state=21; total time=   1.6s
[CV] END criterion=entropy, max_depth=None, max_features=sqrt, min_samples_leaf=2, min_samples_split=3, n_estimators=100, random_state=21; total time=   1.5s
[CV] END criterion=entropy, max_depth=None, max_features=sqrt, min_samples_leaf=2, min_samples_split=3, n_estimators=100, random_state=21; total time=   1.5s
[CV] END criterion=entropy, max_depth=None, max_feat

[CV] END criterion=entropy, max_depth=None, max_features=log2, min_samples_leaf=2, min_samples_split=3, n_estimators=300, random_state=21; total time=   4.7s
[CV] END criterion=entropy, max_depth=None, max_features=None, min_samples_leaf=1, min_samples_split=2, n_estimators=100, random_state=21; total time=   5.8s
[CV] END criterion=entropy, max_depth=None, max_features=None, min_samples_leaf=1, min_samples_split=2, n_estimators=100, random_state=21; total time=   5.3s
[CV] END criterion=entropy, max_depth=None, max_features=None, min_samples_leaf=1, min_samples_split=2, n_estimators=100, random_state=21; total time=   5.1s
[CV] END criterion=entropy, max_depth=None, max_features=None, min_samples_leaf=1, min_samples_split=2, n_estimators=100, random_state=21; total time=   6.0s
[CV] END criterion=entropy, max_depth=None, max_features=None, min_samples_leaf=1, min_samples_split=2, n_estimators=100, random_state=21; total time=   3.7s
[CV] END criterion=entropy, max_depth=None, max_feat

[CV] END criterion=entropy, max_depth=50, max_features=sqrt, min_samples_leaf=1, min_samples_split=3, n_estimators=100, random_state=21; total time=   1.6s
[CV] END criterion=entropy, max_depth=50, max_features=sqrt, min_samples_leaf=1, min_samples_split=3, n_estimators=100, random_state=21; total time=   1.5s
[CV] END criterion=entropy, max_depth=50, max_features=sqrt, min_samples_leaf=1, min_samples_split=3, n_estimators=100, random_state=21; total time=   1.6s
[CV] END criterion=entropy, max_depth=50, max_features=sqrt, min_samples_leaf=1, min_samples_split=3, n_estimators=100, random_state=21; total time=   1.7s
[CV] END criterion=entropy, max_depth=50, max_features=sqrt, min_samples_leaf=1, min_samples_split=3, n_estimators=300, random_state=21; total time=   5.0s
[CV] END criterion=entropy, max_depth=50, max_features=sqrt, min_samples_leaf=1, min_samples_split=3, n_estimators=300, random_state=21; total time=   5.7s
[CV] END criterion=entropy, max_depth=50, max_features=sqrt, min

[CV] END criterion=entropy, max_depth=50, max_features=log2, min_samples_leaf=2, min_samples_split=2, n_estimators=100, random_state=21; total time=   1.8s
[CV] END criterion=entropy, max_depth=50, max_features=log2, min_samples_leaf=2, min_samples_split=2, n_estimators=300, random_state=21; total time=   7.4s
[CV] END criterion=entropy, max_depth=50, max_features=log2, min_samples_leaf=2, min_samples_split=2, n_estimators=300, random_state=21; total time=   5.9s
[CV] END criterion=entropy, max_depth=50, max_features=log2, min_samples_leaf=2, min_samples_split=2, n_estimators=300, random_state=21; total time=   5.6s
[CV] END criterion=entropy, max_depth=50, max_features=log2, min_samples_leaf=2, min_samples_split=2, n_estimators=300, random_state=21; total time=   5.2s
[CV] END criterion=entropy, max_depth=50, max_features=log2, min_samples_leaf=2, min_samples_split=2, n_estimators=300, random_state=21; total time=   5.6s
[CV] END criterion=entropy, max_depth=50, max_features=log2, min

[CV] END criterion=entropy, max_depth=50, max_features=None, min_samples_leaf=2, min_samples_split=3, n_estimators=300, random_state=21; total time=  16.8s
[CV] END criterion=entropy, max_depth=50, max_features=None, min_samples_leaf=2, min_samples_split=3, n_estimators=300, random_state=21; total time=  16.4s
[CV] END criterion=entropy, max_depth=50, max_features=None, min_samples_leaf=2, min_samples_split=3, n_estimators=300, random_state=21; total time=  11.0s


RandomizedSearchCV(cv=5, estimator=RandomForestClassifier(), n_iter=500,
                   param_distributions={'criterion': ['gini', 'entropy'],
                                        'max_depth': [None, 50],
                                        'max_features': ['sqrt', 'log2', None],
                                        'min_samples_leaf': [1, 2],
                                        'min_samples_split': [2, 3],
                                        'n_estimators': [100, 300],
                                        'random_state': [21]},
                   verbose=2)

In [53]:
# ensembled = VotingClassifier(estimators=[('rf', grid2)], voting='soft') #('sgd', grid1),('rf', grid2)
# ensembled.fit(train_X, train_y)

In [54]:
# ''' Calibration '''
# calibrator = CalibratedClassifierCV(SGD, method="isotonic")

# ''' Training the model '''
# calibrator.fit(train_X, train_y)

##### Pickling Model

In [55]:
''' Saving model as pickle object '''
# joblib.dump(grid,"clf_model_basic.pkl")
# joblib.dump(best_features,"clf_model_best_features.pkl")
# joblib.dump(grid,"clf_model_basic_Green.pkl")
# joblib.dump(best_features,"clf_model_best_features_Green.pkl")
# joblib.dump(grid,"clf_model_basic_Red1.pkl")
# joblib.dump(best_features,"clf_model_best_features_Red1.pkl")
# joblib.dump(grid1,"clf_SGD_Red.pkl")
# joblib.dump(grid2,"clf_RF_Red.pkl")
# joblib.dump(grid1,"clf_SGD_Green.pkl")
# joblib.dump(grid2,"clf_RF_Green.pkl")
# joblib.dump(ensembled,"clf_Ensembled_Green.pkl")
# joblib.dump(grid1,"clf_SGD_Red.pkl")
# joblib.dump(grid2,"clf_RF_Red.pkl")
# joblib.dump(ensembled,"clf_Ensembled_Red.pkl")
joblib.dump(grid2,"clf_Ensembled_F_Green_Random.pkl")
# joblib.dump(grid,"clf_Ensembled_F_Green.pkl")

['clf_Ensembled_F_Green_Random.pkl']

In [None]:
# grid1=joblib.load('clf_SGD_Green.pkl')
# grid2=joblib.load('clf_RF_Green.pkl')
# grid=joblib.load('clf_Ensembled_Green.pkl')
# grid1=joblib.load('clf_SGD_Red.pkl')
# grid2=joblib.load('clf_RF_Red.pkl')
# grid=joblib.load('clf_Ensembled_Red.pkl')
grid2=joblib.load('clf_Ensembled_F_Green_Random.pkl')
# grid2=joblib.load('clf_Ensembled_F_Green.pkl')

In [None]:
# feature_imp = {}
# loop=1
# for est in grid2.estimators_:
#     if loop==1:
#         loop=2
#         temp={}
#         for i in range (0,len(list(est.best_estimator_.feature_names_in_))):
#             temp[list(est.best_estimator_.feature_names_in_)[i]]=str(list(est.best_estimator_.coef_[0])[i])+'y + '+str(est.best_estimator_.intercept_)
#         feature_imp['SGD'] = dict(sorted(temp.items(), key=lambda item: item[1]))
#     elif loop==2:
#         temp={}
#         for i in range (0,len(est.best_estimator_.feature_importances_)):
#             temp[est.best_estimator_.feature_names_in_[i]]=est.best_estimator_.feature_importances_[i]
#         feature_imp['RF'] = dict(sorted(temp.items(), key=lambda item: item[1]))
# feature_imp


In [None]:
# best_features = ensembled.best_estimator_
# print(best_features)

In [None]:
# best_features1 = grid1.best_estimator_
# print(best_features1)

In [None]:
best_features2 = {}
for i in range(0,len(grid2.best_estimator_.feature_names_in_)):
    best_features2[grid2.best_estimator_.feature_names_in_[i]]=grid2.best_estimator_.feature_importances_[i]
dict(sorted(best_features2.items(), key=lambda item: item[1]))

In [None]:
# feature_importances_dict1={}
# for i in range (0,len(list(best_features1.feature_names_in_))):
#     feature_importances_dict1[list(best_features1.feature_names_in_)[i]]=list(best_features1.coef_[0])[i]
# dict(sorted(feature_importances_dict1.items(), key=lambda item: item[1]))

In [None]:
# feature_importances_dict2={}
# for i in range (0,len(best_features2.feature_importances_)):
#     feature_importances_dict2[best_features2.feature_names_in_[i]]=best_features2.feature_importances_[i]
# dict(sorted(feature_importances_dict2.items(), key=lambda item: item[1]))

### Predicting

##### Predicting the test data

In [None]:
''' Predicting using the best fit parameters '''
# test_fit = best_features.predict_proba(test_X)
# y_pred = pd.DataFrame(test_fit, columns = ['predicted_value'])
# processed_dataset=test_X.loc[:,processed_dataset.columns != 'user_id']
# test_fit = grid.predict_proba(processed_dataset.loc[:,processed_dataset.columns != 'user_id'])
# test_fit = grid.predict_proba(test_X)
# test_fit1 = grid1.predict_proba(test_X)
test_fit2 = grid2.predict_proba(test_X)
y_pred=pd.DataFrame(test_fit2)
# y_pred1=pd.DataFrame(test_fit1)
# y_pred2=pd.DataFrame(test_fit2)

In [None]:
grid2.classes_

In [None]:
grid2.predict(test_X)

In [None]:
# # y_pred['predicted_value'].value_counts()
# y_pred_1=pd.DataFrame((y_pred1[1]))
# y_pred_2=pd.DataFrame((y_pred2[1]))
# y_pred_manual=pd.DataFrame(y_pred1[1]+(2*y_pred2[1]))
# # y_pred=pd.DataFrame(y_pred1[1]*y_pred2[1])
y_pred.head()

In [None]:
# predicted_y=y_pred['predicted_value'].apply(lambda x: 1.00 if x >= 1 else 0.00)
predicted_y=y_pred[1].apply(lambda x: 1 if x > 0.01 else 0)

### Metrics

##### Testing Accuracy

In [None]:
accuracy_score(test_y, predicted_y)

In [None]:
cm=ConfusionMatrixDisplay(confusion_matrix(test_y, predicted_y))
cm.plot()
plt.show()

In [None]:
false_positive_rate, true_positive_rate, threshold1 = roc_curve(test_y, predicted_y)
'roc='+str(roc_auc_score(test_y, predicted_y))

In [None]:
mae = mean_absolute_error(test_y, predicted_y)
mse = mean_squared_error(test_y, predicted_y)
rmse = np.sqrt(mse) # or mse**(0.5)  
r2 = r2_score(test_y, predicted_y)

print("Results of sklearn.metrics:")
print("MAE:",mae)
print("MSE:", mse)
print("RMSE:", rmse)
print("R-Squared:", r2)

In [None]:
plt.subplots(1, figsize=(10,10))
plt.title('Receiver Operating Characteristic')
plt.plot(false_positive_rate, true_positive_rate)
plt.plot([0, 1], ls="--")
plt.plot([0, 0], [1, 0] , c=".7"), plt.plot([1, 1] , c=".7")
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.show()

### Split to Deciles

In [None]:
# y_pred_deciles.drop(y_pred_deciles.index, inplace=True)
y_pred_deciles = pd.concat([y_pred[1].reset_index(drop = True), test_X_u['user_id'].reset_index(drop = True)], axis = 1)
y_pred_deciles.rename(columns = {1:'predicted_value'}, inplace = True)
y_pred_deciles.head()

In [None]:
# y_pred_deciles_manual = pd.concat([y_pred_manual[1].reset_index(drop = True), test_X_u['user_id'].reset_index(drop = True)], axis = 1)
# y_pred_deciles_manual.rename(columns = {1:'predicted_value'}, inplace = True)
# y_pred_deciles_manual.head()

In [None]:
# y_pred_deciles_1 = pd.concat([y_pred_1[1].reset_index(drop = True), test_X_u['user_id'].reset_index(drop = True)], axis = 1)
# y_pred_deciles_1.rename(columns = {1:'predicted_value'}, inplace = True)
# y_pred_deciles_1.head()

In [None]:
# y_pred_deciles_2 = pd.concat([y_pred_2[1].reset_index(drop = True), test_X_u['user_id'].reset_index(drop = True)], axis = 1)
# y_pred_deciles_2.rename(columns = {1:'predicted_value'}, inplace = True)
# y_pred_deciles_2.head()

In [None]:
y_pred_deciles = pd.merge(y_pred_deciles,cfp_backup, on = 'user_id', how = 'left')
y_pred_deciles.head()

In [None]:
# y_pred_deciles_manual = pd.merge(y_pred_deciles_manual,cfp_backup, on = 'user_id', how = 'left')
# y_pred_deciles_manual.head()

In [None]:
# y_pred_deciles_1 = pd.merge(y_pred_deciles_1,cfp_backup, on = 'user_id', how = 'left')
# y_pred_deciles_1.head()

In [None]:
# y_pred_deciles_2 = pd.merge(y_pred_deciles_2,cfp_backup, on = 'user_id', how = 'left')
# y_pred_deciles_2.head()

In [None]:
y_pred_deciles['decile'] = pd.qcut(y_pred_deciles['predicted_value'].rank(method='first'), 10, labels=False)
y_pred_deciles['decile'] = y_pred_deciles['decile'].apply(lambda x: int(10 - x))
y_pred_deciles['decile'].value_counts()

In [None]:
dummies1 = pd.get_dummies(data = y_pred_deciles['decile'], drop_first=False)
dummies1.head()
processed_dataset1 = pd.concat([processed_dataset.reset_index(drop = True),
                               dummies1.reset_index(drop = True)], axis=1)
processed_dataset1.corr(method='pearson').style.format("{:.2}").background_gradient(cmap=plt.get_cmap('coolwarm'), axis=1)

In [None]:
# y_pred_deciles_manual['decile'] = pd.qcut(y_pred_deciles_manual['predicted_value'].rank(method='first'), 10, labels=False)
# y_pred_deciles_manual['decile'] = y_pred_deciles_manual['decile'].apply(lambda x: int(10 - x))
# y_pred_deciles_manual['decile'].value_counts()

In [None]:
# y_pred_deciles_1['decile'] = pd.qcut(y_pred_deciles_1['predicted_value'].rank(method='first'), 10, labels=False)
# y_pred_deciles_1['decile'] = y_pred_deciles_1['decile'].apply(lambda x: int(10 - x))
# y_pred_deciles_1['decile'].value_counts()

In [None]:
# y_pred_deciles_2['decile'] = pd.qcut(y_pred_deciles_2['predicted_value'].rank(method='first'), 10, labels=False)
# y_pred_deciles_2['decile'] = y_pred_deciles_2['decile'].apply(lambda x: int(10 - x))
# y_pred_deciles_2['decile'].value_counts()

#### Export to CSV

In [None]:
# y_pred_deciles.to_csv('CFP_deciled_data_Green.csv')
# y_pred_deciles.drop(['pincode'],axis=1,inplace=True)
# y_pred_deciles_manual.drop(['pincode'],axis=1,inplace=True)
# y_pred_deciles_1.drop(['pincode'],axis=1,inplace=True)
# y_pred_deciles_2.drop(['pincode'],axis=1,inplace=True)
# y_pred_deciles.to_csv('CFP_deciled_data_Red.csv')

# y_pred_deciles.to_csv('CFP_deciled_data_Green.csv')
# y_pred_deciles_manual.to_csv('CFP_deciled_data_Green_manual.csv')
# y_pred_deciles_1.to_csv('CFP_deciled_data_Green_SGB.csv')
# y_pred_deciles_2.to_csv('CFP_deciled_data_Green_RF.csv')

# y_pred_deciles.to_csv('CFP_deciled_data_Red.csv')
# y_pred_deciles_manual.to_csv('CFP_deciled_data_Red_manual.csv')
# y_pred_deciles_1.to_csv('CFP_deciled_data_Red_SGB.csv')
# y_pred_deciles_2.to_csv('CFP_deciled_data_Red_RF.csv')

y_pred_deciles.to_csv('CFP_deciled_data_Green_ltd_Randomized.csv')
# y_pred_deciles.to_csv('CFP_deciled_data_Green_ltd.csv')

##### -- The End