# CFP Propensity Version 1.0

### LIBRARY IMPORTS

In [None]:
'''Importing Required Libraries'''
import copy
from collections import OrderedDict
import pandas as pd
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.options.display.float_format = '{:.2f}'.format
pd.options.mode.chained_assignment = None  # default='warn'
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from scipy.stats import norm, skew
from sklearn.preprocessing import PowerTransformer, QuantileTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.calibration import CalibratedClassifierCV
from sklearn.metrics import accuracy_score, roc_curve, roc_auc_score, confusion_matrix, ConfusionMatrixDisplay
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import joblib

### DATA IMPORT

In [None]:
'''Importing the data for modelling'''
# cfp_data = pd.read_csv(r"D:\Dbeaver Out\CFP_Data_Green_202301051222.csv", na_values=['',' ','NH', 'NA', '[]'])
cfp_data = pd.read_csv(r"D:\Dbeaver Out\CFP_Data_Red_202301051318.csv", na_values=['',' ','NH', 'NA', '[]'])

In [None]:
cfp_data.shape

In [None]:
cfp_data.columns

In [None]:
cfp_data.head()

## PRE-PROCESSING

#### Handling Null Values

In [None]:
cfp_data.isnull().sum()

In [None]:
cfp_data.describe(percentiles=[0.05,0.10,0.25,0.5,0.75,0.9,0.95,0.99]).transpose()

In [None]:
''' Replacing with 0s and Harcoded Values'''
cfp_data.ptp_last_three_months_flag.fillna(0, inplace= True)
cfp_data.totl_neg_ratio.fillna(0, inplace= True)
cfp_data.max_score.fillna(cfp_data.max_score.mean(),inplace=True)
cfp_data.auto_loan_close_count_last3mnths.fillna(0, inplace= True)
cfp_data.gold_loan_close_count_last3mnths.fillna(0, inplace= True)
cfp_data.consumer_loan_close_count_last3mnths.fillna(0, inplace= True)
cfp_data.housing_loan_close_count_last3mnths.fillna(0, inplace= True)
cfp_data.credit_card_close_count_last3mnths.fillna(0, inplace= True)
cfp_data.personal_loan_close_count_last3mnths.fillna(0, inplace= True)
cfp_data.two_wheeler_close_count_last3mnths.fillna(0, inplace= True)
cfp_data.individual_close_account_last3mnths.fillna(0, inplace= True)
cfp_data.secured_close_last3mnths.fillna(0, inplace= True)
cfp_data.closed_accounts_last3mnths.fillna(0, inplace= True)
cfp_data.negative_status_close_count_last3mnths.fillna(0, inplace= True)
cfp_data.joint_account_close_last3mnths.fillna(0, inplace= True)
cfp_data.unsecured_close_last3mnths.fillna(0, inplace= True)

cfp_data.auto_loan_count.fillna(0, inplace= True)
cfp_data.gold_loan_count.fillna(0, inplace= True)
cfp_data.consumer_loan_count.fillna(0, inplace= True)
cfp_data.housing_loan_count.fillna(0, inplace= True)
cfp_data.credit_card_count.fillna(0, inplace= True)
cfp_data.personal_loan_count.fillna(0, inplace= True)
cfp_data.two_wheeler_count.fillna(0, inplace= True)
cfp_data.individual_account.fillna(0, inplace= True)
cfp_data.open_accounts.fillna(0, inplace= True)
cfp_data.settled_accounts.fillna(0, inplace= True)
cfp_data.secured.fillna(0, inplace= True)
cfp_data.unsecured.fillna(0, inplace= True)
cfp_data.open_negative_status_count.fillna(0, inplace= True)
cfp_data.joint_account.fillna(0, inplace= True)
cfp_data.total_balance.fillna(0, inplace= True)
cfp_data.written_off_accounts.fillna(0, inplace= True)
cfp_data.flows_accounts.fillna(0, inplace= True)
cfp_data.open_accounts.fillna(0, inplace= True)
cfp_data.closed_accounts.fillna(0, inplace= True)


cfp_data.hdfc_neg_flag.fillna(0, inplace= True)
cfp_data.citi_neg_flag.fillna(0, inplace= True)
cfp_data.sbi_neg_flag.fillna(0, inplace= True)        
cfp_data.icici_neg_flag.fillna(0, inplace= True)         
cfp_data.cc_neg_flag.fillna(0, inplace= True)       
cfp_data.pl_neg_flag.fillna(0, inplace= True)
cfp_data.cl_neg_flag.fillna(0, inplace= True)

cfp_data.total_balance.fillna(cfp_data.total_balance.mean(),inplace=True)
cfp_data.total_emi_amount.fillna(cfp_data.total_emi_amount.mean(),inplace=True)
cfp_data.total_amt_sanctioned.fillna(cfp_data.total_amt_sanctioned.mean(),inplace=True)

cfp_data.score_on_m9.fillna(0, inplace= True)
cfp_data.score_on_m3.fillna(0, inplace= True)
cfp_data.score_on_m12.fillna(0, inplace= True)
cfp_data.score_on_m6.fillna(0, inplace= True)
cfp_data.neg_acc_on_m9.fillna(0, inplace= True)
cfp_data.neg_acc_on_m3.fillna(0, inplace= True)
cfp_data.neg_acc_on_m12.fillna(0, inplace= True)
cfp_data.neg_acc_on_m6.fillna(0, inplace= True)
cfp_data.max_dpd_on_m9.fillna(0, inplace= True)
cfp_data.max_dpd_on_m3.fillna(0, inplace= True)
cfp_data.max_dpd_on_m12.fillna(0, inplace= True)
cfp_data.max_dpd_on_m6.fillna(0, inplace= True)

# cfp_data.score_track.fillna(0, inplace= True)
# cfp_data.neg_acc_track.fillna(0, inplace= True)
# cfp_data.max_dpd_track.fillna(0, inplace= True)

cfp_data.cfp_interest.fillna(0, inplace= True)

cfp_data.salary_account.fillna('Unknown', inplace=True)

In [None]:
'''Replacing based on median'''
median_marriage_age=np.median(cfp_data.age[cfp_data['marital_status'] == 'Married'])
def marital_na_filler(marital_status, age): 
    try: 
        if np.isnan(marital_status): 
            if age >= median_marriage_age: 
                return 'Married' 
            else: 
                return 'Single' 
        else: 
            return (marital_status) 
    except: 
        return (marital_status)
cfp_data.marital_status = cfp_data.apply(lambda x: marital_na_filler(x.marital_status, x.age), axis = 1)

In [None]:
'''Replacing based on mean'''
cfp_data.age.fillna(np.mean(cfp_data.age), inplace= True)
cfp_data.monthly_income.fillna(np.mean(cfp_data.monthly_income), inplace= True)

In [None]:
'''Replacing based on Mode'''
mode_credit_limit=cfp_data.credit_limit.mode()
cfp_data.credit_limit.fillna(int(mode_credit_limit),inplace=True)

mode_repayment_tenure=cfp_data.repayment_tenure.mode()
cfp_data.repayment_tenure.fillna(int(mode_repayment_tenure),inplace=True)

In [None]:
''' Replacing FLAG in city_band with 'Missing/Others' '''
cfp_data.city_band = cfp_data.city_band.apply(lambda x: 'Missing/Others' if x == 'FLAG' else x)

In [None]:
'''Format Employment and handle null values'''
def getFormattedEmploymentType(employment_type):
    try:
        formatted_employment_type = ""
        salaried_list = ['Salaried','salaried','Salaried Doctor','Working Executive','Student','Salaried doctor', 'salaried', 'salariedDoctor']
        self_employed_list = ['Self Employed', 'Self employed', 'Self employed professional', 'Self-Employed', 'selfemployee', 'selfEmployedProfessional','Self employed business', 'selfEmployedBusiness', 'Self Employed Business']
        if employment_type in salaried_list:
            return "Salaried"
        elif employment_type in self_employed_list:
            return "Self Employed"
        else:
            return 'Salaried'
    except:
        return employment_type
cfp_data.employment_type = cfp_data.employment_type.apply(lambda x: getFormattedEmploymentType(x))

In [None]:
''' Income and Age to positive if negative '''

cfp_data.monthly_income = cfp_data.monthly_income.apply(lambda x: (x * -1) if x < 0 else x)
cfp_data.age = cfp_data.age.apply(lambda x: (x * -1) if x < 0 else x)

In [None]:
cfp_data.isnull().sum()
# cfp_data.salary_account.value_counts()

In [None]:
cfp_backup=copy.deepcopy(cfp_data)
cfp_data.drop(['ltd','latest_login_date','first_profile_date','latest_profile_date','pincode'],axis=1,inplace=True)

#### Probe and Handle Outliers

In [None]:
numerical_columns=list(cfp_data.select_dtypes(include='number').columns)
exclude=['user_id','cfp_subscribed','monthly_income', 'age','cfp_interest','positive_response_rate', 'contactability',
         'ptp_last_three_months_flag']
for feature in exclude:
    numerical_columns.remove(feature)
print(numerical_columns)

In [None]:
sns.displot(cfp_data[numerical_columns[20]])

In [None]:
'''Percentile values'''
Q05=cfp_data.quantile(0.05)
Q10=cfp_data.quantile(0.10)
Q95=cfp_data.quantile(0.95)

In [None]:
'''Capping Upper Limit'''
cfp_data['monthly_income'] = np.where(cfp_data['monthly_income']>Q95.monthly_income,Q95.monthly_income,cfp_data['monthly_income'])
cfp_data['nsaleable'] = np.where(cfp_data['nsaleable']>Q95.nsaleable,Q95.nsaleable,cfp_data['nsaleable'])
for feature in numerical_columns:
    cfp_data[feature] = np.where(cfp_data[feature]>Q95[feature],Q95[feature],cfp_data[feature])


'''Capping Upper and Lower Limit'''
cfp_data['age'] = np.where(cfp_data['age']>Q95.age,Q95.age,np.where(cfp_data['age']<Q05.age,Q05.age,cfp_data['age']))

In [None]:
'''Finding Skewness'''
def find_skew(data,numerical_columns):
    skew_list={}
    for feature in numerical_columns:
        try:
            skew_value=(3*(data[feature].mean()-data[feature].median()))/data[feature].std()
            if ((round(skew_value,2)<-3) or (round(skew_value,2)>3)):
                skew_list[feature]='Skewwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwww'
            else:
                skew_list[feature]=round(skew_value,2)
        except ZeroDivisionError:
            skew_list[feature]='Skewwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwww'
    return skew_list
find_skew(cfp_data,numerical_columns)

In [None]:
#Finding better transformation 
#def test_transformers(data,columns):
#     count=len(columns)
#     pt = PowerTransformer()
#     qt = QuantileTransformer(n_quantiles=500, output_distribution='normal')
#     fig = plt.figure(figsize=(20,300))
#     j = 1
#     for i in columns:
#         array = np.array(data[i]).reshape(-1, 1)
#         y = pt.fit_transform(array)
#         x = qt.fit_transform(array)
#         plt.subplot(count,3,j)
#         sns.histplot(array, bins = 50, kde = True)
#         plt.title(f"Original Distribution for {i}")
#         plt.subplot(count,3,j+1)
#         sns.histplot(x, bins = 50, kde = True)
#         plt.title(f"Quantile Transform for {i}")
#         plt.subplot(count,3,j+2)
#         sns.histplot(y, bins = 50, kde = True)
#         plt.title(f"Power Transform for {i}")
#         j += 3
# test_transformers(cfp_data,numerical_columns)

In [None]:
'''Transforming features - Adjusting for skewness'''
qt = QuantileTransformer(n_quantiles=1000, output_distribution='normal')
for i in numerical_columns:
    array = np.array(cfp_data[i]).reshape(-1, 1)
    try:
        cfp_data[i] = qt.fit_transform(array)
    except:
        pass
find_skew(cfp_data,numerical_columns)

### Pickling Data

In [None]:
# joblib.dump(cfp_data, 'cfp_clean_data_Green.pkl')
joblib.dump(cfp_data, 'cfp_clean_data_Red.pkl')

In [None]:
# cfp_clean_data=joblib.load('cfp_clean_data_Green.pkl')
cfp_clean_data=joblib.load('cfp_clean_data_Red.pkl')

### Removing Unwanted Columns

In [None]:
cfp_clean_data=cfp_clean_data.drop(['user_id'],axis=1)

In [None]:
cfp_clean_data.head()

### Segregate Columns for Encoding and Scaling

In [None]:
exemption_features=['cfp_subscribed']
id_columns=['user_id']

In [None]:
categorical_features=list(cfp_clean_data.select_dtypes(exclude='number').columns)
categorical_features

In [None]:
numeric_features=list(cfp_clean_data.select_dtypes(include='number').columns)
for feature in exemption_features:
    numeric_features.remove(feature)
numeric_features

In [None]:
'''Encoding categorical features with dummies'''
dummies = pd.get_dummies(data = cfp_clean_data[categorical_features], drop_first=True)
dummies.head()

In [None]:
'''Scaling numeric features'''
standard_scalar = StandardScaler(with_mean=True,with_std=True)

standard_scalar.fit(cfp_clean_data[numeric_features])
numeric_data_scaled = standard_scalar.transform(cfp_clean_data[numeric_features])
numeric_data_scaled = pd.DataFrame(numeric_data_scaled,columns=numeric_features)
numeric_data_scaled.head()

In [None]:
'''Concatenating all features'''
processed_dataset = pd.concat([cfp_data[id_columns].reset_index(drop = True), 
                               numeric_data_scaled.reset_index(drop = True),
                               dummies.reset_index(drop = True),
                              cfp_clean_data[exemption_features].reset_index(drop = True)], axis=1)
processed_dataset.head()

In [None]:
'''Eliminating worst features to increase binding'''
worst_features=['two_wheeler_count', 'citi_neg_flag', 'auto_loan_close_count_last3mnths', 'gold_loan_close_count_last3mnths', 'consumer_loan_close_count_last3mnths', 'housing_loan_close_count_last3mnths', 'credit_card_close_count_last3mnths', 'personal_loan_close_count_last3mnths', 'two_wheeler_close_count_last3mnths', 'secured_close_last3mnths', 'negative_status_close_count_last3mnths', 'joint_account_close_last3mnths', 'max_dpd_on_m9', 'max_dpd_on_m12', 'max_dpd_on_m6', 'salary_account_ALMORA URBAN COOPERATIVE BANK', 'salary_account_ANDHRA BANK', 'salary_account_AP MAHESH', 'salary_account_Abhyudaya co operative bank', 'salary_account_Allahabad bank', 'salary_account_Anand co op Bank', 'salary_account_Andhra Bank', 'salary_account_Andhrabank', 'salary_account_Axis Bank', 'salary_account_BANK OF BARODA', 'salary_account_BANK OF MAHARAHSTRA', 'salary_account_BANK OF MAHARASHTRA', 'salary_account_BANK OF MAHARASTRA', 'salary_account_BBK', 'salary_account_Bank of India', 'salary_account_Bank of india', 'salary_account_By Cash', 'salary_account_By cash', 'salary_account_CITI UNION BANK', 'salary_account_CITY UNION', 'salary_account_CORPORATION', 'salary_account_CORPORATION BANK', 'salary_account_Canara', 'salary_account_Canara Bank', 'salary_account_Canara bank', 'salary_account_Canarabank', 'salary_account_Cash', 'salary_account_Cheque -20000 and cash 5000', 'salary_account_Corp', 'salary_account_Corporation Bank', 'salary_account_Corporation bank', 'salary_account_Cosmos bank', 'salary_account_Dcb bank', 'salary_account_Dena bank', 'salary_account_Dhanlaxmi Bank', 'salary_account_FEDERAL', 'salary_account_Federal', 'salary_account_HSBC', 'salary_account_IDBI', 'salary_account_IDBI BANK', 'salary_account_IDBI Bank', 'salary_account_IDFC', 'salary_account_INDIAN BANK', 'salary_account_INDIAN OVERSEAS BANK', 'salary_account_INDUSIND', 'salary_account_INDUSIND BANK', 'salary_account_ING', 'salary_account_ING VYSYA BANK', 'salary_account_IOB', 'salary_account_Idbi bank', 'salary_account_Indian Bank', 'salary_account_Indian Overseas Bank', 'salary_account_Indian overseas bank', 'salary_account_Indusind', 'salary_account_Indusind bank', 'salary_account_Ing VYSYA bank', 'salary_account_JANATA SAHKARI BANK LIMITED PUNE', 'salary_account_KARNATAKA BANK', 'salary_account_KARNATAKA BANK LTD', 'salary_account_KARUR VYSYA', 'salary_account_KODAK MAHENDRA', 'salary_account_KOTAK MAHENDRA BANK', 'salary_account_KOTAK MAHINDAR BANK', 'salary_account_Karnataka bank', 'salary_account_Kaylan janta sahakari bank', 'salary_account_Kotak', 'salary_account_Kotak Mahendra Bank', 'salary_account_Kotak Mahindra', 'salary_account_Kotak Mahindra Bank', 'salary_account_Kotak Mahindra bank', 'salary_account_Kotak bank', 'salary_account_Kotak mahendhra', 'salary_account_Kotak mahendra bank', 'salary_account_MAY BANK', 'salary_account_ORIENTAL BANK OF COMMERCE', 'salary_account_Oriental Bank of Commerce', 'salary_account_PARSIK CO-OPERTAIVE BANK', 'salary_account_PAYTM', 'salary_account_PMC BANK', 'salary_account_PUNJAB AND SIND', 'salary_account_PUNJAB NATIONAL BANK', 'salary_account_Panjab national bank', 'salary_account_Punjab National Bank Pune', 'salary_account_Punjab national Bank', 'salary_account_Punjab national bank', 'salary_account_RBS', 'salary_account_RTGS', 'salary_account_Ratnakar bank', 'salary_account_SBH', 'salary_account_SBM', 'salary_account_SCB', 'salary_account_SOUTH INDIAN', 'salary_account_STANCHART', 'salary_account_STANDARD CHARTERED', 'salary_account_STATE BANK OF HYDERABAD', 'salary_account_SYNDICATE BANK', 'salary_account_Saptagiri Grameena Bank', 'salary_account_Saraswat Bank', 'salary_account_Standard Charted', 'salary_account_Standard Charterd Bank', 'salary_account_Standard Chartered', 'salary_account_Standard charter bank', 'salary_account_Standard chartered', 'salary_account_State Bank of Hyderabad', 'salary_account_State Bank of India', 'salary_account_Syndicate Bank', 'salary_account_UBI', 'salary_account_UCO bank', 'salary_account_UNION BANK OF INDIA', 'salary_account_UNITED BANK OF INDIA', 'salary_account_Union bank', 'salary_account_United bank of india', 'salary_account_VIJAYA', 'salary_account_VIJAYA BANK', 'salary_account_VIJYA BANK', 'salary_account_Vijaya bank', 'salary_account_YES BANK', 'salary_account_Yes bank ltd', 'salary_account_Yes banl', 'salary_account_andha bank', 'salary_account_bank of Maharastra', 'salary_account_bank of baroda', 'salary_account_bank of borada', 'salary_account_bank of maharashtra', 'salary_account_bank transfer', 'salary_account_canara bank', 'salary_account_cash', 'salary_account_cenra bank', 'salary_account_cheque & cash', 'salary_account_corporation bank', 'salary_account_corprotion bank', 'salary_account_cub', 'salary_account_federal bank', 'salary_account_govt bank', 'salary_account_hdfc bank', 'salary_account_idbi bank', 'salary_account_idbibank', 'salary_account_indian bank', 'salary_account_indianbank', 'salary_account_indicate bank', 'salary_account_indusind', 'salary_account_indusind Bank Ltd.', 'salary_account_ing vysya', 'salary_account_j&k', 'salary_account_jan seva cooperative bank', 'salary_account_kOTAK', 'salary_account_karnataka bank', 'salary_account_kaveri brahmin bank', 'salary_account_kotak bank', 'salary_account_kotak mahiendra bank', 'salary_account_kotak mahindra', 'salary_account_kotak mahindra Bank', 'salary_account_kvb', 'salary_account_oriental bank of commerce', 'salary_account_pnb', 'salary_account_punjab national bank', 'salary_account_receiveByCashOrCheque', 'salary_account_saraswat bank', 'salary_account_sarswat bank', 'salary_account_sate bank of hyderabad', 'salary_account_standard charted', 'salary_account_standard chatered', 'salary_account_standart chatered bank', 'salary_account_standred charted bank', 'salary_account_state bank of hyderabad', 'salary_account_state bank of mysore', 'salary_account_state bank of patiala', 'salary_account_svcBANK', 'salary_account_syndicate bank', 'salary_account_ubi', 'salary_account_uco', 'salary_account_yes','salary_account_ALLAHABAD', 'salary_account_Andhra bank', 'salary_account_BANK OF INDIA', 'salary_account_BOB', 'salary_account_BOI', 'salary_account_CANARA', 'salary_account_CANARA BANK', 'salary_account_CENTRAL BANK OF INDIA', 'salary_account_DBS', 'salary_account_FEDERAL BANK', 'salary_account_Federal Bank', 'salary_account_Federal bank', 'salary_account_Hsbc', 'salary_account_INDIAN', 'salary_account_INDIAN OVERSEAS', 'salary_account_Idbi', 'salary_account_Indian bank', 'salary_account_Indusind Bank', 'salary_account_J&K BANK', 'salary_account_KANARTAKA BANK', 'salary_account_KARNATAKA', 'salary_account_KOTAK', 'salary_account_KOTAK MAHINDRA', 'salary_account_KOTAK MAHINDRA BANK', 'salary_account_KOTAK MAHINDRA bank', 'salary_account_Karnataka bank Ltd', 'salary_account_Kotak mahindra bank', 'salary_account_OBC', 'salary_account_PNB', 'salary_account_PUNJAB NATIONAL', 'salary_account_SBBJ', 'salary_account_SBH, HYDERABAD.', 'salary_account_SBP', 'salary_account_SBT', 'salary_account_Syndicate bank', 'salary_account_The Federal Bank', 'salary_account_UCO BANK', 'salary_account_UNION', 'salary_account_United Bank of India', 'salary_account_Yes Bank', 'salary_account_Yes bank', 'salary_account_abdhra bank', 'salary_account_bank of india', 'salary_account_hsbc', 'salary_account_idbi', 'salary_account_indian overseas bank', 'salary_account_indusind bank', 'salary_account_karnataka', 'salary_account_kotak', 'salary_account_kotak Bank', 'salary_account_kotak mahindra bank', 'salary_account_standard chartered', 'salary_account_union bank', 'salary_account_union bank of india', 'salary_account_yes bank','salary_account_ANDHRA', 'salary_account_Axis', 'salary_account_I receive by cheque', 'salary_account_OBC BANK', 'salary_account_SBI GROUP', 'salary_account_central bank of india']
try:
    for feature in worst_features:
        processed_dataset.drop([feature],axis=1,inplace=True)
except:
    pass
processed_dataset.head()

In [None]:
processed_dataset.isnull().sum()

In [None]:
processed_dataset.corr(method='pearson').style.format("{:.2}").background_gradient(cmap=plt.get_cmap('coolwarm'), axis=1)

## Train Test Split

In [None]:
'''  Preparing test and train data '''
X = processed_dataset.loc[:,processed_dataset.columns != 'cfp_subscribed']
y = processed_dataset['cfp_subscribed']
train_X_u,test_X_u,train_y,test_y = train_test_split(X,y,random_state=31,test_size=0.3)
train_X = train_X_u.loc[:,train_X_u.columns != 'user_id']
test_X = test_X_u.loc[:,test_X_u.columns != 'user_id']

## Model Building

### Training

##### Initializing Model, Hyper Parameter Tuning and Model Fitting

In [None]:
''' Tuning Grid '''
gridParams = {'n_estimators': [100,300],
                'max_features':['sqrt','log2',None],
                'criterion':['gini', 'entropy'],
                 'max_depth':[20]}

'''Model Object'''
random_forest = RandomForestClassifier(random_state = 15,warm_start=False)


''' Grid Formation '''
grid = RandomizedSearchCV(estimator = random_forest , param_distributions= gridParams, verbose = 4, cv = 3)
grid.fit(train_X, train_y)


In [None]:
best_features = grid.best_estimator_
print(best_features)

In [None]:
feature_importances_dict={}
for i in range (0,len(best_features.feature_importances_)):
    feature_importances_dict[best_features.feature_names_in_[i]]=best_features.feature_importances_[i]
dict(sorted(feature_importances_dict.items(), key=lambda item: item[1]))


##### Pickling Model

In [None]:
''' Saving model as pickle object '''
# joblib.dump(grid,"clf_model_basic.pkl")
# joblib.dump(best_features,"clf_model_best_features.pkl")
# joblib.dump(grid,"clf_model_basic_Green.pkl")
# joblib.dump(best_features,"clf_model_best_features_Green.pkl")
joblib.dump(grid,"clf_model_basic_Red1.pkl")
joblib.dump(best_features,"clf_model_best_features_Red1.pkl")

### Predicting

##### Predicting the test data

In [None]:
''' Predicting using the best fit parameters '''
# test_fit = best_features.predict_proba(test_X)
# y_pred = pd.DataFrame(test_fit, columns = ['predicted_value'])
test_fit = calibrator.predict_proba(test_X)
y_pred=pd.DataFrame(test_fit)

In [None]:
# y_pred['predicted_value'].value_counts()
y_pred[1].max()

In [None]:
# predicted_y=y_pred['predicted_value'].apply(lambda x: 1.00 if x >= 1 else 0.00)
predicted_y=y_pred[1].apply(lambda x: 1 if x >= 0.0044 else 0)

### Metrics

##### Testing Accuracy

In [None]:
accuracy_score(test_y, predicted_y)

In [None]:
cm=ConfusionMatrixDisplay(confusion_matrix(test_y, predicted_y),display_labels=grid.classes_)
cm.plot()
plt.show()

In [None]:
false_positive_rate, true_positive_rate, threshold1 = roc_curve(test_y, predicted_y)
'roc='+str(roc_auc_score(test_y, predicted_y))

In [None]:
mae = mean_absolute_error(test_y, predicted_y)
mse = mean_squared_error(test_y, predicted_y)
rmse = np.sqrt(mse) # or mse**(0.5)  
r2 = r2_score(test_y, predicted_y)

print("Results of sklearn.metrics:")
print("MAE:",mae)
print("MSE:", mse)
print("RMSE:", rmse)
print("R-Squared:", r2)

In [None]:
plt.subplots(1, figsize=(10,10))
plt.title('Receiver Operating Characteristic')
plt.plot(false_positive_rate, true_positive_rate)
plt.plot([0, 1], ls="--")
plt.plot([0, 0], [1, 0] , c=".7"), plt.plot([1, 1] , c=".7")
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.show()

### Split to Deciles

In [None]:
y_pred_deciles = pd.concat([y_pred[1].reset_index(drop = True), test_X_u['user_id'].reset_index(drop = True)], axis = 1)
y_pred_deciles.rename(columns = {1:'predicted_value'}, inplace = True)
y_pred_deciles.head()

In [None]:
y_pred_deciles = pd.merge(y_pred_deciles,cfp_backup, on = 'user_id', how = 'left')
y_pred_deciles.head()

In [None]:
y_pred_deciles['decile'] = pd.qcut(y_pred_deciles['predicted_value'].rank(method='first'), 10, labels=False)
y_pred_deciles['decile'] = y_pred_deciles['decile'].apply(lambda x: int(10 - x))
y_pred_deciles['decile'].value_counts()

#### Export to CSV

In [None]:
# y_pred_deciles.to_csv('CFP_deciled_data_Green.csv')
y_pred_deciles.to_csv('CFP_deciled_data_Red.csv')