Importing Libraries

In [1]:
import numpy as np 
import pandas as pd 
import seaborn as sns
import matplotlib.pyplot as plt
import pickle
import warnings

from sklearn.metrics import accuracy_score,f1_score,precision_score, recall_score
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import LabelEncoder
from imblearn.over_sampling import RandomOverSampler
from collections import Counter
warnings.filterwarnings("ignore")

Read Dataset

In [2]:
df = pd.read_csv('application_data.csv')
df1 = df.copy()
df.head()

Unnamed: 0,SK_ID_CURR,TARGET,NAME_CONTRACT_TYPE,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,...,FLAG_DOCUMENT_18,FLAG_DOCUMENT_19,FLAG_DOCUMENT_20,FLAG_DOCUMENT_21,AMT_REQ_CREDIT_BUREAU_HOUR,AMT_REQ_CREDIT_BUREAU_DAY,AMT_REQ_CREDIT_BUREAU_WEEK,AMT_REQ_CREDIT_BUREAU_MON,AMT_REQ_CREDIT_BUREAU_QRT,AMT_REQ_CREDIT_BUREAU_YEAR
0,100002,1,Cash loans,M,N,Y,0,202500.0,406597.5,24700.5,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,1.0
1,100003,0,Cash loans,F,N,N,0,270000.0,1293502.5,35698.5,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0
2,100004,0,Revolving loans,M,Y,Y,0,67500.0,135000.0,6750.0,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0
3,100006,0,Cash loans,F,N,Y,0,135000.0,312682.5,29686.5,...,0,0,0,0,,,,,,
4,100007,0,Cash loans,M,N,Y,0,121500.0,513000.0,21865.5,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0


Information of dataframe

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 307511 entries, 0 to 307510
Columns: 122 entries, SK_ID_CURR to AMT_REQ_CREDIT_BUREAU_YEAR
dtypes: float64(65), int64(41), object(16)
memory usage: 286.2+ MB


Preprocessing of Data

In [4]:
df.isnull().sum().sort_values(ascending=False)

COMMONAREA_MEDI             214865
COMMONAREA_AVG              214865
COMMONAREA_MODE             214865
NONLIVINGAPARTMENTS_MODE    213514
NONLIVINGAPARTMENTS_AVG     213514
                             ...  
NAME_HOUSING_TYPE                0
NAME_FAMILY_STATUS               0
NAME_EDUCATION_TYPE              0
NAME_INCOME_TYPE                 0
SK_ID_CURR                       0
Length: 122, dtype: int64

In [66]:
null_counts = df.isnull().sum()

# If 1/3 one third values are null drop the columns 

columns_with_less_than_200000_null = null_counts[null_counts < 200000].index

columns_with_more_than_200000_null = null_counts[null_counts > 200000].index

print("Columns with less than 200000 null values:")
print(columns_with_less_than_200000_null)

print("\nColumns with more than 200000 null values:")
print(columns_with_more_than_200000_null)

Columns with less than 200000 null values:
Index(['SK_ID_CURR', 'TARGET', 'NAME_CONTRACT_TYPE', 'CODE_GENDER',
       'FLAG_OWN_CAR', 'FLAG_OWN_REALTY', 'CNT_CHILDREN', 'AMT_INCOME_TOTAL',
       'AMT_CREDIT', 'AMT_ANNUITY',
       ...
       'FLAG_DOCUMENT_18', 'FLAG_DOCUMENT_19', 'FLAG_DOCUMENT_20',
       'FLAG_DOCUMENT_21', 'AMT_REQ_CREDIT_BUREAU_HOUR',
       'AMT_REQ_CREDIT_BUREAU_DAY', 'AMT_REQ_CREDIT_BUREAU_WEEK',
       'AMT_REQ_CREDIT_BUREAU_MON', 'AMT_REQ_CREDIT_BUREAU_QRT',
       'AMT_REQ_CREDIT_BUREAU_YEAR'],
      dtype='object', length=105)

Columns with more than 200000 null values:
Index([], dtype='object')


In [67]:
df = df.drop(columns_with_more_than_200000_null, axis=1)

In [68]:
df.head()

Unnamed: 0,SK_ID_CURR,TARGET,NAME_CONTRACT_TYPE,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,...,FLAG_DOCUMENT_18,FLAG_DOCUMENT_19,FLAG_DOCUMENT_20,FLAG_DOCUMENT_21,AMT_REQ_CREDIT_BUREAU_HOUR,AMT_REQ_CREDIT_BUREAU_DAY,AMT_REQ_CREDIT_BUREAU_WEEK,AMT_REQ_CREDIT_BUREAU_MON,AMT_REQ_CREDIT_BUREAU_QRT,AMT_REQ_CREDIT_BUREAU_YEAR
0,100002,1,0,1,0,1,0,202500.0,406597.5,24700.5,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,1.0
1,100003,0,0,0,0,0,0,270000.0,1293502.5,35698.5,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0
2,100004,0,1,1,1,1,0,67500.0,135000.0,6750.0,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0
3,100006,0,0,0,0,1,0,135000.0,312682.5,29686.5,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,1.0
4,100007,0,0,1,0,1,0,121500.0,513000.0,21865.5,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0


In [69]:
df.isnull().sum().sort_values(ascending=False)


SK_ID_CURR                     0
BASEMENTAREA_MODE              0
DEF_60_CNT_SOCIAL_CIRCLE       0
OBS_60_CNT_SOCIAL_CIRCLE       0
DEF_30_CNT_SOCIAL_CIRCLE       0
                              ..
HOUR_APPR_PROCESS_START        0
WEEKDAY_APPR_PROCESS_START     0
REGION_RATING_CLIENT_W_CITY    0
REGION_RATING_CLIENT           0
AMT_REQ_CREDIT_BUREAU_YEAR     0
Length: 105, dtype: int64

Fill the null values 

In [70]:
for i in df.columns:
    if df[i].dtypes == 'object':
        df[i].fillna(df[i].mode()[0], inplace=True)
    else:
        df[i].fillna(df[i].median(), inplace=True)
print(df)

        SK_ID_CURR  TARGET  NAME_CONTRACT_TYPE  CODE_GENDER  FLAG_OWN_CAR  \
0           100002       1                   0            1             0   
1           100003       0                   0            0             0   
2           100004       0                   1            1             1   
3           100006       0                   0            0             0   
4           100007       0                   0            1             0   
...            ...     ...                 ...          ...           ...   
307506      456251       0                   0            1             0   
307507      456252       0                   0            0             0   
307508      456253       0                   0            0             0   
307509      456254       1                   0            0             0   
307510      456255       0                   0            0             0   

        FLAG_OWN_REALTY  CNT_CHILDREN  AMT_INCOME_TOTAL  AMT_CREDIT  \
0   

In [71]:
df.isnull().sum()

SK_ID_CURR                    0
TARGET                        0
NAME_CONTRACT_TYPE            0
CODE_GENDER                   0
FLAG_OWN_CAR                  0
                             ..
AMT_REQ_CREDIT_BUREAU_DAY     0
AMT_REQ_CREDIT_BUREAU_WEEK    0
AMT_REQ_CREDIT_BUREAU_MON     0
AMT_REQ_CREDIT_BUREAU_QRT     0
AMT_REQ_CREDIT_BUREAU_YEAR    0
Length: 105, dtype: int64

Observing the Dataframe and columns

In [72]:
numerical= df.select_dtypes('number').columns

categorical = df.select_dtypes('object').columns

print(f'Numerical Columns:  {df[numerical].columns}')
print('\n')
print(f'Categorical Columns: {df[categorical].columns}')

Numerical Columns:  Index(['SK_ID_CURR', 'TARGET', 'NAME_CONTRACT_TYPE', 'CODE_GENDER',
       'FLAG_OWN_CAR', 'FLAG_OWN_REALTY', 'CNT_CHILDREN', 'AMT_INCOME_TOTAL',
       'AMT_CREDIT', 'AMT_ANNUITY',
       ...
       'FLAG_DOCUMENT_18', 'FLAG_DOCUMENT_19', 'FLAG_DOCUMENT_20',
       'FLAG_DOCUMENT_21', 'AMT_REQ_CREDIT_BUREAU_HOUR',
       'AMT_REQ_CREDIT_BUREAU_DAY', 'AMT_REQ_CREDIT_BUREAU_WEEK',
       'AMT_REQ_CREDIT_BUREAU_MON', 'AMT_REQ_CREDIT_BUREAU_QRT',
       'AMT_REQ_CREDIT_BUREAU_YEAR'],
      dtype='object', length=105)


Categorical Columns: Index([], dtype='object')


Encoding the values using LabelEncoder() by Sk learn

In [73]:
# Import label encoder
from sklearn import preprocessing

# label_encoder object knows how to understand word labels.
label_encoder = preprocessing.LabelEncoder()

# Encode labels in column 'species'.
for name in df[categorical].columns:
    
    # # df[name]= label_encoder.fit(df[name])
    # df[name]= label_encoder.fit_transform(df[name])
    # print(df[name])


    LE = label_encoder.fit(df[name])
    # print(f'{name}.pickel')
    # print('\n')

    with open(name+'LE.pickle', 'wb') as f:
        pickle.dump(LE, f)
    
    df[name]= LE.transform(df[name])
    print(df[name])

In [74]:
df.select_dtypes('int').describe()

Unnamed: 0,SK_ID_CURR,TARGET,NAME_CONTRACT_TYPE,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,NAME_TYPE_SUITE,NAME_INCOME_TYPE,NAME_EDUCATION_TYPE,...,FLAG_DOCUMENT_12,FLAG_DOCUMENT_13,FLAG_DOCUMENT_14,FLAG_DOCUMENT_15,FLAG_DOCUMENT_16,FLAG_DOCUMENT_17,FLAG_DOCUMENT_18,FLAG_DOCUMENT_19,FLAG_DOCUMENT_20,FLAG_DOCUMENT_21
count,307511.0,307511.0,307511.0,307511.0,307511.0,307511.0,307511.0,307511.0,307511.0,307511.0,...,307511.0,307511.0,307511.0,307511.0,307511.0,307511.0,307511.0,307511.0,307511.0,307511.0
mean,278180.518577,0.080729,0.095213,0.341669,0.340108,0.693673,0.417052,5.22299,4.670288,3.188273,...,7e-06,0.003525,0.002936,0.00121,0.009928,0.000267,0.00813,0.000595,0.000507,0.000335
std,102790.175348,0.272419,0.293509,0.474297,0.473746,0.460968,0.722121,1.772626,2.544525,1.298753,...,0.00255,0.059268,0.05411,0.03476,0.099144,0.016327,0.089798,0.024387,0.022518,0.018299
min,100002.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,189145.5,0.0,0.0,0.0,0.0,0.0,0.0,6.0,3.0,2.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,278202.0,0.0,0.0,0.0,0.0,1.0,0.0,6.0,7.0,4.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,367142.5,0.0,0.0,1.0,1.0,1.0,1.0,6.0,7.0,4.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,456255.0,1.0,1.0,2.0,1.0,1.0,19.0,6.0,7.0,4.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [75]:
def color_(value):
    if value < 0 :
        color = 'red'
    elif value == 1 :
        color = 'blue'
    else:
        color = 'green'
    return 'color: %s' % color
df.select_dtypes('int').corr().style.applymap(color_)

Unnamed: 0,SK_ID_CURR,TARGET,NAME_CONTRACT_TYPE,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,NAME_TYPE_SUITE,NAME_INCOME_TYPE,NAME_EDUCATION_TYPE,NAME_FAMILY_STATUS,NAME_HOUSING_TYPE,DAYS_BIRTH,DAYS_EMPLOYED,DAYS_ID_PUBLISH,FLAG_MOBIL,FLAG_EMP_PHONE,FLAG_WORK_PHONE,FLAG_CONT_MOBILE,FLAG_PHONE,FLAG_EMAIL,OCCUPATION_TYPE,REGION_RATING_CLIENT,REGION_RATING_CLIENT_W_CITY,WEEKDAY_APPR_PROCESS_START,HOUR_APPR_PROCESS_START,REG_REGION_NOT_LIVE_REGION,REG_REGION_NOT_WORK_REGION,LIVE_REGION_NOT_WORK_REGION,REG_CITY_NOT_LIVE_CITY,REG_CITY_NOT_WORK_CITY,LIVE_CITY_NOT_WORK_CITY,ORGANIZATION_TYPE,HOUSETYPE_MODE,WALLSMATERIAL_MODE,EMERGENCYSTATE_MODE,FLAG_DOCUMENT_2,FLAG_DOCUMENT_3,FLAG_DOCUMENT_4,FLAG_DOCUMENT_5,FLAG_DOCUMENT_6,FLAG_DOCUMENT_7,FLAG_DOCUMENT_8,FLAG_DOCUMENT_9,FLAG_DOCUMENT_10,FLAG_DOCUMENT_11,FLAG_DOCUMENT_12,FLAG_DOCUMENT_13,FLAG_DOCUMENT_14,FLAG_DOCUMENT_15,FLAG_DOCUMENT_16,FLAG_DOCUMENT_17,FLAG_DOCUMENT_18,FLAG_DOCUMENT_19,FLAG_DOCUMENT_20,FLAG_DOCUMENT_21
SK_ID_CURR,1.0,-0.002108,0.001654,-0.000769,0.001216,0.000703,-0.001129,0.000954,-0.002112,-0.000529,0.002846,0.002191,-0.0015,0.001366,-0.000384,0.002804,-0.001337,-0.000415,0.002815,0.002753,0.000281,0.001439,-0.001075,-0.001138,0.002185,0.00035,-0.000283,0.001097,0.002903,-0.001885,-0.001582,6.7e-05,0.002269,0.001138,-0.001808,0.002549,0.0007,-0.003411,-0.004139,-0.001097,0.002121,-0.002694,0.001809,0.001505,-0.000815,-0.002012,-0.001045,0.000896,-0.001077,0.002604,-0.000724,0.00145,0.000509,0.000167,0.001073,0.000282
TARGET,-0.002108,1.0,-0.030896,0.054692,-0.021851,-0.006148,0.019187,0.008319,0.046829,0.054699,-0.004127,0.034489,0.078239,-0.044932,0.051457,0.000534,0.045982,0.028524,0.00037,-0.023806,-0.001758,0.017357,0.058899,0.060893,0.004002,-0.024166,0.005576,0.006942,0.002819,0.044395,0.050994,0.032518,-0.030765,0.003445,0.003654,0.004829,0.005417,0.044346,-0.002672,-0.000316,-0.028602,-0.00152,-0.00804,-0.004352,-0.001414,-0.004229,-0.000756,-0.011583,-0.009464,-0.006536,-0.011615,-0.003378,-0.007952,-0.001358,0.000215,0.003709
NAME_CONTRACT_TYPE,0.001654,-0.030896,1.0,-0.008613,0.004022,0.067177,0.029998,0.012007,0.005428,-0.065086,0.011272,0.02419,0.086364,-0.05445,0.052847,0.000585,0.054938,-0.033647,-0.097116,-0.021378,-0.010811,0.001657,-0.021593,-0.023116,-0.006925,0.035897,0.018561,0.015774,0.007941,0.013586,0.004487,-0.004999,-0.040187,3.7e-05,-0.001333,-0.000339,-0.002109,-0.479724,0.004448,0.029372,-0.098808,0.042703,-0.084961,-0.006592,0.014708,0.025995,-0.000827,-0.0049,5e-06,-0.007783,-0.023433,-0.000548,-0.00753,-0.00519,-0.007308,0.055821
CODE_GENDER,-0.000769,0.054692,-0.008613,1.0,0.345833,-0.04434,0.047403,0.029996,0.065089,0.009554,-0.099713,0.058767,0.14808,-0.156018,0.00013,-0.002503,0.157253,0.03403,-0.006017,-0.019429,0.018218,-0.038149,-0.017862,-0.017263,-0.000271,0.007177,0.023683,0.103437,0.10606,0.048237,0.137541,0.133239,-0.163879,-0.004761,-0.005906,-0.005174,-0.00152,-0.087338,-0.002693,0.003255,-0.100766,-0.002059,0.249767,-0.00807,0.000874,0.00483,0.000851,0.040447,0.000947,0.020888,6.1e-05,0.003772,0.022205,0.004632,0.003258,0.023908
FLAG_OWN_CAR,0.001216,-0.021851,0.004022,0.345833,1.0,-0.002817,0.102023,0.004256,0.033186,-0.093977,-0.131014,-0.004041,0.129879,-0.154441,0.013661,-0.002512,0.154659,0.011471,-0.006644,-0.007588,0.032105,-0.04372,-0.022668,-0.021405,0.002346,0.014369,-0.000668,0.038937,0.046352,0.003123,0.076081,0.088061,-0.114162,-0.007063,-0.012,-0.007584,-0.000445,-0.07,-0.001144,-0.013661,-0.106012,0.000958,0.228368,-0.011069,0.000891,-0.000456,0.003552,0.080647,0.00341,0.045909,0.001984,-0.002476,-9.7e-05,0.002184,0.007299,0.006741
FLAG_OWN_REALTY,0.000703,-0.006148,0.067177,-0.04434,-0.002817,1.0,-0.002366,-0.050326,-0.026539,0.02213,0.000521,-0.199722,-0.119146,0.069677,0.006664,-0.001198,-0.070021,-0.114043,0.008526,-0.041507,0.029247,0.010022,0.001289,0.001884,-0.003414,-0.102687,-0.036301,-0.032316,-0.018324,-0.061709,-0.061889,-0.035991,0.050389,-0.008986,0.002159,-0.008535,0.003236,-0.036839,0.00208,-0.011922,0.041567,0.003093,-0.035077,-0.003513,0.003171,-0.035201,-0.001071,-0.057484,-0.054678,-0.035729,-0.092272,-0.014207,-0.087687,-0.019364,-0.02513,-0.000173
CNT_CHILDREN,-0.001129,0.019187,0.029998,0.047403,0.102023,-0.002366,1.0,-0.013277,0.102547,-0.034128,-0.165475,0.021737,0.330938,-0.239818,-0.028019,0.001041,0.240714,0.05563,-0.000794,-0.029906,0.022619,-0.013482,0.025423,0.024781,-0.003419,-0.007292,-0.013319,0.008185,0.014835,0.020072,0.07065,0.069957,-0.144416,-0.002489,-0.003963,0.004525,0.001786,0.056837,-0.003709,-0.016737,-0.157024,-0.001498,0.051697,-0.001997,-0.002756,-0.005318,0.000293,0.003945,-0.005459,0.003609,0.010662,0.000773,0.004031,0.000864,0.000988,-0.00245
NAME_TYPE_SUITE,0.000954,0.008319,0.012007,0.029996,0.004256,-0.050326,-0.013277,1.0,0.007355,-0.036648,0.037044,0.032243,0.036691,-0.039598,0.013258,-0.00079,0.03986,0.02013,-0.005398,0.014181,0.010252,-0.003709,-0.004381,-0.005931,0.008118,-0.02717,0.019175,0.010042,0.002173,0.024278,0.011572,-0.001445,-0.029741,0.000913,0.00096,-0.001505,0.001439,-0.013575,-0.001948,0.004684,-0.01939,0.002761,0.020893,0.001704,0.001707,0.027,0.000399,0.001928,0.007481,0.00053,-0.000514,0.002777,0.013617,-0.000963,0.002462,0.003312
NAME_INCOME_TYPE,-0.002112,0.046829,0.005428,0.065089,0.033186,-0.026539,0.102547,0.007355,1.0,0.088298,-0.057381,0.035824,0.189904,-0.307731,0.074256,-0.001651,0.307389,0.137629,0.008401,-0.009941,-0.02556,-0.004418,0.130969,0.1214,-0.001877,-0.029674,-0.041006,-0.021898,-0.009141,0.010664,0.102258,0.102862,-0.176226,0.003719,0.021593,0.011214,0.001236,0.132288,-0.001808,0.008018,-0.182297,-0.003649,-0.012786,-0.014669,0.00169,-0.047771,-0.002175,-0.019269,-0.023838,-0.002072,-0.005393,-0.003676,-0.008464,-3.5e-05,-0.000543,0.003769
NAME_EDUCATION_TYPE,-0.000529,0.054699,-0.065086,0.009554,-0.093977,0.02213,-0.034128,-0.036648,0.088298,1.0,0.007279,-0.01707,-0.151752,0.114975,-0.041064,0.00165,-0.115273,-0.007286,0.021119,-0.030929,-0.086165,0.111235,0.068567,0.07161,-0.003683,-0.089428,-0.041738,-0.043688,-0.032684,-0.019046,0.018578,0.031023,0.092907,0.005326,0.005898,0.016284,0.000983,0.060846,0.000359,-0.009175,0.074893,-0.004178,-0.083166,-0.031333,-0.000167,-0.035478,-0.001352,-0.037012,-0.047987,-0.019452,-0.024922,-0.007275,-0.033619,-0.004975,-0.007603,0.000357


Correlation Matrix between columns and TARGET 

In [76]:
# Calculate the correlation matrix between columns and TARGET columns
correlation_matrix = df.corr()
correlation_with_target = correlation_matrix['TARGET'].abs().sort_values(ascending=False)



Selecting top 20 columns

In [54]:
# Get the 20 columns with the highest correlation including the TARGET column
top_20_correlated_columns = correlation_with_target[:20].index

# Create a new DataFrame from this column list
df_top_20_correlated = df[top_20_correlated_columns]

In [55]:
df_top_20_correlated.shape

(307511, 20)

In [56]:
df1 = df_top_20_correlated

In [57]:
df1.shape

(307511, 20)

In [58]:
df1.head()

Unnamed: 0,TARGET,EXT_SOURCE_2,EXT_SOURCE_3,EXT_SOURCE_1,DAYS_BIRTH,REGION_RATING_CLIENT_W_CITY,REGION_RATING_CLIENT,DAYS_LAST_PHONE_CHANGE,NAME_EDUCATION_TYPE,CODE_GENDER,DAYS_ID_PUBLISH,REG_CITY_NOT_WORK_CITY,NAME_INCOME_TYPE,FLAG_EMP_PHONE,DAYS_EMPLOYED,REG_CITY_NOT_LIVE_CITY,FLAG_DOCUMENT_3,DAYS_REGISTRATION,AMT_GOODS_PRICE,FLOORSMAX_AVG
0,1,0.262949,0.139376,0.083037,-9461,2,2,-1134.0,4,1,-2120,0,7,1,-637,0,1,-3648.0,351000.0,0.0833
1,0,0.622246,0.535276,0.311267,-16765,1,1,-828.0,1,0,-291,0,4,1,-1188,0,1,-1186.0,1129500.0,0.2917
2,0,0.555912,0.729567,0.505998,-19046,2,2,-815.0,4,1,-2531,0,7,1,-225,0,0,-4260.0,135000.0,0.1667
3,0,0.650442,0.535276,0.505998,-19005,2,2,-617.0,4,0,-2437,0,7,1,-3039,0,1,-9833.0,297000.0,0.1667
4,0,0.322738,0.535276,0.505998,-19932,2,2,-1106.0,4,1,-3458,1,7,1,-3038,0,0,-4311.0,513000.0,0.1667


In [59]:
df1.head()

Unnamed: 0,TARGET,EXT_SOURCE_2,EXT_SOURCE_3,EXT_SOURCE_1,DAYS_BIRTH,REGION_RATING_CLIENT_W_CITY,REGION_RATING_CLIENT,DAYS_LAST_PHONE_CHANGE,NAME_EDUCATION_TYPE,CODE_GENDER,DAYS_ID_PUBLISH,REG_CITY_NOT_WORK_CITY,NAME_INCOME_TYPE,FLAG_EMP_PHONE,DAYS_EMPLOYED,REG_CITY_NOT_LIVE_CITY,FLAG_DOCUMENT_3,DAYS_REGISTRATION,AMT_GOODS_PRICE,FLOORSMAX_AVG
0,1,0.262949,0.139376,0.083037,-9461,2,2,-1134.0,4,1,-2120,0,7,1,-637,0,1,-3648.0,351000.0,0.0833
1,0,0.622246,0.535276,0.311267,-16765,1,1,-828.0,1,0,-291,0,4,1,-1188,0,1,-1186.0,1129500.0,0.2917
2,0,0.555912,0.729567,0.505998,-19046,2,2,-815.0,4,1,-2531,0,7,1,-225,0,0,-4260.0,135000.0,0.1667
3,0,0.650442,0.535276,0.505998,-19005,2,2,-617.0,4,0,-2437,0,7,1,-3039,0,1,-9833.0,297000.0,0.1667
4,0,0.322738,0.535276,0.505998,-19932,2,2,-1106.0,4,1,-3458,1,7,1,-3038,0,0,-4311.0,513000.0,0.1667


In [60]:
desired_order = ['TARGET', 'REGION_RATING_CLIENT_W_CITY', 'REGION_RATING_CLIENT',
                 'REG_CITY_NOT_WORK_CITY', 'NAME_EDUCATION_TYPE', 'CODE_GENDER',
                 'NAME_INCOME_TYPE', 'FLAG_EMP_PHONE', 'REG_CITY_NOT_LIVE_CITY',
                 'FLAG_DOCUMENT_3', 'NAME_HOUSING_TYPE', 'LIVE_CITY_NOT_WORK_CITY',
                 'NAME_CONTRACT_TYPE', 'ORGANIZATION_TYPE', 'REGION_POPULATION_RELATIVE',
                 'DAYS_REGISTRATION', 'DAYS_EMPLOYED', 'DAYS_ID_PUBLISH',
                 'DAYS_LAST_PHONE_CHANGE', 'DAYS_BIRTH']



df1 = df1.reindex(columns=desired_order)

Normalizing the dataframe

In [61]:
# Normalize function according to Min-Max Scaling method
def min_max_scaling(column):
    return (column - column.min()) / (column.max() - column.min())

# Apply the normalize function to each column in the DataFrame
df_normalized = df1.apply(min_max_scaling)

# View data after normalization
print(df_normalized.head())

   TARGET  REGION_RATING_CLIENT_W_CITY  REGION_RATING_CLIENT  \
0     1.0                          0.5                   0.5   
1     0.0                          0.0                   0.0   
2     0.0                          0.5                   0.5   
3     0.0                          0.5                   0.5   
4     0.0                          0.5                   0.5   

   REG_CITY_NOT_WORK_CITY  NAME_EDUCATION_TYPE  CODE_GENDER  NAME_INCOME_TYPE  \
0                     0.0                 1.00          0.5          1.000000   
1                     0.0                 0.25          0.0          0.571429   
2                     0.0                 1.00          0.5          1.000000   
3                     0.0                 1.00          0.0          1.000000   
4                     1.0                 1.00          0.5          1.000000   

   FLAG_EMP_PHONE  REG_CITY_NOT_LIVE_CITY  FLAG_DOCUMENT_3  NAME_HOUSING_TYPE  \
0             1.0                     0.0      

In [62]:
df1.select_dtypes('int').cov().style.applymap(color_)

Unnamed: 0,TARGET,REGION_RATING_CLIENT_W_CITY,REGION_RATING_CLIENT,REG_CITY_NOT_WORK_CITY,NAME_EDUCATION_TYPE,CODE_GENDER,NAME_INCOME_TYPE,FLAG_EMP_PHONE,REG_CITY_NOT_LIVE_CITY,FLAG_DOCUMENT_3,DAYS_EMPLOYED,DAYS_ID_PUBLISH,DAYS_BIRTH
TARGET,0.074212,0.00834,0.008168,0.00585,0.019353,0.007067,0.032461,0.004814,0.003247,0.005482,-1729.246843,21.159315,93.013383
REGION_RATING_CLIENT_W_CITY,0.00834,0.252745,0.24333,0.005945,0.046756,-0.004116,0.155299,-0.006706,0.006163,0.015303,2459.141436,-5.871592,17.712735
REGION_RATING_CLIENT,0.008168,0.24333,0.259116,0.001792,0.045331,-0.004313,0.169638,-0.00643,0.004884,0.014442,2355.208443,-3.921112,20.793917
REG_CITY_NOT_WORK_CITY,0.00585,0.005945,0.001792,0.177345,0.010161,0.027472,0.109576,0.041498,0.049787,0.010753,-15144.888706,63.156146,445.480226
NAME_EDUCATION_TYPE,0.019353,0.046756,0.045331,0.010161,1.686761,0.005885,0.291799,-0.057531,-0.00664,0.035857,21095.884041,-80.502161,-860.094266
CODE_GENDER,0.007067,-0.004116,-0.004313,0.027472,0.005885,0.224958,0.078553,0.028661,0.006142,-0.018796,-10454.244961,0.093027,306.500272
NAME_INCOME_TYPE,0.032461,0.155299,0.169638,0.109576,0.291799,0.078553,6.474607,0.300568,0.007284,0.152737,-110623.07561,285.203396,2108.746158
FLAG_EMP_PHONE,0.004814,-0.006706,-0.00643,0.041498,-0.057531,0.028661,0.300568,0.147671,0.009508,0.043336,-54276.179362,158.708853,1039.548207
REG_CITY_NOT_LIVE_CITY,0.003247,0.006163,0.004884,0.049787,-0.00664,0.006142,0.007284,0.009508,0.072062,0.00038,-3432.789043,30.927488,211.315113
FLAG_DOCUMENT_3,0.005482,0.015303,0.014442,0.010753,0.035857,-0.018796,0.152737,0.043336,0.00038,0.205891,-15967.159919,34.457608,217.158095


In [63]:
def color_(value):
    if value < 0 :
        color = 'red'
    elif value == 1 :
        color = 'blue'
    else:
        color = 'green'
    return 'color: %s' % color
df1.select_dtypes('int').corr().style.applymap(color_)

Unnamed: 0,TARGET,REGION_RATING_CLIENT_W_CITY,REGION_RATING_CLIENT,REG_CITY_NOT_WORK_CITY,NAME_EDUCATION_TYPE,CODE_GENDER,NAME_INCOME_TYPE,FLAG_EMP_PHONE,REG_CITY_NOT_LIVE_CITY,FLAG_DOCUMENT_3,DAYS_EMPLOYED,DAYS_ID_PUBLISH,DAYS_BIRTH
TARGET,1.0,0.060893,0.058899,0.050994,0.054699,0.054692,0.046829,0.045982,0.044395,0.044346,-0.044932,0.051457,0.078239
REGION_RATING_CLIENT_W_CITY,0.060893,1.0,0.950842,0.028081,0.07161,-0.017263,0.1214,-0.034712,0.045669,0.067083,0.034624,-0.007737,0.008073
REGION_RATING_CLIENT,0.058899,0.950842,1.0,0.00836,0.068567,-0.017862,0.130969,-0.032871,0.035741,0.062528,0.03275,-0.005103,0.009361
REG_CITY_NOT_WORK_CITY,0.050994,0.028081,0.00836,1.0,0.018578,0.137541,0.102258,0.256427,0.440409,0.056275,-0.254559,0.099354,0.242401
NAME_EDUCATION_TYPE,0.054699,0.07161,0.068567,0.018578,1.0,0.009554,0.088298,-0.115273,-0.019046,0.060846,0.114975,-0.041064,-0.151752
CODE_GENDER,0.054692,-0.017263,-0.017862,0.137541,0.009554,1.0,0.065089,0.157253,0.048237,-0.087338,-0.156018,0.00013,0.14808
NAME_INCOME_TYPE,0.046829,0.1214,0.130969,0.102258,0.088298,0.065089,1.0,0.307389,0.010664,0.132288,-0.307731,0.074256,0.189904
FLAG_EMP_PHONE,0.045982,-0.034712,-0.032871,0.256427,-0.115273,0.157253,0.307389,1.0,0.092166,0.248534,-0.999755,0.273611,0.619888
REG_CITY_NOT_LIVE_CITY,0.044395,0.045669,0.035741,0.440409,-0.019046,0.048237,0.010664,0.092166,1.0,0.003117,-0.090516,0.076326,0.180382
FLAG_DOCUMENT_3,0.044346,0.067083,0.062528,0.056275,0.060846,-0.087338,0.132288,0.248534,0.003117,1.0,-0.249082,0.050309,0.109666


In [64]:
X = df1.drop(['TARGET'],axis = 1)
target = df1['TARGET']
X_train, X_test, Y_train, Y_test = train_test_split(X, target, test_size= 0.2, random_state = 42)

In [65]:
count_ones = sum(Y_test == 1)
print(count_ones)

4949


In [28]:
count_zero = sum(Y_test == 0)
print(count_zero)

56554


In [29]:
count_one = sum(Y_test == 1)
print(count_one)

4949


In [30]:
print(Y_test)

245895    0
98194     0
36463     0
249923    0
158389    0
         ..
256564    0
278889    0
221828    0
190245    0
253004    0
Name: TARGET, Length: 61503, dtype: int64


In [31]:
df1.shape

(307511, 20)

Sampling to overcome the unbalance nature of the dataset

In [32]:
print('before Oversampling:',Counter(Y_train))
oversample = RandomOverSampler(sampling_strategy='minority')
X_train1, Y_train1 = oversample.fit_resample(X_train, Y_train)
print('After Oversampling:',Counter(Y_train1))

before Oversampling: Counter({0: 226132, 1: 19876})
After Oversampling: Counter({0: 226132, 1: 226132})


Train the model

In [33]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
from sklearn.model_selection import train_test_split

# Make sure df1 and df are your DataFrame
X = df1.drop(['TARGET'], axis=1)
target = df1['TARGET']
X_train, X_test, Y_train, Y_test = train_test_split(X, target, test_size=0.3, random_state=0)
X_train_o, X_test_o, Y_train_o, Y_test_o = X_train, X_test, Y_train, Y_test 



# Initialize and train the Random Forest model
model_rf = RandomForestClassifier(max_depth=None, max_features='sqrt',min_samples_leaf=1,n_estimators=500)
model_rf.fit(X_train1, Y_train1)

# We have used grid search for hyper parameter tuning
# max_depth=None, max_features='sqrt',min_samples_leaf=1,n_estimators=500


Model Testing and Metrics

In [42]:

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix

# Predict the probability on the test set
proba = model_rf.predict_proba(X_test)[:, 1]

# Set threshold
threshold = 0.3

# Convert probability to label prediction based on threshold
Y_pred = (proba > threshold).astype(int)


# Calculate measurements
accuracy = accuracy_score(Y_test, Y_pred)
precision = precision_score(Y_test, Y_pred)
recall = recall_score(Y_test, Y_pred)
f1 = f1_score(Y_test, Y_pred)


# Print measurements
print("Accuracy: {:.2f}%, Precision: {:.2f}%, Recall: {:.2f}%,f1: {:.2f}%".format(
    accuracy * 100, precision * 100, recall * 100,f1 *100))


# Confusion matrix
cm = confusion_matrix(Y_test, Y_pred)
print("Confusion Matrix:")
print(cm)



Accuracy: 98.37%, Precision: 98.98%, Recall: 80.31%,f1: 88.67%
Confusion Matrix:
[[84853    61]
 [ 1445  5895]]


To find one target data instance for default prediction

In [35]:
ones=[Y_pred==1]
print(ones)

[array([ True, False, False, ..., False, False, False])]


In [36]:
df_copy = pd.read_csv('application_data.csv')


In [37]:
# df_copy = df_copy[df_copy.index==42962]

In [38]:
# preddf=pd.DataFrame(df_copy,columns=columns_of_interest)

In [39]:
# preddf

Top 10 important by order of importance

In [40]:
# Print out the 10 most important properties of the Random Forest model
feature_importances = model_rf.feature_importances_

# Sort by importance in descending order
sorted_idx = feature_importances.argsort()[::-1]

# Select the 10 most important attributes
top_features = X.columns[sorted_idx][:10]

# Print out the results
print("Top 10 Important Features:")
for i, feature in enumerate(top_features, 1):
    print(f"{i}. {feature}: {feature_importances[sorted_idx[i-1]]:.4f}")


Top 10 Important Features:
1. DAYS_BIRTH: 0.1930
2. DAYS_ID_PUBLISH: 0.1837
3. DAYS_REGISTRATION: 0.1830
4. DAYS_LAST_PHONE_CHANGE: 0.1703
5. DAYS_EMPLOYED: 0.1594
6. NAME_EDUCATION_TYPE: 0.0235
7. NAME_INCOME_TYPE: 0.0183
8. FLAG_DOCUMENT_3: 0.0129
9. CODE_GENDER: 0.0128
10. REGION_RATING_CLIENT_W_CITY: 0.0114


Pickel model Dump

In [41]:
# Commented out the pick dump code as model is already loaded

# import pickle

# # Assuming you have a trained RandomForestClassifier model named 'model_rf'
# # Replace 'model_rf' with the name of your actual trained model

# # Save the trained model to a file
# with open('trained_model.pkl', 'wb') as file:
#     pickle.dump(model_rf, file)
