In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Data Loading and Cleaning

In [3]:
#Load data
df=pd.read_csv(r'C:\Users\Rajakanthan\Documents\Datastorm/credit_card_default_train.csv')

In [4]:
df.head()

Unnamed: 0,Client_ID,Balance_Limit_V1,Gender,EDUCATION_STATUS,MARITAL_STATUS,AGE,PAY_JULY,PAY_AUG,PAY_SEP,PAY_OCT,...,PAID_AMT_SEP,PAID_AMT_OCT,PAID_AMT_NOV,PAID_AMT_DEC,Average Paid,Average Due,NEXT_MONTH_DEFAULT,Total Paid,Total Due,Unnamed: 29
0,A2,1M,M,Graduate,Other,31-45,-1,-1,-1,-1,...,39418,162772,0,538165,149271.8,33474.833333,0,749796,200849,-548947.0
1,A3,1M,F,High School,Other,Less than 30,0,-1,-1,-1,...,43530,80811,942,33666,41029.8,130174.666667,0,356967,781048,424081.0
2,A4,100K,F,High School,Single,31-45,4,3,2,2,...,0,0,0,0,0.0,8040.0,1,0,48240,48240.0
3,A5,200K,F,Graduate,Single,31-45,2,0,0,0,...,3696,4620,4049,3918,4034.6,96483.333333,1,24028,578900,554872.0
4,A6,1M,F,Graduate,Other,31-45,2,2,0,0,...,16170,17325,16401,17325,17602.2,435555.166667,0,88011,2613331,2525320.0


In [5]:
#Check for null values
df.isnull().sum()

Client_ID                 0
Balance_Limit_V1          0
Gender                    0
EDUCATION_STATUS          0
MARITAL_STATUS            0
AGE                       0
PAY_JULY                  0
PAY_AUG                   0
PAY_SEP                   0
PAY_OCT                   0
PAY_NOV                   0
PAY_DEC                   0
DUE_AMT_JULY              0
DUE_AMT_AUG               0
DUE_AMT_SEP               0
DUE_AMT_OCT               0
DUE_AMT_NOV               0
DUE_AMT_DEC               0
PAID_AMT_JULY             0
PAID_AMT_AUG              0
PAID_AMT_SEP              0
PAID_AMT_OCT              0
PAID_AMT_NOV              0
PAID_AMT_DEC              0
Average Paid              0
Average Due               0
NEXT_MONTH_DEFAULT        0
Total Paid                0
Total Due                 0
Unnamed: 29           23909
dtype: int64

In [6]:
#changing credit limit to numeric
df['balance_unclean']=(df['Balance_Limit_V1'].str.strip().str.findall(r'[0-9]*')).apply(lambda x:''.join(x))
df['balance_thousand']=df['Balance_Limit_V1'].str.strip().str.replace(r'[0-9.]*','')
df['balance_limit_clean']=0
df.loc[df['balance_thousand']=='K','balance_limit_clean']=df['balance_unclean'].astype(int)*1000
df.loc[df['balance_thousand']=='M','balance_limit_clean']=df['balance_unclean'].astype(int)*100000

df=df.drop(['balance_thousand','balance_unclean'],axis=1)

In [7]:
#Change Age to more understandable categories
print(df.AGE.unique())
df['Age_int']=df['AGE']
df['Age_int']=df['Age_int'].str.replace('Less than 30','24')
df['Age_int']=df['Age_int'].str.replace('31-45','38')
df['Age_int']=df['Age_int'].str.replace('46-65','55')
df['Age_int']=df['Age_int'].str.replace('More than 65','75')
df['Age_int']=df['Age_int'].astype(int)

['31-45' 'Less than 30' '46-65' 'More than 65']


In [8]:
#Check late payment values
print(df.PAY_JULY.value_counts())


 0    11788
-1     4534
 1     2956
-2     2205
 2     2140
 3      258
 4       66
 5       23
 8       14
 6       10
 7        6
Name: PAY_JULY, dtype: int64


In [9]:
#Compute Average lateness
df['Avg_Late']=df[['PAY_JULY','PAY_AUG','PAY_SEP','PAY_OCT','PAY_NOV','PAY_DEC']].mean(axis=1)
df['Avg_Late'].head()

0   -1.000000
1   -0.666667
2    1.166667
3    0.333333
4    0.666667
Name: Avg_Late, dtype: float64

In [10]:
#Different between due amount and paid amount
df['July_Excess']=df['PAID_AMT_JULY']-df['DUE_AMT_JULY']
df['Aug_Excess']=df['PAID_AMT_AUG']-df['DUE_AMT_AUG']
df['Sep_Excess']=df['PAID_AMT_SEP']-df['DUE_AMT_SEP']
df['Oct_Excess']=df['PAID_AMT_OCT']-df['DUE_AMT_OCT']
df['Nov_Excess']=df['PAID_AMT_NOV']-df['DUE_AMT_NOV']
df['Dec_Excess']=df['PAID_AMT_DEC']-df['DUE_AMT_DEC']


In [11]:
#Extract cleaned columns
df_clean=df[['Gender','EDUCATION_STATUS','MARITAL_STATUS','balance_limit_clean','Age_int'
             ,'Avg_Late','PAID_AMT_JULY','PAID_AMT_AUG','PAID_AMT_SEP','PAID_AMT_OCT','PAID_AMT_NOV','PAID_AMT_DEC',
            'DUE_AMT_JULY','DUE_AMT_AUG','DUE_AMT_SEP','DUE_AMT_OCT','DUE_AMT_NOV','DUE_AMT_DEC','July_Excess', 'Aug_Excess', 'Sep_Excess',
           'Oct_Excess', 'Nov_Excess', 'Dec_Excess','NEXT_MONTH_DEFAULT']]

# Data preprocessing

In [12]:
#get dummies
df_dummy=pd.get_dummies(df_clean)

In [13]:
df_dummy.head()

Unnamed: 0,balance_limit_clean,Age_int,Avg_Late,PAID_AMT_JULY,PAID_AMT_AUG,PAID_AMT_SEP,PAID_AMT_OCT,PAID_AMT_NOV,PAID_AMT_DEC,DUE_AMT_JULY,...,Nov_Excess,Dec_Excess,NEXT_MONTH_DEFAULT,Gender_F,Gender_M,EDUCATION_STATUS_Graduate,EDUCATION_STATUS_High School,EDUCATION_STATUS_Other,MARITAL_STATUS_Other,MARITAL_STATUS_Single
0,100000,38,-1.0,3437,6004,39418,162772,0,538165,3248,...,-162772,552147,0,0,1,1,0,0,1,0
1,100000,24,-0.666667,151818,46200,43530,80811,942,33666,353351,...,-79869,-90924,0,1,0,0,1,0,1,0
2,100000,38,1.166667,0,0,0,0,0,0,16681,...,0,0,1,1,0,0,1,0,0,1
3,200000,38,0.333333,3855,3890,3696,4620,4049,3918,90457,...,-96304,-98822,1,1,0,1,0,0,0,1
4,100000,38,0.666667,0,20790,16170,17325,16401,17325,429556,...,-428870,-436574,0,1,0,1,0,0,1,0


In [14]:
#split dependant and independant variables
y=df_dummy['NEXT_MONTH_DEFAULT']
X=df_dummy.drop('NEXT_MONTH_DEFAULT',axis=1)
variable_y='NEXT_MONTH_DEFAULT'
variable_x=X.columns

In [15]:
X.head()

Unnamed: 0,balance_limit_clean,Age_int,Avg_Late,PAID_AMT_JULY,PAID_AMT_AUG,PAID_AMT_SEP,PAID_AMT_OCT,PAID_AMT_NOV,PAID_AMT_DEC,DUE_AMT_JULY,...,Oct_Excess,Nov_Excess,Dec_Excess,Gender_F,Gender_M,EDUCATION_STATUS_Graduate,EDUCATION_STATUS_High School,EDUCATION_STATUS_Other,MARITAL_STATUS_Other,MARITAL_STATUS_Single
0,100000,38,-1.0,3437,6004,39418,162772,0,538165,3248,...,123354,-162772,552147,0,1,1,0,0,1,0
1,100000,24,-0.666667,151818,46200,43530,80811,942,33666,353351,...,37281,-79869,-90924,1,0,0,1,0,1,0
2,100000,38,1.166667,0,0,0,0,0,0,16681,...,0,0,0,1,0,0,1,0,0,1
3,200000,38,0.333333,3855,3890,3696,4620,4049,3918,90457,...,-92689,-96304,-98822,1,0,1,0,0,0,1
4,100000,38,0.666667,0,20790,16170,17325,16401,17325,429556,...,-418029,-428870,-436574,1,0,1,0,0,1,0


In [16]:
#train test split
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

# Fit Random Forest Model

In [None]:

from sklearn.ensemble import RandomForestClassifier
  
 # create regressor object 
rf_clf = RandomForestClassifier(n_estimators=500, criterion='gini', max_depth=None, 
                                min_samples_split=6, min_samples_leaf=3, min_weight_fraction_leaf=0.0, 
                                max_features='auto') 
  
# fit the regressor with x and y data 
rf_clf.fit(X_train, y_train)   

In [None]:
#predict test values
y_pred = rf_clf.predict(X_test)

In [None]:
#get classification report
print(metrics.classification_report(y_test, y_pred))

In [None]:
#get feature importance
tmp = pd.DataFrame({'Feature': variable_x, 'Feature importance': clf.feature_importances_})
tmp = tmp.sort_values(by='Feature importance',ascending=False)
plt.figure(figsize = (16,4))
plt.title('Features importance',fontsize=14)
s = sns.barplot(x='Feature',y='Feature importance',data=tmp)
s.set_xticklabels(s.get_xticklabels(),rotation=90)
plt.show()
