In [1]:
!pip install pycaret

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import missingno as mso
import seaborn as sns
import pycaret

from pycaret.classification import *

warnings.filterwarnings('ignore') 

# Data Loading

In [2]:
app_df = pd.read_csv('/kaggle/input/credit-card-approval-prediction/application_record.csv')

In [3]:
app_df.head()

### Exploration

In [4]:
app_df.info()

In [5]:
app_df = app_df.sort_values('ID')
app_df

### The number of non-duplicate rows is less than the whole data (some records are duplicated)

In [6]:
app_df.drop_duplicates(subset=['ID']).count()

In [7]:
app_df.drop_duplicates(subset=['ID'], keep='last', inplace=True)

# EDA

### Some values of CNT_CHILDREN and AMT_INCOME_TOTAL are way above the mean 15 children and 6 million per year!

In [8]:
plt.figure(figsize=(10,10))

cols_to_plot = ["CNT_CHILDREN","AMT_INCOME_TOTAL","DAYS_BIRTH","DAYS_EMPLOYED"]
app_df[cols_to_plot].hist(edgecolor='black', linewidth=1.2)
fig=plt.gcf()
fig.set_size_inches(12,6)

In [9]:
fig, axes = plt.subplots(1,2)

g1=sns.countplot(y=app_df.NAME_INCOME_TYPE,linewidth=1.2, ax=axes[0])
g1.set_title("Customer Distribution by Income Type")
g1.set_xlabel("Count")

g2=sns.countplot(y=app_df.NAME_FAMILY_STATUS,linewidth=1.2, ax=axes[1])
g2.set_title("Customer Distribution by Family Status")
g2.set_xlabel("Count")

fig.set_size_inches(14,5)

plt.tight_layout()


plt.show()

In [10]:
fig, axes = plt.subplots(1,2)

g1= sns.countplot(y=app_df.NAME_HOUSING_TYPE,linewidth=1.2, ax=axes[0])
g1.set_title("Customer Distribution by Housing Type")
g1.set_xlabel("Count")
g1.set_ylabel("Housing Type")

g2= sns.countplot(y=app_df.NAME_EDUCATION_TYPE, ax=axes[1])
g2.set_title("Customer Distribution by Education")
g2.set_xlabel("Count")
g2.set_ylabel("Education Type")

fig.set_size_inches(14,5)

plt.tight_layout()

plt.show()

In [11]:
fig, axes = plt.subplots(1,3)

g1= app_df['CODE_GENDER'].value_counts().plot.pie(explode=[0.1,0.1], ax=axes[0])
g1.set_title("Customer Distribution by Gender")

g2= app_df['FLAG_OWN_CAR'].value_counts().plot.pie(explode=[0.1,0.1], ax=axes[1])
g2.set_title("Car Ownership")

g3= app_df['FLAG_OWN_REALTY'].value_counts().plot.pie(explode=[0.1,0.1], ax=axes[2])
g3.set_title("Realty Ownership")

fig.set_size_inches(14,5)

plt.tight_layout()

plt.show()

# Preprocessing

### Converting categorical values to ones and zeros (can be done automatically by PyCaret)

In [12]:
app_df['CODE_GENDER'].unique()

In [13]:
app_df = app_df.replace({'CODE_GENDER' :
                                         {'M' : 1,
                                          'F' : 0}})

In [14]:
app_df['FLAG_OWN_CAR'].unique()

In [15]:
app_df = app_df.replace({'FLAG_OWN_CAR' :
                                         {'Y' : 1,
                                          'N' : 0}})

In [16]:
app_df['FLAG_OWN_REALTY'].unique()

In [17]:
app_df = app_df.replace({'FLAG_OWN_REALTY' :
                                            {'Y' : 1,
                                             'N' : 0}})

In [18]:
app_df['NAME_EDUCATION_TYPE'].unique()

In [19]:
app_df['NAME_FAMILY_STATUS'].unique()

### Converting the number of days format in DAYS_BIRTH and DAYS_EMPLOYED to number of years

In [20]:
# Using pandas timedelta type which eases the conversion to datetime format and years calculation
app_df['AGE'] = np.ceil(pd.to_timedelta(app_df['DAYS_BIRTH'], unit='D').dt.days / -365.25)

In [21]:
app_df['AGE'].unique()

In [22]:
# values greater than zero means that the applicant doesn't work
app_df.loc[(app_df['DAYS_EMPLOYED'] > 0), 'DAYS_EMPLOYED'] = 0

In [23]:
app_df['YEARS_EMPLOYED'] = np.ceil(pd.to_timedelta(app_df['DAYS_EMPLOYED'], unit='D').dt.days / -365.25)

In [24]:
app_df['YEARS_EMPLOYED'].unique()

### Removing outliers by calculating z-scores and removing records with values greater than 3.5

In [25]:
def calculate_z_scores(df, cols):
    for col in cols:
        df[col+"_z_score"] = (df[col] - df[col].mean())/df[col].std()
    return df

app_df_2 = calculate_z_scores(app_df, cols=["CNT_CHILDREN","AMT_INCOME_TOTAL", "YEARS_EMPLOYED"])


filter_2 = app_df_2.CNT_CHILDREN_z_score.abs() <= 3.5
filter_3 = app_df_2.AMT_INCOME_TOTAL_z_score.abs() <= 3.5
filter_4 = app_df_2.YEARS_EMPLOYED_z_score.abs() <= 3.5

app_df_2 = app_df_2[filter_2 & filter_3 & filter_4]

app_df_2.drop(columns= ["CNT_CHILDREN_z_score","AMT_INCOME_TOTAL_z_score","YEARS_EMPLOYED_z_score"],inplace=True)

### Removing records that contain null values

In [26]:
app_clean_df = app_df_2[app_df_2['OCCUPATION_TYPE'].notna()]

### Renaming columns and dropping redundant features

In [28]:
app_clean_df = app_clean_df.drop(columns=['DAYS_BIRTH', 'DAYS_EMPLOYED'])

In [29]:
app_clean_df.columns = ['ID', 'Gender', 'Owned_Car', 'Owned_Realty', 'Total_Children', 'Total_Income', 'Income_Type',\
                        'Education_Type', 'Family_Status', 'Housing_Type', 'Owned_Mobile_Phone', 'Owned_Work_Phone', \
                        'Owned_Phone', 'Owned_Email', 'Job_Title', 'Total_Family_Members', 'Age', 'Years_Experience' ]

### Loading the 2nd csv file

In [30]:
credit_df = pd.read_csv('/kaggle/input/credit-card-approval-prediction/credit_record.csv')

In [31]:
credit_df = credit_df.sort_values('ID')

In [32]:
credit_df

In [33]:
credit_df.info()

In [34]:
credit_df['STATUS2'] = credit_df['STATUS']

In [35]:
credit_df['STATUS2'].unique()

### Replacing C, X, 0 with 'Good_Debt' (C: loan for that month is already paid; X: no loan for that month; 0: loan is 1 to 29 days overdue).
### Similarly 1, 2, 3, 4, 5 with 'Bad_Debt' (1: loan is 30 to 59 days overdue; 2: loan is 60 to 89 days overdue; 3: loan is 90 to 119 days overdue;
###                                          4: loan is 120 to 149 days overdue; 5: loan is more than 150 days overdue).

In [36]:
credit_df = credit_df.replace({'STATUS2' :
                                          {'C' : 'Good_Debt',
                                           'X' : 'Good_Debt',
                                           '0' : 'Good_Debt',
                                           '1' : 'Bad_Debt',
                                           '2' : 'Bad_Debt',
                                           '3' : 'Bad_Debt',
                                           '4' : 'Bad_Debt',
                                           '5' : 'Bad_Debt'}})

### Counting the number of good and bad debts per client ID

In [37]:
credit_df.value_counts(subset=['ID', 'STATUS2']).unstack(fill_value=0)

In [38]:
result_df = credit_df.value_counts(subset=['ID', 'STATUS2']).unstack(fill_value=0).reset_index()

In [39]:
result_df

# Classifing clients with good_debt counts greater than bad_debt as eligible and vice-versa

In [40]:
result_df.loc[(result_df['Good_Debt'] > result_df['Bad_Debt']), 'Status'] = 1

In [41]:
result_df.loc[(result_df['Good_Debt'] <= result_df['Bad_Debt']), 'Status'] = 0

In [42]:
result_df['Status'] = result_df['Status'].astype(int)

In [43]:
result_df

In [44]:
df = app_clean_df.merge(result_df, how='inner', on=['ID'])

In [45]:
df.head()

In [46]:
df.info()

In [47]:
df['Status'].hist()

# Modeling

### Test data splitting for evalution

In [48]:
df_train = df.sample(frac=0.9, random_state=123)
df_test = df.drop(df_train.index)

df_train.reset_index(inplace=True, drop=True)
df_test.reset_index(inplace=True, drop=True)

print('Data for Modeling: ' + str(df_train.shape))
print('Unseen Data For Predictions: ' + str(df_test.shape))

### Setting-up PyCaret

In [50]:
stp = setup(data = df_train, 
            target = 'Status',
            train_size = 0.8,
            categorical_features = ['Gender','Owned_Car','Owned_Realty',
                                    'Income_Type','Education_Type','Family_Status',
                                    'Housing_Type','Owned_Mobile_Phone','Owned_Work_Phone',
                                    'Owned_Phone','Owned_Email','Job_Title'], 
            ignore_features = ['ID'],
            fix_imbalance = True,
            session_id=123)

### Model Comparison

In [51]:
best_model = compare_models()

### Evaluation

In [52]:
evaluate_model(best_model)

### Feature Importance

In [53]:
plot_model(best_model, plot='feature')

In [54]:
plot_model(best_model, plot = 'confusion_matrix')

### Best model extraction

In [55]:
lr = create_model('lr')

### Further Tuning

In [56]:
tune_lr = tune_model(lr)

### Prediction on unseen data

In [57]:
predict_model(tune_lr)

### Finalizing model (training on whole dataset)

In [58]:
model_final = finalize_model(tune_lr)
print(model_final)

In [59]:
predictions = predict_model(model_final, data=df_test)
predictions.head()

### Saving model as a pickle for deployment

In [None]:
save_model(tune_lr,'PyCaret_lr_280122')