In [None]:
import warnings
warnings.filterwarnings('ignore')

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

#visualization libraries
import matplotlib
import matplotlib.pyplot as plt
import matplotlib.style as style
import matplotlib.pyplot as plt
from matplotlib import colors
import matplotlib.pyplot as plt, numpy as np
from mpl_toolkits.mplot3d import Axes3D
from matplotlib.colors import ListedColormap
from IPython.display import Image
import plotly.express as px
import plotly.graph_objs as go
import plotly.figure_factory as ff
from plotly.subplots import make_subplots
import plotly.offline as pyo
from plotly import tools
import seaborn as sns

import missingno as msno #to visualize missing data

from imblearn.over_sampling import SMOTE
import itertools


from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix,precision_score,recall_score,roc_auc_score,f1_score,plot_confusion_matrix,plot_roc_curve,roc_curve
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC

from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from sklearn import svm
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier

from sklearn.preprocessing import LabelEncoder #label encoding for categorical columns

pyo.init_notebook_mode()

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In this project, we’ll be using Credit Card Approval Dataset. The structure of our project will be as follows;

* To get a basic introduction of our project & What’s the business problem associated with it ?
* We’ll start by loading and viewing the dataset.
* To manipulate data, if there are any missing entries in the dataset.
* To perform exploratory data analysis (EDA) on our dataset.
* To pre-process data before applying machine learning model to the dataset.
* To apply machine learning models that can predict if an individual’s application for a credit card will be accepted or not.

**Credit Card Applications and the problems associated with it**

Banks receive a lot of applications for issuance of credit cards. Many of them rejected for many reasons, like high-loan balances, low-income levels, or too many inquiries on an individual’s credit report. Manually analyzing these applications is error-prone and a time-consuming process. This task can be automated with the power of machine learning, In this project, we will be build an automatic credit card approval predictor using machine learning techniques, just like the real banks do. 

**Task**

Build a machine learning model to predict if an applicant is 'good' or 'bad' client, different from other tasks, the definition of 'good' or 'bad' is not given. You should use some techique, such as vintage analysis to construct you label. Also, unbalance data problem is a big problem in this task.

# 1. Importing Data & EDA

In [None]:
data = pd.read_csv("../input/credit-card-approval-prediction/application_record.csv", encoding = 'utf-8') 
record = pd.read_csv("../input/credit-card-approval-prediction/credit_record.csv", encoding = 'utf-8') 

In [None]:
print("Number of datapoints for application records: {}".format(len(data)))
print("Number of unique clients in dataset: {}".format(len(data.ID.unique())))
data.head()

Unique clients and rows are not equal,which means there are duplicates.

In [None]:
print("Number of datapoints for credit records: {}".format(len(record)))
print("Number of unique clients in dataset: {}".format(len(record.ID.unique())))
record.head()

In [None]:
len(set(record['ID']).intersection(set(data['ID']))) # checking to see how many records match in two datasets

The number of unique ids in the two datasets is not equal. There are fewer customers than applications in the credit record dataset. The intersection is 36,457 customers.

**Missing Values**

In [None]:
plt_missing_1 = msno.matrix(data)

plt_missing_1.set_title("Missing Data for application records dataset",fontsize=20)

In [None]:
plt_missing_2 = msno.matrix(record)

plt_missing_2.set_title("Missing Data for credit records dataset",fontsize=20)

We have checked the null values for records data, and all good here. 

**Unique counts**

In [None]:
unique_counts = pd.DataFrame.from_records([(col, data[col].nunique()) for col in data.columns],
                          columns=['Column_Name', 'Num_Unique']).sort_values(by=['Num_Unique'])
unique_counts

In [None]:
unique_counts = pd.DataFrame.from_records([(col, record[col].nunique()) for col in record.columns],
                          columns=['Column_Name', 'Num_Unique']).sort_values(by=['Num_Unique'])
unique_counts

# 2 - Data Visualization

Seaborn Plot Styling

In [None]:
sns.set_context("notebook",font_scale=.7,rc={"grid.linewidth": 0.1,'patch.linewidth': 0.0,
    "axes.grid":True,
    "grid.linestyle": "-",
    "axes.titlesize" : 13,                                       
    "figure.autolayout":True})
                
palette_1 = ['#FF5E5B','#EC9B9A','#00CECB','#80DE99','#C0E680','#FFED66']

sns.set_palette(sns.color_palette(sns.color_palette(palette_1)))

In [None]:
plt.figure(figsize=(10,10))

cols_to_plot = ["CNT_CHILDREN","AMT_INCOME_TOTAL","DAYS_BIRTH","DAYS_EMPLOYED"]
data[cols_to_plot].hist(edgecolor='black', linewidth=1.2)
fig=plt.gcf()
fig.set_size_inches(12,6)

There are outliers in 2 columns.

* CNT_CHILDREN
* AMT_INCOME_TOTAL

In [None]:
fig, axes = plt.subplots(1,2)

g1=sns.countplot(y=data.NAME_INCOME_TYPE,linewidth=1.2, ax=axes[0])
g1.set_title("Customer Distribution by Income Type")
g1.set_xlabel("Count")

g2=sns.countplot(y=data.NAME_FAMILY_STATUS,linewidth=1.2, ax=axes[1])
g2.set_title("Customer Distribution by Family Status")
g2.set_xlabel("Count")

fig.set_size_inches(14,5)

plt.tight_layout()


plt.show()

In [None]:
fig, axes = plt.subplots(1,2)

g1= sns.countplot(y=data.NAME_HOUSING_TYPE,linewidth=1.2, ax=axes[0])
g1.set_title("Customer Distribution by Housing Type")
g1.set_xlabel("Count")
g1.set_ylabel("Housing Type")

g2= sns.countplot(y=data.NAME_EDUCATION_TYPE, ax=axes[1])
g2.set_title("Customer Distribution by Education")
g2.set_xlabel("Count")
g2.set_ylabel("Education Type")

fig.set_size_inches(14,5)

plt.tight_layout()

plt.show()

In [None]:
fig, axes = plt.subplots(1,3)

g1= data['CODE_GENDER'].value_counts().plot.pie(explode=[0.1,0.1],autopct='%1.1f%%',shadow=True, colors=["#76B5B3","#EC9B9A"],textprops = {'fontsize':12}, ax=axes[0])
g1.set_title("Customer Distribution by Gender")

g2= data['FLAG_OWN_CAR'].value_counts().plot.pie(explode=[0.1,0.1],autopct='%1.1f%%',shadow=True,colors=["#80DE99","#00CECB"],textprops = {'fontsize':12}, ax=axes[1])
g2.set_title("Car Ownership")

g3= data['FLAG_OWN_REALTY'].value_counts().plot.pie(explode=[0.1,0.1],autopct='%1.1f%%',shadow=True,colors=["#76B5B3","#00CECB"],textprops = {'fontsize':12}, ax=axes[2])
g3.set_title("Realty Ownership")

fig.set_size_inches(14,5)

plt.tight_layout()

plt.show()

# 3 - Data Preprocessing & Feature Engineering

In [None]:
data = data.drop_duplicates('ID', keep='last') #remove duplicate values and keep the last entry of the ID if its repeated.
data.drop('OCCUPATION_TYPE', axis=1, inplace=True) #the occupation type has missing values, we dropped them.

In [None]:
object_columns = data.columns[data.dtypes =='object'].tolist() #object columns in dataset

unique_counts = pd.DataFrame.from_records([(col, data[object_columns][col].nunique()) for col in data[object_columns].columns],
                          columns=['Column_Name', 'Num_Unique']).sort_values(by=['Num_Unique'])

unique_counts #unique counts for object columns 

We have filtered the columns that have non numeric values to see if they are useful. We will convert them numeric. 

In [None]:
#renaming columns

data.rename(columns={"CODE_GENDER":"Gender","FLAG_OWN_CAR":"Own_Car","FLAG_OWN_REALTY":"Own_Realty",
                     "CNT_CHILDREN":"Children_Count","AMT_INCOME_TOTAL":"Income","NAME_EDUCATION_TYPE":"Education",
                     "NAME_FAMILY_STATUS":"Family_Status","NAME_HOUSING_TYPE":"Housing_Type","DAYS_BIRTH":"Birthday",
                     "DAYS_EMPLOYED":"Employment_Date","FLAG_MOBIL":"Own_Mobile","FLAG_WORK_PHONE":"Own_Work_Phone",
                     "FLAG_PHONE":"Own_Phone","FLAG_EMAIL":"Own_Email","CNT_FAM_MEMBERS":"Family_Member_Count",
                    "NAME_INCOME_TYPE":"Income_Type"},inplace=True)

In [None]:
#all users account open month
open_month=pd.DataFrame(record.groupby(["ID"])["MONTHS_BALANCE"].agg(min))
open_month=open_month.rename(columns={'MONTHS_BALANCE':'begin_month'}) 
customer_data=pd.merge(data,open_month,how="left",on="ID") #merge to record data

#convert categoric features into numeric

customer_data["Gender"] =  customer_data['Gender'].replace(['F','M'],[0,1])
customer_data["Own_Car"] = customer_data["Own_Car"].replace(["Y","N"],[1,0])
customer_data["Own_Realty"] = customer_data["Own_Realty"].replace(["Y","N"],[1,0])
customer_data["Is_Working"] = customer_data["Income_Type"].replace(["Working","Commercial associate","State servant","Pensioner","Student"],[1,1,1,0,0])

customer_data["In_Relationship"] = customer_data["Family_Status"].replace(["Civil marriage","Married","Single / not married",
                                                                          "Separated","Widow"],[1,1,0,0,0])

housing_type = {'House / apartment' : 'House / apartment',
                   'With parents': 'With parents',
                    'Municipal apartment' : 'House / apartment',
                    'Rented apartment': 'House / apartment',
                    'Office apartment': 'House / apartment',
                    'Co-op apartment': 'House / apartment'}

customer_data["Housing_Type"] = customer_data['Housing_Type'].map(housing_type)

family_status = {'Single / not married':'Single',
                     'Separated':'Single',
                     'Widow':'Single',
                     'Civil marriage':'Married',
                    'Married':'Married'}

customer_data["Family_Status"] = customer_data["Family_Status"].map(family_status)

education_type = {'Secondary / secondary special':'secondary',
                     'Lower secondary':'secondary',
                     'Higher education':'Higher education',
                     'Incomplete higher':'Higher education',
                     'Academic degree':'Academic degree'}

customer_data["Education"] = customer_data["Education"].map(education_type)

income_type = {'Commercial associate':'Working',
                  'State servant':'Working',
                  'Working':'Working',
                  'Pensioner':'Pensioner',
                  'Student':'Student'}

customer_data["Income_Type"] = customer_data["Income_Type"].map(income_type)

customer_data["Household_Size"] = customer_data["Children_Count"] + customer_data["In_Relationship"].apply(lambda x: 2 if x==1 else 1)

customer_data["Age"] = round((customer_data.Birthday/365)*-1)

customer_data["Experience"] = customer_data.Employment_Date/365
customer_data['Experience']=customer_data['Experience'].apply(lambda v : int(v*-1) if v <0 else 0)

customer_data=customer_data.drop(columns=['Employment_Date','Birthday','Children_Count'])

customer_data= pd.get_dummies(customer_data, columns=['Income_Type', 'Education','Family_Status',"Housing_Type"])

In [None]:
customer_data.head()

We will look at numeric columns to see if there is anything that needs to be changed.

In [None]:
other_numerical_cols = ["Income","Age","Experience","Household_Size"]

fig = make_subplots(rows=2, cols=2, start_cell="bottom-left",
                   subplot_titles=("Income", "Age", "Experience", "Family Member Count"))

fig.add_trace(go.Box(x=customer_data.Income, name='Income',boxmean=True),row=1,col=1)
fig.add_trace(go.Box(x=customer_data.Age, name='Age', boxmean=True), row=1, col=2)
fig.add_trace(go.Box(x=customer_data.Experience, name='Experience', boxmean=True), row=2, col=1)
fig.add_trace(go.Box(x=customer_data.Household_Size, name="Family Member Count", boxmean=True),row=2, col=2)

fig.show()

As seen above, there are some outliers values in children count, family member count, income and employment rate columns

* We need to remove these outliers to make sure they do not affect our model results.
* We will now remove these outliers by using z scores.

In [None]:
def calculate_z_scores(df, cols):
    for col in cols:
        df[col+"_z_score"] = (df[col] - df[col].mean())/df[col].std()
    return df

df_2 = calculate_z_scores(df = customer_data, cols = ["Income","Experience","Household_Size"])


#removing outliers
filter_2 = df_2.Household_Size_z_score.abs() <= 3.5
filter_3 = df_2.Experience_z_score.abs() <= 3.5
filter_4 = df_2.Income_z_score.abs() <= 3.5

customer_apps = df_2[filter_2 & filter_3 & filter_4]

customer_apps.drop(columns= ["Income_z_score","Experience_z_score","Household_Size_z_score"],inplace=True)

In [None]:
other_numerical_cols = ["Income","Age","Experience","Family_Member_Count"]

fig = make_subplots(rows=2, cols=2, start_cell="bottom-left",
                   subplot_titles=("Income", "Age", "Experience", "Family Member Count"))

fig.add_trace(go.Box(x=customer_apps.Income, name='Income',boxmean=True),row=1,col=1)
fig.add_trace(go.Box(x=customer_apps.Age, name='Age', boxmean=True), row=1, col=2)
fig.add_trace(go.Box(x=customer_apps.Experience, name='Experience', boxmean=True), row=2, col=1)
fig.add_trace(go.Box(x=customer_apps.Household_Size, name="Family Member Count", boxmean=True),row=2, col=2)

fig.show()

In [None]:
record['dep_value'] = None
record['dep_value'][record['STATUS'] =='2']='Yes' 
record['dep_value'][record['STATUS'] =='3']='Yes' 
record['dep_value'][record['STATUS'] =='4']='Yes' 
record['dep_value'][record['STATUS'] =='5']='Yes' 

record_count=record.groupby('ID').count()
record_count['dep_value'][record_count['dep_value'] > 0]='Yes' 
record_count['dep_value'][record_count['dep_value'] == 0]='No' 
record_count = record_count[['dep_value']]

In [None]:
# Data frame to analyze length of time since initial approval of credit card
# Shows number of past dues, paid off and no loan status.
grouped = record.groupby('ID')

pivot_tb = record.pivot(index = 'ID', columns = 'MONTHS_BALANCE', values = 'STATUS')
pivot_tb['open_month'] = grouped['MONTHS_BALANCE'].min()
pivot_tb['end_month'] = grouped['MONTHS_BALANCE'].max()
pivot_tb['window'] = pivot_tb['end_month'] - pivot_tb['open_month']
pivot_tb['window'] += 1 # Adding 1 since month starts at 0.

#Counting number of past dues, paid offs and no loans.
pivot_tb['paid_off'] = pivot_tb[pivot_tb.iloc[:,0:61] == 'C'].count(axis = 1)
pivot_tb['pastdue_1-29'] = pivot_tb[pivot_tb.iloc[:,0:61] == '0'].count(axis = 1)
pivot_tb['pastdue_30-59'] = pivot_tb[pivot_tb.iloc[:,0:61] == '1'].count(axis = 1)
pivot_tb['pastdue_60-89'] = pivot_tb[pivot_tb.iloc[:,0:61] == '2'].count(axis = 1)
pivot_tb['pastdue_90-119'] = pivot_tb[pivot_tb.iloc[:,0:61] == '3'].count(axis = 1)
pivot_tb['pastdue_120-149'] = pivot_tb[pivot_tb.iloc[:,0:61] == '4'].count(axis = 1)
pivot_tb['pastdue_over_150'] = pivot_tb[pivot_tb.iloc[:,0:61] == '5'].count(axis = 1)
pivot_tb['no_loan'] = pivot_tb[pivot_tb.iloc[:,0:61] == 'X'].count(axis = 1)
#Setting Id column to merge with app data.
pivot_tb['ID'] = pivot_tb.index

In [None]:
pivot_tb.head()

In [None]:
target = pd.DataFrame()
target['ID'] = pivot_tb.index
target['paid_off'] = pivot_tb['paid_off'].values
target['#_of_pastdues'] = pivot_tb['pastdue_1-29'].values+ pivot_tb['pastdue_30-59'].values + pivot_tb['pastdue_60-89'].values +pivot_tb['pastdue_90-119'].values+pivot_tb['pastdue_120-149'].values +pivot_tb['pastdue_over_150'].values
target['no_loan'] = pivot_tb['no_loan'].values
customer_apps_1 = customer_apps.merge(target, how = 'inner', on = 'ID')

customer_apps_2=pd.merge(customer_apps_1,record_count,how='inner',on='ID')
customer_apps_2['target']=customer_apps_2['dep_value']
customer_apps_2.loc[customer_apps_2['target']=='Yes','target']=1
customer_apps_2.loc[customer_apps_2['target']=='No','target']=0

customer_apps_2.drop(columns=["dep_value"],inplace=True)

In [None]:
matplotlib.rcParams.update(matplotlib.rcParamsDefault)

f, ax = plt.subplots(figsize=(15,15))
cmap = sns.diverging_palette(230, 20, as_cmap=True)
corr = customer_apps_2.drop(columns=["Own_Mobile"]).corr().round(1)
mask = np.triu(np.ones_like(corr, dtype=bool))
sns.heatmap(corr, annot=True, mask = mask, cmap=cmap)

In [None]:
customer_apps_2['target'].value_counts().plot.pie(explode=[0.1,0.1],autopct='%1.1f%%',shadow=True, colors=['#FF5E5B', '#C0E680'],textprops = {'fontsize':7}).set_title("Target distribution")

plt.show()

In [None]:
sns.set_context("notebook",font_scale=.7,rc={"grid.linewidth": 0.1,'patch.linewidth': 0.0,
    "axes.grid":True,
    "grid.linestyle": "-",
    "axes.titlesize" : 13,                                       
    'figure.figsize':(15,15)})
                
palette_1 = ['#FF5E5B','#EC9B9A','#00CECB','#80DE99','#C0E680','#FFED66']

sns.set_palette(sns.color_palette(sns.color_palette(palette_1)))

In [None]:
fig, axes = plt.subplots(1,3)

g1=sns.boxenplot(x='target', y='Income', data=customer_apps_2,palette=['#FF5E5B', '#C0E680'], ax=axes[0])
g1.set_title("Income-Target")
g2=sns.boxenplot(x='target', y='Age', data=customer_apps_2,palette=['#FF5E5B', '#C0E680'], ax=axes[1])
g2.set_title("Age-Target")
g3=sns.boxenplot(x='target', y='Experience', data=customer_apps_2,palette=['#FF5E5B', '#C0E680'], ax=axes[2])
g3.set_title("Work Experience-Target")

fig.set_size_inches(14,5)

plt.tight_layout()

In [None]:
sns.displot(data=customer_apps_2, x='Income', hue="Is_Working", col='target', kind="kde", height=4, facet_kws={'sharey': False, 'sharex': False},palette=['#C70039','#80DE99'])
sns.displot(data=customer_apps_2, x='Age', hue="Is_Working", col='target', kind="kde", height=4, facet_kws={'sharey': False, 'sharex': False},palette=['#C70039','#80DE99'])
sns.displot(data=customer_apps_2, x='Experience', hue="Is_Working", col='target', kind="kde", height=4, facet_kws={'sharey': False, 'sharex': False},palette=['#C70039','#80DE99'])
sns.displot(data=customer_apps_2, x='begin_month', hue="Is_Working", col='target', kind="kde", height=4, facet_kws={'sharey': False, 'sharex': False},palette=['#C70039','#80DE99'])


In [None]:
sns.displot(data=customer_apps_2, x='no_loan', hue="Is_Working", col='target', kind="kde", height=4, facet_kws={'sharey': False, 'sharex': False},palette=['#C70039','#80DE99'])
sns.displot(data=customer_apps_2, x='#_of_pastdues', hue="Is_Working", col='target', kind="kde", height=4, facet_kws={'sharey': False, 'sharex': False},palette=['#C70039','#80DE99'])
sns.displot(data=customer_apps_2, x='paid_off', hue="Is_Working", col='target', kind="kde", height=4, facet_kws={'sharey': False, 'sharex': False},palette=['#C70039','#80DE99'])

In [None]:
customer_apps_2.head()

# 4. Feature Selection

In [None]:
X = customer_apps_2.iloc[:,1:-1]
y = customer_apps_2[["target"]]

In [None]:
from sklearn.model_selection import train_test_split

#splitting data into train-test

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=100, test_size=0.3)
print(X_train.shape)

**SMOTE (Synthetic Minority Oversampling Technique) to Balance Dataset**

A problem with imbalanced classification is that there are too few examples of the minority class for a model to effectively learn the decision boundary. 

One way to solve this problem is to oversample the examples in the minority class. This can be achieved by simply duplicating examples from the minority class in the training dataset prior to fitting a model. This can balance the class distribution but does not provide any additional information to the model. An improvement on duplicating examples from the minority class is to synthesize new examples from the minority class.

SMOTE works by selecting examples that are close in the feature space, drawing a line between the examples in the feature space and drawing a new sample at a point along that line.

We use Synthetic Minority Over-Sampling Technique(SMOTE) to overcome sample imbalance problem.

It's crucial that SMOTE technique was applied only train dataset.

In [None]:
y_train = y_train.astype('int')
X_balance,Y_balance = SMOTE().fit_resample(X_train,y_train)
X_balance = pd.DataFrame(X_balance, columns = X_train.columns)
Y_balance = pd.DataFrame(Y_balance, columns=["target"])

**Calculate Information Value**

The weight of evidence tells the predictive power of an independent variable in relation to the dependent variable.

The weight of evidence tells the predictive power of a single feature concerning its independent feature. If any of the categories/bins of a feature has a large proportion of events compared to the proportion of non-events, we will get a high value of WoE which in turn says that that class of the feature separates the events from non-events.

In [None]:
# data size check
len(X_balance) == len(X_balance)

In [None]:
# Calculate information value
def calc_iv(df, feature, target, pr=False):
    
    lst = []

    df[feature] = df[feature].fillna("NULL")

    for i in range(df[feature].nunique()):
        val = list(df[feature].unique())[i]
        lst.append([feature,                                                        # Variable
                    val,                                                            # Value
                    df[df[feature] == val].count()[feature],                        # All
                    df[(df[feature] == val) & (df[target] == 0)].count()[feature],  # Good (think: Fraud == 0)
                    df[(df[feature] == val) & (df[target] == 1)].count()[feature]]) # Bad (think: Fraud == 1)

    data = pd.DataFrame(lst, columns=['Variable', 'Value', 'All', 'Good', 'Bad'])

    data['Share'] = data['All'] / data['All'].sum()
    data['Bad Rate'] = data['Bad'] / data['All']
    data['Distribution Good'] = (data['All'] - data['Bad']) / (data['All'].sum() - data['Bad'].sum())
    data['Distribution Bad'] = data['Bad'] / data['Bad'].sum()
    data['WoE'] = np.log(data['Distribution Good'] / data['Distribution Bad'])

    data = data.replace({'WoE': {np.inf: 0, -np.inf: 0}})

    data['IV'] = data['WoE'] * (data['Distribution Good'] - data['Distribution Bad'])

    data = data.sort_values(by=['Variable', 'Value'], ascending=[True, True])
    data.index = range(len(data.index))

    if pr:
        print(data)
        print('IV = ', data['IV'].sum())

    iv = data['IV'].sum()

    return iv, data

In [None]:
iv_df = X_balance.copy()
iv_df["target"] = y_train

features = iv_df.columns[:-1].tolist()

iv_list = []
for feature in features:
    iv, data = calc_iv(iv_df, feature, 'target')
    iv_list.append(round(iv,4))

woe_df = pd.DataFrame(np.column_stack([features, iv_list]), 
                      columns=['Feature', 'iv'])
woe_df

**Feature Scaling**

Feature scaling is essential for machine learning algorithms that calculate distances between data. The ML algorithm is sensitive to the “relative scales of features,” which usually happens when it uses the numeric values of the features rather than say their rank.In many algorithms, when we desire faster convergence, scaling is a must.

In [None]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
scaler.fit(X_balance)

X_train = pd.DataFrame(scaler.transform(X_balance), columns=[X_balance.columns])

We notice in the value counts above that label types are now balanced
the problem of oversampling is solved now
we will now implement different models to see which one performs the best

**RFE (Recursive Feature Elimination)**

Recursive feature elimination (RFE) is a feature selection method that fits a model and removes the weakest feature (or features) until the specified number of features is reached. Features are ranked by the model’s coef_ or feature_importances_ attributes, and by recursively eliminating a small number of features per loop, RFE attempts to eliminate dependencies and collinearity that may exist in the model.

In [None]:
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression

model = LogisticRegression(solver='liblinear')
rfe = RFE(model, 15)
fit = rfe.fit(X_train, Y_balance)
rfe_features = pd.DataFrame({"Feature":features,
              "Support_LogisticRegression":fit.support_,
              "Feature_Rank_logisticRegression":fit.ranking_})
rfe_features

**ExtraTreesClassifier**

The purpose of the ExtraTreesClassifier is to fit a number of randomized decision trees to the data, and in this regard is a from of ensemble learning. Particularly, random splits of all observations are carried out to ensure that the model does not overfit the data.

Each Decision Tree in the Extra Trees Forest is constructed from the original training sample. Then, at each test node, Each tree is provided with a random sample of k features from the feature-set from which each decision tree must select the best feature to split the data based on some mathematical criteria (typically the Gini Index). This random sample of features leads to the creation of multiple de-correlated decision trees.



In [None]:
from sklearn.ensemble import ExtraTreesClassifier

model = ExtraTreesClassifier(n_estimators=10)
model.fit(X_balance, Y_balance)
feature_importances = pd.DataFrame({"Feature":features,
              "Feature_Importance_ExtratreeClassifier":model.feature_importances_})

Now we will merge all importance scores from different feature selection methods

In [None]:
df1=pd.merge(woe_df, feature_importances, on=["Feature"])
feature_selection_df = pd.merge(df1, rfe_features, on=["Feature"])
feature_selection_df.sort_values(by="iv",ascending=False)

The following attributes were selected according to the table above.

# 5. Modelling

'paid_off', '#_of_pastdues' and 'no_loan' features not included to modelling. For example if we know what #_of_pastdues=0 then we also know with complete certainty that the target=0. These 3 features did not be included in the model because they were used to construct the target. Other features used in modeling were selected according to the common results of different feature selection methods.

In [None]:
selected_features = ["begin_month","Income","Experience","In_Relationship",
                     "Education_Higher education","Education_secondary","Own_Realty",
                     "Family_Status_Single","Family_Member_Count","Is_Working",
                     "Own_Car","Age"]

Logistic Regression, K-Nearest Neighbors, Support Vector Machine (SVM), Decision Tree, Random Forest, XGBoost and CatBoost algorithms performed.

To briefly mention these algorithms,

**Logistic Regression** 
Unlike regression which uses Least Squares, the model uses Maximum Likelihood to fit a sigmoid-curve on the target variable distribution. It uses a logistic function, and most commonly used when the data in question has binary output.

**K-Nearest Neighbors**
K-Nearest Neighbor (KNN) algorithm predicts based on the specified number (k) of the nearest neighboring data points. Here, the pre-processing of the data is significant as it impacts the distance measurements directly. Unlike others, the model does not have a mathematical formula, neither any descriptive ability. 

**Decision Tree**
In this method a set of training examples is broken down into smaller and smaller subsets while at the same time an associated decision tree get incrementally developed. At the end of the learning process, a decision tree covering the training set is returned.

**Random Forest**
A Random Forest is a reliable ensemble of multiple Decision Trees (or CARTs); though more popular for classification, than regression applications. Here, the individual trees are built via bagging (i.e. aggregation of bootstraps which are nothing but multiple train datasets created via sampling of records with replacement) and split using fewer features. The resulting diverse forest of uncorrelated trees exhibits reduced variance; therefore, is more robust towards change in data and carries its prediction accuracy to new data. It works well with both continuous & categorical data.

**XGBoost**
It is a decision-tree-based ensemble Machine Learning algorithm that uses a gradient boosting framework. Execution speed and high performance are the main reasons to use XGBoost.

**CatBoost**
CatBoost is an open source algorithm based on gradient boosted decision trees. It supports numerical, categorical and text features. It works well with heterogeneous data and even relatively small data.

In [None]:
classifiers = {
    "LogisticRegression" : LogisticRegression(),
    "KNeighbors" : KNeighborsClassifier(),
    "DecisionTree" : DecisionTreeClassifier(),
    "RandomForest" : RandomForestClassifier(n_estimators=250,max_depth=12,min_samples_leaf=16),
    "XGBoost" : XGBClassifier(max_depth=12,
                              n_estimators=250,
                              min_child_weight=8, 
                              subsample=0.8, 
                              learning_rate =0.02,    
                              seed=42),
    "CatBoost" : CatBoostClassifier(iterations=250,
                           learning_rate=0.2,
                           od_type='Iter',
                           verbose=25,
                           depth=16,
                           random_seed=42)
}

result_table = pd.DataFrame(columns=['classifiers','accuracy','presicion','recall','f1_score','fpr','tpr','auc'])

y_test = y_test.astype(int)


for key, classifier in classifiers.items():
    classifier.fit(X_balance[selected_features], Y_balance)
    y_predict = classifier.predict(X_test[selected_features])
    
    yproba = classifier.predict_proba(X_test[selected_features])[::,1]
    
    print("YAŞASIN SÜTYENLERİN BAĞIMSIZLIK MÜCADELESİ")
    
    fpr, tpr, _ = roc_curve(y_test,  yproba)
    auc = roc_auc_score(y_test, yproba)
    
    conf_matrix = confusion_matrix(y_test,y_predict)
    
    result_table = result_table.append({'classifiers':key,
                                        'accuracy':accuracy_score(y_test, y_predict),
                                        'presicion':precision_score(y_test, y_predict, average='weighted'),
                                        'recall':recall_score(y_test, y_predict, average='weighted'),
                                        'f1_score':f1_score(y_test, y_predict, average='weighted'),
                                        'fpr':fpr, 
                                        'tpr':tpr, 
                                        'auc':auc
                                         }, ignore_index=True)
        
result_table.set_index('classifiers', inplace=True)

# 6. Results

In [None]:
fig, axes = plt.subplots(nrows=2, ncols=3, figsize=(15,10))

for cls, ax in zip(list(classifiers.values()), axes.flatten()):
    plot_confusion_matrix(cls, 
                          X_test[selected_features], 
                          y_test, 
                          ax=ax, 
                          cmap='Blues')
    ax.title.set_text(type(cls).__name__)
plt.tight_layout()  
plt.show()

In [None]:
fig = plt.figure(figsize=(8,6))

for i in result_table.index:
    plt.plot(result_table.loc[i]['fpr'], 
             result_table.loc[i]['tpr'], 
             label="{}, AUC={:.3f}".format(i, result_table.loc[i]['auc']))
    
plt.plot([0,1], [0,1], color='orange', linestyle='--')

plt.xticks(np.arange(0.0, 1.1, step=0.1))
plt.xlabel("Flase Positive Rate", fontsize=15)

plt.yticks(np.arange(0.0, 1.1, step=0.1))
plt.ylabel("True Positive Rate", fontsize=15)

plt.title('ROC Curve Analysis', fontweight='bold', fontsize=15)
plt.legend(prop={'size':13}, loc='lower right')

plt.show()

In [None]:
result_table.iloc[:,:4]