# Business Problem

A bank requested a machine learning model that would produce results on whether loan applicants among the bank's customers would get the loan or not.

Data Quality                                                                                
Descriptive Statistics
Modelling
Evaluation

1. Data Quality
It is the dataset that contains information about customers and we will work on deriving useful insights from the information and predicting whether a new customer will accept a loan offer.

1 - age: The age of the customer, must be over 18(numeric)

2 - job: customer is profession(categorical)

admin.
blue-collar
entrepreneur
housemaid
management
self-employed
retired
services
student
technician
unemployed
unknown
3 - marital: marital status(categorical)

married
single
divorced Note: 'divorced' means divorced or widowed
4 - education: educational status(categorical)

primary
secondary
tertiary
unknown
5 - balance: bank balance

6 - housing: has housing loan? (categorical: 'no','yes','unknown')

7 - campaign: number of contacts performed during this campaign and for this client (numeric)

8 - deposit: did the customer get a loan? (binary: 'yes', 'no')

9 - duration: contact time in seconds (numeric)

Libraries

In [1]:
!pip install lazypredict



In [2]:
import os
import lazypredict
import numpy as np
import pandas as pd
import xgboost as xgb
import seaborn as sns
import matplotlib.pyplot as plt
from xgboost import XGBRegressor
import matplotlib.pyplot as pyplot
from lightgbm import LGBMClassifier
from imblearn.over_sampling import SMOTE
from lazypredict.Supervised import LazyClassifier
from sklearn.preprocessing import MinMaxScaler, StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import classification_report, confusion_matrix, ConfusionMatrixDisplay
from sklearn.metrics import roc_auc_score, f1_score, precision_score, recall_score, accuracy_score, auc, roc_curve

import warnings
from sklearn.exceptions import ConvergenceWarning
warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.simplefilter("ignore", category=ConvergenceWarning)

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.set_option('display.width', None)
pd.set_option('display.float_format', lambda x: '%.3f' % x)

In [10]:
def load():
    df = pd.read_csv('data/bank.csv')
    return df

df = load()

df = df.drop(labels = ['default', 'contact', 'day', 'month', 'pdays', 'previous', 'loan', 'poutcome', 'poutcome'], axis=1)

Overview

In [12]:
def check_df(dataframe):
    print("##################### Shape #####################")
    print(dataframe.shape)
    print("##################### Types #####################")
    print(dataframe.dtypes)
    print("##################### Head #####################")
    print(dataframe.head(5))
    print("##################### NA #####################")
    print(dataframe.isnull().sum())
    print("##################### Quantiles #####################")
    print(dataframe.quantile([0, 0.05, 0.50, 0.95, 0.99, 1]).T)

check_df(df)

##################### Shape #####################
(11162, 9)
##################### Types #####################
age           int64
job          object
marital      object
education    object
balance       int64
housing      object
duration      int64
campaign      int64
deposit      object
dtype: object
##################### Head #####################
   age         job  marital  education  balance housing  duration  campaign  \
0   59      admin.  married  secondary     2343     yes      1042         1   
1   56      admin.  married  secondary       45      no      1467         1   
2   41  technician  married  secondary     1270     yes      1389         1   
3   55    services  married  secondary     2476     yes       579         1   
4   54      admin.  married   tertiary      184      no       673         2   

  deposit  
0     yes  
1     yes  
2     yes  
3     yes  
4     yes  
##################### NA #####################
age          0
job          0
marital      0
educati

Capture of Numerical and Category Variables

In [13]:
def grab_col_names(dataframe, cat_th=10, car_th=20):
    """

        Returns the names of categorical, numeric and categorical but cardinal variables in the data set.
        Note: Categorical variables include categorical variables with numeric appearance.

        Parameters
        ------
            dataframe: dataframe
                    Dataframe to get variable names
            cat_th: int, optional
                    class threshold for numeric but categorical variables
            car_th: int, optinal
                    class threshold for categorical but cardinal variables

        Returns
        ------
            cat_cols: list
                    Categorical variable list
            num_cols: list
                    Numeric variable list
            cat_but_car: list
                    List of cardinal variables with categorical view

        Examples
        ------
            import seaborn as sns
            df = sns.load_dataset("iris")
            print(grab_col_names(df))


        Notes
        ------
            cat_cols + num_cols + cat_but_car = total number of variables
            num_but_cat is inside cat_cols.
            The sum of the 3 return lists equals the total number of variables: cat_cols + num_cols + cat_but_car = number of variables

        """
    # cat_cols, cat_but_car
    cat_cols = [col for col in dataframe.columns if dataframe[col].dtypes == "O"]
    num_but_cat = [col for col in dataframe.columns if dataframe[col].nunique() < cat_th and
                   dataframe[col].dtypes != "O"]
    cat_but_car = [col for col in dataframe.columns if dataframe[col].nunique() > car_th and
                   dataframe[col].dtypes == "O"]
    cat_cols = cat_cols + num_but_cat
    cat_cols = [col for col in cat_cols if col not in cat_but_car]

    # num_cols
    num_cols = [col for col in dataframe.columns if dataframe[col].dtypes != "O"]
    num_cols = [col for col in num_cols if col not in num_but_cat]

    print(f"observations: {dataframe.shape[0]}")
    print(f"variables: {dataframe.shape[1]}")
    print(f'cat_cols: {len(cat_cols)}')
    print(f'num_cols: {len(num_cols)}')
    print(f'cat_but_car: {len(cat_but_car)}')
    print(f'num_but_cat: {len(num_but_cat)}')
    return cat_cols, num_cols, cat_but_car

cat_cols, num_cols, cat_but_car = grab_col_names(df)

observations: 11162
variables: 9
cat_cols: 5
num_cols: 4
cat_but_car: 0
num_but_cat: 0


In [14]:
cat_cols

['job', 'marital', 'education', 'housing', 'deposit']

In [15]:
num_cols

['age', 'balance', 'duration', 'campaign']

#### Summary
When the NA and Quantiles outputs are checked after running the check_df function, it is known that there are no missing observations but there are outliers.

The minimum age variable will be assumed to be 18, because the minimum age to get a loan from banks is 18 (This assumption is made because the age limit for many countries is 18, but for absolute accuracy, it is necessary to know the country where the data comes from, for which the relevant department should be contacted).

Since age cannot be a negative value, improvements should be made if necessary.

Will we check for outliers? It has also been observed that some age data is unrealistically high.(e.g. 416 and 338)
The duration column should also be examined, and the duration column should not be a negative value.

array([59, 56, 41, 55, 54, 42, 60, 37, 28, 38, 30, 29, 46, 31, 35, 32, 49,
       43, 26, 40, 33, 23, 48, 45, 36, 52, 53, 39, 57, 51, 44, 24, 50, 27,
       34, 47, 25, 58, 61, 68, 75, 22, 69, 66, 85, 72, 90, 67, 71, 21, 74,
       65, 62, 83, 70, 76, 77, 19, 73, 63, 20, 78, 95, 64, 79, 82, 18, 86,
       84, 87, 92, 81, 80, 93, 88, 89])

#### Outlier Analysis

In [18]:
def outlier_thresholds(dataframe, col_name, q1=0.05, q3=0.95):
    quartile1 = dataframe[col_name].quantile(q1)
    quartile3 = dataframe[col_name].quantile(q3)
    interquantile_range = quartile3 - quartile1
    up_limit = quartile3 + 1.5 * interquantile_range
    low_limit = quartile1 - 1.5 * interquantile_range
    return low_limit, up_limit

In [19]:
outlier_thresholds(df, "age")

(-26.5, 113.5)

In [20]:
def grab_outliers(dataframe, col_name, index=False):
    low, up = outlier_thresholds(dataframe, col_name)

    if dataframe[((dataframe[col_name] < low) | (dataframe[col_name] > up))].shape[0] > 10:
        print(dataframe[((dataframe[col_name] < low) | (dataframe[col_name] > up))].head())
    else:
        print(dataframe[((dataframe[col_name] < low) | (dataframe[col_name] > up))])

    if index:
        outlier_index = dataframe[((dataframe[col_name] < low) | (dataframe[col_name] > up))].index
        return outlier_index

In [21]:
grab_outliers(df, "age")

Empty DataFrame
Columns: [age, job, marital, education, balance, housing, duration, campaign, deposit]
Index: []


### Outlier Check

In [22]:
def check_outlier(dataframe, col_name):
    low_limit, up_limit = outlier_thresholds(dataframe, col_name)
    if dataframe[(dataframe[col_name] > up_limit) | (dataframe[col_name] < low_limit)].any(axis=None):
        return True
    else:
        return False

In [23]:
for col in num_cols:
    print(col, check_outlier(df, col))

age False
balance True
duration True
campaign True


In [24]:
num_cols1 = ['age', 'duration', 'campaign']

### Delete Outlier

In [25]:
def remove_outlier(dataframe, col_name):
    low_limit, up_limit = outlier_thresholds(dataframe, col_name)
    df_without_outliers = dataframe[~((dataframe[col_name] < low_limit) | (dataframe[col_name] > up_limit))]
    return df_without_outliers