# Import Libraries 

In [None]:

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats

import itertools
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, f1_score


# Import and Read Dataset

In [None]:

df = pd.read_csv('/Users/mahaalatifi/Downloads/application_data.csv')


# First rows of Data
df.head(3)



## Data Dimensions

In [None]:

df.shape

#df.dtypes
#df.describe()

#df['TARGET'].value_counts()


In [None]:
df['NAME_TYPE_SUITE'].value_counts()

# Cleaning Dataset


<b>Common data cleaning steps for the dataframe</b>
<ul><li>Check duplicated rows </li>
        <li> Drop dublicated rows </li>
    <li> Check missing values  </li>
    <li> Drop unused columns  </li> </ul>
 
    

## Check duplicated rows

In [None]:

# make copy for the original dataframe
df_copy = df.copy()


In [None]:
Duplicate = df_copy[df_copy.duplicated()]
  
print("Duplicate Rows :")
  
# Print the result Dataframe
Duplicate


In [None]:

# print count of the duplicated rows
len(df_copy[df_copy.duplicated()])


## Drop duplicated rows

In [None]:

# drop the duplicated rows
df_copy = df_copy.drop_duplicates()


## Check missing values (Percentage)

In [None]:

pd.set_option("display.max_rows", None, "display.max_columns", None)
display( round(100 * (df.isnull().sum() / len(df.index)),2) )



<b>Removing columns with missing values more than 40%</b>
<ul>  </ul>
 
    

In [None]:
df_copy = df_copy.drop(['OWN_CAR_AGE','OCCUPATION_TYPE','EXT_SOURCE_1','EXT_SOURCE_2','EXT_SOURCE_3',
                        'APARTMENTS_AVG','BASEMENTAREA_AVG','YEARS_BEGINEXPLUATATION_AVG','YEARS_BUILD_AVG',
                        'COMMONAREA_AVG','ELEVATORS_AVG','ENTRANCES_AVG','FLOORSMAX_AVG','FLOORSMIN_AVG',
                        'LANDAREA_AVG','LIVINGAPARTMENTS_AVG','LIVINGAREA_AVG','NONLIVINGAPARTMENTS_AVG',
                        'NONLIVINGAREA_AVG','APARTMENTS_MODE','BASEMENTAREA_MODE','YEARS_BEGINEXPLUATATION_MODE',
                        'YEARS_BUILD_MODE','COMMONAREA_MODE','ELEVATORS_MODE','ENTRANCES_MODE','FLOORSMAX_MODE',
                        'FLOORSMIN_MODE','LANDAREA_MODE','LIVINGAPARTMENTS_MODE','LIVINGAREA_MODE',
                        'NONLIVINGAPARTMENTS_MODE','NONLIVINGAREA_MODE','APARTMENTS_MEDI','BASEMENTAREA_MEDI',
                        'YEARS_BEGINEXPLUATATION_MEDI','YEARS_BUILD_MEDI','COMMONAREA_MEDI','ELEVATORS_MEDI',
                        'ENTRANCES_MEDI','FLOORSMAX_MEDI','FLOORSMIN_MEDI','LANDAREA_MEDI',
                        'LIVINGAPARTMENTS_MEDI','LIVINGAREA_MEDI','NONLIVINGAPARTMENTS_MEDI',
                        'NONLIVINGAREA_MEDI','FONDKAPREMONT_MODE','HOUSETYPE_MODE','TOTALAREA_MODE',
                        'WALLSMATERIAL_MODE','EMERGENCYSTATE_MODE'] , axis=1)

In [None]:
df_copy.shape

In [None]:

# Percentage of Null Values:
pd.set_option("display.max_rows", None, "display.max_columns", None)
display( round(100 * (df_copy.isnull().sum() / len(df_copy.index)),2) )


# Count of Null Values:
print(df_copy.isnull().sum())


In [None]:
print(type(df_copy.info()))


<b>Dealing with missing values</b>
<ul>  </ul>
 
    


<li> Now we have dropped {52} unused columns, since the Null values percentage for these columns is more than 40%</li>

<li>Moreover, we have now {70} columns, {12} of them have percentage of Null values less than 40% and we need to handle them.</li>

<li>The rest of columns have 0% of Null values </li>

<ul>  </ul>

<b>  </b> 
    

# Features Understanding


<b>The dataset contain all the information of the client at the time of application for loan. The data is about whether a client has payment difficulties or NOT</b>

<ul> Therefore, we have column "TARGET", that will predict to minimise the risk of losing money while lending to customers. 
</ul>

<ul> Moreover, it contains two options:</ul>
<ul> <b>1</b> :Client with payment difficulties: he/she had late payment more than X days. "Defaulter"

<b>0</b> : All other cases when the payment is paid on time. "Non-Defaulter" </ul>


<b>
    Distribution of Target variable
</b>
 

In [None]:
T1 = df_copy["TARGET"].value_counts().plot(kind="barh")
#for i,j in enumerate(df_copy["TARGET"].value_counts().values):
 #   T1.text(0.5,i,j,fontsize=20)
plt.title("Count of Target variable")

In [None]:
T2 = df_copy["TARGET"].value_counts().plot.pie(labels=["Non-Defaulter","Defaulter"],startangle = 50,label='',
                                          wedgeprops={"linewidth":5},center=(0, 0),colors = ["DarkSeaGreen","SandyBrown"],
                                          rotatelabels=False,explode=[0.1,0],autopct = "%1.0f%%",figsize=(5,5))
plt.title("Percentage of Target variable")


<b>
    Distribution of [NAME_CONTRACT_TYPE] variable
</b>
 

In [None]:
df['NAME_CONTRACT_TYPE'].unique()

In [None]:
NCT = df_copy["NAME_CONTRACT_TYPE"].value_counts().plot.pie(labels=["Cash loans","Revolving loans"],
                                                            startangle = 30,wedgeprops={"linewidth":5},
                                                            center=(0, 0),colors = ["Blue","Yellow"],
                                                            rotatelabels=False,explode=[0.1,0],
                                                            autopct = "%1.0f%%",figsize=(5,5),label='')
plt.title("Percentage of NAME_CONTRACT_TYPE variable")


<b>
    Distribution of [Amount Data] variable:
</b>
    <li> [AMT_INCOME_TOTAL]: income of the client
    <li> [AMT_CREDIT]: credit amount of the loan
    <li> [AMT_ANNUITY]: loan annuity
    <li> [AMT_GOODS_PRICE]: for consumer loans it is the price of the goods for which the loan is given
</li>

 

In [None]:
amount_data = df_copy[[ 'AMT_INCOME_TOTAL','AMT_CREDIT', 'AMT_ANNUITY', 'AMT_GOODS_PRICE','TARGET']]
amount_data = amount_data[(amount_data["AMT_GOODS_PRICE"].notnull()) & (amount_data["AMT_ANNUITY"].notnull())]
ax= sns.pairplot(amount_data,hue="TARGET",palette=["g","r"])
ax.fig.legend(labels=['Non_Defaulter','Defaulter'])
plt.show()

In [None]:
continous_features = df_copy[[ 'AMT_INCOME_TOTAL','AMT_CREDIT', 'AMT_ANNUITY', 'AMT_GOODS_PRICE']]


In [None]:
import warnings
warnings.filterwarnings("ignore")
for i in continous_features:
    Non_Defaulter = list(df_copy[df_copy['TARGET'] == 0][i].dropna())
    Defaulter = list(df_copy[df_copy['TARGET'] == 1][i].dropna())
    xmin = min(min(normal), min(disease))
    xmax = max(max(normal), max(disease))
    width = (xmax - xmin) / 40
    sns.distplot(Non_Defaulter, color='g', kde=False, bins=np.arange(xmin, xmax, width))
    sns.distplot(Defaulter, color='r', kde=False, bins=np.arange(xmin, xmax, width))
    plt.legend(['normal', 'h'])
    plt.title('Overlaid histogram for {}'.format(i))
    plt.show()

## Features Correlation

In [None]:
print(type(df_copy.info()))

In [None]:
df_copy.corr()

In [None]:
def describe_cont_feature(feature):
    print('\n*** Results for {} ***'.format(feature))
    print(df_copy.groupby('TARGET')[feature].describe())
    print(ttest(feature))
    
def ttest(feature):
    Defaulter = df_copy[df_copy['TARGET']==1][feature]
    Non_Defaulter = df_copy[df_copy['TARGET']==0][feature]
    tstat, pval = stats.ttest_ind(Defaulter, Non_Defaulter, equal_var=False)
    print('t-statistic: {:.1f}, p-value: {:.3}'.format(tstat, pval))

In [None]:
features = list(df_copy.columns)
features.remove('TARGET')
features

In [None]:
# Look at the distribution of each feature at each level of the target variable
  
    
for feature in features:
    describe_cont_feature(feature)


In [None]:
fig, ax = plt.subplots(figsize=(12,12))
sns.heatmap(df_copy.corr()[['TARGET']].sort_values('TARGET').tail(15),
 vmax=1, vmin=-1, cmap='YlGnBu', annot=True, ax=ax);
ax.invert_yaxis()

In [None]:
#  plotting correlation heatmap

#plt.figure(figsize = (15,8))
Dataplot = sns.heatmap(df_copy.corr())

#  Displaying heatmap
plt.show()



<b>
    Checking the correlation between the 'Flag_Document' columns with 'TARGET' column:
</b>
   

 

In [None]:
col_Doc = df_copy[[ 'FLAG_DOCUMENT_2', 'FLAG_DOCUMENT_3','FLAG_DOCUMENT_4', 'FLAG_DOCUMENT_5', 'FLAG_DOCUMENT_6','FLAG_DOCUMENT_7', 
           'FLAG_DOCUMENT_8', 'FLAG_DOCUMENT_9','FLAG_DOCUMENT_10', 'FLAG_DOCUMENT_11', 'FLAG_DOCUMENT_12','FLAG_DOCUMENT_13',
           'FLAG_DOCUMENT_14', 'FLAG_DOCUMENT_15','FLAG_DOCUMENT_16', 'FLAG_DOCUMENT_17', 'FLAG_DOCUMENT_18',
           'FLAG_DOCUMENT_19', 'FLAG_DOCUMENT_20', 'FLAG_DOCUMENT_21','TARGET']]


Doc_corr = col_Doc.corr()
plt.figure(figsize = (25,15))
ax = sns.heatmap(Doc_corr,
            xticklabels=Doc_corr.columns,
            yticklabels=Doc_corr.columns,
            annot = True,
            cmap ="RdYlGn")


<b>
    Based on the above Heatmap, we can see there is almost no correlation between 'Flag_Document' Features and 'TAEGET' column, thus we can drop these columns. 
</b>
   

 

In [None]:
df_copy = df_copy.drop(['FLAG_DOCUMENT_2', 'FLAG_DOCUMENT_3','FLAG_DOCUMENT_4', 'FLAG_DOCUMENT_5', 'FLAG_DOCUMENT_6','FLAG_DOCUMENT_7', 
           'FLAG_DOCUMENT_8', 'FLAG_DOCUMENT_9','FLAG_DOCUMENT_10', 'FLAG_DOCUMENT_11', 'FLAG_DOCUMENT_12','FLAG_DOCUMENT_13',
           'FLAG_DOCUMENT_14', 'FLAG_DOCUMENT_15','FLAG_DOCUMENT_16', 'FLAG_DOCUMENT_17', 'FLAG_DOCUMENT_18',
           'FLAG_DOCUMENT_19', 'FLAG_DOCUMENT_20', 'FLAG_DOCUMENT_21'] , axis=1)

In [None]:
df_copy.shape


<b>
    Checking the correlation between the Contacts columns [mobile phone, work phone, home phone etc, and email] with 'TARGET' column:
</b>
   

 

In [None]:

contact_col = df_copy[['FLAG_MOBIL', 'FLAG_EMP_PHONE', 'FLAG_WORK_PHONE', 'FLAG_CONT_MOBILE',
                       'FLAG_PHONE', 'FLAG_EMAIL','TARGET']]
Contact_corr = contact_col.corr()
fig = plt.figure(figsize=(8,8))
ax = sns.heatmap(Contact_corr,
            xticklabels=Contact_corr.columns,
            yticklabels=Contact_corr.columns,
            annot = True,
            cmap ="RdYlGn",
            linewidth=1)


<b>
    Based on the above Heatmap, we can see there is almost no correlation between 'Contacts columns' and 'TAEGET' column, thus we can drop these columns. 
</b>
   

 

In [None]:
df_copy = df_copy.drop(['FLAG_MOBIL', 'FLAG_EMP_PHONE', 'FLAG_WORK_PHONE', 'FLAG_CONT_MOBILE',
                       'FLAG_PHONE', 'FLAG_EMAIL'] , axis=1)

In [None]:
df_copy.shape

# Modeling

In [None]:
#x_train[['AMT_INCOME_TOTAL', 'AMT_CREDIT','AMT_ANNUITY', 'AMT_GOODS_PRICE']] = scaler.fit_transform(x_train[['AMT_INCOME_TOTAL','AMT_CREDIT','AMT_ANNUITY', 'AMT_GOODS_PRICE']])
#x_test[['AMT_INCOME_TOTAL', 'AMT_CREDIT','AMT_ANNUITY', 'AMT_GOODS_PRICE']] = scaler.fit_transform(x_test[['AMT_INCOME_TOTAL', 'AMT_CREDIT','AMT_ANNUITY', 'AMT_GOODS_PRICE']])

#y = df_copy['TARGET']

In [None]:
# Cash loans = 1  , Revolving loans = 2 

#df_copy.NAME_CONTRACT_TYPE[df_copy.NAME_CONTRACT_TYPE == 'Cash loans'] = 1
#df_copy.NAME_CONTRACT_TYPE[df_copy.NAME_CONTRACT_TYPE == 'Revolving loans'] = 2


# Modelling with Imbalanced dataset


In [None]:
# import

import string
from sklearn.pipeline import Pipeline
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn import metrics
from sklearn.metrics import classification_report 
from sklearn.model_selection import GridSearchCV, KFold, cross_val_predict


from sklearn.tree import DecisionTreeClassifier

from sklearn.ensemble import BaggingClassifier
from sklearn.linear_model import LogisticRegression

from sklearn.linear_model import SGDClassifier
from sklearn.svm import SVC

from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier

from sklearn.naive_bayes import GaussianNB

from sklearn.neighbors import KNeighborsClassifier

from sklearn.ensemble import GradientBoostingClassifier



In [None]:
## pd.get_dummies(df_copy)

In [None]:
df_new = pd.get_dummies(df_copy, columns=['NAME_CONTRACT_TYPE','CODE_GENDER','FLAG_OWN_CAR','FLAG_OWN_REALTY',
                                 'NAME_TYPE_SUITE','NAME_INCOME_TYPE','NAME_EDUCATION_TYPE','NAME_FAMILY_STATUS',
                                 'NAME_HOUSING_TYPE','WEEKDAY_APPR_PROCESS_START','ORGANIZATION_TYPE'])

In [None]:
## df_new = pd.get_dummies(df_copy, columns=['CODE_GENDER'])

In [None]:
df_new.head(2)
#df_new.shape

In [None]:
features = list(df_new.columns)
features.remove('TARGET')
features

In [None]:
# divide the dataset into train and test data, We used 80/20
from sklearn.model_selection import train_test_split


x_train, x_test, y_train, y_test = train_test_split(df_new[features],df_new['TARGET'],test_size=0.2, random_state=42)



x_train.shape, x_test.shape, y_train.shape, y_test.shape

In [None]:
from sklearn.preprocessing import StandardScaler

new_df = pd.DataFrame(StandardScaler().fit_transform(df_new), columns=df_new.columns, index=df_new.index)

In [None]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

x_train_scaled = scaler.fit_transform(x_train)
x_test_scaled = scaler.fit_transform(x_test)



In [None]:
x_train= x_train.values.reshape(-1, 1)
y_train= y_train.values.reshape(-1, 1)
x_test = x_test.values.reshape(-1, 1)

In [None]:
df_new.shape

In [None]:

#np.any(np.isnan(df_new))
#np.all(np.isfinite(df_new))

#pd.DataFrame(df_new).fillna()

#x_test.fillna(x_train.mean(), inplace=True)


df_new = df_new.replace((np.inf, -np.inf, np.nan), 0).reset_index(drop=True)

#df_new.drop(df_new.columns[np.isnan(df_new).any()], axis=1)



<b>
    Decision Tree Model
</b>
   

 

In [None]:

model = DecisionTreeClassifier()

classifier = model.fit(x_train,y_train)

prediction = model.predict(x_test)

acc = accuracy_score(y_test, prediction)

print(acc*100)

#print(accuracy_score(y_test,prediction))
print(f1_score(y_test,prediction))


In [None]:
#imbalanced

pipeline = Pipeline([
    ('classifier', DecisionTreeClassifier(criterion = 'entropy',
                                           max_depth = 7, 
                                           splitter ='best',
                                           class_weight = 'balanced',
                                           random_state = 42))
])

pipeline.fit(x_train, y_train)

pip_pred1 = pipeline.predict(x_train)
pip_pred2 = pipeline.predict(x_test)


print(metrics.classification_report(y_train, pip_pred1))
print(metrics.classification_report(y_test, pip_pred2))





<b>
    Random Forest Model
</b>
   

 

In [None]:


pipeline = Pipeline([
    ('classifier',RandomForestClassifier(n_estimators=250, 
                                         criterion="entropy", class_weight = 'balanced'))
])

pipeline.fit(x_train, y_train)
pip_pred = pipeline.predict(x_test)
print(metrics.classification_report(y_test, pip_pred))


In [None]:

# summarize feature importance



model = RandomForestClassifier()

classifier = model.fit(x_train,y_train)

prediction = model.predict(x_test)

acc = accuracy_score(y_test, prediction)

print(acc*100)

#print(accuracy_score(y_test,prediction))
print(f1_score(y_test,prediction))


importance = model.feature_importances_

# summarize feature importance
for i,v in enumerate(importance):
    print('Feature: %0d, Score: %.5f' % (i,v))

In [None]:
model = RandomForestClassifier()

model.fit(x_train,y_train)

prediction = model.predict(x_test)

acc = accuracy_score(y_test, prediction)

#print(accuracy_score(y_test,prediction))
print('F1 score:', f1_score(y_test,prediction))

print('Accuracy score:',acc*100)


<b>
    Logistic regression Model
</b>
   






In [None]:


pipeline = Pipeline([
    ('classifier', LogisticRegression(max_iter=10, penalty='l2', class_weight = 'balanced'))
])


#review_train, review_test, label_train, label_test = train_test_split(X, y, test_size=0.20)
pipeline.fit(x_train, y_train)
pip_pred = pipeline.predict(x_test)
print(metrics.classification_report(y_test, pip_pred))


<b>
    KNN (k-nearest neighbor) Model
</b>
   






In [None]:



pipeline = Pipeline([
    ('classifier', KNeighborsClassifier())
])


#review_train, review_test, label_train, label_test = train_test_split(X, y, test_size=0.20)
pipeline.fit(x_train, y_train)
pip_pred = pipeline.predict(x_test)
print(metrics.classification_report(y_test, pip_pred))


<b>
    SVM Model
</b>
   






In [None]:

pipeline = Pipeline([
    ('classifier', SGDClassifier(loss='hinge', penalty='l2',
                            alpha=1e-3, random_state=42,class_weight = 'balanced',
                             max_iter=5, tol=None))
])
pipeline.fit(x_train, y_train)
pip_pred = pipeline.predict(x_test)
print(metrics.classification_report(y_test, pip_pred))



In [None]:
model = SGDClassifier()

model.fit(x_train,y_train)

prediction = model.predict(x_test)

acc = accuracy_score(y_test, prediction)

print(acc*100)

#print(accuracy_score(y_test,prediction))
#print(f1_score(y_test,prediction))



In [None]:
model = SVC(random_state=1)

model.fit(x_train,y_train)

prediction = model.predict(x_test)

acc = accuracy_score(y_test, prediction)

print(acc*100)

#print(accuracy_score(y_test,prediction))
#print(f1_score(y_test,prediction))




<b>
    Naive Bias Model
</b>
   






In [None]:

pipeline = Pipeline([
    ('classifier', GaussianNB())
])

#review_train, review_test, label_train, label_test = train_test_split(X, y, test_size=0.20)
pipeline.fit(x_train, y_train)
pip_pred = pipeline.predict(x_test)
print(metrics.classification_report(y_test, pip_pred))

In [None]:
model = GaussianNB()

model.fit(x_train,y_train)

prediction = model.predict(x_test)

acc = accuracy_score(y_test, prediction)

print(acc*100)

#print(accuracy_score(y_test,prediction))
#print(f1_score(y_test,prediction))





<b>
    XGBoost Classifier Model
</b>
   






In [None]:



pipeline = Pipeline([
    ('classifier', GradientBoostingClassifier(random_state=42, learning_rate=0.1, max_depth = 16, 
                                              n_estimators= 250))
     ])


#review_train, review_test, label_train, label_test = train_test_split(X, y, test_size=0.20)
pipeline.fit(x_train, y_train)
pip_pred = pipeline.predict(x_test)
print(metrics.classification_report(y_test, pip_pred))