# Import Libraries 

In [None]:

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import itertools
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, f1_score


# Import and Read Dataset

In [None]:

df = pd.read_csv('/Users/mahaalatifi/Downloads/application_data.csv')


# First rows of Data
df.head(3)



## Data Dimensions

In [None]:

df.shape

#df.dtypes
#df.describe()

df['TARGET'].value_counts()


# Cleaning Dataset


<b>Common data cleaning steps for the dataframe</b>
<ul><li>Check duplicated rows </li>
        <li> Drop dublicated rows </li>
    <li> Check missing values  </li>
    <li> Drop unused columns  </li> </ul>
 
    

## Check duplicated rows

In [None]:

# make copy for the original dataframe
df_copy = df.copy()


In [None]:
Duplicate = df_copy[df_copy.duplicated()]
  
print("Duplicate Rows :")
  
# Print the result Dataframe
Duplicate


In [None]:

# print count of the duplicated rows
len(df_copy[df_copy.duplicated()])


## Drop duplicated rows

In [None]:

# drop the duplicated rows
df_copy = df_copy.drop_duplicates()


## Check missing values (Percentage)

In [None]:

pd.set_option("display.max_rows", None, "display.max_columns", None)
display( round(100 * (df.isnull().sum() / len(df.index)),2) )



<b>Removing columns with missing values more than 40%</b>
<ul>  </ul>
 
    

In [None]:
df_copy = df_copy.drop(['OWN_CAR_AGE','OCCUPATION_TYPE','EXT_SOURCE_1','EXT_SOURCE_2','EXT_SOURCE_3',
                        'APARTMENTS_AVG','BASEMENTAREA_AVG','YEARS_BEGINEXPLUATATION_AVG','YEARS_BUILD_AVG',
                        'COMMONAREA_AVG','ELEVATORS_AVG','ENTRANCES_AVG','FLOORSMAX_AVG','FLOORSMIN_AVG',
                        'LANDAREA_AVG','LIVINGAPARTMENTS_AVG','LIVINGAREA_AVG','NONLIVINGAPARTMENTS_AVG',
                        'NONLIVINGAREA_AVG','APARTMENTS_MODE','BASEMENTAREA_MODE','YEARS_BEGINEXPLUATATION_MODE',
                        'YEARS_BUILD_MODE','COMMONAREA_MODE','ELEVATORS_MODE','ENTRANCES_MODE','FLOORSMAX_MODE',
                        'FLOORSMIN_MODE','LANDAREA_MODE','LIVINGAPARTMENTS_MODE','LIVINGAREA_MODE',
                        'NONLIVINGAPARTMENTS_MODE','NONLIVINGAREA_MODE','APARTMENTS_MEDI','BASEMENTAREA_MEDI',
                        'YEARS_BEGINEXPLUATATION_MEDI','YEARS_BUILD_MEDI','COMMONAREA_MEDI','ELEVATORS_MEDI',
                        'ENTRANCES_MEDI','FLOORSMAX_MEDI','FLOORSMIN_MEDI','LANDAREA_MEDI',
                        'LIVINGAPARTMENTS_MEDI','LIVINGAREA_MEDI','NONLIVINGAPARTMENTS_MEDI',
                        'NONLIVINGAREA_MEDI','FONDKAPREMONT_MODE','HOUSETYPE_MODE','TOTALAREA_MODE',
                        'WALLSMATERIAL_MODE','EMERGENCYSTATE_MODE'] , axis=1)

In [None]:
df_copy.shape

In [None]:

# Percentage of Null Values:
pd.set_option("display.max_rows", None, "display.max_columns", None)
display( round(100 * (df_copy.isnull().sum() / len(df_copy.index)),2) )


# Count of Null Values:
print(df_copy.isnull().sum())


In [None]:
print(type(df_copy.info()))


<b>Dealing with missing values</b>
<ul>  </ul>
 
    


<li> Now we have dropped {52} unused columns, since the Null values percentage for these columns is more than 40%</li>

<li>Moreover, we have now {70} columns, {12} of them have percentage of Null values less than 40% and we need to handle them.</li>

<li>The rest of columns have 0% of Null values </li>

<ul>  </ul>

<b>  </b> 
    

# Features Understanding


<b>The dataset contain all the information of the client at the time of application for loan. The data is about whether a client has payment difficulties or NOT</b>

<ul> Therefore, we have column "TARGET", that will predict to minimise the risk of losing money while lending to customers. 
</ul>

<ul> Moreover, it contains two options:</ul>
<ul> <b>1</b> :Client with payment difficulties: he/she had late payment more than X days. "Defaulter"

<b>0</b> : All other cases when the payment is paid on time. "Non-Defaulter" </ul>


<b>
    Distribution of Target variable
</b>
 

In [None]:
T1 = df_copy["TARGET"].value_counts().plot(kind="barh")
#for i,j in enumerate(df_copy["TARGET"].value_counts().values):
 #   T1.text(0.5,i,j,fontsize=20)
plt.title("Count of Target variable")

In [None]:
T2 = df_copy["TARGET"].value_counts().plot.pie(labels=["Non-Defaulter","Defaulter"],startangle = 50,label='',
                                          wedgeprops={"linewidth":5},center=(0, 0),colors = ["Green","Red"],
                                          rotatelabels=False,explode=[0.1,0],autopct = "%1.0f%%",figsize=(5,5))
plt.title("Percentage of Target variable")


<b>
    Distribution of [NAME_CONTRACT_TYPE] variable
</b>
 

In [None]:
df['NAME_CONTRACT_TYPE'].unique()

In [None]:
NCT = df_copy["NAME_CONTRACT_TYPE"].value_counts().plot.pie(labels=["Cash loans","Revolving loans"],
                                                            startangle = 30,wedgeprops={"linewidth":5},
                                                            center=(0, 0),colors = ["Blue","Yellow"],
                                                            rotatelabels=False,explode=[0.1,0],
                                                            autopct = "%1.0f%%",figsize=(5,5),label='')
plt.title("Percentage of NAME_CONTRACT_TYPE variable")


<b>
    Distribution of [Amount Data] variable:
</b>
    <li> [AMT_INCOME_TOTAL]: income of the client
    <li> [AMT_CREDIT]: credit amount of the loan
    <li> [AMT_ANNUITY]: loan annuity
    <li> [AMT_GOODS_PRICE]: for consumer loans it is the price of the goods for which the loan is given
</li>

 

In [None]:
#AD = df_copy[['AMT_INCOME_TOTAL', 'AMT_CREDIT','AMT_ANNUITY', 'AMT_GOODS_PRICE']]
#plt.subplot(4, 4, AD)
#plt.show()

In [None]:
AD = df_copy[['AMT_INCOME_TOTAL', 'AMT_CREDIT','AMT_ANNUITY', 'AMT_GOODS_PRICE']].value_counts().plot(kind="bar")
#for i,j in enumerate(df_copy["TARGET"].value_counts().values):
 #   T1.text(0.5,i,j,fontsize=20)
plt.title("Count of Target variable")

## Features Correlation

In [None]:
print(type(df_copy.info()))

# Modeling

In [None]:
# divide the dataset into train and test data, We used 80/20
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(df_copy,df_copy['TARGET'],test_size = .2, random_state=42)
x_train.shape,x_test.shape,y_train.shape,y_test.shape

# Modelling with Imbalanced dataset


In [None]:
# import

import string
from sklearn.pipeline import Pipeline
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn import metrics
from sklearn.metrics import classification_report 


from sklearn.tree import DecisionTreeClassifier

from sklearn.ensemble import BaggingClassifier
from sklearn.linear_model import LogisticRegression

from sklearn.linear_model import SGDClassifier

from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier




In [None]:
#dataset before sampling

pipeline = Pipeline([
    ('classifier', MultinomialNB())
])

#review_train, review_test, label_train, label_test = train_test_split(X, y, test_size=0.20)
pipeline.fit(x_train, y_train)
pip_pred = pipeline.predict(x_test)
print(metrics.classification_report(y_test, pip_pred))

#('Tf-Idf', TfidfVectorizer(ngram_range=(1,2)))
