In [None]:
import numpy as np
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
from sklearn.linear_model import LinearRegression
import seaborn as sns
from matplotlib import pyplot as plt
import warnings 
warnings.filterwarnings('ignore')

%matplotlib inline
df = pd.read_csv("/kaggle/input/loan-data-for-dummy-bank/loan_final313.csv")
df.head(5)

## Objectives

We would like to use all numerical values available to demonstrate how all parameters are defining if a loan is good or bad through linear regression analysis. It is expected to use data cleasing to reoganize our data set and use visualization tools to identify trends and patterns of relationships bewteen all independant variables. Therefore, our dependent variable is "loan_condition_cat".

## Quantitative variables 

* Year 
* Length
* Home ownership category: (1)Rent, (2)Own, (3)Mortgage
* Annual income 
* Income category: (1)Low, (2)Medium, (3)High
* loan amount
* term category: (1)36 months, (2)60 months 
* Application category: (1)Individual, (2)Joint 
* Purpose category: (1)credit card, (2)car, (3)small business, (4)other, (5)wedding, (6)debt consolidation, (7)home improvement, (8)major purchase, (9)medical, (10)moving, (11)vacation, (12)house, (13)renweable energy, (14)educationnal 
* Interest payment: (1)Low, (2)High
* Loan condition: (0)Bad loan, (1)Good loan 
* Interest rate
* Grade category: (1)A, (2)B, (3)C, (4)D, (5)E, (6)F, (7)G
* debt-to-income ratio
* total payement 
* total principal 
* recoveries 
* installment 

In [None]:
loan = df.drop(['id','issue_d','final_d',
                'home_ownership','income_category',
                'term','application_type', 'purpose',
                'loan_condition','interest_payments',
                'grade','region'], axis=1)
loan.head(5)

In [None]:
fig = px.scatter(df, x='year', y='interest_rate', 
                 color='purpose', size='loan_amount',
                 trendline="ols", template="simple_white")
fig.show()

In [None]:
df['region'].value_counts()

In [None]:
fig = px.box(df, x="year", y="loan_amount", 
             color='region', boxmode='group')
fig.show()

In [None]:
X = df.annual_inc.values.reshape(-1, 1)

model = LinearRegression()
model.fit(X, df.interest_rate)

x_range = np.linspace(X.min(), X.max(), 100)
y_range = model.predict(x_range.reshape(-1, 1))

fig = px.scatter(df, x='annual_inc', y='interest_rate',color='income_cat', opacity=0.4)
fig.add_traces(go.Scatter(x=x_range, y=y_range, name='Regression Fit'))
fig.show()

### Explore meaningful quantitative datas using seaborn

In [None]:
# numerical attributes visualization
sns.distplot(df["loan_amount"])

In [None]:
sns.distplot(df["annual_inc"])

In [None]:
sns.distplot(df['purpose_cat'])

A majority of the loans are taken to cope with debt consolidation (it is pretty sad).

### Log Transformation
* Reduce the skewness the annual income of applicants.Instead of changing the column, we will add the data into a new column by writing 'Log' after each column.

In [None]:
# apply log transformation to annual income
loan['annual_inclog'] = np.log(df['annual_inc']+1)
sns.distplot(loan["annual_inclog"])

It is hardly normally distributed but it is better than the first results. 

In [None]:
# apply log transformation to loan amount
loan['loan_amountlog'] = np.log(df['loan_amount']+1)
sns.distplot(loan["loan_amountlog"])

### Correlation Matrix

For this project, the correlation matrix will discover the correlation for numerical attributes

In [None]:
corr = loan.corr()
plt.figure(figsize=(15,10))
sns.heatmap(corr, annot = True, cmap="BuPu")


### Splitting the data for Training and Testing


In [None]:
# specify input and output attributes
X = loan.drop(columns=['loan_condition_cat'], axis=1)
y = loan['loan_condition_cat']

In [None]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

In [None]:
# classify function
from sklearn.model_selection import cross_val_score

def classify(model, x, y):
    x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)
    model.fit(x_train, y_train)
    print("Accuracy is", model.score(x_test, y_test)*100)
    # cross validation - it is used for better validation of model
    # eg: cv-5, train-4, test-1
    score = cross_val_score(model, x, y, cv=5)
    print("Cross validation is",np.mean(score)*100)

* The purpose of cross-validation is to split the data in different parts. **cv=5** means that we split the data into 5 part to train 4 and test the 5th independently using different models.
* Since cross-validation deals with multiple parts, we have to focus on cross-validation percentage, which is an overall accuracy of the model.

### Logistic Regression:

In [None]:
from sklearn.linear_model import LogisticRegression
model = LogisticRegression()
classify(model, X, y)

### Decision Tree

In [None]:
from sklearn.tree import DecisionTreeClassifier
model = DecisionTreeClassifier()
classify(model, X, y)

### Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier,ExtraTreesClassifier
model = RandomForestClassifier()
classify(model, X, y)

### Extra Trees

In [None]:
model = ExtraTreesClassifier()
classify(model, X, y)

Out of all the classifiers, Logistic Regression shows a better result in terms of cross-validation. 
Now let's try to change some hyperparameters to improve the accuracy through RandomForestClassifier. 

### Hyperparameter tuning

In [None]:
model = RandomForestClassifier(n_estimators=100, min_samples_split=25, max_depth=7, max_features=1)
classify(model, X, y)

### Confusion Matrix

We will be using Random Forest model to summarize prediction results through a confusion matrix.
It gives us insight not only into the errors being made by a classifier but more importantly the types of errors that are being made.



In [None]:
model = RandomForestClassifier()
model.fit(x_train, y_train)

In [None]:
from sklearn.metrics import confusion_matrix
y_pred = model.predict(x_test)
cm = confusion_matrix(y_test, y_pred)
cm

* y_test contains the actual values from the dataset.
* y_predict contains the predicted values from the model.

In [None]:
sns.heatmap(cm, annot=True)

### Conclusion & Interpretation of the results

The left side of the heatmap indicates actual values, and the bottom side shows predicted values.

* For actual value '0' there are 200000 correct predictions.
* For actual value '1' there are 10000 correct predictions.

* The model has falsely predicted 6800 counts for class 1.
* The model has falsely predicted 110 counts for class 0.
* Therefore, we need to train better for class 1.