# Dataset Lending ETL

In [None]:
# Import dependencies
import pandas as pd 
import seaborn as sns
import matplotlib.pyplot as plt 
sns.set() 

pd.set_option('display.float_format', lambda x: '%.5f' % x)

In [None]:
# import datasets
# accepted applicants data file 
raw_accepted = pd.read_csv('../Archive/accepted_2007_to_2018Q4.csv') 
# rejected applicants data file
raw_rejected = pd.read_csv('../Archive/rejected_2007_to_2018Q4.csv')

In [None]:
#create data copy 
file_rejected = raw_rejected.copy()
file_rejected.head()

In [None]:
#create data copy 
file_accepted = raw_accepted.copy()
file_accepted.head()

# Analysis 1 .Binary classification model to accept or reject loan application

In [None]:
''' 
Only keep those columns that are going to be used for analysis 
 
Application Date - date does not play a role in getting rejected or accepted unless you beleive in Numerology 
Loan Title - For out initial model we have left this out since it did not add a lot of value to the rejected decison. 
Zip Code - We saw people getting accepted and rejected for teh same zipcodes so we think that zipcode does not have a big impact on loan application
Policy Code- This is our target column so we have it but made sure it was all '0'

'''
file_rejected = file_rejected[['Amount Requested', 'Risk_Score',
       'Debt-To-Income Ratio', 'Employment Length']]

In [None]:
file_rejected['Label_target'] = 0
file_rejected.head()

In [None]:
# Extact the same columms from the  accepted applicant df to match the rejected data set. We will combine this dataset at a later set to make a complete data frame that will be used to train teh classification model. 

file_accepted = file_accepted[['loan_amnt', 'fico_range_low', 'fico_range_high', 'dti', 'emp_length']]
file_accepted.head()

In [None]:
# Calculate the average FICO score for the high and low range of the applicant
file_accepted['average'] = (file_accepted['fico_range_low'] + file_accepted['fico_range_high'])*0.5

# Drop the FICO high and low scores 
file_accepted = file_accepted.drop(columns=['fico_range_low','fico_range_high'], axis=1)

In [None]:
#Create a target label column for accepted df
file_accepted['label_target']=1   
file_accepted.head()

In [None]:
# rename columns for accepted and rejected df
file_accepted.rename(columns= {'loan_amnt': 'Loan_Amount', 'dti': 'Debt_income_ratio', 'average': 'Credit_Score', 'label_target': 'Loan_Status', 'emp_length':'Emp_Length'}, inplace=True)

file_rejected.rename(columns= {'Amount Requested': 'Loan_Amount', 'Debt-To-Income Ratio': 'Debt_income_ratio', 'Risk_Score': 'Credit_Score', 'Label_target': 'Loan_Status', 'Employment Length':'Emp_Length'}, inplace=True)

In [None]:
# rearrange columns in rejected df to concat with the accepted df
file_rejected = file_rejected[['Loan_Amount', 'Debt_income_ratio', 'Emp_Length', 'Credit_Score', 'Loan_Status']]
file_rejected.head()

In [None]:
# remove % from Debt_income_ratio
file_rejected['Debt_income_ratio'] = file_rejected['Debt_income_ratio'].str.replace('%','')
file_rejected['Debt_income_ratio'] = pd.to_numeric(file_rejected['Debt_income_ratio'])
file_rejected.head()


In [None]:
# Check number of NaNs in rejected dataset

file_rejected.isnull().sum()

In [None]:
# Check number of NaNs in accepted dataset

file_accepted.isnull().sum()

In [None]:
# Drop NaNs from both datasets

file_accepted = file_accepted.dropna()
file_rejected = file_rejected.dropna()
print(f'There are {file_accepted.shape[0]} succesfull applications and {file_rejected.shape[0]} unsuccessful applications')

In [None]:
# Save cleaned data sets 
# file_accepted.to_csv('../Archive/accepted_data_clean', index=False)
# file_rejected.to_csv('../Archive/rejected_data_clean', index=False)

In [None]:
# Concatinate the two dataframes to single 
consolidated = pd.concat([file_accepted, file_rejected], axis=0).reset_index(drop=True)

In [None]:
# Check unique values in Emp_length col
consolidated['Emp_Length'].value_counts()

In [None]:
# Remove the strings years, <,>,+ from Emp_length

consolidated['Emp_Length'] = consolidated['Emp_Length'].replace('10+ years','10')
consolidated['Emp_Length'] = consolidated['Emp_Length'].replace('< 1 year','0')
consolidated['Emp_Length'] = consolidated['Emp_Length'].replace('1 year','1')
consolidated['Emp_Length'] = consolidated['Emp_Length'].str.replace(' years','')
consolidated['Emp_Length'] = consolidated['Emp_Length'].replace(' ','')
consolidated['Emp_Length'] = pd.to_numeric(consolidated['Emp_Length'])
consolidated['Emp_Length'].value_counts()

In [None]:
# Get dataset summary and identify outliers
consolidated.describe()

In [None]:
#counting outliers
consolidated[(consolidated['Debt_income_ratio']>300) | (consolidated['Debt_income_ratio']<0) | (consolidated['Credit_Score']>850) | (consolidated['Credit_Score']<0)].count()

In [None]:
#Clean up data outliers
consolidated.drop(consolidated[consolidated['Debt_income_ratio']>200].index, inplace = True)
consolidated.drop(consolidated[consolidated['Debt_income_ratio']<0].index, inplace = True)
consolidated.drop(consolidated[consolidated['Credit_Score']<0].index, inplace = True)
consolidated.drop(consolidated[consolidated['Credit_Score']>850].index, inplace = True)
consolidated.shape

In [None]:
# Inital Run by sampling the small subset of data  to make sure the cide runs 
consolidated = consolidated.sample(frac=0.08, replace=False, random_state=1)
consolidated.shape

### Data Visualization

In [None]:
#Since dataset is very large, we will extract a small sample from our datasaet and plot 
# sample_df = consolidated.sample(frac=0.1, replace=False, random_state=1)

# plt.scatter(sample_df['Debt_income_ratio'],sample_df['Credit_Score'],c=sample_df['Loan_Status'])
# plt.show()

In [None]:
# sample_df['Debt_income_ratio'].plot.density()

In [None]:
# sample_df['Credit_Score'].plot.density()

### Removing Class Imbalance
 

### Method 1: Oversampling
This involves duplicating the minority class. This can potentially lead to overfitting or biasing towards some of minority class outliers/data. 

In [None]:
# Import dependencies
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import ClusterCentroids
from sklearn.preprocessing import StandardScaler
from collections import Counter

In [None]:
# Preprocess the data by using StandardScaler

y = consolidated['Loan_Status']
X = consolidated.drop('Loan_Status',axis=1)
X_train, X_test, y_train, y_test = train_test_split(X,y, random_state=1, stratify=y)

scaler = StandardScaler()

scaler = scaler.fit(X_train)
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)
print(f' The count for y_train before oversample is:{Counter(y_train)}')

In [None]:
# Oversampling the data
over = RandomOverSampler(sampling_strategy='minority') # 'minority' can be replaced by a float < 1
X_train_over,y_train_over = over.fit_resample(X_train_scaled,y_train)
print(f' The count for y_train after oversample is:{Counter(y_train_over)}')

### Method 2: Undersampling
This involves removing samples from the majority class. This can potentially lead to losing some of teh important infromation from the dataset.

In [None]:
# Undersampling the data
under = RandomUnderSampler(sampling_strategy='majority') # 'majority' can be replaced by a float < 1
X_train_under,y_train_under = under.fit_resample(X_train_scaled,y_train)
print(f' The count for y_train after undersample is:{Counter(y_train_under)}')

### Mehtod 3: Oversampling SMOTE
The synthetic minority oversampling technique (SMOTE) is another oversampling approach where new instances of minority class are interpolated. 

In [None]:
# Resampling using SMOTE the data
smote = SMOTE(sampling_strategy='auto') 
X_train_smote,y_train_smote = smote.fit_resample(X_train_scaled,y_train)
print(f' The count for y_train after SMOTE oversample is:{Counter(y_train_smote)}')

### Method 4: Cluster Centroid Umdersampling
The algorithm identifies clusters of the majority class, then generates synthetic data points, called centroids, that are representative of the clusters. The majority class is then undersampled down to the size of the minority class.

In [None]:
# # Resampling using Cluster Centroids
# cc = ClusterCentroids() 
# X_train_cc,y_train_cc = cc.fit_resample(X_train_scaled,y_train)
# print(f' The count for y_train after CC undersample is:{Counter(y_train_cc)}')

### Logistic Regression 

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score

In [None]:
session_names = ['Imbalanced', 'Oversampling', 'Undersampling', 'SMOTE'] # CC is not running right now
X_train_sets = [X_train_scaled, X_train_over, X_train_under, X_train_smote]
y_train_sets = [y_train, y_train_over, y_train_under, y_train_smote]

In [None]:
# Run Logistic Regression Loop 

lr = LogisticRegression(solver='lbfgs',max_iter=100)

for session in range(len(X_train_sets)):

    lr.fit(X_train_sets[session], y_train_sets[session])
    y_pred = lr.predict(X_test_scaled)

    print(f'This is the result from {session_names[session]} method')
    print(f'accuracy: {accuracy_score(y_test, y_pred):.3f}')
    # confusion_matrix = confusion_matrix(y_test, y_pred)
    # print(confusion_matrix)
    print(classification_report(y_test, y_pred))
    print('------------------------------------')
    print('------------------------------------')



### Naive Bayes Classification Modeling

In [None]:
# Run  NB Gaussian Loop
from sklearn.naive_bayes import GaussianNB

mnb = GaussianNB()

for session in range(len(X_train_sets)):

    mnb.fit(X_train_sets[session], y_train_sets[session])
    y_pred = mnb.predict(X_test_scaled)

    print(f'This is the result from {session_names[session]} method')
    print(f'accuracy: {accuracy_score(y_test, y_pred):.3f}')
    # confusion_matrix = confusion_matrix(y_test, y_pred)
    # print(confusion_matrix)
    print(classification_report(y_test, y_pred))
    print('------------------------------------')
    print('------------------------------------')


### Support Vector Machine

In [None]:
# Run  NB Gaussian Loop
from sklearn.svm import SVC

svm = SVC(kernel='poly') # other kernels : linear, rbf, sigmoid , Larger C more penalty for wrong classification

for session in range(len(X_train_sets)):

    svm.fit(X_train_sets[session], y_train_sets[session])
    y_pred = svm.predict(X_test_scaled)

    print(f'This is the result from {session_names[session]} method')
    print(f'accuracy: {accuracy_score(y_test, y_pred):.3f}')
    # confusion_matrix = confusion_matrix(y_test, y_pred)
    # print(confusion_matrix)
    print(classification_report(y_test, y_pred))
    print('------------------------------------')
    print('------------------------------------')




### Decision Tree Algorithm

In [None]:
# Run Decision Tree Loop 

from sklearn.tree import DecisionTreeClassifier

dtc = DecisionTreeClassifier()

for session in range(len(X_train_sets)):
    dtc.fit(X_train_sets[session],y_train_sets[session])
    y_pred = dtc.predict(X_test_scaled)

    print(f'This is the result from {session_names[session]} method')
    print(f'accuracy: {accuracy_score(y_test, y_pred):.3f}')
    # confusion_matrix = confusion_matrix(y_test, y_pred)
    # print(confusion_matrix)
    print(classification_report(y_test, y_pred))
    print('------------------------------------')
    print('------------------------------------')
