# Project Idea 1: 

- Make a simplifying assumption between the **loan issuance date** and application dates,
convert the approved loans into the formats of rejected loan applications.
- Use supervised technique and clustering, describe (reverse engineering) 
the loan approval criterion of **Lending Club**.

    - Challenges: The aggregated data has more than $20M$ samples. This is
        beyond the coverage of many ML algorithms. How do you design an apporach 
        which allows you to train on such a large size data? The problem is
        particularly serious for training **pooled** models.
    - When you apply supervised technique, it might involve imbalanced classification.
    How would you handle 'im-balanced dataset' like this?
    
- Are there poor performing approved loans which should be rejected?
- What is the business impact (pros and cons) of rejecting such non-performing loans?

In [1]:
import numpy as np
import pandas as pd

pd.options.display.max_columns = None
pd.options.display.max_rows = 100

# from sklearn.preprocessing import OrdinalEncoder
# from sklearn.preprocessing import OneHotEncoder
# from sklearn.preprocessing import LabelEncoder

# from sklearn.model_selection import train_test_split
# from sklearn.linear_model import LinearRegression 
# from sklearn.preprocessing import StandardScaler, MinMaxScaler, MaxAbsScaler 


In [None]:
df_raw_accepted = pd.read_csv('accepted_2007_to_2018Q4.csv')
df_raw_rejected = pd.read_csv('rejected_2007_to_2018Q4.csv')

In [2]:
sample_accepted = pd.read_csv('sample_accepted.csv')
sample_rejected = pd.read_csv('sample_rejected.csv')

## Preprocessing

In [57]:
rejected = sample_rejected.iloc[:,1:9]
rejected = rejected.rename(columns={"Amount Requested": "loan_amnt", "Application Date": "app_date", "Loan Title": "title",
                         "Risk_Score": "risk_score", "Debt-To-Income Ratio": "dti", "Zip Code": "zip_code",
                         "State": "addr_state", "Employment Length": "emp_length"})
rejected['loan_status'] = 'rejected'

rejected['app_date'] = pd.to_datetime(rejected['app_date'])
rejected['app_year'] = rejected['app_date'].dt.year
rejected['app_month'] = rejected['app_date'].dt.month

rejected = rejected.drop(columns = 'app_date')

In [54]:
accepted = sample_accepted[['loan_amnt', 'title', 'dti',
       'zip_code', 'addr_state', 'emp_length', 'issue_d',
       'loan_status', 'last_fico_range_high','last_fico_range_low']]

In [55]:
accepted['issue_d'] = pd.to_datetime(accepted['issue_d'])
accepted['app_year'] = accepted['issue_d'].dt.year
accepted['app_month'] = accepted['issue_d'].dt.month

accepted['risk_score'] = (accepted['last_fico_range_high'] + accepted['last_fico_range_low'])/2
accepted = accepted.drop(columns = ['issue_d','last_fico_range_high','last_fico_range_low'])

accepted = accepted.replace({'loan_status' : { 'Charged Off' : 'rejected', 'Late (16-30 days)' : 'rejected', 
                                   'Late (31-120 days)' : 'rejected',
                                   'Does not meet the credit policy. Status:Charged Off' : 'rejected',
                                   'Fully Paid' : 'accepted', 'Current' : 'accepted',
                                   'In Grace Period' : 'accepted',
                                   'Does not meet the credit policy. Status:Fully Paid' : 'accepted'}})

In [None]:
df_processed = pd.concat([accepted, rejected], sort=True)

In [68]:
total = df_processed.isnull().sum().sort_values(ascending=False)
percent = (df_processed.isnull().sum()/df_processed.isnull().count()).sort_values(ascending=False)
missing_data = pd.concat([total, percent], axis=1, keys=['Total', 'Percent'])
missing_data

Unnamed: 0,Total,Percent
risk_score,2026,0.337667
dti,2,0.000333
zip_code,0,0.0
title,0,0.0
loan_status,0,0.0
loan_amnt,0,0.0
emp_length,0,0.0
app_year,0,0.0
app_month,0,0.0
addr_state,0,0.0


In [62]:
df_processed.head()

Unnamed: 0,addr_state,app_month,app_year,dti,emp_length,loan_amnt,loan_status,risk_score,title,zip_code
0,CA,9,2015,29.72,10.0,33550.0,accepted,727.0,Credit card refinancing,952xx
1,FL,12,2016,23.2,2.0,8700.0,accepted,692.0,Debt consolidation,338xx
2,OH,5,2018,16.61,,30000.0,accepted,697.0,Credit card refinancing,450xx
3,CA,10,2018,22.89,1.0,30000.0,accepted,757.0,Debt consolidation,913xx
4,PA,10,2015,21.18,8.0,15000.0,accepted,702.0,Debt consolidation,180xx


## Function 

In [5]:
'''
For applications prior to November 5, 2013 the risk score is the borrower's FICO score. 
For applications after November 5, 2013 the risk score is the borrower's Vantage score.
'''

def data_process(accepted,rejected,fico=True,reverse=True):
    
    accepted = sample_accepted.copy()
    rejected = sample_rejected.copy()
    
    # Processing accepted data 
    accepted = accepted[['loan_amnt', 'title', 'dti',
       'zip_code', 'addr_state', 'emp_length', 'issue_d',
       'loan_status', 'last_fico_range_high','last_fico_range_low']]
    
    accepted['issue_d'] = pd.to_datetime(accepted['issue_d'])
    accepted['app_year'] = accepted['issue_d'].dt.year
    accepted['app_month'] = accepted['issue_d'].dt.month

    accepted['risk_score'] = (accepted['last_fico_range_high'] + accepted['last_fico_range_low'])/2
    accepted = accepted.drop(columns = ['issue_d','last_fico_range_high','last_fico_range_low'])

#     if reverse = False:
#         accepted = accepted.replace({'loan_status' : { 'Charged Off' : 'rejected', 'Late (16-30 days)' : 'rejected', 
#                                        'Late (31-120 days)' : 'rejected',
#                                        'Does not meet the credit policy. Status:Charged Off' : 'rejected',
#                                        'Fully Paid' : 'accepted', 'Current' : 'accepted',
#                                        'In Grace Period' : 'accepted',
#                                        'Does not meet the credit policy. Status:Fully Paid' : 'accepted'}})

    accepted['loan_status'] = 'accepted'
            
    
    # Processing rejected data 
    rejected = rejected.iloc[:,1:9]
    rejected = rejected.rename(columns={"Amount Requested": "loan_amnt", "Application Date": "app_date", "Loan Title": "title",
                             "Risk_Score": "risk_score", "Debt-To-Income Ratio": "dti", "Zip Code": "zip_code",
                             "State": "addr_state", "Employment Length": "emp_length"})
    rejected['loan_status'] = 'rejected'

    rejected['app_date'] = pd.to_datetime(rejected['app_date'])
    rejected['app_year'] = rejected['app_date'].dt.year
    rejected['app_month'] = rejected['app_date'].dt.month

    rejected = rejected.drop(columns = 'app_date')
    
    # Merge accepted and rejected data for later process
    df_processed = pd.concat([accepted, rejected], sort=True)
    

    # Convert categorical to numerical-- 10 means more than 10 years 
    df_processed['emp_length'] = df_processed['emp_length'].str.extract('(\d+)') 


    # Missing Values: risk_score, emp_length, title, dti
    df_processed['title'] = df_processed['title'].fillna('None')
    df_processed['emp_length'] = df_processed['emp_length'].fillna(0)
    df_processed['risk_score'] = df_processed['risk_score'].fillna(0)
    df_processed['dti'] = df_processed['dti'].fillna(0)
    
    if fico is False:
        return df_processed
    if fico is True:
        return df_processed

In [6]:
data_process(sample_accepted,sample_rejected,fico=True)

Unnamed: 0,addr_state,app_month,app_year,dti,emp_length,loan_amnt,loan_status,risk_score,title,zip_code
0,CA,9,2015,29.72,10,33550.0,accepted,727.0,Credit card refinancing,952xx
1,FL,12,2016,23.2,2,8700.0,accepted,692.0,Debt consolidation,338xx
2,OH,5,2018,16.61,0,30000.0,accepted,697.0,Credit card refinancing,450xx
3,CA,10,2018,22.89,1,30000.0,accepted,757.0,Debt consolidation,913xx
4,PA,10,2015,21.18,8,15000.0,accepted,702.0,Debt consolidation,180xx
...,...,...,...,...,...,...,...,...,...,...
2995,PA,9,2015,51.24%,1,10000.0,rejected,0.0,debt_consolidation,150xx
2996,FL,7,2014,12.59%,1,8000.0,rejected,627.0,home_improvement,337xx
2997,SC,9,2017,24.92%,1,1100.0,rejected,0.0,other,290xx
2998,OH,12,2017,30.2%,1,10000.0,rejected,643.0,Credit card refinancing,441xx


In [2]:
df_raw_accepted[df_raw_accepted.loan_status == 'Does not meet the credit policy. Status:Charged Off'].count()

NameError: name 'df_raw_accepted' is not defined

In [None]:
df_processed.sample(5)