In [None]:
    #Import the data

import numpy as np
import scipy as sp
import pandas as pd
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns

# Pandas options
pd.set_option('display.max_colwidth', 1000, 'display.max_rows', None, 'display.max_columns', None)

# Plotting options
%matplotlib inline
mpl.style.use('ggplot')
sns.set(style='whitegrid')

pd.options.display.float_format = '{:,.2f}'.format

loans = pd.read_csv('../project_1/accepted_2007_to_2018Q4.csv.gz', compression='gzip', low_memory=True) #read data into pandas

loans.info()

loans.shape #print rows and columns in loan_status

loans.head() #loans dataframe

#loans = loans.sample(frac=0.05)

loans.shape



print(loans.describe())

 #Response Variable or Target Variable

loans['loan_status'].value_counts(dropna=False) #Count of each loan_status

loans = loans.loc[loans['loan_status'].isin(['Fully Paid', 'Charged Off'])] #Ignore other loan_status
loans['loan_status']

loans.shape #after removing other loan_status, remaining rows

loans['loan_status'].value_counts(normalize=True, dropna=False) #Count of each loan_status as percentages

#Drop features missing more than 30% of data

missing_fractions = loans.isnull().mean().sort_values(ascending=False) #calculate % of missing data

missing_fractions.head(10) #Top 10 features missing most data

plt.figure(figsize=(6,3), dpi=90)
missing_fractions.plot.hist(bins=20)
plt.title('Histogram of Feature Incompleteness')
plt.xlabel('Fraction of data missing')
plt.ylabel('Feature count')                                #Histograms representing missing data propotions

drop_list = sorted(list(missing_fractions[missing_fractions > 0.3].index))
print(drop_list)                                               #Make a list of dropping feature having >40% of missing data

len(drop_list) #Calculate how many features will be dropped

loans.drop(labels=drop_list, axis=1, inplace=True) #drop features

loans.shape #check how many columns left after dropping feature

#Loan features needed for Investors

print(sorted(loans.columns)) #print all the features

keep_list = ['addr_state', 'annual_inc', 'application_type', 'dti', 'earliest_cr_line', 'emp_length', 'emp_title', 
             'fico_range_high', 'fico_range_low', 'grade', 'home_ownership', 'id', 'initial_list_status', 'installment', 
             'int_rate', 'issue_d', 'loan_amnt', 'loan_status', 'mort_acc', 'open_acc', 'pub_rec', 'pub_rec_bankruptcies',
             'purpose', 'revol_bal', 'revol_util', 'sub_grade', 'term', 'title', 'total_acc', 'verification_status',
             'zip_code'] #make a list of features useful for  investors

len(keep_list) #out of 93 columns we use 31 columns for prediction of loan_status variable

drop_list = [col for col in loans.columns if col not in keep_list]
print(drop_list)  #drop remaining 62 columns

len(drop_list)

loans.drop(labels=drop_list, axis=1, inplace=True) #drop drop_list containing list of features not needed

loans.shape #columns after dropping features not needed

loans.dtypes #checking datatypes of variables in loans dataframe

loans.select_dtypes(include=np.object).columns.tolist() #categorical columns in loans dataframe

loans.select_dtypes(include=np.float).columns.tolist()

#Using Pearson Correlation
plt.figure(figsize=(12,10))
cor = loans.corr()
sns.heatmap(cor, annot=True, cmap=plt.cm.Reds)
plt.show()

def plot_var(col_name, full_name, continuous):
    """
    Visualize a variable with and without faceting on the loan status.
    - col_name is the variable name in the dataframe
    - full_name is the full variable name
    - continuous is True if the variable is continuous, False otherwise
    """
    f, (ax1, ax2) = plt.subplots(nrows=1, ncols=2, figsize=(12,3), dpi=90)
    
    # Plot without loan status
    if continuous:
        sns.distplot(loans.loc[loans[col_name].notnull(), col_name], kde=False, ax=ax1)
    else:
        sns.countplot(loans[col_name], order=sorted(loans[col_name].unique()), color='#5975A4', saturation=1, ax=ax1)
    ax1.set_xlabel(full_name)
    ax1.set_ylabel('Count')
    ax1.set_title(full_name)

    # Plot with loan status
    if continuous:
        sns.boxplot(x=col_name, y='loan_status', data=loans, ax=ax2)
        ax2.set_ylabel('')
        ax2.set_title(full_name + ' by Loan Status')
    else:
        charge_off_rates = loans.groupby(col_name)['loan_status'].value_counts(normalize=True).loc[:,'Charged Off']
        sns.barplot(x=charge_off_rates.index, y=charge_off_rates.values, color='#5975A4', saturation=1, ax=ax2)
        ax2.set_ylabel('Fraction of Loans Charged-off')
        ax2.set_title('Charge-off Rate by ' + full_name)
    ax2.set_xlabel(full_name)
    
    plt.tight_layout()

#Analyzing categoical data ['id','term','grade','sub_grade','emp_title','emp_length','home_ownership','verification_status',
#'issue_d','loan_status','purpose','title', 'zip_code','addr_state', 'earliest_cr_line','initial_list_status',
#'application_type']

loans['id'].describe() #Usually ID's are unique and independent keys

loans.drop('id', axis=1, inplace=True) #drop id



loans['term'].value_counts(dropna=False)

loans['term'] = loans['term'].apply(lambda s: np.int8(s.split()[0])) #convert them into numericals

loans['term'].value_counts(normalize=True)

loans.groupby('term')['loan_status'].value_counts(normalize=True).loc[:,'Charged Off']

print(sorted(loans['grade'].unique()))

print(sorted(loans['sub_grade'].unique()))

loans.drop('grade', axis=1, inplace=True) #Drop grade

plot_var('sub_grade', 'Subgrade', continuous=False)



loans['emp_title'].describe()

loans.drop(labels='emp_title', axis=1, inplace=True) #too many unique values



loans['emp_length'].value_counts(dropna=False).sort_index()

loans['emp_length'].replace('< 1 year', '0 years', inplace=True)

loans['emp_length'].replace(to_replace='10+ years', value='10 years', inplace=True)

def emp_length_to_int(s):
    if pd.isnull(s):
        return s
    else:
        return np.int8(s.split()[0])

loans['emp_length'] = loans['emp_length'].apply(emp_length_to_int)

loans['emp_length'] = loans['emp_length'].fillna(loans.emp_length.median())

loans['emp_length'].value_counts(dropna=False).sort_index()

loans.groupby('emp_length')['loan_status'].value_counts(normalize=True).loc[:,'Charged Off']

plot_var('emp_length', 'Employment Length', continuous=False)

loans.groupby('emp_length')['loan_status'].describe()



loans['home_ownership'].value_counts(dropna=False)

loans['home_ownership'].replace(['NONE', 'ANY'], 'OTHER', inplace=True)

loans['home_ownership'].value_counts(dropna=False)

home_ownership_xt = pd.crosstab(loans['home_ownership'], loans['loan_status'])
home_ownership_xt

# Normalize the cross tab to sum to 1:
home_ownership_xt_pct = home_ownership_xt.div(home_ownership_xt.sum(1).astype(float), axis=0)

home_ownership_xt_pct.plot(kind='bar', 
                   stacked=True, 
                   title='Loan status by Home Ownership')
plt.xlabel('Home Ownership')
plt.ylabel('Loan Status')

plot_var('home_ownership', 'Home Ownership', continuous=False)

loans.groupby('home_ownership')['loan_status'].value_counts(normalize=True).loc[:,'Charged Off']



loans['verification_status'].value_counts(dropna=False)

loans.groupby('verification_status')['loan_status'].value_counts(normalize=True).loc[:,'Charged Off']

plot_var('verification_status', 'Verification Status', continuous=False)




loans['purpose'].value_counts()

loans.groupby('purpose')['loan_status'].value_counts(normalize=True).loc[:,'Charged Off'].sort_values()



loans['title'].describe()

loans['title'].value_counts().head

loans.drop('title', axis=1, inplace=True)



loans['zip_code'].describe()

loans['addr_state'].sample(5)

loans['addr_state'].nunique()

loans.drop(labels='zip_code', axis=1, inplace=True)



loans['earliest_cr_line'].describe()

loans['earliest_cr_line'].isnull().any()



loans['initial_list_status'].describe()

plot_var('initial_list_status', 'Initial List Status', continuous=False)

loans.groupby('initial_list_status')['loan_status'].value_counts(normalize=True).loc[:,'Charged Off']



loans['application_type'].value_counts()

loans['application_type'].value_counts(normalize=True)

loans.groupby('application_type')['loan_status'].value_counts(normalize=True).loc[:,'Charged Off']



loans['loan_amnt'].describe()

loans['loan_amnt'] = loans['loan_amnt'].apply(lambda x: np.log10(x+1)) #applying log10 to calculate numerical value

loans['loan_amnt'].describe()

plot_var('loan_amnt', 'Loan Amount', continuous=True)

loans.groupby('loan_status')['loan_amnt'].describe()



loans['int_rate'].describe()

plot_var('int_rate', 'Interest Rate', continuous=True)

loans.groupby('loan_status')['int_rate'].describe()

loans.groupby('purpose')['int_rate'].describe()

loans.groupby('sub_grade')['int_rate'].describe()

loans.groupby('sub_grade')['purpose'].describe()



loans['installment'].describe()

plot_var('installment', 'Installment', continuous=True)

loans.groupby('loan_status')['installment'].describe()



loans['annual_inc'].describe()

loans['log_annual_inc'] = loans['annual_inc'].apply(lambda x: np.log10(x+1))

loans.drop('annual_inc', axis=1, inplace=True)

loans['log_annual_inc'].describe()

plot_var('log_annual_inc', 'Log Annual Income', continuous=True)

loans.groupby('loan_status')['log_annual_inc'].describe()



loans['dti'].describe()

plt.figure(figsize=(8,3), dpi=90)
sns.distplot(loans.loc[loans['dti'].notnull() & (loans['dti']<60), 'dti'], kde=False)
plt.xlabel('Debt-to-income Ratio')
plt.ylabel('Count')
plt.title('Debt-to-income Ratio')

(loans['dti']>=60).sum()



loans.groupby('loan_status')['dti'].describe()



loans[['fico_range_low', 'fico_range_high']].describe()

loans[['fico_range_low','fico_range_high']].corr()

loans['fico_score'] = 0.5*loans['fico_range_low'] + 0.5*loans['fico_range_high']

loans.drop(['fico_range_high', 'fico_range_low'], axis=1, inplace=True)

plot_var('fico_score', 'FICO Score', continuous=True)

loans.groupby('loan_status')['fico_score'].describe()



plt.figure(figsize=(10,3), dpi=90)
sns.countplot(loans['open_acc'], order=sorted(loans['open_acc'].unique()), color='#5975A4', saturation=1)
_, _ = plt.xticks(np.arange(0, 90, 5), np.arange(0, 90, 5))
plt.title('Number of Open Credit Lines')

loans[['open_acc','total_acc']].corr()

loans.groupby('loan_status')['open_acc'].describe()



loans['pub_rec'].value_counts().sort_index()

loans[['pub_rec','pub_rec_bankruptcies']].corr()

loans.groupby('loan_status')['pub_rec'].describe()



loans['revol_bal'].describe()

loans['log_revol_bal'] = loans['revol_bal'].apply(lambda x: np.log10(x+1))

loans.drop('revol_bal', axis=1, inplace=True)

plot_var('log_revol_bal', 'Log Revolving Credit Balance', continuous=True)

loans.groupby('loan_status')['log_revol_bal'].describe()



loans['revol_util'].describe()

plot_var('revol_util', 'Revolving Line Utilization', continuous=True)

loans.groupby('loan_status')['revol_util'].describe()

loans[['revol_util','fico_score']].corr()



loans.groupby('loan_status')['total_acc'].describe()



loans['mort_acc'].describe()

loans['mort_acc'].value_counts().head(10)

loans.groupby('loan_status')['mort_acc'].describe()



loans['pub_rec_bankruptcies'].value_counts().sort_index()

plot_var('pub_rec_bankruptcies', 'Public Record Bankruptcies', continuous=False)



loans['issue_d'].sample(5)

loans['issue_d'].isnull().any()

loans['issue_d'] = pd.to_datetime(loans['issue_d'])

loans['issue_d'].sample(5)


loans['issue_d'].describe()

plt.figure(figsize=(6,3), dpi=90)
loans['issue_d'].dt.year.value_counts().sort_index().plot.bar(color='darkblue')
plt.xlabel('Year')
plt.ylabel('Number of Loans Funded')
plt.title('Loans Funded per Year')

loans['earliest_cr_line'].sample(5)

loans['earliest_cr_line'].isnull().any()

loans['earliest_cr_line'] = pd.to_datetime(loans['earliest_cr_line'])

loans['earliest_cr_line'].sample(5)



loans['days_from_issue_to_earliest_cr'] = (loans['issue_d'] - loans['earliest_cr_line']).apply(lambda x: x.days)

loans['days_from_issue_to_earliest_cr'].sample(5)

loans.groupby('days_from_issue_to_earliest_cr')['loan_status'].value_counts(normalize=True).loc[:,'Charged Off']

loans['target'] = (loans['loan_status'] == 'Charged Off').astype(int)
loans['target'].describe()

_df = loans.groupby('days_from_issue_to_earliest_cr')['target'].mean().reset_index()

sns.scatterplot(x='days_from_issue_to_earliest_cr', y='target', data=_df)

_df.corr()

# Data Analysis

* What are you doing?
* What have you found?
* What action did you take?

# Modelling

# Evaluation

loans.drop('earliest_cr_line', axis=1, inplace=True)

loans.drop('days_from_issue_to_earliest_cr', axis=1, inplace=True)

loans.drop('target',axis=1,inplace=True)

loans['acc_ratio'] = loans['open_acc'] / loans['total_acc']

loans.head(4)

loans.drop('total_acc', axis=1, inplace=True)

loans.drop('open_acc', axis=1, inplace=True)



loans.head(20)

loans.drop('pub_rec', axis=1, inplace=True)



loans['charged_off'] = (loans['loan_status'] == 'Charged Off').apply(np.uint8)
loans.drop('loan_status', axis=1, inplace=True)



#Using Pearson Correlation
plt.figure(figsize=(12,10))
cor = loans.corr()
sns.heatmap(cor, annot=True, cmap=plt.cm.Reds)
plt.show()

loans.shape


missing_fractions = loans.isnull().mean().sort_values(ascending=False) # Fraction of data missing for each variable


print(missing_fractions[missing_fractions > 0]) # Print variables that are missing data


print(loans.columns)

loans = pd.get_dummies(loans, columns=['sub_grade', 'home_ownership', 'verification_status', 'purpose', 'addr_state', 'initial_list_status','application_type'], drop_first=True)



loans.shape


loans.sample(5)




plt.figure(figsize=(6,3), dpi=90)
loans['issue_d'].dt.year.value_counts().sort_index().plot.bar(color='darkblue')
plt.xlabel('Year')
plt.ylabel('Number of Loans Funded')
plt.title('Loans Funded per Year')





loans_train = loans.loc[loans['issue_d'] <  loans['issue_d'].quantile(0.9)]
loans_test =  loans.loc[loans['issue_d'] >= loans['issue_d'].quantile(0.9)]

print('Number of loans in the partition:   ', loans_train.shape[0] + loans_test.shape[0])
print('Number of loans in the full dataset:', loans.shape[0])

loans_test.shape[0] / loans.shape[0]

del loans

loans_train['issue_d'].describe()

loans_test['issue_d'].describe()


loans_train.drop('issue_d', axis=1, inplace=True)
loans_test.drop('issue_d', axis=1, inplace=True)

y_train = loans_train['charged_off']
y_test = loans_test['charged_off']

X_train = loans_train.drop('charged_off', axis=1)
X_test = loans_test.drop('charged_off', axis=1)

del loans_train, loans_test



from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.model_selection import GridSearchCV


from sklearn.ensemble import RandomForestClassifier

pipeline_rfc = Pipeline([
    ('imputer', SimpleImputer(copy=False)),
    ('model', RandomForestClassifier(n_jobs=-1, random_state=1))
])

param_grid_rfc = {
    'model__n_estimators': [50] # The number of randomized trees to build
}

grid_rfc = GridSearchCV(estimator=pipeline_rfc, param_grid=param_grid_rfc, scoring='roc_auc', n_jobs=1, pre_dispatch=1, cv=5, verbose=1, return_train_score=False)

grid_rfc.fit(X_train, y_train)

grid_rfc.best_score_

np.any(np.isnan(X_train))

np.any(np.isnan(y_train))

np.all(np.isfinite(X_train))

np.all(np.isfinite(y_train))

np.any(np.isnan(X_test))

np.any(np.isnan(y_test))

X_test = X_test.fillna(X_test.mean())

X_train = X_train.fillna(X_train.mean())

np.where(y_train.values >= np.finfo(np.float64).max)

np.all(np.isfinite(X_test))

np.all(np.isfinite(y_test))

np.where(y_test.values >= np.finfo(np.float64).max)

np.where(X_test.values >= np.finfo(np.float64).max)





from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score, accuracy_score, roc_auc_score
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_val_score

# See the inital model performance
clf = RandomForestClassifier(random_state=10)
print('Acc:', cross_val_score(clf, X_train, y_train, 
                              cv=StratifiedKFold(n_splits=5), 
                              scoring='accuracy').mean())
print('F1:', cross_val_score(clf, X_train, y_train, 
                             cv=StratifiedKFold(n_splits=5), 
                             scoring='f1').mean())
print('ROC AUC:', cross_val_score(clf, X_train, y_train, 
                                  cv=StratifiedKFold(n_splits=5), 
                                  scoring='roc_auc').mean())



y_score = rfc.predict_proba(X_test)[:,1]
roc_auc_score(y_test, y_score)

from sklearn import model_selection
# random forest model creation
rfc = RandomForestClassifier()
rfc.fit(X_train,y_train)
# predictions
rfc_predict = rfc.predict(X_test)

from sklearn.model_selection import cross_val_score
from sklearn.metrics import classification_report, confusion_matrix

rfc_cv_score =cross_val_score(rfc, X_train, y_train, cv=10, scoring='roc_auc')

print("=== Confusion Matrix ===")
print(confusion_matrix(y_test, rfc_predict))
print('\n')
print("=== Classification Report ===")
print(classification_report(y_test, rfc_predict))
print('\n')
print("=== All AUC Scores ===")
print(rfc_cv_score)
print('\n')
print("=== Mean AUC Score ===")
print("Mean AUC Score - Random Forest: ", rfc_cv_score.mean())



import sklearn.metrics as metrics

y_pred_proba = rfc.predict_proba(X_test)[:,1]
fpr,tpr,thresholds = roc_curve(y_test,y_pred_proba)

roc_auc = metrics.auc(fpr, tpr)

import matplotlib.pyplot as plt
plt.title('Receiver Operating Characteristic')
plt.plot(fpr, tpr, 'b', label = 'AUC = %0.2f' % roc_auc)
plt.legend(loc = 'lower right')
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0, 1])
plt.ylim([0, 1])
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.show()

roc_auc_score(y_test,y_pred_proba)



from sklearn.metrics import precision_recall_curve
import matplotlib.pyplot as plt

precision, recall, thresholds = precision_recall_curve(y_test, y_pred_proba)

plt.figure(figsize = (10,8))
plt.plot([0, 1], [0.5, 0.5],'k--')
plt.plot(recall, precision,'b', label = 'Precision Recall Curve = %0.2f' % auc_prc)
plt.legend(loc = 'lower right')
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.title('PRC curve')
plt.show()

# calculate precision-recall AUC
auc_prc = auc(recall, precision)
print(auc_prc)

