# Credit Risk Modelling

Building a risk assessment model that eliminates social biases like age, sex and race. Improve the fairness, transparency and inclusitivity of access to credit

# Importing tools

We tested out a few different algorithms and XGBoost Classifier was so far the most accurate (75% prediction rate)

In [83]:
# Defining tools 

import xgboost
import pandas as pd
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score



# Loading data

We uses a public dataset given by Lending Club, with more than 2 million different accounts and 44 billion dollars in lending.


In [89]:
# Loading data

original_data_path = "accepted_2007_to_2018Q3.csv"

original_data = pd.read_csv(original_data_path, nrows=100000, engine="python")

features = ["loan_amnt", "int_rate", "emp_length", "home_ownership", "annual_inc",
           "verification_status", "loan_status", "title", "delinq_2yrs", "fico_range_low", "fico_range_high",
            "pub_rec", "revol_bal", "revol_util", "application_type", "tot_cur_bal", "chargeoff_within_12_mths", "pub_rec_bankruptcies",
           "total_il_high_credit_limit"]

# Features that are not numbers:
# One Hot encoding: home_ownership, verification_status, title ==== done
# Direct change:
# - emp_length: a years (2 <= a <= 10, 10+ years, <1 year)  ===== done
# - loan_status: only takes Charged Off and Fully Paid  ===== done
# - application_type: deals only with individual   ====== done

# Direct change first:

chosen_data = original_data[features]
chosen_data.drop(chosen_data[(chosen_data.loan_status != "Fully Paid") & (chosen_data.loan_status != "Charged Off")].index, inplace = True)

# Choose only individuals
chosen_data.drop(chosen_data[(chosen_data.application_type != "Individual")].index, inplace = True)
chosen_data = chosen_data.drop("application_type", axis = 1)

# Reformatting emp_length:
chosen_data["emp_length"].fillna(0, inplace = True)
emp_length_di = {"< 1 year": 0.5, "1 year": 1, "2 years": 2, "3 years": 3, "4 years": 4, "5 years": 5, "6 years": 6, "7 years": 7,
                "8 years": 8, "9 years": 9, "10+ years": 10}
chosen_data = chosen_data.replace({"emp_length": emp_length_di})

# Fill known empty columns
chosen_data['title'].fillna("Other", inplace = True)
util_median = chosen_data['revol_util'].median()
chosen_data['revol_util'].fillna(util_median, inplace = True)

# Double checking for empty cells
cols_with_missing = [col for col in chosen_data.columns if chosen_data[col].isnull().any()]
chosen_data = chosen_data.replace({"loan_status": {"Charged Off": 0, "Fully Paid": 1}})

chosen_data.to_csv("clean_data.csv")

# Training data

We get the clean set of data in and use it to train our data. 

In [104]:
# Loading clean data

clean_data_path = "clean_data.csv"
clean_data = pd.read_csv(clean_data_path, engine="python")

X = clean_data.drop(["loan_status", "int_rate"], axis = 1)

hot_encoded_X = pd.get_dummies(X)
y = clean_data.loan_status


# split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(hot_encoded_X, y, test_size=0.3, random_state=1)

# fit model no training data
model = XGBClassifier(n_estimators = 150, learning_rate = 0.05)
model.fit(X_train, y_train)

# make predictions for test data
y_pred = model.predict(X_test)

# evaluate accuracy of predictions
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy: %.2f%%" % (accuracy * 100.0))



Accuracy: 75.98%


In [105]:
# evaluate increase in profit and interest rates
default_risk = model.predict_proba(hot_encoded_X)[:, 0] * 100
clean_data['default_risk'] = default_risk

# if default_risk > 25% we are not lending
mask = (clean_data['default_risk'] <= 25)
clean_data_not_default = clean_data[mask]

clean_data['new_loan_amnt'] = 0
clean_data.loc[mask, 'new_loan_amnt'] = clean_data_not_default['loan_amnt']

# total lost
# we have profit = interest_revenue - default_lost

clean_data['default_lost_predicted'] = clean_data.default_risk * clean_data.new_loan_amnt / 100
total_lost_predicted = clean_data['default_lost_predicted'].sum()

clean_data['default_lost_actual'] = clean_data.loan_amnt * abs(clean_data.loan_status - 1)
total_lost_actual = clean_data['default_lost_actual'].sum()
clean_data['interest_collected_actual'] = clean_data.loan_amnt * clean_data.int_rate / 100
total_interest_actual = clean_data['interest_collected_actual'].sum()
total_profit_actual = total_interest_actual - total_lost_actual

print(total_lost_predicted)
print(total_lost_actual)
print(total_interest_actual)
print(total_profit_actual)


# interest

clean_data

61282074.150980055
174754225.0
99877677.35499999
-74876547.64500001


Unnamed: 0.1,Unnamed: 0,loan_amnt,int_rate,emp_length,home_ownership,annual_inc,verification_status,loan_status,title,delinq_2yrs,...,revol_util,tot_cur_bal,chargeoff_within_12_mths,pub_rec_bankruptcies,total_il_high_credit_limit,default_risk,new_loan_amnt,default_lost_predicted,default_lost_actual,interest_collected_actual
0,2,12000.0,16.99,0.0,RENT,30000.0,Not Verified,1,Credit card refinancing,0.0,...,38.2,8862.0,0.0,0.0,0.0,34.059151,0.0,0.000000,0.0,2038.8000
1,7,25900.0,25.49,10.0,MORTGAGE,54000.0,Verified,0,Credit card refinancing,0.0,...,64.5,166140.0,0.0,0.0,7665.0,32.788891,0.0,0.000000,25900.0,6601.9100
2,9,8000.0,7.99,10.0,MORTGAGE,57000.0,Source Verified,1,Credit card refinancing,3.0,...,36.7,146207.0,0.0,0.0,88422.0,15.151757,8000.0,1212.140579,0.0,639.2000
3,12,13000.0,5.32,10.0,MORTGAGE,101800.0,Not Verified,0,Credit card refinancing,0.0,...,57.5,168092.0,0.0,0.0,52184.0,10.611701,13000.0,1379.521132,13000.0,691.6000
4,16,5000.0,11.44,10.0,RENT,60000.0,Source Verified,1,Credit card refinancing,0.0,...,27.8,24036.0,0.0,0.0,19519.0,20.401371,5000.0,1020.068550,0.0,572.0000
5,17,25000.0,28.69,10.0,RENT,83807.4,Source Verified,1,Debt consolidation,0.0,...,38.6,65044.0,0.0,0.0,63196.0,34.860569,0.0,0.000000,0.0,7172.5000
6,19,4500.0,12.74,6.0,RENT,32000.0,Not Verified,1,Credit card refinancing,0.0,...,64.0,25255.0,0.0,0.0,29681.0,28.119183,0.0,0.000000,0.0,573.3000
7,22,11100.0,5.32,3.0,RENT,60000.0,Not Verified,1,Credit card refinancing,0.0,...,34.4,67344.0,0.0,0.0,39360.0,18.224770,11100.0,2022.949425,0.0,590.5200
8,23,12000.0,14.99,10.0,MORTGAGE,68000.0,Source Verified,1,Debt consolidation,0.0,...,85.3,273093.0,0.0,0.0,16343.0,19.175488,12000.0,2301.058502,0.0,1798.8000
9,25,20000.0,14.99,2.0,RENT,59200.0,Not Verified,1,Major purchase,1.0,...,12.8,263533.0,0.0,0.0,238242.0,35.875076,0.0,0.000000,0.0,2998.0000
