## Project Goal
- Predict the likelihood of a customer defaulting on their load

## 
1 Status 	credit status
2 Seniority 	job seniority (years)
3 Home 	type of home ownership
4 Time 	time of requested loan
5 Age 	client's age
6 Marital 	marital status
7 Records 	existance of records
8 Job 	type of job
9 Expenses 	amount of expenses
10 Income 	amount of income
11 Assets 	amount of assets
12 Debt 	amount of debt
13 Amount 	amount requested of loan
14 Price 	price of good

## Importing Libraries

In [1]:
## loading and preprocessing data
import pandas as pd
import numpy as np

## visualizing data
import matplotlib.pyplot as plt
import seaborn as sns

## 
from sklearn.model_selection import train_test_split

## Dowloading And Loading The Datasets

In [2]:
#url = "https://github.com/gastonstat/CreditScoring/raw/master/CreditScoring.csv"

#!wget $url 

In [3]:
df = pd.read_csv('CreditScoring.csv')

In [4]:
df.head()

Unnamed: 0,Status,Seniority,Home,Time,Age,Marital,Records,Job,Expenses,Income,Assets,Debt,Amount,Price
0,1,9,1,60,30,2,1,3,73,129,0,0,800,846
1,1,17,1,60,58,3,1,1,48,131,0,0,1000,1658
2,2,10,2,36,46,2,2,3,90,200,3000,0,2000,2985
3,1,0,1,60,24,1,1,1,63,182,2500,0,900,1325
4,1,0,1,36,26,1,1,1,46,107,0,0,310,910


In [5]:
## normalize the columns names to lowercase
df.columns = df.columns.str.lower()
df.head()

Unnamed: 0,status,seniority,home,time,age,marital,records,job,expenses,income,assets,debt,amount,price
0,1,9,1,60,30,2,1,3,73,129,0,0,800,846
1,1,17,1,60,58,3,1,1,48,131,0,0,1000,1658
2,2,10,2,36,46,2,2,3,90,200,3000,0,2000,2985
3,1,0,1,60,24,1,1,1,63,182,2500,0,900,1325
4,1,0,1,36,26,1,1,1,46,107,0,0,310,910


In [6]:
df.status.value_counts()

1    3200
2    1254
0       1
Name: status, dtype: int64

In [7]:
## lets replace numeric categorical values with strings
status = {1: "default", 2: "ok", 0: "unk"}

df.status = df.status.map(status)

In [8]:
df.home.value_counts()

2    2107
1     973
5     783
6     319
3     247
4      20
0       6
Name: home, dtype: int64

In [9]:
home = {1: "rent", 2: "owner", 3: "priv", 4: "ignore", 5: "parents", 0: "other"}

df.home = df.home.map(home)

In [10]:
df.marital.value_counts()

2    3241
1     978
4     130
3      67
5      38
0       1
Name: marital, dtype: int64

In [11]:
marital = {1: "single", 2: "married", 3: "widow", 4: "separated", 5: "divorced", 0: "unk"}

df.marital = df.marital.map(marital)

In [12]:
df.records.value_counts()

1    3682
2     773
Name: records, dtype: int64

In [13]:
records = {1: "no_rec", 2: "yes_rec"}

df.records = df.records.map(records)

In [14]:
df.job.value_counts()

1    2806
3    1024
2     452
4     171
0       2
Name: job, dtype: int64

In [15]:
jobs = {1: "fixed", 2: "partime", 3: "freelance", 4: "others", 0: "unk"}
df.job = df.job.map(jobs)

In [16]:
df.head()

Unnamed: 0,status,seniority,home,time,age,marital,records,job,expenses,income,assets,debt,amount,price
0,default,9,rent,60,30,married,no_rec,freelance,73,129,0,0,800,846
1,default,17,rent,60,58,widow,no_rec,fixed,48,131,0,0,1000,1658
2,ok,10,owner,36,46,married,yes_rec,freelance,90,200,3000,0,2000,2985
3,default,0,rent,60,24,single,no_rec,fixed,63,182,2500,0,900,1325
4,default,0,rent,36,26,single,no_rec,fixed,46,107,0,0,310,910


In [17]:
## summary stats of the dataframe
df.describe().round()

Unnamed: 0,seniority,time,age,expenses,income,assets,debt,amount,price
count,4455.0,4455.0,4455.0,4455.0,4455.0,4455.0,4455.0,4455.0,4455.0
mean,8.0,46.0,37.0,56.0,763317.0,1060341.0,404382.0,1039.0,1463.0
std,8.0,15.0,11.0,20.0,8703625.0,10217569.0,6344253.0,475.0,628.0
min,0.0,6.0,18.0,35.0,0.0,0.0,0.0,100.0,105.0
25%,2.0,36.0,28.0,35.0,80.0,0.0,0.0,700.0,1118.0
50%,5.0,48.0,36.0,51.0,120.0,3500.0,0.0,1000.0,1400.0
75%,12.0,60.0,45.0,72.0,166.0,6000.0,0.0,1300.0,1692.0
max,48.0,72.0,68.0,180.0,99999999.0,99999999.0,99999999.0,5000.0,11140.0


In [18]:
for c in ["income", "assets", "debt"]:
    df[c] = df[c].replace(to_replace=99999999, value=np.nan)

In [19]:
df.describe().round()

Unnamed: 0,seniority,time,age,expenses,income,assets,debt,amount,price
count,4455.0,4455.0,4455.0,4455.0,4421.0,4408.0,4437.0,4455.0,4455.0
mean,8.0,46.0,37.0,56.0,131.0,5403.0,343.0,1039.0,1463.0
std,8.0,15.0,11.0,20.0,86.0,11573.0,1246.0,475.0,628.0
min,0.0,6.0,18.0,35.0,0.0,0.0,0.0,100.0,105.0
25%,2.0,36.0,28.0,35.0,80.0,0.0,0.0,700.0,1118.0
50%,5.0,48.0,36.0,51.0,120.0,3000.0,0.0,1000.0,1400.0
75%,12.0,60.0,45.0,72.0,165.0,6000.0,0.0,1300.0,1692.0
max,48.0,72.0,68.0,180.0,959.0,300000.0,30000.0,5000.0,11140.0


In [20]:
## lets remove the un]known records in our dataset
[df.status != "unk"]

[0       True
 1       True
 2       True
 3       True
 4       True
         ... 
 4450    True
 4451    True
 4452    True
 4453    True
 4454    True
 Name: status, Length: 4455, dtype: bool]

## Build A Validation Framework


In [21]:
df_train_full, df_test = train_test_split(df, test_size=0.2, random_state=42)
df_train, df_valid = train_test_split(df_train_full, test_size=0.25, random_state=42)

print(f"Training size {len(df_train)}")
print(f"Validation size {len(df_valid)}")
print(f"Testing size {len(df_test)}")

Training size 2673
Validation size 891
Testing size 891


In [22]:
y_train = (df_train.status == "default").values
y_valid = (df_valid.status == "default").values
y_test = (df_test.status == "default").values

In [23]:
del df_train['status']
del df_valid['status']
del df_test['status']

## Build A Decision Tree Model 

In [24]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.feature_extraction import DictVectorizer
from sklearn.metrics import roc_auc_score

In [25]:
df_train.isnull().sum()

seniority      0
home         204
time           0
age            0
marital        0
records        0
job            0
expenses       0
income        21
assets        26
debt           7
amount         0
price          0
dtype: int64

In [26]:
## lets fill in missing values with zeros
df_train = df_train.fillna(0)
df_valid = df_valid.fillna(0)

## convert categorical data to numerical 
dict_train = df_train.to_dict(orient='records')
dict_valid = df_valid.to_dict(orient='records')

## 
dv = DictVectorizer(sparse=False)

X_train = dv.fit_transform(dict_train)
X_valid = dv.transform(dict_valid)

In [27]:
len(X_train), len(y_train)

(2673, 2673)

In [28]:
clf_tree = DecisionTreeClassifier()
clf_tree.fit(X_train,y_train)

In [37]:
y_pred_train = clf_tree.predict_proba(X_train)[:,1]
y_pred_train


array([1., 1., 1., ..., 1., 1., 1.])

In [38]:
roc_auc_score(y_train, y_pred_train)

0.9999996450431698

In [39]:
## generate predictions on the validation set
y_pred_valid = clf_tree.predict_proba(X_valid)[:,1]

In [40]:
roc_auc_score(y_valid, y_pred_valid)

0.6645045056181164

In [41]:
clf_tree = DecisionTreeClassifier(max_depth=3)
clf_tree.fit(X_train, y_train)

In [43]:
y_pred_train = clf_tree.predict_proba(X_train)[:,1]

roc_auc_score(y_train, y_pred_train)

0.7783638903836516

In [42]:
y_pred_valid = clf_tree.predict_proba(X_valid)[:,1]

auc = roc_auc_score(y_valid, y_pred_valid)

print(f"Validation Accuracy: {auc}")

Validation Accuracy: 0.7359701603233662
