<a href="https://colab.research.google.com/github/martharegina/machinelearning/blob/main/lendingclub_oc_pb.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [66]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from sklearn.cluster import MiniBatchKMeans
from sklearn.utils import resample
from scipy.stats import f_oneway

# Preprocessing

In [33]:
# Load dataset

object_cols = [
    'annual_inc',
    'dti',
    'fico_range_low',
    'fico_range_high',
    'emp_length',
    'revol_util',
    'total_acc',
    'open_acc',
    'delinq_2yrs',
    'loan_amnt',
    'installment',
    'term',
    'int_rate',
    'purpose',
    'application_type',
    'home_ownership',
    'issue_d'
]

data = pd.read_csv(
    '/content/accepted_2007_to_2018Q4.csv.gz',
    compression='gzip',
    usecols=object_cols,
    low_memory=False)
data.head()

Unnamed: 0,loan_amnt,term,int_rate,installment,emp_length,home_ownership,annual_inc,issue_d,purpose,dti,delinq_2yrs,fico_range_low,fico_range_high,open_acc,revol_util,total_acc,application_type
0,3600.0,36 months,13.99,123.03,10+ years,MORTGAGE,55000.0,Dec-2015,debt_consolidation,5.91,0.0,675.0,679.0,7.0,29.7,13.0,Individual
1,24700.0,36 months,11.99,820.28,10+ years,MORTGAGE,65000.0,Dec-2015,small_business,16.06,1.0,715.0,719.0,22.0,19.2,38.0,Individual
2,20000.0,60 months,10.78,432.66,10+ years,MORTGAGE,63000.0,Dec-2015,home_improvement,10.78,0.0,695.0,699.0,6.0,56.2,18.0,Joint App
3,35000.0,60 months,14.85,829.9,10+ years,MORTGAGE,110000.0,Dec-2015,debt_consolidation,17.06,0.0,785.0,789.0,13.0,11.6,17.0,Individual
4,10400.0,60 months,22.45,289.91,3 years,MORTGAGE,104433.0,Dec-2015,major_purchase,25.37,1.0,695.0,699.0,12.0,64.5,35.0,Individual


In [34]:
# Drop baris yang punya missing values
data = data.dropna(axis=0)
data.shape

(2111997, 17)

In [35]:
# Pilih data di tahun 2018 saja
data['issue_d'] = pd.to_datetime(data['issue_d'])
data = data[data['issue_d'].dt.year == 2018]

  data['issue_d'] = pd.to_datetime(data['issue_d'])


In [36]:
# Ordinal encoding emp_length
emp_map = {
    '< 1 year': 0,
    '1 year': 1,
    '2 years': 2,
    '3 years': 3,
    '4 years': 4,
    '5 years': 5,
    '6 years': 6,
    '7 years': 7,
    '8 years': 8,
    '9 years': 9,
    '10+ years': 10
}
data['emp_length'] = data['emp_length'].map(emp_map)

In [37]:
# Binary encoding term
data['term'] = data['term'].map({' 36 months': 0, ' 60 months': 1})

In [91]:
# Behavioral grouping purpose
consumption = [
    'credit_card', 'vacation', 'wedding', 'moving',
    'medical', 'other', 'car'
]

productive = [
    'house', 'home_improvement', 'renewable_energy',
    'debt_consolidation', 'small_business', 'major_purchase'
]

def map_purpose(x):
    if x in consumption:
        return 'consumption'
    elif x in productive:
        return 'productive'

data['purpose_group'] = data['purpose'].apply(map_purpose)

In [39]:
# Binary encoding application_type
data['joint_app'] = (data['application_type'] == 'Joint App').astype(int)
data = data.drop(columns=['application_type'], axis=1)

In [40]:
# Ordinal encoding home_ownership
home_map = {
    'RENT': 0,
    'MORTGAGE': 1,
    'OWN': 2,
    'ANY': 0
}

data['home_ownership'] = data['home_ownership'].map(home_map)

In [41]:
# Tambah kolom loan_to_income and installment_to_income
data['loan_to_income'] = data['loan_amnt'] / data['annual_inc']
data['installment_to_income'] = data['installment'] / data['annual_inc']

In [42]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 452656 entries, 421097 to 1611876
Data columns (total 20 columns):
 #   Column                 Non-Null Count   Dtype         
---  ------                 --------------   -----         
 0   loan_amnt              452656 non-null  float64       
 1   term                   452656 non-null  int64         
 2   int_rate               452656 non-null  float64       
 3   installment            452656 non-null  float64       
 4   emp_length             452656 non-null  int64         
 5   home_ownership         452656 non-null  int64         
 6   annual_inc             452656 non-null  float64       
 7   issue_d                452656 non-null  datetime64[ns]
 8   purpose                452656 non-null  object        
 9   dti                    452656 non-null  float64       
 10  delinq_2yrs            452656 non-null  float64       
 11  fico_range_low         452656 non-null  float64       
 12  fico_range_high        452656 non-null  flo

# Overconfidence

In [43]:
features_oc = [
    'loan_to_income',
    'installment_to_income',
    'dti',
    'fico_range_low',
    'revol_util',
    'emp_length',
    'delinq_2yrs'
]

In [44]:
np.isinf(data[features_oc]).sum()

Unnamed: 0,0
loan_to_income,1
installment_to_income,1
dti,0
fico_range_low,0
revol_util,0
emp_length,0
delinq_2yrs,0


In [45]:
data[features_oc] = data[features_oc].replace([np.inf, -np.inf], np.nan)

In [46]:
data_oc = data[features_oc].dropna()

In [47]:
X_oc = StandardScaler().fit_transform(data_oc)

In [57]:
# Pertimbangan jumlah cluster
X_sample = resample(X_oc, n_samples=50000, random_state=42)

sil = []
for k in range(2,7):
    km = MiniBatchKMeans(n_clusters=k, random_state=42, batch_size=10000)
    labels = km.fit_predict(X_sample)
    sil.append(silhouette_score(X_sample, labels))

sil

[np.float64(0.23066879440483096),
 np.float64(0.24517169096146693),
 np.float64(0.2454923032154201),
 np.float64(0.2220594340807672),
 np.float64(0.2174725008257129)]

In [84]:
# Clustering k=3
kmeans = KMeans(n_clusters=3, random_state=42)
data_oc['OC_cluster'] = kmeans.fit_predict(X_oc)

In [85]:
# Buat cluster profile
cluster_profile = data_oc.groupby('OC_cluster')[features_oc].mean()
cluster_profile

Unnamed: 0_level_0,loan_to_income,installment_to_income,dti,fico_range_low,revol_util,emp_length,delinq_2yrs
OC_cluster,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.241837,0.007318,18.943412,689.245762,51.315087,2.32543,0.238962
1,0.243044,0.006754,17.133798,750.555001,19.351624,6.114045,0.076123
2,0.264236,0.0079,21.102408,690.090418,55.752518,9.309386,0.346536


In [63]:
data_oc['overconfident'] = (data_oc['OC_cluster'] == 2).astype(int)

In [68]:
from scipy.stats import f_oneway

f_oneway(
    data_oc[data_oc.OC_cluster==0]['delinq_2yrs'],
    data_oc[data_oc.OC_cluster==1]['delinq_2yrs'],
    data_oc[data_oc.OC_cluster==2]['delinq_2yrs']
)

F_onewayResult(statistic=np.float64(4563.13930653073), pvalue=np.float64(0.0))

# Present Bias

In [95]:
features_pb = [
    'term',
    'installment_to_income',
    'int_rate',
    'loan_to_income',
    'delinq_2yrs'
]

In [96]:
np.isinf(data[features_pb]).sum()

Unnamed: 0,0
term,0
installment_to_income,0
int_rate,0
loan_to_income,0
delinq_2yrs,0


In [97]:
data[features_pb] = data[features_pb].replace([np.inf, -np.inf], np.nan)

In [98]:
data_pb = data[features_pb].dropna()

In [99]:
X_pb = StandardScaler().fit_transform(data_pb)

In [100]:
# Pertimbangan jumlah cluster
X_sample = resample(X_pb, n_samples=50000, random_state=42)

sil = []
for k in range(2,7):
    km = MiniBatchKMeans(n_clusters=k, random_state=42, batch_size=10000)
    labels = km.fit_predict(X_sample)
    sil.append(silhouette_score(X_sample, labels))

sil

[np.float64(0.5553044539941017),
 np.float64(0.5377778834493265),
 np.float64(0.5329255776063355),
 np.float64(0.49797730363432413),
 np.float64(0.45513883340661365)]

In [101]:
# Clustering k=3
kmeans = KMeans(n_clusters=2, random_state=42)
data_pb['PB_cluster'] = kmeans.fit_predict(X_pb)

In [102]:
# Buat cluster profile
cluster_profile = data_pb.groupby('PB_cluster')[features_pb].mean()
cluster_profile

Unnamed: 0_level_0,term,installment_to_income,int_rate,loan_to_income,delinq_2yrs
PB_cluster,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,0.998921,0.008753,15.15495,0.352625,0.209908
1,0.0,0.006751,11.601902,0.204098,0.242942


In [103]:
data_pb['present_biased'] = (data_pb['PB_cluster'] == 0).astype(int)

In [104]:
pd.crosstab(data_pb['PB_cluster'], data['purpose_group'], normalize='index')

purpose_group,consumption,productive
PB_cluster,Unnamed: 1_level_1,Unnamed: 2_level_1
0,0.301804,0.698196
1,0.395829,0.604171


In [106]:
from scipy.stats import f_oneway

f_oneway(
    data_pb[data_pb.PB_cluster==0]['delinq_2yrs'],
    data_pb[data_pb.PB_cluster==1]['delinq_2yrs']
)

F_onewayResult(statistic=np.float64(186.64058397651104), pvalue=np.float64(1.754264583403895e-42))