In [317]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from warnings import filterwarnings
filterwarnings('ignore')

from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import f1_score

import lightgbm as lgb

In [318]:
from scipy.stats import pearsonr

In [319]:
df = pd.read_csv('../data/german_credit_data.csv')
df = df.drop(['Unnamed: 0'], axis=1)
df.columns = [x.lower().replace(' ', '_') for x in df.columns]

In [320]:
job_dic = {
    0: 'unskilled non res',
    1: 'unskilled resident',
    2: 'skilled',
    3: 'highly skilled'
}
df.job = df.job.map(job_dic)

In [321]:
numerical_features = ['age', 'credit_amount', 'duration']
categorical_features = ['sex', 'job', 'housing', 'saving_accounts', 'checking_account', 'purpose']

In [322]:
df = df.fillna('undefined')

In [323]:
df['target'] = (df.risk == 'bad').astype(int)
df.sample(3)

Unnamed: 0,age,sex,job,housing,saving_accounts,checking_account,credit_amount,duration,purpose,risk,target
476,24,male,skilled,own,quite rich,undefined,2569,39,car,good,0
126,40,male,unskilled resident,own,little,little,701,12,radio/TV,good,0
649,40,male,unskilled resident,rent,little,little,684,12,education,bad,1


In [324]:
top_quantiles_space = np.linspace(.1, 0.9, 9)

numerical_frame = pd.DataFrame()
for feature in numerical_features:
    for q in top_quantiles_space:
        value = df[feature].quantile(q)

        evaluation_series = (df[feature] >= value)
        r, p = pearsonr(df.target, evaluation_series)
        numerical_frame = pd.concat([numerical_frame, pd.DataFrame({'feature': feature, 'operation':'bigger than', 'quantile': q, 'pearsonr': r, 'value':value}, index=[0])])
        
        evaluation_series = (df[feature] <= value)
        r, p = pearsonr(df.target, evaluation_series)
        numerical_frame = pd.concat([numerical_frame, pd.DataFrame({'feature': feature, 'operation':'smaller than', 'quantile': q, 'pearsonr': r, 'value':value}, index=[0])])

categorical_frame = pd.DataFrame()
for feature in categorical_features:
    for category in df[feature].unique():
        evaluation_series = (df[feature] == category)
        r, p = pearsonr(df.target, evaluation_series)
        categorical_frame = pd.concat([categorical_frame, pd.DataFrame({'feature': feature, 'value': category, 'pearsonr': r}, index=[0])])
    

In [325]:
## good_signs selection
print('The condition for a selection to be considered a bad sign is that the correlation is greater than 0.1')
numerical_bad_signs = numerical_frame[numerical_frame.pearsonr > 0.1]
display(numerical_bad_signs.sample(5))
print('For each feature I will take the broadest quantile that fits the condition:')
numerical_bad_signs = numerical_bad_signs.sort_values('quantile').drop_duplicates(['feature', 'operation'], keep='first')
display(numerical_bad_signs.head(5))

The condition for a selection to be considered a bad sign is that the correlation is greater than 0.1


Unnamed: 0,feature,operation,quantile,pearsonr,value
0,age,smaller than,0.3,0.100864,28.0
0,duration,bigger than,0.6,0.149747,24.0
0,age,smaller than,0.5,0.114405,33.0
0,duration,bigger than,0.5,0.175723,18.0
0,duration,bigger than,0.2,0.15336,12.0


For each feature I will take the broadest quantile that fits the condition:


Unnamed: 0,feature,operation,quantile,pearsonr,value
0,duration,bigger than,0.1,0.136092,9.0
0,age,smaller than,0.2,0.112409,26.0
0,credit_amount,bigger than,0.7,0.11275,3590.0


In [326]:
print('for categorical variables, I just select r > 0.1')
categorical_bad_signs = categorical_frame[categorical_frame.pearsonr > 0.1]
display(categorical_bad_signs.head(5))
bad_signs = pd.concat([numerical_bad_signs, categorical_bad_signs])

for categorical variables, I just select r > 0.1


Unnamed: 0,feature,value,pearsonr
0,saving_accounts,little,0.161007
0,checking_account,little,0.258333
0,checking_account,moderate,0.119581


In [327]:
numerical_good_signs = numerical_frame[numerical_frame.pearsonr < -0.1]
numerical_good_signs = numerical_good_signs.sort_values('quantile').drop_duplicates(['feature', 'operation'], keep='first')
categorical_good_signs = categorical_frame[categorical_frame.pearsonr < -0.1]
good_signs = pd.concat([numerical_good_signs, categorical_good_signs])

In [328]:
good_signs['type'] = 'good_sign'
bad_signs['type'] = 'bad_sign'

signs_conditions = pd.concat([good_signs, bad_signs])
signs_conditions = signs_conditions[['feature','operation','value', 'type']]
signs_conditions.operation = signs_conditions.operation.fillna('equal to')

In [329]:
df['number_good_signs'] = 0
df['number_bad_signs'] = 0
for condition in signs_conditions.iloc:
    sign_type = condition.type ### good or bad
    if condition.operation == 'bigger than':
        df[f'number_{sign_type}s'] += (df[condition.feature] >= condition.value).astype(int)
    elif condition.operation == 'smaller than':
        df[f'number_{sign_type}s'] += (df[condition.feature] <= condition.value).astype(int)
    else:
        df[f'number_{sign_type}s'] += (df[condition.feature] == condition.value).astype(int)
        
df['good_bad_balance'] = df.number_good_signs - df.number_bad_signs

In [331]:
print(pearsonr(df.number_good_signs, df.target))
print(pearsonr(df.number_bad_signs, df.target))
print(pearsonr(df.good_bad_balance, df.target))

PearsonRResult(statistic=-0.3484071684948514, pvalue=6.485868863202711e-30)
PearsonRResult(statistic=0.3511348318683366, pvalue=2.175101419068473e-30)
PearsonRResult(statistic=-0.37371518243864044, pvalue=1.6854356618569274e-34)


In [332]:
categorical_encoder_frame = pd.DataFrame()
for feature in categorical_features:
    for value in df[feature].unique():
            evaluation_series = (df[feature] == value)
            r, p = pearsonr(df.target, evaluation_series)
            categorical_encoder_frame = pd.concat(
                [categorical_encoder_frame, pd.DataFrame(
                    {'feature': feature, 'value': value, 'pearsonr': r}, index=[0])])
                    
categorical_encoder_frame = categorical_encoder_frame.sort_values(['feature','pearsonr']
    ).drop('pearsonr', axis=1).reset_index(drop=True)
categorical_encoder_frame['encoded'] = 1
categorical_encoder_frame.encoded = categorical_encoder_frame.groupby('feature').encoded.cumsum()

In [333]:
for feature in categorical_features:
    encoding_dic = categorical_encoder_frame[categorical_encoder_frame.feature == feature
        ][['value', 'encoded']].set_index('value').to_dict()['encoded']
    df[feature] = df[feature].map(encoding_dic)

## encode categorical features ordinally
for feature in categorical_features:
    df[feature] = df[feature].astype('category').cat.codes

In [334]:
## train lgbm

X_train, X_test, y_train, y_test = train_test_split(
    df.drop(['risk', 'target'], axis=1), df.target, test_size=0.2, stratify=df.target, random_state=42)

lgbm = lgb.LGBMClassifier(verbose = -1)
lgbm.fit(X_train, y_train)
y_pred = lgbm.predict_proba(X_test)[:,1]

print(roc_auc_score(y_test, y_pred))

## estimate precision and recall at different thresholds

precision, recall, thresholds = precision_recall_curve(y_test, y_pred)
f1_scores = [f1_score(y_test, y_pred > threshold) for threshold in thresholds]

0.7707142857142858


In [335]:
## calculate recall at 50% precision
recall_at_50_precision = recall[np.argmin(np.abs(np.array(precision) - 0.5))]
print(recall_at_50_precision) 

0.65


In [336]:
## feature importances
feature_importances = pd.DataFrame({'feature': X_train.columns, 'importance': lgbm.feature_importances_})
feature_importances = feature_importances.sort_values('importance', ascending=False)

In [337]:
feature_importances

Unnamed: 0,feature,importance
6,credit_amount,1056
0,age,589
7,duration,377
8,purpose,204
5,checking_account,152
11,good_bad_balance,137
9,number_good_signs,117
4,saving_accounts,92
2,job,89
1,sex,53
