In [64]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from warnings import filterwarnings
filterwarnings('ignore')

from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import f1_score

import lightgbm as lgb

In [65]:
from scipy.stats import pearsonr

In [66]:
df = pd.read_csv('../data/german_credit_data.csv')
df = df.drop(['Unnamed: 0'], axis=1)
df.columns = [x.lower().replace(' ', '_') for x in df.columns]

In [67]:
job_dic = {
    0: 'unskilled non res',
    1: 'unskilled resident',
    2: 'skilled',
    3: 'highly skilled'
}
df.job = df.job.map(job_dic)

In [68]:
numerical_features = ['age', 'credit_amount', 'duration']
categorical_features = ['sex', 'job', 'housing', 'saving_accounts', 'checking_account', 'purpose']
df = df.fillna('undefined') ## we are treating indefinitions are new categories as they have strong relations with target
df['target'] = (df.risk == 'bad').astype(int)
df.sample(3)

Unnamed: 0,age,sex,job,housing,saving_accounts,checking_account,credit_amount,duration,purpose,risk,target
395,32,male,skilled,rent,moderate,moderate,11760,39,education,good,0
26,39,male,unskilled resident,own,little,undefined,426,6,radio/TV,good,0
199,40,male,highly skilled,own,little,moderate,4297,18,furniture/equipment,bad,1


## Feature creation

The main idea here is to find "good indicators" and "bad indicators".
- A customer having a bad indicator means "being in a group that is an evidence of the customer being bad. For instance, having high credit duration as we saw on eda

- Good indicator is the opposite of above

- The indicators will be found by calculating the correlation between the group and target

- Examples of group are: "rich saving_accounts" or "90th percentile credit_amount"

- I decided to use pearson correlation above 0.1 here, though this can be tuned

In [69]:
top_quantiles_space = np.linspace(.1, 0.9, 9)

numerical_frame = pd.DataFrame()
for feature in numerical_features:
    for q in top_quantiles_space:
        value = df[feature].quantile(q)

        evaluation_series = (df[feature] >= value)
        r, p = pearsonr(df.target, evaluation_series)
        numerical_frame = pd.concat([numerical_frame, pd.DataFrame({
            'feature': feature, 'operation':'bigger than', 'quantile': q, 'pearsonr': r, 'value':value}, index=[0])])
        
        evaluation_series = (df[feature] <= value)
        r, p = pearsonr(df.target, evaluation_series)
        numerical_frame = pd.concat([numerical_frame, pd.DataFrame({
            'feature': feature, 'operation':'smaller than', 'quantile': q, 'pearsonr': r, 'value':value}, index=[0])])

categorical_frame = pd.DataFrame()
for feature in categorical_features:
    for category in df[feature].unique():
        evaluation_series = (df[feature] == category)
        r, p = pearsonr(df.target, evaluation_series)
        categorical_frame = pd.concat([categorical_frame, pd.DataFrame({
            'feature': feature, 'value': category, 'pearsonr': r}, index=[0])])

In [70]:
display(categorical_frame.head(5))
display(numerical_frame.head(5))
print("Above are examples of potential indicators. Most of them don't cut it (<10% correlation), but we'll filter them below")

Unnamed: 0,feature,value,pearsonr
0,sex,male,-0.075493
0,sex,female,0.075493
0,job,skilled,-0.013559
0,job,unskilled resident,-0.021822
0,job,highly skilled,0.040559


Unnamed: 0,feature,operation,quantile,pearsonr,value
0,age,bigger than,0.1,-0.04612,23.0
0,age,smaller than,0.1,0.074744,23.0
0,age,bigger than,0.2,-0.127938,26.0
0,age,smaller than,0.2,0.112409,26.0
0,age,bigger than,0.3,-0.094643,28.0


Above are examples of potential indicators. Most of them don't cut it (<10% correlation), but we'll filter them below


In [71]:
print('5 examples of negative indicators:')
numerical_negative_indicators = numerical_frame[numerical_frame.pearsonr > 0.1]
display(numerical_negative_indicators.head(5))
print('For each feature I will take the broadest quantile that fits the condition:')
numerical_negative_indicators = numerical_negative_indicators.sort_values('quantile').drop_duplicates(['feature', 'operation'], keep='first')
display(numerical_negative_indicators.head(5))

5 examples of negative indicators:


Unnamed: 0,feature,operation,quantile,pearsonr,value
0,age,smaller than,0.2,0.112409,26.0
0,age,smaller than,0.3,0.100864,28.0
0,age,smaller than,0.4,0.109549,30.0
0,age,smaller than,0.5,0.114405,33.0
0,credit_amount,bigger than,0.7,0.11275,3590.0


For each feature I will take the broadest quantile that fits the condition:


Unnamed: 0,feature,operation,quantile,pearsonr,value
0,duration,bigger than,0.1,0.136092,9.0
0,age,smaller than,0.2,0.112409,26.0
0,credit_amount,bigger than,0.7,0.11275,3590.0


In [72]:
print('For categorical variables, I just select r > 0.1')
categorical_negative_indicators = categorical_frame[categorical_frame.pearsonr > 0.1]
display(categorical_negative_indicators.head(5))
negative_indicators = pd.concat([numerical_negative_indicators, categorical_negative_indicators])

For categorical variables, I just select r > 0.1


Unnamed: 0,feature,value,pearsonr
0,saving_accounts,little,0.161007
0,checking_account,little,0.258333
0,checking_account,moderate,0.119581


In [73]:
numerical_positive_indicators = numerical_frame[numerical_frame.pearsonr < -0.1]
numerical_positive_indicators = numerical_positive_indicators.sort_values('quantile').drop_duplicates(['feature', 'operation'], keep='first')
categorical_positive_indicators = categorical_frame[categorical_frame.pearsonr < -0.1]
positive_indicators = pd.concat([numerical_positive_indicators, categorical_positive_indicators])

In [74]:
positive_indicators['type'] = 'positive_indicator'
negative_indicators['type'] = 'negative_indicator'

signs_conditions = pd.concat([positive_indicators, negative_indicators])
signs_conditions = signs_conditions[['feature','operation','value', 'type']]
signs_conditions.operation = signs_conditions.operation.fillna('equal to')

#### Final table with positive and negative indicators, from numerical and categorical features:

In [75]:
signs_conditions.sample(5)

Unnamed: 0,feature,operation,value,type
0,checking_account,equal to,little,negative_indicator
0,credit_amount,bigger than,3590.0,negative_indicator
0,checking_account,equal to,moderate,negative_indicator
0,saving_accounts,equal to,little,negative_indicator
0,purpose,equal to,radio/TV,positive_indicator


In [76]:
signs_conditions.type.value_counts()

positive_indicator    7
negative_indicator    6
Name: type, dtype: int64

In [77]:
signs_conditions.operation.value_counts()

equal to        7
smaller than    3
bigger than     3
Name: operation, dtype: int64

Now I'll count how many positive and negative indicators customers have:

In [78]:
df['number_positive_indicators'] = 0
df['number_negative_indicators'] = 0
for condition in signs_conditions.iloc:
    sign_type = condition.type ### good or bad
    if condition.operation == 'bigger than':
        df[f'number_{sign_type}s'] += (df[condition.feature] >= condition.value).astype(int)
    elif condition.operation == 'smaller than':
        df[f'number_{sign_type}s'] += (df[condition.feature] <= condition.value).astype(int)
    else:
        df[f'number_{sign_type}s'] += (df[condition.feature] == condition.value).astype(int)
        
df['good_bad_balance'] = df.number_positive_indicators - df.number_negative_indicators

In [79]:
print(pearsonr(df.number_positive_indicators, df.target))
print(pearsonr(df.number_negative_indicators, df.target))
print(pearsonr(df.good_bad_balance, df.target))

PearsonRResult(statistic=-0.3484071684948514, pvalue=6.485868863202711e-30)
PearsonRResult(statistic=0.3511348318683366, pvalue=2.175101419068473e-30)
PearsonRResult(statistic=-0.37371518243864044, pvalue=1.6854356618569274e-34)


- Those are the biggest correlations I found on this dataset yet. But this doesn't mean much as these features are highly overfitted to this metric

- In order to avoid data overfitting due to data leaking, this data processing will be done separately on training and testing sets

- For this reason I put the code above on a .py file that will be used on modeling so we can add it to the ML pipelines

### Categorical feature encoding

- One-hot encoding creates many features, which can negatively affect performance in a small dataset like this one, and which is harder to tune

- I won't use ordinal encoder and instead:

- I will ordinally encode the classes but in a specific order: I will order categories by correlation to the target

- For instance, if men are more risky than women, women will be 1 and men will be 2

- This makes it easier for the dataset to be linearly separable and makes it easier for the model to learn

- This is also prone to data leakage, so it will also be done separately on train/test set by adding it to the pipeline

In [80]:
categorical_encoder_frame = pd.DataFrame()
for feature in categorical_features:
    for value in df[feature].unique():
            evaluation_series = (df[feature] == value)
            r, p = pearsonr(df.target, evaluation_series)
            categorical_encoder_frame = pd.concat(
                [categorical_encoder_frame, pd.DataFrame(
                    {'feature': feature, 'value': value, 'pearsonr': r}, index=[0])])
                    
categorical_encoder_frame = categorical_encoder_frame.sort_values(['feature','pearsonr']
    ).drop('pearsonr', axis=1).reset_index(drop=True)
categorical_encoder_frame['encoded'] = 1
categorical_encoder_frame.encoded = categorical_encoder_frame.groupby('feature').encoded.cumsum()

In [81]:
for feature in categorical_features:
    encoding_dic = categorical_encoder_frame[categorical_encoder_frame.feature == feature
        ][['value', 'encoded']].set_index('value').to_dict()['encoded']
    df[feature] = df[feature].map(encoding_dic)

## End of processing
Below I'll quickly train a model to get a preview of how the feature importances look
- Note that the test below has data leakage so it's not a very accurate depiction of the actual importances

In [82]:
## train lgbm

X_train, X_test, y_train, y_test = train_test_split(
    df.drop(['risk', 'target'], axis=1), df.target, test_size=0.2, stratify=df.target, random_state=42)

lgbm = lgb.LGBMClassifier(verbose = -1)
lgbm.fit(X_train, y_train)
y_pred = lgbm.predict_proba(X_test)[:,1]

print(roc_auc_score(y_test, y_pred))

## estimate precision and recall at different thresholds

precision, recall, thresholds = precision_recall_curve(y_test, y_pred)
f1_scores = [f1_score(y_test, y_pred > threshold) for threshold in thresholds]

0.7707142857142858


In [83]:
## calculate recall at 50% precision
recall_at_50_precision = recall[np.argmin(np.abs(np.array(precision) - 0.5))]
print(recall_at_50_precision) 

0.65


In [84]:
## feature importances
feature_importances = pd.DataFrame({'feature': X_train.columns, 'importance': lgbm.feature_importances_})
feature_importances = feature_importances.sort_values('importance', ascending=False)

In [85]:
feature_importances

Unnamed: 0,feature,importance
6,credit_amount,1056
0,age,589
7,duration,377
8,purpose,204
5,checking_account,152
11,good_bad_balance,137
9,number_positive_indicators,117
4,saving_accounts,92
2,job,89
1,sex,53


In [86]:
## correlation between every feature and the target
correlations = []
for feature in X_train.columns:
    r, p = pearsonr(X_train[feature], y_train)
    correlations.append({'feature': feature, 'pearsonr': r, 'p_value': p})

correlations = pd.DataFrame(correlations)
correlations = correlations.sort_values('pearsonr', ascending=False)


In [87]:
correlations

Unnamed: 0,feature,pearsonr,p_value
10,number_negative_indicators,0.337728,8.558942000000001e-23
5,checking_account,0.336676,1.181434e-22
7,duration,0.206301,3.875571e-09
4,saving_accounts,0.19044,5.706069e-08
6,credit_amount,0.138776,8.212145e-05
3,housing,0.128635,0.0002644675
8,purpose,0.118568,0.0007787614
1,sex,0.084246,0.0171555
2,job,0.041993,0.2354573
0,age,-0.05775,0.1026316
