In [248]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from warnings import filterwarnings
filterwarnings('ignore')

!pip install numpy==1.23.4

In [249]:
from scipy.stats import pearsonr
from sklearn.svm import SVC

In [250]:
df = pd.read_csv('../data/german_credit_data.csv')
df = df.drop(['Unnamed: 0'], axis=1)
df.columns = [x.lower().replace(' ', '_') for x in df.columns]

In [251]:
job_dic = {
    0: 'unskilled non res',
    1: 'unskilled resident',
    2: 'skilled',
    3: 'highly skilled'
}
df.job = df.job.map(job_dic)

In [252]:
numerical_features = ['age', 'credit_amount', 'duration']
categorical_features = ['sex', 'job', 'housing', 'saving_accounts', 'checking_account', 'purpose']

In [253]:
df = df.fillna('undefined')

In [254]:
df['target'] = (df.risk == 'bad').astype(int)
df.sample(3)

Unnamed: 0,age,sex,job,housing,saving_accounts,checking_account,credit_amount,duration,purpose,risk,target
816,62,male,skilled,own,quite rich,undefined,1338,6,domestic appliances,good,0
35,25,male,unskilled resident,own,little,moderate,4746,45,radio/TV,bad,1
52,25,male,skilled,own,little,undefined,1262,12,radio/TV,good,0


In [255]:
import pandas as pd

def add_columns(df):
    df['new_column'] = df['age'] + df['sex']
    return df


In [256]:
def find_good_and_bad_signs(df, y):
    df['target'] = y
    top_quantiles_space = np.linspace(.1, 0.9, 9)

    numerical_frame = pd.DataFrame()
    for feature in numerical_features:
        for q in top_quantiles_space:
            value = df[feature].quantile(q)

            evaluation_series = (df[feature] >= value)
            r, p = pearsonr(df.target, evaluation_series)
            numerical_frame = pd.concat([numerical_frame, pd.DataFrame({'feature': feature, 'operation':'bigger than', 'quantile': q, 'pearsonr': r, 'value':value}, index=[0])])
            
            evaluation_series = (df[feature] <= value)
            r, p = pearsonr(df.target, evaluation_series)
            numerical_frame = pd.concat([numerical_frame, pd.DataFrame({'feature': feature, 'operation':'smaller than', 'quantile': q, 'pearsonr': r, 'value':value}, index=[0])])

    categorical_frame = pd.DataFrame()
    for feature in categorical_features:
        for category in df[feature].unique():
            evaluation_series = (df[feature] == category)
            r, p = pearsonr(df.target, evaluation_series)
            categorical_frame = pd.concat([categorical_frame, pd.DataFrame({'feature': feature, 'value': category, 'pearsonr': r}, index=[0])])

    ## good_signs selection
    numerical_bad_signs = numerical_frame[numerical_frame.pearsonr > 0.1]
    numerical_bad_signs = numerical_bad_signs.sort_values('quantile').drop_duplicates(['feature', 'operation'], keep='first')
    categorical_bad_signs = categorical_frame[categorical_frame.pearsonr > 0.1]
    bad_signs = pd.concat([numerical_bad_signs, categorical_bad_signs])

    numerical_good_signs = numerical_frame[numerical_frame.pearsonr < -0.1]
    numerical_good_signs = numerical_good_signs.sort_values('quantile').drop_duplicates(['feature', 'operation'], keep='first')
    categorical_good_signs = categorical_frame[categorical_frame.pearsonr < -0.1]
    good_signs = pd.concat([numerical_good_signs, categorical_good_signs])

    good_signs['type'] = 'good_sign'
    bad_signs['type'] = 'bad_sign'

    signs_conditions = pd.concat([good_signs, bad_signs])
    signs_conditions = signs_conditions[['feature','operation','value', 'type']]
    signs_conditions.operation = signs_conditions.operation.fillna('equal to')
        
    return signs_conditions

def apply_signs_conditions(df, signs_conditions):
    df['number_good_signs'] = 0
    df['number_bad_signs'] = 0
    for condition in signs_conditions.iloc:
        sign_type = condition.type ### good or bad
        if condition.operation == 'bigger than':
            df[f'number_{sign_type}s'] += (df[condition.feature] >= condition.value).astype(int)
        elif condition.operation == 'smaller than':
            df[f'number_{sign_type}s'] += (df[condition.feature] <= condition.value).astype(int)
        else:
            df[f'number_{sign_type}s'] += (df[condition.feature] == condition.value).astype(int)
            
    df['good_bad_balance'] = df.number_good_signs - df.number_bad_signs
    return df

In [257]:
def sort_categorical_values_by_correlation(df, y):
    df['target'] = y
    categorical_encoder_frame = pd.DataFrame()
    for feature in categorical_features:
        for value in df[feature].unique():
                evaluation_series = (df[feature] == value)
                r, p = pearsonr(df.target, evaluation_series)
                categorical_encoder_frame = pd.concat(
                    [categorical_encoder_frame, pd.DataFrame(
                        {'feature': feature, 'value': value, 'pearsonr': r}, index=[0])])
                        
    categorical_encoder_frame = categorical_encoder_frame.sort_values(['feature','pearsonr']
        ).drop('pearsonr', axis=1).reset_index(drop=True)
    categorical_encoder_frame['encoded'] = 1
    categorical_encoder_frame.encoded = categorical_encoder_frame.groupby('feature').encoded.cumsum()
    return categorical_encoder_frame

def encode_categorical_values(df, categorical_encoder_frame):
    for feature in categorical_features:
        encoding_dic = categorical_encoder_frame[categorical_encoder_frame.feature == feature
            ][['value', 'encoded']].set_index('value').to_dict()['encoded']
        df[feature] = df[feature].map(encoding_dic)
    return df

In [345]:
from sklearn.base import BaseEstimator, TransformerMixin

class PreprocessingTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, dropped_features = []):
        self.dropped_features = dropped_features
        pass

    def fit(self, X, y=None):
        self.categorical_features_order = sort_categorical_values_by_correlation(X.copy(), y)
        self.good_and_bad_signs_frme = find_good_and_bad_signs(X.copy(), y)
        return self

    def transform(self, X):
        X = encode_categorical_values(X.copy(), self.categorical_features_order)
        X = apply_signs_conditions(X.copy(), self.good_and_bad_signs_frme)
        X = X.drop(self.dropped_features, axis = 1)
        self.features = X.columns
        return X


In [259]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
import lightgbm as lgb
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier

In [260]:
## bayes optimization

from skopt import BayesSearchCV
from skopt.space import Real, Categorical, Integer


In [311]:
lgbm = lgb.LGBMClassifier(verbose = -1, n_estimators=30)

pipeline = Pipeline([
    ('preprocess', PreprocessingTransformer()), ## standard scaler below
    ('model', lgb.LGBMClassifier(verbose = -1, optimizer='dart'))
])

bayes_search = BayesSearchCV(
    pipeline,
    {
        'model__num_leaves': Integer(10, 100),
        'model__feature_fraction': Real(0.1, 1),
        'model__learning_rate': Real(0.01, 0.1),
    },
    cv=5,
    verbose=-1,
    n_jobs=-1,
    n_iter=3,
    return_train_score=False
)
bayes_search.fit(X_train, y_train);

In [262]:
def warn(model_params, best_params):
    for param in model_params:
        prior = model_params[param].prior
        if 'prior' not in ['uniform', 'log-uniform']:
            continue

        low = model_params[param].low
        high = model_params[param].high

        if prior == 'log-uniform':
            space = np.logspace(np.log10(low), np.log10(high), 10)
        elif prior == 'uniform':
            space = np.linspace(low, high, 10)
        value_found = best_params[param]

        if (value_found > space[-2]) or (value_found < space[1]):
            print('warning:', param, 'is at the edge of the search space (value:', value_found, ')')

In [317]:
def test_pipeline(model, model_params, cv, n_iter, print_scores=True, dropped_features = []):

    pipeline = Pipeline([
        ('preprocess', PreprocessingTransformer(dropped_features=dropped_features)), ## standard scaler below
        ('model', model)
        ])

    bayes_search = BayesSearchCV(
        pipeline,
        model_params,
        cv=cv,
        verbose=-1,
        n_jobs=-1,
        return_train_score=True,
        n_iter = n_iter
    )

    bayes_search.fit(X_train, y_train)

    best_estimator_test_score = bayes_search.best_score_
    best_estimator_train_score = bayes_search.cv_results_['mean_train_score'][bayes_search.best_index_]

    output = {
        'best_estimator_test_score': best_estimator_test_score,
        'best_estimator_train_score': best_estimator_train_score,
        'best_estimator': bayes_search.best_estimator_,
        'best_params': bayes_search.best_params_
    }
    warn(model_params, bayes_search.best_params_)

    if print_scores == True:
        print('best estimator test score:', best_estimator_test_score)
        print('best estimator train score:', best_estimator_train_score)

    return output


## model selection

In [264]:
cv, n_iter = 5, 30

## lgbm (gradient boosting)

In [294]:
model = lgb.LGBMClassifier(verbose = -1, n_estimators=300)

model_params = {
    'model__num_leaves': Integer(10, 100, 'uniform'),
    'model__feature_fraction': Real(0.1, 1, 'uniform'),
    'model__learning_rate': Real(0.001, 0.1, 'log-uniform'),
    'model__min_child_samples': Integer(5, 200, 'uniform'),
    'model__boosting_type': Categorical(['gbdt', 'dart', 'goss']),
    'model__n_estimators': Integer(30, 300, 'uniform'),
}

lgbm_bo = test_pipeline(model, model_params, cv, n_iter)

best estimator test score: 0.7487499999999999
best estimator train score: 0.8034375


## lgbm (random forest)

In [346]:
model = lgb.LGBMClassifier(verbose = -1, optimizer='rf', n_estimators=300)

model_params = {
    'model__num_leaves': Integer(10, 100, 'uniform'),
    'model__feature_fraction': Real(0.1, 1, 'uniform'),
    'model__min_child_samples': Integer(5, 200, 'uniform'),
    'model__bagging_freq': Integer(1, 10, 'uniform'),
    'model__bagging_fraction': Real(0.1, 1, 'uniform'),
}

lgbm_rf = test_pipeline(model, model_params, cv, n_iter)

best estimator test score: 0.75125
best estimator train score: 0.8371875


In [355]:
lgbm_rf

{'best_estimator_test_score': 0.75125,
 'best_estimator_train_score': 0.8371875,
 'best_estimator': Pipeline(steps=[('preprocess', PreprocessingTransformer()),
                 ('model',
                  LGBMClassifier(bagging_fraction=1.0, bagging_freq=1,
                                 feature_fraction=0.847960675218702,
                                 min_child_samples=105, n_estimators=300,
                                 num_leaves=43, optimizer='rf', verbose=-1))]),
 'best_params': OrderedDict([('model__bagging_fraction', 1.0),
              ('model__bagging_freq', 1),
              ('model__feature_fraction', 0.847960675218702),
              ('model__min_child_samples', 105),
              ('model__num_leaves', 43)])}

In [353]:
#feature importances
importances = pd.DataFrame({'feature': lgbm_rf['best_estimator']['preprocess'].features, 
                            'importance': lgbm_rf['best_estimator']['model'].feature_importances_}
                            ).sort_values('importance', ascending=False).reset_index(drop=True)

## feature selection

In [324]:
todas_features = numerical_features + categorical_features + ['good_bad_balance', 'number_good_signs', 'number_bad_signs']

In [385]:
def selection_round(n_evaluations, model, model_params, todas_features, ja_removidos):
    rows = []

    todas_testadas = [x for x in todas_features if x not in ja_removidos]

    for i in range(n_evaluations):
        for tested_feature in (todas_testadas + [[]])[::-1]:
            res = test_pipeline(model, model_params, 5, 5, dropped_features=tested_feature, print_scores=False)
            res = (pd.DataFrame({
                    'removed_feature': [tested_feature] + ja_removidos,
                    'test_score': res['best_estimator_test_score']})
                    )
            rows.append(res)

    results = pd.concat(rows).reset_index(drop=True)

    results.removed_feature = results.removed_feature.astype(str)
    results ['avg_test_score'] = results.groupby('removed_feature').test_score.transform('mean')
    results ['std_test_score'] = results.groupby('removed_feature').test_score.transform('std')
    results = results.sort_values('avg_test_score', ascending=False).drop_duplicates('removed_feature').reset_index(drop=True)
    worse = results.iloc[0]
    worse_feature = worse.removed_feature
    worse_score = worse.avg_test_score
    worse_std = worse.std_test_score

    if worse_feature == '[]':
        print('acabou')
        return '[]', results

    control = results[results.removed_feature == '[]'].iloc[0]
    control_score = control.avg_test_score
    control_std = control.std_test_score

    combined_std = np.sqrt(worse_std**2 + control_std**2)

    if (worse_score - control_score) < 1*combined_std:
        print('acabou')
        return '[]', results
        
    else:
        print('removing', worse_feature)
    return worse_feature, results

In [392]:
n_evaluations_per_round = 4

model_params = {
    'model__feature_fraction': Real(0.8, 1, 'uniform')
}
model = lgb.LGBMClassifier(verbose = -1, optimizer='rf', n_estimators=100)


ja_removidos = []
removed = ''

result = pd.DataFrame()
while removed != '[]':
    removed, round_res = selection_round(n_evaluations_per_round, model, model_params, todas_features[::-1], ja_removidos = ja_removidos)
    ja_removidos.append(removed)
    round_res['round'] = len(ja_removidos)
    round_res['removed_feature'] = round_res.removed_feature.astype(str)
    result = pd.concat([result, round_res])

removing number_good_signs
acabou


In [393]:
result

Unnamed: 0,removed_feature,test_score,avg_test_score,std_test_score,round
0,number_good_signs,0.74125,0.74125,0.0,1
1,number_bad_signs,0.74125,0.74125,0.0,1
2,purpose,0.7375,0.7375,0.0,1
3,good_bad_balance,0.7375,0.7375,0.0,1
4,[],0.73125,0.73125,0.0,1
5,saving_accounts,0.72375,0.72375,0.0,1
6,checking_account,0.69375,0.690625,0.004419,1
0,number_bad_signs,0.74125,0.74125,0.0,2
1,good_bad_balance,0.7375,0.7375,0.0,2
2,purpose,0.7375,0.7375,0.0,2
