In [72]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from warnings import filterwarnings
filterwarnings('ignore')
import time

!pip install numpy==1.23.4

In [None]:
import utils.modeling_utils as mu

In [73]:
from scipy.stats import pearsonr
from sklearn.svm import SVC

In [74]:
df = pd.read_csv('../data/german_credit_data.csv')
df = df.drop(['Unnamed: 0'], axis=1)
df.columns = [x.lower().replace(' ', '_') for x in df.columns]

In [75]:
job_dic = {
    0: 'unskilled non res',
    1: 'unskilled resident',
    2: 'skilled',
    3: 'highly skilled'
}
df.job = df.job.map(job_dic)

In [76]:
numerical_features = ['age', 'credit_amount', 'duration']
categorical_features = ['sex', 'job', 'housing', 'saving_accounts', 'checking_account', 'purpose']

In [None]:
df.saving_accounts = df.saving_accounts.fillna('undefined')
df.checking_account = df.checking_account.fillna('undefined')

In [77]:
df.isna().sum().sum()

In [78]:
df['target'] = (df.risk == 'bad').astype(int)
df.sample(3)

Unnamed: 0,age,sex,job,housing,saving_accounts,checking_account,credit_amount,duration,purpose,risk,target
384,26,male,unskilled resident,own,moderate,undefined,4272,30,business,good,0
200,52,male,skilled,own,quite rich,undefined,936,9,education,good,0
83,58,female,unskilled resident,own,little,little,1755,24,vacation/others,good,0


In [79]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(df.drop(['risk', 'target'], axis=1), df.target, 
                                                    test_size=0.2, random_state=42, stratify = df.target)

In [83]:
from sklearn.base import BaseEstimator, TransformerMixin

class PreprocessingTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, dropped_features = []):
        self.dropped_features = dropped_features
        pass

    def fit(self, X, y=None):
        self.categorical_features_order = sort_categorical_values_by_correlation(X.copy(), y)
        self.positive_and_negative_indicators_frme = find_positive_and_negative_indicators(X.copy(), y)
        return self

    def transform(self, X):
        X = encode_categorical_values(X.copy(), self.categorical_features_order)
        X = apply_indicators_conditions(X.copy(), self.positive_and_negative_indicators_frme)
        X = X.drop(self.dropped_features, axis = 1)
        self.features = X.columns
        return X


In [84]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
import lightgbm as lgb
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier

In [85]:
## bayes optimization

from skopt import BayesSearchCV
from skopt.space import Real, Categorical, Integer


In [88]:
def test_pipeline(model, model_params, cv, n_iter, print_scores=True, dropped_features = []):

    pipeline = Pipeline([
        ('preprocess', PreprocessingTransformer(dropped_features=dropped_features)), ## standard scaler below
        ('model', model)
        ])

    bayes_search = BayesSearchCV(
        pipeline,
        model_params,
        cv=cv,
        verbose=-1,
        n_jobs=-1,
        return_train_score=True,
        n_iter = n_iter
    )

    bayes_search.fit(X_train, y_train)

    best_estimator_test_score = bayes_search.best_score_
    best_estimator_train_score = bayes_search.cv_results_['mean_train_score'][bayes_search.best_index_]

    output = {
        'best_estimator_test_score': best_estimator_test_score,
        'best_estimator_train_score': best_estimator_train_score,
        'best_estimator_test_score_std': bayes_search.cv_results_['std_test_score'][bayes_search.best_index_],
        'best_estimator': bayes_search.best_estimator_,
        'best_params': bayes_search.best_params_
    }
    warn(model_params, bayes_search.best_params_)

    if print_scores == True:
        print('best estimator test score:', best_estimator_test_score)
        print('best estimator train score:', best_estimator_train_score)

    return output, bayes_search


## model selection

In [89]:
cv, n_iter = 5, 30

## lgbm (gradient boosting)

In [90]:
model = lgb.LGBMClassifier(verbose = -1, n_estimators=30)

model_params = {
    'model__num_leaves': Integer(10, 100, 'uniform'),
    'model__feature_fraction': Real(0.1, 1, 'uniform'),
    'model__learning_rate': Real(0.001, 0.1, 'log-uniform'),
    'model__min_child_samples': Integer(5, 200, 'uniform'),
    'model__boosting_type': Categorical(['gbdt', 'dart', 'goss']),
    'model__n_estimators': Integer(30, 300, 'uniform'),
}

lgbm_bo, obj = test_pipeline(model, model_params, cv, n_iter)

best estimator test score: 0.75
best estimator train score: 0.8412499999999999


## lgbm (random forest)

In [91]:
model = lgb.LGBMClassifier(verbose = -1, boosting_type ='rf', bagging_freq = 1, bagging_fraction =0.6, n_estimators=100)

model_params = {
    'model__num_leaves': Integer(10, 100, 'uniform'),
    #'model__feature_fraction': Real(0.1, 1, 'uniform'),
    'model__min_child_samples': Integer(5, 200, 'uniform'),
}

lgbm_rf = test_pipeline(model, model_params, cv, n_iter)

best estimator test score: 0.7412500000000001
best estimator train score: 0.7918749999999999


## feature selection

In [92]:
all_features = numerical_features + categorical_features + ['positive_negative_balance', 'number_positive_indicators', 'number_negative_indicators']

In [None]:
def selection_round(n_evaluations, model, model_params, all_features, previously_removed):
    rows = []

    all_tested_feats = [x for x in all_features if x not in previously_removed]

    for i in range(n_evaluations):
        for tested_feature in (all_tested_feats + [[]])[::-1]:
            feature_formatted = tested_feature if type(tested_feature) == list else [tested_feature]
            dropped = list(set(feature_formatted).union(set(previously_removed)))
            res, _ = test_pipeline(model, model_params, 5, 5, dropped_features=dropped, print_scores=False)
            res = (pd.DataFrame({
                    'removed_feature': str(tested_feature),
                    'test_score': res['best_estimator_test_score']}, index=[0])
                    )
            rows.append(res)

    results = pd.concat(rows).reset_index(drop=True)

    results.removed_feature = results.removed_feature.astype(str)
    results ['avg_test_score'] = results.groupby('removed_feature').test_score.transform('mean')
    results = results.sort_values('avg_test_score', ascending=False).drop_duplicates('removed_feature').reset_index(drop=True)
    worse = results.iloc[0]
    worse_feature = worse.removed_feature


    if worse_feature == '[]':
        print('acabou')
        return '[]', results

    print('removing', worse_feature)
    return worse_feature, results

In [94]:
n_evaluations_per_round = 4

model_params = {
    'model__feature_fraction': Real(0.8, 1, 'uniform')
    }
model = lgb.LGBMClassifier(verbose = -1, boosting_type ='gbdt', n_estimators=100)


previously_removed = []
removed = ''

result = pd.DataFrame()
while removed != '[]':
    start = time.time()
    removed, round_res = selection_round(n_evaluations_per_round, model, model_params,
                        all_features[::-1], previously_removed = previously_removed)
    previously_removed.append(removed)
    round_res['round'] = len(previously_removed)
    round_res['round_removed'] = removed
    result = pd.concat([result, round_res])
    display(round_res)
    print('round time, ', round(time.time() - start, 1))

TypeError: selection_round() got an unexpected keyword argument 'previously_removed'

In [None]:
result

Unnamed: 0,removed_feature,test_score,avg_test_score,std_test_score,round,round_removed
0,number_good_signs,0.74125,0.74125,0.0,1,number_good_signs
1,number_bad_signs,0.74125,0.739688,0.003125,1,number_good_signs
2,housing,0.73875,0.73875,0.0,1,number_good_signs
3,good_bad_balance,0.7375,0.7375,0.0,1,number_good_signs
4,purpose,0.7375,0.7375,0.0,1,number_good_signs
5,age,0.73625,0.73625,0.0,1,number_good_signs
6,sex,0.735,0.735,6.409876000000001e-17,1,number_good_signs
7,[],0.73125,0.73125,0.0,1,number_good_signs
8,job,0.73,0.73,0.0,1,number_good_signs
9,saving_accounts,0.72375,0.72375,0.0,1,number_good_signs
