# Import library

In [1]:
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import  AdaBoostClassifier, BaggingClassifier, RandomForestClassifier, HistGradientBoostingClassifier

from sklearn.metrics import classification_report, roc_auc_score, matthews_corrcoef

import pandas as pd

import pickle, os

# Read test set

In [3]:
model_subdir = 'loan_approval_prediction'
target_col = 'is_approved'

df = pd.read_csv('../dataset/cleaned/loan_approval_prediction/test_processed_data.csv')

y = df[target_col]
x = df.drop([target_col], axis=1)



# Get evaluation results

In [15]:
def eval_result(model_name, imb_data_handling_method):

    model_dir = '../model/loan_approval_prediction/{}/{}/'.format(imb_data_handling_method, model_name)

    print('loading model from', model_dir)

    with open(model_dir + 'model.pkl', 'rb') as f:
        model = pickle.load(f)

    pred = model.predict(x)
    prob = model.predict_proba(x)[:, 1]

    result = classification_report(y, pred, output_dict=True)

    result_rows = []

    print(result)

    for k,v in result.items():
        data_row = {
            'model': model_name,
            'data-imbalanced-handling': imb_data_handling_method
        }

        if k in ['0.0','1.0', 'macro avg']:
            data_row['class'] = k
            for met, val in v.items():
                if met != 'support':
                    data_row[met] = round(val,2)
            result_rows.append(data_row)

    ## store result of each class
    result_all_class_df = pd.DataFrame(result_rows)

    ## store result of all classes
    roc_auc = roc_auc_score(y, prob)
    mcc = matthews_corrcoef(y, pred)

    result_dict = {
        'model': model_name,
        'data-imbalanced-handling': imb_data_handling_method,
        'AUC': roc_auc,
        'MCC': mcc
    }

    return result_all_class_df, result_dict

In [16]:
result_dir = '../result/'
task_name = 'loan_approval_prediction'

## just for testing
model_names = ['DecisionTreeClassifier']
data_imb_handling_methods = ['imb-data']

alL_result_df = []
all_result_rows = []

for model_name in model_names:
    for method in data_imb_handling_methods:
        result_all_class_df, result_dict = eval_result(model_name, method)

        alL_result_df.append(result_all_class_df)
        all_result_rows.append(result_dict)  

loading model from ../model/loan_approval_prediction/imb-data/DecisionTreeClassifier/
{'0.0': {'precision': 0.621191104747416, 'recall': 0.5660292738614588, 'f1-score': 0.5923286969877976, 'support': 24527.0}, '1.0': {'precision': 0.9103142036214726, 'recall': 0.927332343373132, 'f1-score': 0.9187444724130893, 'support': 116503.0}, 'accuracy': 0.864496915549883, 'macro avg': {'precision': 0.7657526541844444, 'recall': 0.7466808086172954, 'f1-score': 0.7555365847004434, 'support': 141030.0}, 'weighted avg': {'precision': 0.8600318364224088, 'recall': 0.864496915549883, 'f1-score': 0.8619764108385581, 'support': 141030.0}}


In [13]:
pd.concat(alL_result_df)

Unnamed: 0,model,data-imbalanced-handling,class,precision,recall,f1-score
0,DecisionTreeClassifier,imb-data,macro avg,0.77,0.75,0.76
0,KNeighborsClassifier,imb-data,macro avg,0.79,0.74,0.76


In [17]:
pd.DataFrame(all_result_rows)

Unnamed: 0,model,data-imbalanced-handling,AUC,MCC
0,DecisionTreeClassifier,imb-data,0.84074,0.512078


In [None]:
final_result_each_class_df = pd.concat(alL_result_df)
final_result_all_class_df = pd.DataFrame(all_result_rows)

# final_result_each_class_df.to_csv(os.path.join(result_dir, '{}_result_each_class.csv'.format(task_name)), index=False)
# final_result_all_class_df.to_csv(os.path.join(result_dir, '{}_result_all_class.csv'.format(task_name)), index=False)

In [5]:
## just for testing

y_true = [0, 1, 2, 2, 2]
y_pred = [0, 0, 2, 2, 1]
target_names = ['class 0', 'class 1', 'class 2']

result = classification_report(y_true, y_pred, output_dict=True)

print(result)

{'0': {'precision': 0.5, 'recall': 1.0, 'f1-score': 0.6666666666666666, 'support': 1.0}, '1': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 1.0}, '2': {'precision': 1.0, 'recall': 0.6666666666666666, 'f1-score': 0.8, 'support': 3.0}, 'accuracy': 0.6, 'macro avg': {'precision': 0.5, 'recall': 0.5555555555555555, 'f1-score': 0.48888888888888893, 'support': 5.0}, 'weighted avg': {'precision': 0.7, 'recall': 0.6, 'f1-score': 0.6133333333333334, 'support': 5.0}}


In [6]:
result

{'0': {'precision': 0.5,
  'recall': 1.0,
  'f1-score': 0.6666666666666666,
  'support': 1.0},
 '1': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 1.0},
 '2': {'precision': 1.0,
  'recall': 0.6666666666666666,
  'f1-score': 0.8,
  'support': 3.0},
 'accuracy': 0.6,
 'macro avg': {'precision': 0.5,
  'recall': 0.5555555555555555,
  'f1-score': 0.48888888888888893,
  'support': 5.0},
 'weighted avg': {'precision': 0.7,
  'recall': 0.6,
  'f1-score': 0.6133333333333334,
  'support': 5.0}}

In [7]:
## just for testing
# each row has model_name, class_num and macro, prec, rec, f1, 
data_rows = []

for k,v in result.items():
    data_row = {}
    if k in ['0','1','2', 'macro avg']:
        data_row['class'] = k
        for met, val in v.items():
            if met != 'support':
                data_row[met] = round(val,2)
        data_rows.append(data_row)

pd.DataFrame(data_rows)

Unnamed: 0,class,precision,recall,f1-score
0,0,0.5,1.0,0.67
1,1,0.0,0.0,0.0
2,2,1.0,0.67,0.8
3,macro avg,0.5,0.56,0.49
