<a href="https://colab.research.google.com/github/lukekolbe/AL-in-CreditScoring/blob/main/threshold_tuner.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Setup

In [1]:
from google.colab import drive
drive.mount('/gdrive', force_remount=True)

Mounted at /gdrive


In [3]:
############ LIBRARIES

#!pip install scikit-plot


import os
import random
import multiprocessing
import pickle
import copy
import gc
import sys
import json
gc.enable()

import warnings
warnings.filterwarnings('ignore')


import numpy as np
import pandas as pd
pd.set_option('display.max_columns', None)
import matplotlib.pyplot as plt
#import seaborn as sns
#import scikitplot as skplt

from sklearn.preprocessing import RobustScaler


from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold  ##### what is this used for?
from sklearn.model_selection import StratifiedShuffleSplit

from sklearn.linear_model import LogisticRegression

from sklearn.metrics import confusion_matrix
from sklearn.metrics import ConfusionMatrixDisplay


############ RANDOMNESS
# seed function
def seed_everything(seed):
    os.environ['PYTHONHASHSEED'] = str(seed)
    random.seed(seed)
    np.random.seed(seed)
    
# set seed
seed = 30
seed_everything(seed)


In [4]:
os.chdir('/gdrive/My Drive/ACTIVE LEARNING THESIS/')

# Data Loader

In [5]:
############ DATA IMPORT

## available datasets

  # OK gmsc          # shape:  (150000, 68)
  # OK uk            # shape:  (30000, 51), y mean:  0.04
  # OK lendingclub   # shape:  (41623, 114) y mean:  0.1331235134420873    
  # OK bene2         # shape:  (7190, 28)
  # bene1            # shape:  (3123, 18)
  # hmeq             # shape:  (5960, 20)
  # australian       # shape:  (690, 42)
  # german           # shape:  (1000, 61)
  # thomas           # shape:  (1225, 28)
  # pakdd            # shape:  (50000, 373), y mean:  0.26082

data_name = "australian"
dataset_list = ["bene2", "bene1","gmsc", "uk", "lendingclub", "hmeq", "australian", "german", "thomas", "pakdd"]

def data_loader(dataset):

  #df = pd.read_csv('//home//RDC//kolbeluk1//AL_THESIS//prepared_data//{}.csv'.format(dataset)) #Linux path

  # C:\\Users\\kolbeluk1\\AL_THESIS
  #df = pd.read_csv('C:\\Users\\kolbeluk1\\AL_THESIS\\prepared_data\\{}.csv'.format(dataset))
  df = pd.read_csv('/gdrive/My Drive/ACTIVE LEARNING THESIS/prepared_data/{}.csv'.format(dataset))

  # remove NA
  df = df.dropna()
  df.reset_index(drop = True, inplace = True)

  #print(df)
  # extract label
  df['BAD'][df['BAD']=='BAD']  = 1
  df['BAD'][df['BAD']=='GOOD'] = 0
  df['BAD'] = df['BAD'].astype('int')


  y_temp = df['BAD']
  del df['BAD']

  #one hot encoding
  df = pd.get_dummies(df)

  #transform to numpy array >> same location for df and X
  X = df.to_numpy()
  y = y_temp.to_numpy()

  print("X type: ", type(X), "X shape: ",X.shape,"y shape: ", y.shape, "y mean: ", np.mean(y))

  return X,y


# Helper Functions

In [6]:
# append_record: helper function that adds best-parameter for every model to dict and saves it
def append_record(record, filename):
    with open(f'{filename}', 'a') as f:
        json.dump(record, f)
        f.write(os.linesep)

In [7]:
# loader function that unpacks tuning results and extracts parameters for different model steps
def param_getter(tuned=False, dataset=None):
  filename = f'{dataset}_tuned-params'

  with open(filename, 'r') as f:
    param_list = [json.loads(line) for line in f if line.startswith('{')]

  param_dict = {}
  for i in range(len(param_list)):
    strategy_short = list(param_list[i].keys())[0]
    param_dict[strategy_short] = param_list[i][list(param_list[i].keys())[0]]

  #find cases where some models are not tuned, establish base parameters
  for key, name in [('oracle', 'Oracle'),
                    ('score', 'Score'),
                    ('eer', 'QueryExpectedErrorReduction'), 
                    ('quire', 'QueryInstanceQUIRE'), 
                    ('bmdr','QueryInstanceBMDR'),
                    ('spal', 'QueryInstanceSPAL')]:
    try:
      param_dict[key]
    except KeyError:
      param_dict[key] = {'AL':{'strategy_name':name}, 'CLF':{}}

  # transfer tuned classifier to all models (clf is tuned separately, not in combination with AL model)
  for key in param_dict.keys():
    param_dict[key]['CLF']=param_dict['random']['CLF']

  for key in ['bmdr', 'spal']:
    param_dict[key]['AL']['rho'] = 10 #set this parameter for increased performance
  
  return param_dict

In [11]:
'''
  applies every threshold to a given prediction vector and returns misclassification cost as defined in the cost matrix
'''

def cost_tuner(y, pred_raw, y_mean, thres_array, cost_matrix = None):
  if cost_matrix == None:
    fn_cost = (1-y_mean)/y_mean # set fn cost to the inverse of the probability of the rare class; fix cost of fp to 1
    cost_matrix = [[0, 1],[fn_cost, 0]]

  print(cost_matrix)

  cost_list = []
  for t in thres_array:
    pred_thres =  (pred_raw[:,1] >= t).astype(int)
    cost = np.sum(confusion_matrix(y, pred_thres) * cost_matrix)
    cost_list.append(cost)

  print(cost_list)
  cost_min_fold = min(cost_list)
  print('cost_min_fold', cost_min_fold)
  cost_min_fold_index = cost_list.index(cost_min_fold)
  print('cost_min_fold_index', cost_min_fold_index)
  best_threshold_fold = thres_array[cost_min_fold_index]
  print(f'from cost tuner: best threshold of this fold: {best_threshold_fold}; minimum cost: {cost_min_fold}')

  return cost_list

In [13]:
'''
  the pipeline that loads pre-tuned classifier parameters, processes data (scaling, splitting), 
  trains the tuned classifier, makes a prediction, 
  computes cost for all thresholds, picks threshold that minimizes cost
'''

def threshold_finder(X,y,dataset,folds,cost_matrix):
  seed_everything(seed)

  param_dict = param_getter(tuned=True, dataset=dataset) # load tuned parameters
  clf = LogisticRegression(random_state = seed,**param_dict['random']['CLF']) #set up tuned classifier
  
  skf = StratifiedKFold(n_splits=folds, shuffle = True, random_state=seed)
  idx = []
  for train_index, test_index in skf.split(X, y):
    idx.append((train_index, test_index))

  cost_list = []
  thres_arr =  [np.round(i, 3) for i in iter(np.linspace(0, 1, 1001))] # 1000 possible thresholds between 0 and 1

  y_mean = np.mean(y)


  for f in range(folds-1):
    if len(y)<2000:
      train_idx = idx[f][0]
      test_idx = idx[f][1]
    else:
      #for large datasets, swap order of train and test indices in order to not train on too much data and closer reflect the conditions within the experiment
      train_idx = idx[f][1]
      test_idx = idx[f][0]
    
    if f == 0:
      print('lenghts of train & test: ',len(train_idx), len(test_idx))


    scaler = RobustScaler(with_centering=True, with_scaling=True)
    scaler.fit(X[train_idx,:])
    X_t = scaler.transform(np.array(X)) #scaled version of full dataset 

    clf.fit(X_t[train_idx,:], y[train_idx])
    probabilities = clf.predict_proba(X_t[test_idx,:])

    # for each threshold, compute the misclassification cost and add it to the list
    cost_list.append(cost_tuner(y[test_idx], probabilities, y_mean, thres_arr, cost_matrix = cost_matrix))
  
  #compute mean cost for each threshold across training folds
  mean_cost = np.mean(cost_list, axis = 0)
  cost_min = min(mean_cost)
  cost_min_index = np.where(mean_cost == mean_cost.min())[0][0]
  best_thres = np.round(thres_arr[cost_min_index], 3)

  return best_thres



# Run for all datasets in one go

In [16]:
threshold_dict = {}
folds = 5
cost_matrices = [None] #other cost matrices not feasible


for cost_matrix in cost_matrices:
  for d in dataset_list:
    print(f'##### STARTING TUNING FOR DATASET {d} #####')
    X,y = data_loader(d)

    filename = 'tuned_thresholds'
    
    if cost_matrix != None:
      filename += f"_cost-{cost_matrix[0][1]}-{cost_matrix[1][0]}"

    cost_threshold = threshold_finder(X, y, d, folds, cost_matrix = cost_matrix)
    print(f'for dataset {d}, and cost matrix {cost_matrix}, best threshold is {cost_threshold}')
    print("---------------------------------------------")

    threshold_dict[f'{d}'] = copy.deepcopy(cost_threshold)

  print(threshold_dict)

  with open(f'{filename}', 'wb') as a_file:
    pickle.dump(threshold_dict, a_file)

  a_file.close()

##### STARTING TUNING FOR DATASET bene2 #####
X type:  <class 'numpy.ndarray'> X shape:  (7190, 28) y shape:  (7190,) y mean:  0.3
lenghts of train & test:  1438 5752
[[0, 1], [2.3333333333333335, 0]]
[4027.0, 3960.6666666666665, 3928.6666666666665, 3914.6666666666665, 3895.6666666666665, 3886.6666666666665, 3869.6666666666665, 3854.6666666666665, 3841.0, 3826.0, 3814.0, 3805.0, 3791.0, 3786.3333333333335, 3783.6666666666665, 3779.0, 3767.0, 3763.3333333333335, 3752.3333333333335, 3739.3333333333335, 3731.3333333333335, 3722.3333333333335, 3716.3333333333335, 3705.3333333333335, 3692.3333333333335, 3687.3333333333335, 3681.3333333333335, 3671.3333333333335, 3668.3333333333335, 3662.3333333333335, 3656.3333333333335, 3648.3333333333335, 3644.6666666666665, 3638.6666666666665, 3634.0, 3632.3333333333335, 3625.3333333333335, 3616.3333333333335, 3604.3333333333335, 3591.3333333333335, 3586.6666666666665, 3583.6666666666665, 3574.6666666666665, 3567.6666666666665, 3565.0, 3562.3333333333335

In [29]:
fname = "tuned_thresholds"
infile = open(f'{fname}','rb')
tuned_thresholds_none = pickle.load(infile)
infile.close()

print(tuned_thresholds_none)

{'bene2': 0.283, 'gmsc': 0.057, 'uk': 0.039, 'lendingclub': 0.137, 'hmeq': 0.208, 'australian': 0.523, 'german': 0.307, 'thomas': 0.31, 'pakdd': 0.244, 'bene1': 0.372}
