In [None]:
from google.colab import drive
drive.mount('/gdrive', force_remount=True)

Mounted at /gdrive


In [None]:
### FOR RUNNING ON COLAB:

!pip uninstall cvxpy -y
!pip install cvxpy
!pip install git+https://github.com/NUAA-AL/alipy.git
!pip install -U imbalanced-learn
!pip install scikit-optimize

Found existing installation: cvxpy 1.0.31
Uninstalling cvxpy-1.0.31:
  Successfully uninstalled cvxpy-1.0.31
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting cvxpy
  Downloading cvxpy-1.2.1-cp37-cp37m-manylinux_2_24_x86_64.whl (2.8 MB)
[K     |████████████████████████████████| 2.8 MB 28.7 MB/s 
Installing collected packages: cvxpy
Successfully installed cvxpy-1.2.1
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting git+https://github.com/NUAA-AL/alipy.git
  Cloning https://github.com/NUAA-AL/alipy.git to /tmp/pip-req-build-rtzcxcs3
  Running command git clone -q https://github.com/NUAA-AL/alipy.git /tmp/pip-req-build-rtzcxcs3
Building wheels for collected packages: alipy
  Building wheel for alipy (setup.py) ... [?25l[?25hdone
  Created wheel for alipy: filename=alipy-1.2.5-py3-none-any.whl size=121053 sha256=80b8626114d3580799012597aa7d4f746b6cee2dd3a3c6d6f35f2275

In [None]:
!pip freeze

In [None]:
############ LIBRARIES

import os
import time
import datetime
import random
import multiprocessing
import pickle
import re
import copy
import gc
import sys
import json
gc.enable()

import warnings
warnings.filterwarnings('ignore')


import numpy as np
import pandas as pd
pd.set_option('display.max_columns', None)

from itertools import cycle


import scipy.stats

#from sklearn.preprocessing import OneHotEncoder
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import RobustScaler
#from sklearn.model_selection import StratifiedKFold  ##### what is this used for?
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.metrics import classification_report
from sklearn.metrics import roc_auc_score
from sklearn.metrics import brier_score_loss
from sklearn.metrics import f1_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import fbeta_score

# FOR CROSS VALIDATED HYPERPARAMETER TUNING
# use imblearn pipeline instead of sklearn pipeline to skip AL sampling process in the prediction phase
from imblearn.pipeline import Pipeline
from imblearn import FunctionSampler
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV


from sklearn.ensemble import RandomForestClassifier 
from sklearn.linear_model import LogisticRegression

from alipy import ToolBox
from alipy import query_strategy
from alipy.index import IndexCollection
from alipy import data_manipulate
import cvxpy



In [None]:
os.chdir('/gdrive/My Drive/ACTIVE LEARNING THESIS/')

In [None]:
os.getcwd()

'/gdrive/My Drive/ACTIVE LEARNING THESIS'

# CV helper functions

In [None]:
############ RANDOMNESS
# seed function
def seed_everything(seed):
    os.environ['PYTHONHASHSEED'] = str(seed)
    random.seed(seed)
    np.random.seed(seed)
    
# set seed
seed = 30
seed_everything(seed)

In [None]:
# get strategy

def strategy_getter(X, y, strategy_name="QueryInstanceRandom", train_idx = None, **kwargs):
    """Return the query strategy object from alipy package"""
    
    try:
        exec("from alipy.query_strategy import " + strategy_name)
    except:
        raise KeyError("Strategy " + strategy_name + " is not implemented in ALiPy.")
    strategy = None
    
    if train_idx is not None:
      strategy = eval(strategy_name + "(X=X, y=y, train_idx = train_idx, **kwargs)")
    else:
      strategy = eval(strategy_name + "(X=X, y=y, **kwargs)")
          
    # print(strategy)
    return strategy

In [None]:
#Custom Data Scaler for use in tuning pipeline
class CustomScaler(BaseEstimator, TransformerMixin):
  
  def __init__(self, with_centering=True, with_scaling=True, seed=30):
        self.seed = seed
        self.with_centering = with_centering
        self.with_scaling = with_scaling
        self.scaler = RobustScaler(with_centering=self.with_centering, with_scaling = self.with_scaling)


  #estimator method
  def fit(self, X, y):
    sss = StratifiedShuffleSplit(n_splits=1, test_size=int(0.5*len(X)), random_state=self.seed) 
    for label, unlabel in sss.split(X=X, y=y):
      label_idx, unlabel_idx = np.asarray(label), np.asarray(unlabel)    

      self.scaler.fit(X[label_idx,:])
      return self

  #transformation
  def transform(self, X, y = None):
    self.X = self.scaler.transform(X)
    return self.X


In [None]:
# Custom Sampler Class, only run in the fit phase of the pipeline, skipped in predict phase
#####################################
def AL_resampler(X=None
              , y=None
              , strategy_name = None
              , pass_index=False
              , model = None
              , seed=30
              , **kwargs):

  
  os.environ['PYTHONHASHSEED'] = str(seed)
  random.seed(seed)
  np.random.seed(seed)
  
  # re-create the sample that was used to scale the data!
  # this is done so that the scaling is only done with data that is labeled, so that there is no leakage
  # this is the intial labeled sample that all strategies share
  sss = StratifiedShuffleSplit(n_splits=1, test_size=int(0.5*len(X)), random_state=seed) # THESE ARE POSITIONS, NOT INDICES    
  for label, unlabel in sss.split(X=X, y=y):
    label_idx, unlabel_idx = np.asarray(label), np.asarray(unlabel)    
        

  n_instances = int(0.5*len(unlabel_idx))

  ##############
  # initialize AL strategy
  ##############
  t_idx = np.concatenate((label_idx, unlabel_idx))

  if pass_index and not strategy_name == 'QueryInstanceLAL':
    strategy = strategy_getter(X, y, strategy_name, t_idx, **kwargs)

  elif not pass_index and not strategy_name == 'QueryInstanceLAL':
    strategy = strategy_getter(X, y, strategy_name, **kwargs)

  # special case because of extra RF; might be prohibitively expensive to tune
  elif strategy_name == 'QueryInstanceLAL':

    param_dict = {**kwargs}
    reg_est=param_dict.pop('reg_est', None)
    reg_depth=param_dict.pop('reg_depth', None)
    reg_feat=param_dict.pop('reg_feat', None)
    if reg_feat > np.shape(X)[1]:
      reg_feat = np.shape(X)[1]

    strategy = strategy_getter(X, y, strategy_name = strategy_name, mode='LAL_iterative', train_slt=False, **param_dict)
    starttime = time.time()
    strategy.download_data()
    print('duration of data download', starttime - time.time())
    strategy.train_selector_from_file(reg_est=reg_est, reg_depth=reg_depth, feat=reg_feat)

  ##############
  # run AL selection once
  ##############

  #fit model then pass it as argument to some AL strategies
  if not model == None:
    model.fit(X[label_idx], y[label_idx])

  if strategy_name == 'QueryInstanceQUIRE': #strategy quire has no batch mode
    label_idx = IndexCollection(label_idx)
    unlabel_idx = IndexCollection(unlabel_idx)

    print('starting selection')
    for n in range(n_instances):
      selection = strategy.select(label_index=label_idx.index, unlabel_index=unlabel_idx.index)[0] # returns a list of len one, hence the [0]
      label_idx.update(selection)
      unlabel_idx.difference_update(selection)
      if n%10 == 0:
        print(f"selected {n} cases")
    label_idx = label_idx.index

  elif strategy_name in ['QueryInstanceBMDR', 'QueryInstanceSPAL']:
    select_idx = strategy.select(label_idx, unlabel_idx, model=model,batch_size=n_instances, qp_solver = 'OSQP')
    label_idx = np.concatenate((label_idx, select_idx))

  elif model == None:
    select_idx = strategy.select(IndexCollection(label_idx), IndexCollection(unlabel_idx), batch_size=n_instances)
    label_idx = np.concatenate((label_idx, select_idx))
  
  elif not model ==None:
    select_idx = strategy.select(label_idx, unlabel_idx, model=model, batch_size=n_instances)
    label_idx = np.concatenate((label_idx, select_idx))


  return X[label_idx,:], y[label_idx]

In [None]:
def append_record(record, filename):
    with open(f'metaparameters/{filename}', 'a') as f:
        json.dump(record, f)
        f.write(os.linesep)

# Load Data

In [None]:
############ DATA IMPORT

## available datasets

  # OK gmsc          # shape:  (150000, 68)
  # OK uk            # shape:  (30000, 51), y mean:  0.04
  # OK lendingclub   # shape:  (41623, 114) y mean:  0.1331235134420873    
  # OK bene2         # shape:  (7190, 28)
  # bene1            # shape:  (3123, 18)
  # hmeq             # shape:  (5960, 20)
  # australian       # shape:  (690, 42)
  # german           # shape:  (1000, 61)
  # thomas           # shape:  (1225, 28)
  # pakdd            # shape:  (50000, 373), y mean:  0.26082

dataset = "australian"
#df = pd.read_csv('//home//RDC//kolbeluk1//AL_THESIS//prepared_data//{}.csv'.format(dataset)) #linux path
# C:\\Users\\kolbeluk1\\AL_THESIS
#df = pd.read_csv('C:\\Users\\kolbeluk1\\AL_THESIS\\prepared_data\\{}.csv'.format(dataset))

df = pd.read_csv('/gdrive/My Drive/ACTIVE LEARNING THESIS/prepared_data/{}.csv'.format(dataset))

# remove NA
df = df.dropna()
df.reset_index(drop = True, inplace = True)

# extract label
df['BAD'][df['BAD']=='BAD']  = 1
df['BAD'][df['BAD']=='GOOD'] = 0
df['BAD'] = df['BAD'].astype('int')


y_temp = df['BAD']
del df['BAD']

#one hot encoding
df = pd.get_dummies(df)

#transform to numpy array >> same location for df and X
X = df.to_numpy()
y = y_temp.to_numpy()

print("X type: ", type(X), "X shape: ",X.shape,"y shape: ", y.shape, "y mean: ", np.mean(y))
print (id(X), id(df))


X type:  <class 'numpy.ndarray'> X shape:  (690, 42) y shape:  (690,) y mean:  0.4449275362318841
140421149539664 140421705635216


In [None]:
# append_record: helper function that adds best-parameter for every model to dict and saves it
# careful when re-tuning: formula just adds entries, does not delete existing. Multiple specs for single model are possible. Best run once without pre-existing parameter file for this dataset.
filename = f'{dataset}_tuned-params'
filename

'australian_tuned-params'

# Split off Validation Set

implement scheme for splitting a validation set

In [None]:
# determine number of folds based on dataset size, reduce size of training if necessary (stratified split)
if len(y) > 25000:
    validation_size = 7500
    folds = 3
    sss = StratifiedShuffleSplit(n_splits=1, test_size=validation_size, random_state=seed)
    for model, validation in sss.split(X=X, y=y):
        model_idx, validation_idx = np.asarray(model), np.asarray(validation)    

    print(np.sum(model_idx), np.sum(validation_idx))
    X_val, y_val = X[validation_idx,:], y[validation_idx]

elif len(y) < 2000:
    folds = 10
    X_val, y_val = X, y

else:
    folds = 5
    X_val, y_val = X, y
    
print(np.shape(X), np.shape(X_val), 'folds: ', folds)

(690, 42) (690, 42) folds:  10


# CV Run

In [None]:
# use imblearn pipeline to skip AL sampling process in the prediction phase
CLF_pipe = Pipeline(steps=[('scaler', CustomScaler(seed=seed)),
                           ('AL_sampler', FunctionSampler(func=AL_resampler)),    # this will trigger a call to __init__
                           ('clf', LogisticRegression(random_state=seed))])


In [None]:
## RUN TUNING FOR CLASSIFIER FIRST!

for key in ['random']:

  param_grid={'clf__solver': ['liblinear'], 
              'clf__penalty': ['l1', 'l2'], 
              'clf__C': [0.001,0.01,0.1,1.,10.,100.], 
              'clf__max_iter': [50, 100, 250, 500], 
              'clf__tol': [0.001, 0.0001, 0.00001],               
              'AL_sampler__kw_args': [{'strategy_name': 'QueryInstanceRandom'}]}

  grid_search_clf = RandomizedSearchCV(CLF_pipe, param_distributions=param_grid, n_iter = 500, cv=10, n_jobs=20, verbose=5, error_score="raise", refit='roc_auc', scoring=['roc_auc'], random_state=seed)
  #random search covers all possible parameters, so it is equivalent to running gridsearch
  grid_search_clf.fit(X_val, y_val)

  ### SAVE RESULTS
  AL_params = copy.deepcopy(grid_search_clf.best_params_['AL_sampler__kw_args'])   #[i] for i in grid_search_clf.best_params_['sampler__kw_args'] if i not in ['key', 'name', 'sample_size']]
  AL_params.pop('pass_index', None)
  AL_params.pop('model', None)
  AL_params.pop('skip', None)

  CLF_params = copy.deepcopy(grid_search_clf.best_params_)
  CLF_params.pop('AL_sampler__kw_args', None)
  CLF_params.pop('outlier__kw_args', None)

  for clf_key in list(CLF_params.keys()):
    new_key = re.sub(r'clf__', '', clf_key)
    CLF_params[new_key] = CLF_params.pop(clf_key)

  cv_param_dict = {f'{key}': {#'outlier_rf': outlier_params,
                 'AL': AL_params,
                 'CLF': CLF_params}}
  
  append_record(cv_param_dict, filename)

Fitting 10 folds for each of 144 candidates, totalling 1440 fits


In [None]:
#Define tunable parameters for AL models

clf = LogisticRegression(random_state=seed)

clf_tuned_params = copy.deepcopy(grid_search_clf.best_params_)
clf_tuned_params.pop('AL_sampler__kw_args', None)

for clf_key in list(clf_tuned_params.keys()):
  new_key = re.sub(r'clf__', '', clf_key)
  clf_tuned_params[new_key] = clf_tuned_params.pop(clf_key)

clf.set_params(**clf_tuned_params)
print(str(clf))

AL_pipe = Pipeline(steps=[#('outlier', FunctionSampler(func=outlier_rejection)),
                      ('scaler', CustomScaler(seed=seed)),
                      ('AL_sampler', FunctionSampler(func=AL_resampler)),    # this will trigger a call to __init__
                      ('clf', clf)])

cv_parameters = {'unc': [{'strategy_name': 'QueryInstanceUncertainty', 'model': clf, 'measure': m} for m in ['entropy', 'least_confident', 'margin', 'distance_to_boundary']]
                ,'qbc': [{'strategy_name': 'QueryInstanceQBC', 'model': clf, 'method': 'query_by_bagging', 'disagreement': d} for d in ['vote_entropy', 'KL_divergence']]
                ,'dw': [{'strategy_name': 'QueryInstanceDensityWeighted', 'model': clf, 'uncertainty_meansure': u , 'distance': d, 'beta': b} for u in ['least_confident', 'margin', 'entropy'] for d in ['cityblock', 'cosine', 'euclidean'] for b in [0.5, 1, 2]]
                ,'density':[{'strategy_name': 'QueryInstanceGraphDensity', 'pass_index': True, 'metric': m} for m in ['canberra', 'jaccard', 'cosine', 'hamming']]
                ,'cors' :  [{'strategy_name': 'QueryInstanceCoresetGreedy', 'pass_index': True, 'distance': d} for d in ['cityblock', 'cosine', 'euclidean']]
                ,'lal': [{'strategy_name': 'QueryInstanceLAL', 'cls_est': cls_est, 'reg_est': reg_est, 'reg_depth': reg_depth, 'reg_feat': reg_feat} for cls_est in [16,32,64] for reg_est in [32,64,128] for reg_depth in [5,10,20] for reg_feat in [5,6,7]]
                ,'spal': [{'strategy_name': 'QueryInstanceSPAL', 'kernel': 'rbf', 'mu': mu, 'gamma': g, 'lambda_init': li, 'lambda_pace': lp, 'rho':10} for mu in [0.01, 0.1, 1] for g in [0.01, 0.1, 1] for li in [0.01, 0.1, 1] for lp in [0.001, 0.01, 0.1]] #parameter rho is not tuned because it has massive effect on computation times
                ,'bmdr': [{'strategy_name': 'QueryInstanceBMDR', 'kernel': 'rbf', 'beta': b, 'gamma': g, 'rho':10} for b in [100, 1000, 10000] for g in [0.01, 0.1, 1]] # issues with the solver ECOS; parameter rho is not tuned because it has massive effect on computation times
                #,'eer' has no tunable parameters
                #,'quire' does not run stable enough for tuning on all datasets, some parameter combinations seem to cause issues
}

LogisticRegression(max_iter=50, penalty='l1', random_state=30,
                   solver='liblinear', tol=1e-05)


In [None]:
# tune AL models

start = time.time()
print(start)

for key in cv_parameters.keys(): #eer, quire, lal, spal, bmdr
#for key in ['bmdr']:
  loop_start = time.time()
  param_grid={'AL_sampler__kw_args': cv_parameters[key]} 
  
  grid_search_AL = RandomizedSearchCV(AL_pipe, param_distributions=param_grid, n_iter=150, cv=folds, n_jobs=20, verbose=5, error_score="raise", refit='roc_auc', scoring=['roc_auc'], random_state=seed)
  grid_search_AL.fit(X_val, y_val)

  AL_params = copy.deepcopy(grid_search_AL.best_params_['AL_sampler__kw_args'])  
  AL_params.pop('pass_index', None)
  AL_params.pop('model', None)
  AL_params.pop('skip', None)

  CLF_params = copy.deepcopy(grid_search_AL.best_params_)
  CLF_params.pop('AL_sampler__kw_args', None)

  for clf_key in list(CLF_params.keys()):
    new_key = re.sub(r'clf__', '', clf_key)
    CLF_params[new_key] = CLF_params.pop(clf_key)

  cv_param_dict = {f'{key}': {#'outlier_rf': outlier_params,
                 'AL': AL_params,
                 'CLF': CLF_params}}
  
  append_record(cv_param_dict, filename)
  
  print(f'{key} time:',(time.time() - loop_start)/3600)
print('total time:',(time.time() - start)/3600)

1658736200.6394048
Fitting 10 folds for each of 4 candidates, totalling 40 fits
unc time: 0.0002417404121822781
Fitting 10 folds for each of 2 candidates, totalling 20 fits
qbc time: 0.000354471140437656
Fitting 10 folds for each of 27 candidates, totalling 270 fits
dw time: 0.0016161796119478014
Fitting 10 folds for each of 4 candidates, totalling 40 fits
density time: 0.03715307745668623
Fitting 10 folds for each of 3 candidates, totalling 30 fits
cors time: 0.0007801794343524509
Fitting 10 folds for each of 81 candidates, totalling 810 fits


In [None]:
#### PRINT RESULTS
print("Best parameters set found on development set:", "\n")
print(grid_search_AL.best_params_)
print("Grid scores on development set:", "\n")
means_roc = grid_search_AL.cv_results_['mean_test_roc_auc']
stds_roc = grid_search_AL.cv_results_['std_test_roc_auc']
for means_roc, stds_roc, params in zip(means_roc, stds_roc, grid_search_AL.cv_results_['params']):
  print("%0.3f (+/-%0.03f) for %r"
        % (means_roc, stds_roc * 2, params))

print("\n", "Detailed classification report:", "\n")
print("The model is trained on the full development set.", "\n")
print("The scores are computed on the full evaluation set.", "\n")
y_pred = grid_search_AL.predict(X_model)
print(classification_report(y_model, y_pred))
