In [107]:
import numpy as np
import pandas as pd
import nevergrad as ng
import matplotlib.pyplot as plt

from sklearn.metrics import f1_score
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler

from sklearn.linear_model import LogisticRegression, RidgeClassifier

import sys
sys.path.append('utils')

import dataframe_utils
import preprocess_utils
import model_utils
import metrics_utils

### Continuous hyperparameter optimization

In [14]:
# Genethic Algorithm
# [ ] Drop columns not marked in the bit vector 
# [ ] Drop rows with less than X% registers (?)
# [ ] Drop columns with less than N% of the maximum number of registers (?)
# [ ] Drop all NAs
# [ ] Drop all columns with more than 99% of equal values
# [ ] Train and validate model
# [ ] Maximize score function = ( 1 * ((rows * colums) / total datapoints) + 4 * f1_score)/5 

# [ ] Use Miss Forest to impute data instead of removing NAs
# If using Miss Forest, final score is rows * columns - datapoints imputed


# - Drop vectors not selected by the column array
# - drop rows with less than X% registers
# - Drop all NAs
# - Calculate f1

Import dataset for optmization

In [53]:
df = pd.read_excel('datasets/dataset.xlsx').drop(labels = [
        'Patient ID',  
        'Patient addmited to intensive care unit (1=yes, 0=no)',
        'Patient addmited to semi-intensive unit (1=yes, 0=no)',
        'Patient addmited to regular ward (1=yes, 0=no)'],axis = 1)

class_label = 'SARS-Cov-2 exam result';
empty_features = df.count() == 0
df = df.loc[:,empty_features.values == False]
df_x = df.drop(labels = [class_label],axis = 1)
df_y = df[[class_label]]

In [83]:
def function_to_optimize1(args,verbose = False):

    threshold1 = args[0]
    threshold2 = args[1]
    threshold3 = args[2]

    df_subset = drop_by_data_threshold(df,threshold = threshold1, axis = 'column')
    df_subset = drop_by_data_threshold(df_subset,threshold = threshold2, axis = 'row')
    df_final = drop_by_data_threshold(df_subset,threshold = threshold3, axis = 'column')
    df_final.dropna(inplace = True)
    
    if verbose: print(df_final.shape)
    if (df_final.shape[0] * df_final.shape[1]) < 30: return 1
    
    df_final = encode_features(df_final)
    X_train_norm, X_validation_norm, y_train, y_validation = prepare_data_for_model(df_final)
    best_model, best_params, best_score = find_best_model(X_train_norm,y_train)

    if verbose:
        print('shape = ',df_final.shape)
        print('f1 = ', best_score);
        return best_model, best_params, best_score
    
    return -best_score

In [84]:
def function_to_optimize2(*args,verbose = False):
    
    inputs = [item for item in args]
    
    column_selection = inputs[:-1]
    row_threshold = inputs[-1]

    df_selected = df_x.loc[:,column_selection]
    df_selected = pd.concat([df_y, df_selected], axis=1, sort=False)

    df_final = drop_by_data_threshold(df_selected,threshold = row_threshold, axis = 'row')
    df_final.dropna(inplace = True)

    if verbose: print(df_final.shape)
    # if df_final.shape[0] == 0 or df_final.shape[1] == 0: return np.inf
    if (df_final.shape[0] * df_final.shape[1]) < 30: return 1
    
    df_final = encode_features(df_final)
    X_train_norm, X_validation_norm, y_train, y_validation = prepare_data_for_model(df_final)
    best_model, best_params, best_score = find_best_model(X_train_norm,y_train)

    if return_model:
        print('shape = ',df_final.shape)
        print('f1 = ', best_score);
        return best_model, best_params, best_score
    
    return -best_score

In [85]:
def find_best_model(X_train,y_train):
    
    logistic_regression_params = {
    'C' : [0.1,1,10,100],
    'penalty' : ['l2', 'elasticnet'],
    'solver' : ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga']
    }
    score = 'f1'
    best_model, best_params, best_score = model_utils.find_best_classification_model_with_cross_validation(
        LogisticRegression(random_state=0, class_weight = 'balanced'),
        logistic_regression_params,
        X_train.values,
        y_train.values.ravel(),
        k_folds = 5,
        metric = score)
    return best_model, best_params, best_score

def encode_features(df):
    
    mask_non_numeric = df.dtypes == object
    mask_non_numeric = mask_non_numeric[mask_non_numeric]
    unique_values = dataframe_utils.get_column_categories(df[mask_non_numeric.index],verbose = 0);

    for col in unique_values.keys():
        df.loc[df[col] == 'detected',col] = 1
        df.loc[df[col] == 'not_detected',col] = 0

    df.loc[df[class_label] == 'positive',class_label] = 1
    df.loc[df[class_label] == 'negative',class_label] = 0

    non_float_cols = (df.dtypes == np.float).values
    for col in df.columns:
        if df[col].dtypes == np.float:
            continue
        df[col] = df[col].astype(int)
    return df;

def prepare_data_for_model(df):

    columns_x = df.drop(columns = [class_label]).columns
    column_y = [class_label]

    X_train, X_validation, y_train, y_validation = train_test_split(
    df[columns_x], df[column_y],  test_size=0.20, random_state=42)

    scaler = MinMaxScaler()
    scaler.fit(df[columns_x])

    X_train_norm = pd.DataFrame(data=scaler.transform(X_train),columns=X_train.columns)
    X_validation_norm = pd.DataFrame(data=scaler.transform(X_validation),columns=X_train.columns)
    return X_train_norm, X_validation_norm, y_train, y_validation


def drop_by_data_threshold(df,threshold,axis):

    if axis == 'row': axis = 1
    elif axis == 'col' or axis == 'column': axis = 0

    max_values = df.shape[axis]
    value_threshold = int(threshold * max_values)
    mask_values_to_keep = df.count(axis = axis) >= value_threshold
    # print('threshold = ',value_threshold)
    if axis == 1: return df[mask_values_to_keep]
    elif axis == 0: return df.loc[:,mask_values_to_keep.values]

In [71]:
# O resultado final não foi satisfatório na maioria das vezes. Apenas uma vez que eu consegui um f1 de 0.699. Foi o maior até então.

# busca exaustiva
def grid_search(*args):

    itens = [x for x in args]
    list_size = len(itens)
    current = 0

    threshold1 = np.arange(0, 1.05, 0.05)
    threshold2 = np.arange(0, 1.05, 0.05)
    threshold3 = np.arange(0, 1.05, 0.05)

    options = np.meshgrid(threshold1, threshold2, threshold3, sparse=False)

    comb = np.array(np.meshgrid(x, y, z)).T.reshape(-1,3)
    comb.shape