In [2]:
# # Caso ja não tenha instalado, instale os seguintes pacotes:

# !pip install cmake
# !pip install xgboost
# !pip install sklearn
# !pip install sklearn-deap
# # (para usar o EvolutionarySearch >> https://github.com/rsteca/sklearn-deap)
# !pip install scikit-plot

In [3]:
import multiprocessing 
 
print(multiprocessing.cpu_count()) 

64


In [4]:
import pandas as pd
import pyspark.sql.functions as F
from datetime import datetime
from pyspark.sql.types import *
from pyspark import StorageLevel

import numpy as np
pd.set_option("display.max_rows", 1000)
pd.set_option("display.max_columns", 1000)
pd.set_option("mode.chained_assignment", None)

In [5]:
import xgboost as xgb #XGBClassifier
from sklearn.model_selection import train_test_split, StratifiedKFold
import matplotlib.pylab as plt

from sklearn import metrics
from evolutionary_search import EvolutionaryAlgorithmSearchCV
import scikitplot as skplt

In [6]:
import sklearn
import scikitplot as skplt
from sklearn.metrics import classification_report, confusion_matrix, precision_score

<hr />
<hr />
<hr />

In [7]:
# undersamp_col = ['02-KMODES', '03-STRSAMP-AG', '04-STRSAMP-EW'] 
# dfs = ['ds-1', 'ds-2', 'ds-3']
# cols_sets = ['cols_set_1', 'cols_set_2', 'cols_set_3']

undersamp_col = ['02-KMODES']
dfs = ['ds-1']
cols_sets = ['cols_set_3']

In [8]:
# lists of params
model_MaxEstimators = [50] 
model_maxDepth = [10] 

list_of_param_dicts = []

for maxIter in model_MaxEstimators:
    for maxDepth in model_maxDepth: 
        params_dict = {}
        params_dict['MaxEstimators'] = maxIter
        params_dict['maxDepth'] = maxDepth
        list_of_param_dicts.append(params_dict)

print("There is {} set of params.".format(len(list_of_param_dicts)))
# list_of_param_dicts

There is 1 set of params.


In [9]:
prefix = 'gs://ai-covid19-datalake/trusted/experiment_map/'

<hr />
<hr />
<hr />

In [10]:
# filename = 'gs://ai-covid19-datalake/trusted/experiment_map/02-KMODES/ds-1/cols_set_1/experiment0.parquet'
# df = spark.read.parquet(filename).sample(0.3)

In [11]:
# df = df.toPandas()

In [12]:
# params_dict = {'MaxEstimators': 10,
#                'maxDepth': 3}
# cols = 'cols_set_1'
# experiment_filter = 'ds-1'
# undersampling_method = '03-STRSAMP-AG', 
# experiment_id = 0

In [13]:
# model = run_xgboost(df, params_dict, cols, filename, experiment_filter, undersampling_method, experiment_id)

In [14]:
# print('finished')

In [15]:
# model['model_time_exec']

In [16]:
# model['model_AUC_PR']

<hr />
<hr />
<hr />

In [17]:
# Ref: https://stackoverflow.com/questions/37292872/how-can-i-one-hot-encode-in-python
def encode_and_bind(original_dataframe, feature_to_encode):
    dummies = pd.get_dummies(original_dataframe[[feature_to_encode]])
    res = pd.concat([original_dataframe, dummies], axis=1)
    res = res.drop([feature_to_encode], axis=1)
    return(res) 


def run_xgboost(exp_df, params_dict, cols, filename, experiment_filter, 
            undersampling_method, experiment_id):
    
    import time
    start_time = time.time()
    
    n_covid = len(exp_df[exp_df['CLASSI_FIN']==1.0])
    n_not_covid = len(exp_df[exp_df['CLASSI_FIN']==0.0])
    
    id_cols = ['NU_NOTIFIC', 'CLASSI_FIN']
    
    for column in exp_df.columns: 
        exp_df[column] = exp_df[column].astype("category")
        if column != "CLASSI_FIN":
            exp_df = encode_and_bind(exp_df, column)
            
    # Seleção das variáveis que serão submetidas ao modelo
    x = exp_df.drop("CLASSI_FIN", axis=1)
    y = exp_df.CLASSI_FIN
    
    
    X_train, X_test, y_train, y_test = train_test_split(x, 
                                                        y, 
                                                        test_size=0.3, 
                                                        random_state=2021)

    # Gerando o modelo
    model = xgb.XGBClassifier(objective="binary:logistic",
                              n_jobs = 30,
                              colsample_bytree = 0.3,
                              learning_rate=0.1, 
                              max_depth= params_dict['maxDepth'],
                              n_estimators = params_dict['MaxEstimators'],
                              min_child_weight = 1,
                              subsample=0.5, 
                              scale_pos_weight=2, 
                              eval_metric="error", 
                              booster='dart')

    model.fit(X_train,y_train)
    # Predizendo
    pred = model.predict(X_test)
    
    
    # validation 
    fpr, tpr, thresholds_auc_roc = metrics.roc_curve(y_test, pred)
    auc_ROC = metrics.auc(fpr, tpr)
    
    precision, recall, thresholds = metrics.precision_recall_curve(y_test, pred)
    aupr_ROC = metrics.auc(recall, precision)

    de_para = {1.0: 'covid', 0.0: 'nao_covid'}
    y_test = y_test.replace(de_para)
    pred = pd.Series(pred).replace(de_para)

    report = metrics.classification_report(y_test,pred,output_dict=True)
    conf_matrix = metrics.confusion_matrix(y_test, pred)
    
    
    # Gerando os metadados
    result_dict = {}

    result_dict['experiment_filter'] = experiment_filter
    result_dict['undersampling_method'] = undersampling_method
    result_dict['filename'] = filename
    result_dict['experiment_id'] = experiment_id
    result_dict['n_covid'] = n_covid
    result_dict['n_not_covid'] = n_not_covid
    result_dict['model_name'] = 'XGBoost'
    result_dict['params'] = params_dict
    result_dict['model_AUC_ROC'] = auc_ROC
    result_dict['model_AUC_PR'] = aupr_ROC
    result_dict['model_covid_precision'] = report['covid']['precision']
    result_dict['model_covid_recall'] = report['covid']['recall']
    result_dict['model_covid_f1'] = report['covid']['f1-score']
    result_dict['model_not_covid_precision'] = report['nao_covid']['precision']
    result_dict['model_not_covid_recall'] = report['nao_covid']['recall']
    result_dict['model_not_covid_f1'] = report['nao_covid']['f1-score']
    result_dict['model_avg_precision'] = report['macro avg']['precision']
    result_dict['model_avg_recall'] = report['macro avg']['recall']
    result_dict['model_avg_f1'] = report['macro avg']['f1-score']
    result_dict['model_avg_acc'] = report['accuracy']
    result_dict['model_TP'] = conf_matrix[0][0]
    result_dict['model_TN'] = conf_matrix[1][1]
    result_dict['model_FN'] = conf_matrix[0][1]
    result_dict['model_FP'] = conf_matrix[1][0]
    result_dict['model_time_exec'] = time.time() - start_time
    result_dict['model_col_set'] = cols
    
    return result_dict

<hr />
<hr />
<hr />

In [18]:
experiments = []

### Datasets:

In [None]:
for uc in undersamp_col: 
    for ds in dfs:
        for col_set in cols_sets:
            for params_dict in list_of_param_dicts: 
                for id_exp in range(5):
                    filename = prefix + uc + '/' + ds + '/' + col_set + '/' + 'experiment' + str(id_exp) + '.parquet'
                    exp_dataframe = spark.read.parquet(filename)
                    exp_dataframe = exp_dataframe.toPandas()
                    print('read {}'.format(filename))
                    
                    undersampling_method = uc
                    experiment_filter = ds
                    experiment_id = id_exp

                    try:                     
                        model = run_xgboost(exp_dataframe, params_dict, col_set, filename, experiment_filter, undersampling_method, experiment_id)
                        experiments.append(model)

                        print("Parameters ==> {}\n Results: \n AUC_PR: {} \n Precision: {} \n Time: {}".format(str(params_dict), str(model['model_AUC_PR']), str(model['model_avg_precision']), str(model['model_time_exec'])))
                        print('=========================== \n')
                    except:
                        print('=========== W A R N I N G =========== \n')
                        print('Something wrong with the exp: {}, {}, {}'.format(filename, params_dict, col_set))

read gs://ai-covid19-datalake/trusted/experiment_map/02-KMODES/ds-1/cols_set_3/experiment0.parquet




Parameters ==> {'MaxEstimators': 50, 'maxDepth': 10}
 Results: 
 AUC_PR: 0.9368176488063367 
 Precision: 0.9262476181826504 
 Time: 6478.095958471298

read gs://ai-covid19-datalake/trusted/experiment_map/02-KMODES/ds-1/cols_set_3/experiment1.parquet




In [None]:
for i in range(len(experiments)):
    for d in list(experiments[i].keys()):
        experiments[i][d] = str(experiments[i][d])

In [None]:
# experiments

In [None]:
cols = ['experiment_filter', 'undersampling_method', 'filename', 'experiment_id', 'n_covid', 'n_not_covid', 'model_name', 'params', 'model_AUC_ROC', 'model_AUC_PR', 'model_covid_precision', 'model_covid_recall', 'model_covid_f1', 'model_not_covid_precision', 'model_not_covid_recall', 'model_not_covid_f1', 'model_avg_precision', 'model_avg_recall', 'model_avg_f1', 'model_avg_acc', 'model_TP', 'model_TN', 'model_FN', 'model_FP', 'model_time_exec', 'model_col_set']

In [None]:
intermed_results = spark.createDataFrame(data=experiments).select(cols)
intermed_results.toPandas()

In [None]:
intermed_results.write.parquet('gs://ai-covid19-datalake/trusted/intermed_results/KMODES/XGBOOST_experiments-kmodes-ds1-cs3.parquet', mode='overwrite')

In [None]:
print('finished')

In [None]:
intermed_results.show()