In [1]:
import pandas as pd
import pyspark.sql.functions as F
from datetime import datetime
from pyspark.sql.types import *
from pyspark import StorageLevel

import numpy as np
pd.set_option("display.max_rows", 1000)
pd.set_option("display.max_columns", 1000)
pd.set_option("mode.chained_assignment", None)

In [2]:
from pyspark.ml import Pipeline
from pyspark.ml.classification import GBTClassifier
from pyspark.ml.feature import IndexToString, StringIndexer, VectorIndexer
# from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.feature import OneHotEncoderEstimator, StringIndexer, VectorAssembler

from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder

from pyspark.sql import Row
from pyspark.ml.linalg import Vectors

In [3]:
# !pip install scikit-plot

In [4]:
import sklearn
import scikitplot as skplt
from sklearn.metrics import classification_report, confusion_matrix, precision_score

<hr />
<hr />
<hr />

In [5]:
result_schema = StructType([
                    StructField('experiment_filter', StringType(), True),
                    StructField('undersampling_method', StringType(), True),
                    StructField('undersampling_column', StringType(), True),
                    StructField('filename', StringType(), True),
                    StructField('experiment_id', StringType(), True),
                    StructField('n_covid', IntegerType(), True),
                    StructField('n_not_covid', IntegerType(), True),
                    StructField('model_name', StringType(), True),
                    StructField('model_seed', StringType(), True),
                    StructField('model_maxIter', IntegerType(), True),
                    StructField('model_maxDepth', IntegerType(), True),
                    StructField('model_maxBins', IntegerType(), True),
                    StructField('model_minInstancesPerNode', IntegerType(), True),
                    StructField('model_minInfoGain', FloatType(), True),
                    StructField('model_featureSubsetStrategy', StringType(), True),
                    StructField('model_n_estimators', IntegerType(), True),
                    StructField('model_learning_rate', FloatType(), True),
                    StructField('model_impurity', StringType(), True),
                    StructField('model_AUC_ROC', StringType(), True),
                    StructField('model_AUC_PR', StringType(), True),
                    StructField('model_covid_precision', StringType(), True),
                    StructField('model_covid_recall', StringType(), True),
                    StructField('model_covid_f1', StringType(), True),
                    StructField('model_not_covid_precision', StringType(), True),
                    StructField('model_not_covid_recall', StringType(), True),
                    StructField('model_not_covid_f1', StringType(), True),
                    StructField('model_avg_precision', StringType(), True),
                    StructField('model_avg_recall', StringType(), True),
                    StructField('model_avg_f1', StringType(), True),
                    StructField('model_avg_acc', StringType(), True),
                    StructField('model_TP', StringType(), True),
                    StructField('model_TN', StringType(), True),
                    StructField('model_FN', StringType(), True),
                    StructField('model_FP', StringType(), True),
                    StructField('model_time_exec', StringType(), True),
                    StructField('model_col_set', StringType(), True)
                          ])

In [6]:
# GBT PARAMS

# params_dict = {'maxIter': 20, ===> [100, 200, 300]
#               'maxDepth': 3, ===> [3, 5, 10]
#               'maxBins': 20, ===> [16, 32, 64]
#               'learningRate': 0.01,===> [0.01, 0.1, 0.5]
#               'loss': 'logLoss'}  ===> ["logLoss", "leastSquaresError", "leastAbsoluteError"]

# .addGrid(gbt.seed, [2021]) \
#                             .addGrid(gbt.maxIter, [100, 300, 500]) \
#                             .addGrid(gbt.maxDepth, [3, 5, 10, 15]) \
#                             .addGrid(gbt.maxBins, [50, 200]) \
#                             .addGrid(gbt.minInstancesPerNode, [1, 15]) \
#                             .addGrid(gbt.minInfoGain, [0, 3]) \
#                             .addGrid(gbt.featureSubsetStrategy, ['all', 'auto']).build()


# XGBOOST PARAMS
#     'max_depth': range (2, 10, 1),
#     'n_estimators': range(60, 220, 40),
#     'learning_rate': [0.1, 0.01, 0.05]


# RF PARAMS
#                             .addGrid(rf.seed, [2021]) \
#                             .addGrid(rf.numTrees, range(10, 150, 30)) \
#                             .addGrid(rf.maxDepth, range(3, 15, 2)) \
#                             .addGrid(rf.maxBins, range(10, 90, 30)) \
#                             .addGrid(rf.minInstancesPerNode, range(1, 100, 20)) \
#                             .addGrid(rf.minInfoGain, range(0, 10, 2)) \
#                             .addGrid(rf.featureSubsetStrategy, ['all', 'auto', 'onethird', 'sqrt', 'log2']) \
#                             .addGrid(rf.impurity, ['gini', 'entropy']).build()

<hr />
<hr />
<hr />

In [7]:
cols_sets = {'cols_set_1': ['NU_NOTIFIC', 'CLASSI_FIN', 'CRITERIO', 'EVOLUCAO', 
                            'AGE_GROUP', 'DIST_PRI_NOTIFIC_Q',
                            'DIST_PRI_INTERNA_Q', 'DIST_PRI_ENTUTI_Q', 'DIST_PRI_SAIDUTI_Q', 'DIST_PRI_EVOLUCA_Q', 'DIST_PRI_ENCERRA_Q',
                            'SYMP_GROUP1', 'SYMP_GROUP2', 'SYMP_GROUP3', 'SYMP_GROUP4',
                            'RF_GROUP1', 'RF_GROUP2', 'RF_GROUP3', 'RF_GROUP4',
                            'SUPORT_VEN', 'UTI', 'HOSPITAL',
                            'DIST_PRI_RAIOX_Q', 'DIST_PRI_COLETA_Q', 'DIST_PRI_TOMO_Q', 'DIST_PRI_IF_Q', 'DIST_PRI_TRA_Q', 'DIST_PRI_PCR_Q', 'DIST_PRI_SOR_Q',
                            'GMR_TRANSIT_STATIONS_1WEEK_BEFORE_Q', 'GMR_RETAIL_AND_RECREATION_1WEEK_BEFORE_Q', 'GMR_RESIDENTIAL_PERCENT_1WEEK_BEFORE_Q', 'GMR_WORKPLACES_PERCENT_1WEEK_BEFORE_Q',
                            'GMR_TRANSIT_STATIONS_2WEEKS_Q', 'GMR_RETAIL_AND_RECREATION_2WEEKS_Q', 'GMR_RESIDENTIAL_PERCENT_2WEEKS_Q', 'GMR_WORKPLACES_PERCENT_2WEEKS_Q',
                            'INMET_RELATIVE_AIR_HUMIDITY_1WEEK_BEFORE_Q', 'INMET_RELATIVE_AIR_HUMIDITY_2WEEKS_BEFORE_Q'],
             'cols_set_2': ['CS_SEXO','CS_RACA', 'CRITERIO', 'SURTO_SG', 'SUPORT_VEN', 'EVOLUCAO',
                            'OUTRO_SIN', 'AGE_GROUP', 
                            'SYMP_GROUP1',  'SYMP_GROUP2', 'SYMP_GROUP3', 'SYMP_GROUP4',
                            'RF_GROUP1', 'RF_GROUP2', 'RF_GROUP3',  'RF_GROUP4',
                            'DIST_PRI_EVOLUCA_Q', 'DIST_PRI_ENCERRA_Q', 'DIST_PRI_INTERNA_Q',
                            'DIST_PRI_NOTIFIC_Q', 'DIST_PRI_COLETA_Q', 'DIST_PRI_PCR_Q', 'CLASSI_FIN'],
             'cols_set_3': ['SYMP_GROUP1', 'SYMP_GROUP2', 'SYMP_GROUP3', 'SYMP_GROUP4',
                            'AGE_GROUP', 'CS_SEXO', 'CS_RACA', 'EVOLUCAO', 'CS_ESCOL_N',
                            'UTI', 'SUPORT_VEN', 'DIST_PRI_ENTUTI_Q', 'DIST_PRI_INTERNA_Q',
                            'CLASSI_FIN']}

<hr />
<hr />
<hr />

In [8]:
# super_srag = spark.read.parquet('gs://ai-covid19-datalake/standard/super-srag/super_srag_v2.parquet').persist(StorageLevel.MEMORY_ONLY)
# super_srag.limit(2).toPandas()

In [9]:
# super_srag = super_srag.na.fill('9999')
# super_srag = super_srag.na.fill(9999)

<hr />
<hr />
<hr />

In [10]:
def run_gbt(exp_df, params_dict, cols, filename, exp_name, experiment_filter, 
            undersampling_method, undersampling_column, experiment_id):
    import time
    start_time = time.time()
    
    n_covid = exp_df.filter(F.col('CLASSI_FIN') == 1.0).count()
    n_not_covid = exp_df.filter(F.col('CLASSI_FIN') == 0.0).count()
    
    
    id_cols = ['NU_NOTIFIC', 'CLASSI_FIN']

    labelIndexer = StringIndexer(inputCol="CLASSI_FIN", outputCol="indexedLabel").fit(exp_df)    
    
    input_cols = [x for x in exp_df.columns if x not in id_cols]
    assembler = VectorAssembler(inputCols = input_cols, outputCol= 'features')
    exp_df = assembler.transform(exp_df)
    
    # Automatically identify categorical features, and index them.
    # Set maxCategories so features with > 4 distinct values are treated as continuous.
    featureIndexer = VectorIndexer(inputCol="features", outputCol="indexedFeatures", maxCategories=20).fit(exp_df)
    
    # Split the data into training and test sets (30% held out for testing)
    (trainingData, testData) = exp_df.randomSplit([0.7, 0.3])
    trainingData = trainingData.persist(StorageLevel.MEMORY_ONLY)
    testData = testData.persist(StorageLevel.MEMORY_ONLY)
    
    # Train a RandomForest model.
    gbt = GBTClassifier(labelCol = "indexedLabel", featuresCol = "indexedFeatures",
                       maxIter = params_dict['maxIter'], maxDepth = params_dict['maxDepth'], maxBins = params_dict['maxBins'])
    
    # Convert indexed labels back to original labels.
    labelConverter = IndexToString(inputCol="prediction", outputCol="predictedLabel",
                               labels=labelIndexer.labels)

    # Chain indexers and forest in a Pipeline
    pipeline = Pipeline(stages=[labelIndexer, featureIndexer, gbt, labelConverter])

    # Train model.  This also runs the indexers.
    model = pipeline.fit(trainingData)
    
    # Make predictions.
    predictions = model.transform(testData)    
    
    
    pred = predictions.select(['CLASSI_FIN', 'predictedLabel'])\
                  .withColumn('predictedLabel', F.col('predictedLabel').cast('double'))\
                  .withColumn('predictedLabel', F.when(F.col('predictedLabel') == 1.0, 'covid').otherwise('n-covid'))\
                  .withColumn('CLASSI_FIN', F.when(F.col('CLASSI_FIN') == 1.0, 'covid').otherwise('n-covid'))\
                  .toPandas()

    y_true = pred['CLASSI_FIN'].tolist()
    y_pred = pred['predictedLabel'].tolist()
    
    report = classification_report(y_true, y_pred, output_dict=True)
    
    
    evaluator_ROC = BinaryClassificationEvaluator(labelCol="indexedLabel", rawPredictionCol="prediction", metricName="areaUnderROC")
    accuracy_ROC = evaluator_ROC.evaluate(predictions)


    
    evaluator_PR = BinaryClassificationEvaluator(labelCol="indexedLabel", rawPredictionCol="prediction", metricName="areaUnderPR")
    accuracy_PR = evaluator_PR.evaluate(predictions)
    
    conf_matrix = confusion_matrix(y_true, y_pred)

    result_dict = {}
    
    result_dict['experiment_filter'] = experiment_filter
    result_dict['undersampling_method'] = undersampling_method
    result_dict['undersampling_column'] = undersampling_column
    result_dict['filename'] = filename
    result_dict['experiment_id'] = experiment_id
    result_dict['n_covid'] = n_covid
    result_dict['n_not_covid'] = n_not_covid
    result_dict['model_name'] = 'GBT'
    result_dict['model_maxIter'] = params_dict['maxIter']
    result_dict['model_maxDepth'] = params_dict['maxDepth']
    result_dict['model_maxBins'] = params_dict['maxBins']
    result_dict['model_AUC_ROC'] = accuracy_ROC
    result_dict['model_AUC_PR'] = accuracy_PR
    result_dict['model_covid_precision'] = report['covid']['precision']
    result_dict['model_covid_recall'] = report['covid']['recall']
    result_dict['model_covid_f1'] = report['covid']['f1-score']
    result_dict['model_not_covid_precision'] = report['n-covid']['precision']
    result_dict['model_not_covid_recall'] = report['n-covid']['recall']
    result_dict['model_not_covid_f1'] = report['n-covid']['f1-score']
    result_dict['model_avg_precision'] = report['macro avg']['precision']
    result_dict['model_avg_recall'] = report['macro avg']['recall']
    result_dict['model_avg_f1'] = report['macro avg']['f1-score']
    result_dict['model_avg_acc'] = report['accuracy']
    result_dict['model_TP'] = conf_matrix[0][0]
    result_dict['model_TN'] = conf_matrix[1][1]
    result_dict['model_FN'] = conf_matrix[0][1]
    result_dict['model_FP'] = conf_matrix[1][0]
    result_dict['model_time_exec'] = time.time() - start_time
    result_dict['model_col_set'] = cols
    
    return result_dict

<hr />
<hr />
<hr />

# Running GBT on 10 samples for each experiment
### 3x col sets -> ['cols_set_1', 'cols_set_2', 'cols_set_3']
### 3x model_maxIter -> [100, 200, 300]
### 3x model_maxDepth -> [5, 10, 15]
### 3x model_maxBins -> [16, 32, 64]
Total: 10 * 3 * 3 * 3 * 3 = 810

In [11]:
# lists of params
model_maxIter = [100] # [100, 200, 300]
model_maxDepth = [7] # [5, 10, 15]
model_maxBins = [32] # [16, 32, 64]

In [12]:
experiments = []

### Datasets: strat_samp_lab_agegrp

In [13]:
for id_exp in range(3):
    for col_set in list(cols_sets.keys()):
        str_file = 'gs://ai-covid19-datalake/trusted/experiment_map/datasets/kmodes_nofilter_agegrp_' + col_set + '_' + str(id_exp)
        exp_dataframe = spark.read.parquet(str_file)
        print('read it')
        filename = str_file.split('/')[-1].split('.')[0] # strat_samp_lab_agegrp_0
        experiment_name = filename
        undersampling_method = "KMODES"
        experiment_filter = ""
        undersampling_column = "AGEGRP"
        
        for maxIter in model_maxIter:
            for maxDepth in model_maxDepth:
                for maxBins in model_maxBins:
                    params_dict = {'maxIter': maxIter, 'maxDepth': maxDepth, 'maxBins': maxBins}
                    try:                     
                        model = run_gbt(exp_dataframe, params_dict, col_set, filename, experiment_name, experiment_filter, undersampling_method, undersampling_column, id_exp)
                        experiments.append(model)

                        print(experiment_name, str(params_dict), str(model['model_AUC_PR']), str(model['model_time_exec']))
                    except:
                        print('Something wrong with the exp: {}, {}, {}'.format(filename, params_dict, col_set))

read it
kmodes_nofilter_agegrp_cols_set_1_0 {'maxIter': 100, 'maxDepth': 7, 'maxBins': 32} 0.8920245001805328 143.51273012161255
read it
kmodes_nofilter_agegrp_cols_set_2_0 {'maxIter': 100, 'maxDepth': 7, 'maxBins': 32} 0.8673900172277244 130.17175960540771
read it
kmodes_nofilter_agegrp_cols_set_3_0 {'maxIter': 100, 'maxDepth': 7, 'maxBins': 32} 0.8301805112759606 129.11291027069092
read it
kmodes_nofilter_agegrp_cols_set_1_1 {'maxIter': 100, 'maxDepth': 7, 'maxBins': 32} 0.8918187416016999 128.96318674087524
read it
kmodes_nofilter_agegrp_cols_set_2_1 {'maxIter': 100, 'maxDepth': 7, 'maxBins': 32} 0.8727008001502349 131.09679698944092
read it
kmodes_nofilter_agegrp_cols_set_3_1 {'maxIter': 100, 'maxDepth': 7, 'maxBins': 32} 0.829679416286569 128.60937213897705
read it
kmodes_nofilter_agegrp_cols_set_1_2 {'maxIter': 100, 'maxDepth': 7, 'maxBins': 32} 0.8937205250913147 130.92003107070923
read it
kmodes_nofilter_agegrp_cols_set_2_2 {'maxIter': 100, 'maxDepth': 7, 'maxBins': 32} 0.86921

In [14]:
for i in range(len(experiments)):
    for d in list(experiments[i].keys()):
        experiments[i][d] = str(experiments[i][d])

In [15]:
experiments

[{'experiment_filter': '',
  'undersampling_method': 'KMODES',
  'undersampling_column': 'AGEGRP',
  'filename': 'kmodes_nofilter_agegrp_cols_set_1_0',
  'experiment_id': '0',
  'n_covid': '89020',
  'n_not_covid': '44537',
  'model_name': 'GBT',
  'model_maxIter': '100',
  'model_maxDepth': '7',
  'model_maxBins': '32',
  'model_AUC_ROC': '0.922881591969176',
  'model_AUC_PR': '0.8920245001805328',
  'model_covid_precision': '0.9415572541418279',
  'model_covid_recall': '0.9657151431036423',
  'model_covid_f1': '0.9534832043257657',
  'model_not_covid_precision': '0.9276784301313499',
  'model_not_covid_recall': '0.8800480408347096',
  'model_not_covid_f1': '0.903235747303544',
  'model_avg_precision': '0.9346178421365889',
  'model_avg_recall': '0.922881591969176',
  'model_avg_f1': '0.9283594758146548',
  'model_avg_acc': '0.9371701558240164',
  'model_TP': '25745',
  'model_TN': '11724',
  'model_FN': '914',
  'model_FP': '1598',
  'model_time_exec': '143.51273012161255',
  'model_

In [16]:
cols = ['experiment_filter', 'undersampling_method', 'undersampling_column', 'filename', 'experiment_id', 'n_covid', 'n_not_covid', 'model_name', 'model_maxIter', 'model_maxDepth', 'model_maxBins', 'model_AUC_ROC', 'model_AUC_PR', 'model_covid_precision', 'model_covid_recall', 'model_covid_f1', 'model_not_covid_precision', 'model_not_covid_recall', 'model_not_covid_f1', 'model_avg_precision', 'model_avg_recall', 'model_avg_f1', 'model_avg_acc', 'model_TP', 'model_TN', 'model_FN', 'model_FP', 'model_time_exec', 'model_col_set']

In [17]:
intermed_results = spark.createDataFrame(data=experiments).select(cols)
intermed_results.toPandas()



Unnamed: 0,experiment_filter,undersampling_method,undersampling_column,filename,experiment_id,n_covid,n_not_covid,model_name,model_maxIter,model_maxDepth,model_maxBins,model_AUC_ROC,model_AUC_PR,model_covid_precision,model_covid_recall,model_covid_f1,model_not_covid_precision,model_not_covid_recall,model_not_covid_f1,model_avg_precision,model_avg_recall,model_avg_f1,model_avg_acc,model_TP,model_TN,model_FN,model_FP,model_time_exec,model_col_set
0,,KMODES,AGEGRP,kmodes_nofilter_agegrp_cols_set_1_0,0,89020,44537,GBT,100,7,32,0.922881591969176,0.8920245001805328,0.941557254141828,0.9657151431036424,0.9534832043257656,0.92767843013135,0.8800480408347096,0.903235747303544,0.9346178421365888,0.922881591969176,0.9283594758146548,0.9371701558240164,25745,11724,914,1598,143.51273012161255,cols_set_1
1,,KMODES,AGEGRP,kmodes_nofilter_agegrp_cols_set_2_0,0,89020,44537,GBT,100,7,32,0.9069237317259432,0.8673900172277244,0.9306693741388063,0.9571865443425076,0.9437417267245184,0.9087077534791252,0.8566609191093785,0.881917110442232,0.9196885638089658,0.9069237317259432,0.9128294185833752,0.9237914975219784,25666,11427,1148,1912,130.17175960540771,cols_set_2
2,,KMODES,AGEGRP,kmodes_nofilter_agegrp_cols_set_3_0,0,89020,44537,GBT,100,7,32,0.8831486919515089,0.8301805112759606,0.9146892043197168,0.9436247112303452,0.9289316827143512,0.8786785341993425,0.8226726726726726,0.8497537900818115,0.8966838692595296,0.8831486919515088,0.8893427363980814,0.9035061507047164,25325,10958,1513,2362,129.11291027069092,cols_set_3
3,,KMODES,AGEGRP,kmodes_nofilter_agegrp_cols_set_1_1,1,89020,44537,GBT,100,7,32,0.923032100897186,0.8918187416016999,0.9417148714944718,0.965395586248311,0.9534082063827422,0.9272354194617632,0.880668615546061,0.9033522989389512,0.9344751454781176,0.923032100897186,0.9283802526608468,0.9371264224084032,25722,11749,922,1592,128.96318674087524,cols_set_1
4,,KMODES,AGEGRP,kmodes_nofilter_agegrp_cols_set_2_1,1,89020,44537,GBT,100,7,32,0.9080901137412782,0.8727008001502349,0.9299177643548504,0.9593453207702992,0.9444023576800132,0.9141157811260904,0.8568349067122575,0.8845489774776503,0.9220167727404704,0.9080901137412783,0.9144756675788316,0.9249476204729123,25556,11527,1083,1926,131.09679698944092,cols_set_2
5,,KMODES,AGEGRP,kmodes_nofilter_agegrp_cols_set_3_1,1,89020,44537,GBT,100,7,32,0.8847630194191053,0.829679416286569,0.9163471971066908,0.9417908783407056,0.9288948362143236,0.8764984227129338,0.8277351604975051,0.8514191596123644,0.8964228099098123,0.8847630194191053,0.890156997913344,0.903818497396479,25337,11114,1566,2313,128.60937213897705,cols_set_3
6,,KMODES,AGEGRP,kmodes_nofilter_agegrp_cols_set_1_2,2,89020,44537,GBT,100,7,32,0.92117974892682,0.8937205250913147,0.9385511707156206,0.9668214245272336,0.9524765729585007,0.9306563584727044,0.8755380733264064,0.9022562141491396,0.9346037645941624,0.92117974892682,0.92736639355382,0.936047238972152,25614,11797,879,1677,130.92003107070923,cols_set_1
7,,KMODES,AGEGRP,kmodes_nofilter_agegrp_cols_set_2_2,2,89020,44537,GBT,100,7,32,0.9070126667496018,0.8692115904284886,0.9298258424915112,0.9576940433212996,0.9435542135195716,0.9105794451951356,0.856331290177904,0.8826225971724643,0.9202026438433234,0.9070126667496018,0.913088405346018,0.923767825869402,25467,11456,1125,1922,132.80407404899597,cols_set_2
8,,KMODES,AGEGRP,kmodes_nofilter_agegrp_cols_set_3_2,2,89020,44537,GBT,100,7,32,0.8852518044408432,0.8316993406401229,0.9161944202266784,0.942770633971292,0.929292557111275,0.8786461636017755,0.8277329749103942,0.8524300215318363,0.8974202919142269,0.8852518044408431,0.8908612893215557,0.9043941809485851,25221,11085,1531,2307,128.2457070350647,cols_set_3


In [18]:
# intermed_results.write.parquet('gs://ai-covid19-datalake/trusted/int_exp_results_gbt_strat_samp_lab_agegrp-0-9.parquet', mode='overwrite')