In [1]:
import pandas as pd
import pyspark.sql.functions as F
from datetime import datetime
from pyspark.sql.types import *
from pyspark import StorageLevel

import numpy as np
pd.set_option("display.max_rows", 1000)
pd.set_option("display.max_columns", 1000)
pd.set_option("mode.chained_assignment", None)

In [2]:
from pyspark.ml import Pipeline
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.feature import IndexToString, StringIndexer, VectorIndexer
# from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.feature import OneHotEncoderEstimator, StringIndexer, VectorAssembler

from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder

from pyspark.sql import Row
from pyspark.ml.linalg import Vectors

In [3]:
# !pip install scikit-plot

In [4]:
import sklearn
import scikitplot as skplt
from sklearn.metrics import classification_report, confusion_matrix, precision_score

<hr />
<hr />
<hr />

In [5]:
result_schema = StructType([
                    StructField('experiment_filter', StringType(), True),
                    StructField('undersampling_method', StringType(), True),
                    StructField('undersampling_column', StringType(), True),
                    StructField('filename', StringType(), True),
                    StructField('experiment_id', StringType(), True),
                    StructField('n_covid', IntegerType(), True),
                    StructField('n_not_covid', IntegerType(), True),
                    StructField('model_name', StringType(), True),
                    StructField('model_seed', StringType(), True),
                    StructField('model_maxIter', IntegerType(), True),
                    StructField('model_maxDepth', IntegerType(), True),
                    StructField('model_maxBins', IntegerType(), True),
                    StructField('model_minInstancesPerNode', IntegerType(), True),
                    StructField('model_minInfoGain', FloatType(), True),
                    StructField('model_featureSubsetStrategy', StringType(), True),
                    StructField('model_n_estimators', IntegerType(), True),
                    StructField('model_learning_rate', FloatType(), True),
                    StructField('model_impurity', StringType(), True),
                    StructField('model_AUC_ROC', StringType(), True),
                    StructField('model_AUC_PR', StringType(), True),
                    StructField('model_covid_precision', StringType(), True),
                    StructField('model_covid_recall', StringType(), True),
                    StructField('model_covid_f1', StringType(), True),
                    StructField('model_not_covid_precision', StringType(), True),
                    StructField('model_not_covid_recall', StringType(), True),
                    StructField('model_not_covid_f1', StringType(), True),
                    StructField('model_avg_precision', StringType(), True),
                    StructField('model_avg_recall', StringType(), True),
                    StructField('model_avg_f1', StringType(), True),
                    StructField('model_avg_acc', StringType(), True),
                    StructField('model_TP', StringType(), True),
                    StructField('model_TN', StringType(), True),
                    StructField('model_FN', StringType(), True),
                    StructField('model_FP', StringType(), True),
                    StructField('model_time_exec', StringType(), True),
                    StructField('model_col_set', StringType(), True)
                          ])

<hr />
<hr />
<hr />

In [6]:
cols_sets = ['cols_set_1', 'cols_set_2', 'cols_set_3']
undersamp_col = ['03-STRSAMP-AG', '04-STRSAMP-EW']
dfs = ['ds-3'] #, 'ds-2', 'ds-3']

In [7]:
# lists of params
model_numTrees = [20, 50] 
model_maxDepth = [3, 5, 7] 
model_maxBins = [32, 64]

list_of_param_dicts = []

for numTrees in model_numTrees:
    for maxDepth in model_maxDepth:
        for maxBins in model_maxBins: 
            params_dict = {}
            params_dict['numTrees'] = numTrees
            params_dict['maxDepth'] = maxDepth
            params_dict['maxBins'] = maxBins
            list_of_param_dicts.append(params_dict)

print("There is {} set of params.".format(len(list_of_param_dicts)))
# list_of_param_dicts

There is 12 set of params.


In [8]:
prefix = 'gs://ai-covid19-datalake/trusted/experiment_map/'

<hr />
<hr />
<hr />

In [9]:
# filename = 'gs://ai-covid19-datalake/trusted/experiment_map/03-STRSAMP-AG/ds-1/cols_set_1/experiment0.parquet'
# df = spark.read.parquet(filename)
# df.limit(2).toPandas()

In [10]:
# params_dict = {'numTrees': 50,
#                'maxDepth': 3,
#                'maxBins': 32}
# cols = 'cols_set_1'
# experiment_filter = 'ds-1'
# undersampling_method = '03-STRSAMP-AG', 
# experiment_id = 0

In [11]:
# run_rf(df, params_dict, cols, filename, experiment_filter, undersampling_method, experiment_id)

<hr />
<hr />
<hr />

In [12]:
def run_rf(exp_df, params_dict, cols, filename, experiment_filter, 
            undersampling_method, experiment_id):
    import time
    start_time = time.time()
    
    n_covid = exp_df.filter(F.col('CLASSI_FIN') == 1.0).count()
    n_not_covid = exp_df.filter(F.col('CLASSI_FIN') == 0.0).count()
    
    
    id_cols = ['NU_NOTIFIC', 'CLASSI_FIN']

    labelIndexer = StringIndexer(inputCol="CLASSI_FIN", outputCol="indexedLabel").fit(exp_df)    
    
    input_cols = [x for x in exp_df.columns if x not in id_cols]
    assembler = VectorAssembler(inputCols = input_cols, outputCol= 'features')
    exp_df = assembler.transform(exp_df)
    
    # Automatically identify categorical features, and index them.
    # Set maxCategories so features with > 4 distinct values are treated as continuous.
    featureIndexer = VectorIndexer(inputCol="features", outputCol="indexedFeatures", maxCategories=30).fit(exp_df)
    
    # Split the data into training and test sets (30% held out for testing)
    (trainingData, testData) = exp_df.randomSplit([0.7, 0.3])
    trainingData = trainingData.persist(StorageLevel.MEMORY_ONLY)
    testData = testData.persist(StorageLevel.MEMORY_ONLY)
    
    # Train a RandomForest model.
    rf = RandomForestClassifier(labelCol="indexedLabel", featuresCol="indexedFeatures",
                               numTrees = params_dict['numTrees'], 
                               maxDepth = params_dict['maxDepth'], 
                               maxBins = params_dict['maxBins'])
    
    # Convert indexed labels back to original labels.
    labelConverter = IndexToString(inputCol="prediction", outputCol="predictedLabel",
                               labels=labelIndexer.labels)

    # Chain indexers and forest in a Pipeline
    pipeline = Pipeline(stages=[labelIndexer, featureIndexer, rf, labelConverter])

    # Train model.  This also runs the indexers.
    model = pipeline.fit(trainingData)
    
    # Make predictions.
    predictions = model.transform(testData)    
    
    
    pred = predictions.select(['CLASSI_FIN', 'predictedLabel'])\
                  .withColumn('predictedLabel', F.col('predictedLabel').cast('double'))\
                  .withColumn('predictedLabel', F.when(F.col('predictedLabel') == 1.0, 'covid').otherwise('n-covid'))\
                  .withColumn('CLASSI_FIN', F.when(F.col('CLASSI_FIN') == 1.0, 'covid').otherwise('n-covid'))\
                  .toPandas()

    y_true = pred['CLASSI_FIN'].tolist()
    y_pred = pred['predictedLabel'].tolist()
    
    report = classification_report(y_true, y_pred, output_dict=True)
    
    
    evaluator_ROC = BinaryClassificationEvaluator(labelCol="indexedLabel", rawPredictionCol="prediction", metricName="areaUnderROC")
    accuracy_ROC = evaluator_ROC.evaluate(predictions)


    
    evaluator_PR = BinaryClassificationEvaluator(labelCol="indexedLabel", rawPredictionCol="prediction", metricName="areaUnderPR")
    accuracy_PR = evaluator_PR.evaluate(predictions)
    
    conf_matrix = confusion_matrix(y_true, y_pred)

    result_dict = {}
    
    result_dict['experiment_filter'] = experiment_filter
    result_dict['undersampling_method'] = undersampling_method
    result_dict['filename'] = filename
    result_dict['experiment_id'] = experiment_id
    result_dict['n_covid'] = n_covid
    result_dict['n_not_covid'] = n_not_covid
    result_dict['model_name'] = 'RF'
    result_dict['params'] = params_dict
    result_dict['model_AUC_ROC'] = accuracy_ROC
    result_dict['model_AUC_PR'] = accuracy_PR
    result_dict['model_covid_precision'] = report['covid']['precision']
    result_dict['model_covid_recall'] = report['covid']['recall']
    result_dict['model_covid_f1'] = report['covid']['f1-score']
    result_dict['model_not_covid_precision'] = report['n-covid']['precision']
    result_dict['model_not_covid_recall'] = report['n-covid']['recall']
    result_dict['model_not_covid_f1'] = report['n-covid']['f1-score']
    result_dict['model_avg_precision'] = report['macro avg']['precision']
    result_dict['model_avg_recall'] = report['macro avg']['recall']
    result_dict['model_avg_f1'] = report['macro avg']['f1-score']
    result_dict['model_avg_acc'] = report['accuracy']
    result_dict['model_TP'] = conf_matrix[0][0]
    result_dict['model_TN'] = conf_matrix[1][1]
    result_dict['model_FN'] = conf_matrix[0][1]
    result_dict['model_FP'] = conf_matrix[1][0]
    result_dict['model_time_exec'] = time.time() - start_time
    result_dict['model_col_set'] = cols
    
    return result_dict

<hr />
<hr />
<hr />

# Running GBT on 10 samples for each experiment
### 3x col sets -> ['cols_set_1', 'cols_set_2', 'cols_set_3']
### 3x model_maxIter -> [100, 200, 300]
### 3x model_maxDepth -> [5, 10, 15]
### 3x model_maxBins -> [16, 32, 64]
Total: 10 * 3 * 3 * 3 * 3 = 810

In [13]:
experiments = []

### Datasets: strat_samp_lab_agegrp

In [None]:
for uc in undersamp_col: 
    for ds in dfs:
        for col_set in cols_sets:
            for params_dict in list_of_param_dicts: 
                for id_exp in range(50):
                    filename = prefix + uc + '/' + ds + '/' + col_set + '/' + 'experiment' + str(id_exp) + '.parquet'
                    exp_dataframe = spark.read.parquet(filename)
                    print('read {}'.format(filename))
                    
                    undersampling_method = uc
                    experiment_filter = ds
                    experiment_id = id_exp

                    try:                     
                        model = run_rf(exp_dataframe, params_dict, col_set, filename, experiment_filter, undersampling_method, experiment_id)
                        experiments.append(model)

                        print("Parameters ==> {}\n Results: \n AUC_PR: {} \n Precision: {} \n Time: {}".format(str(params_dict), str(model['model_AUC_PR']), str(model['model_avg_precision']), str(model['model_time_exec'])))
                        print('=========================== \n')
                    except:
                        print('=========== W A R N I N G =========== \n')
                        print('Something wrong with the exp: {}, {}, {}'.format(filename, params_dict, col_set))

read gs://ai-covid19-datalake/trusted/experiment_map/03-STRSAMP-AG/ds-3/cols_set_1/experiment0.parquet
Parameters ==> {'numTrees': 20, 'maxDepth': 3, 'maxBins': 32}
 Results: 
 AUC_PR: 0.9052569182104682 
 Precision: 0.9276006934150963 
 Time: 20.42249846458435

read gs://ai-covid19-datalake/trusted/experiment_map/03-STRSAMP-AG/ds-3/cols_set_1/experiment1.parquet
Parameters ==> {'numTrees': 20, 'maxDepth': 3, 'maxBins': 32}
 Results: 
 AUC_PR: 0.9005226833545339 
 Precision: 0.9265740760209726 
 Time: 10.234735012054443

read gs://ai-covid19-datalake/trusted/experiment_map/03-STRSAMP-AG/ds-3/cols_set_1/experiment2.parquet
Parameters ==> {'numTrees': 20, 'maxDepth': 3, 'maxBins': 32}
 Results: 
 AUC_PR: 0.9034201062127387 
 Precision: 0.9282163052609189 
 Time: 13.86210298538208

read gs://ai-covid19-datalake/trusted/experiment_map/03-STRSAMP-AG/ds-3/cols_set_1/experiment3.parquet
Parameters ==> {'numTrees': 20, 'maxDepth': 3, 'maxBins': 32}
 Results: 
 AUC_PR: 0.9066368937176082 
 Prec

<hr />
<hr />
<hr />

In [None]:
for i in range(len(experiments)):
    for d in list(experiments[i].keys()):
        experiments[i][d] = str(experiments[i][d])

In [None]:
# experiments

In [None]:
cols = ['experiment_filter', 'undersampling_method', 'filename', 'experiment_id', 'n_covid', 'n_not_covid', 'model_name', 'params', 'model_AUC_ROC', 'model_AUC_PR', 'model_covid_precision', 'model_covid_recall', 'model_covid_f1', 'model_not_covid_precision', 'model_not_covid_recall', 'model_not_covid_f1', 'model_avg_precision', 'model_avg_recall', 'model_avg_f1', 'model_avg_acc', 'model_TP', 'model_TN', 'model_FN', 'model_FP', 'model_time_exec', 'model_col_set']

In [None]:
intermed_results = spark.createDataFrame(data=experiments).select(cols)
intermed_results.toPandas()



Unnamed: 0,experiment_filter,undersampling_method,filename,experiment_id,n_covid,n_not_covid,model_name,params,model_AUC_ROC,model_AUC_PR,model_covid_precision,model_covid_recall,model_covid_f1,model_not_covid_precision,model_not_covid_recall,model_not_covid_f1,model_avg_precision,model_avg_recall,model_avg_f1,model_avg_acc,model_TP,model_TN,model_FN,model_FP,model_time_exec,model_col_set
0,ds-3,03-STRSAMP-AG,gs://ai-covid19-datalake/trusted/experiment_ma...,0,76008,44538,RF,"{'numTrees': 20, 'maxDepth': 3, 'maxBins': 32}",0.8857310344873445,0.9052569182104682,0.8861614235780108,0.9851194418686802,0.9330238588127052,0.9690399632521819,0.7863426271060087,0.8681838758796658,0.9276006934150963,0.8857310344873444,0.9006038673461856,0.911178281245667,22310,10548,337,2866,20.42249846458435,cols_set_1
1,ds-3,03-STRSAMP-AG,gs://ai-covid19-datalake/trusted/experiment_ma...,1,76098,44538,RF,"{'numTrees': 20, 'maxDepth': 3, 'maxBins': 32}",0.8876125854286899,0.9005226833545339,0.8911980925889131,0.9817887317777875,0.9343026162306282,0.9619500594530321,0.7934364390795926,0.8696047626922442,0.9265740760209726,0.88761258542869,0.9019536894614362,0.9126267383234529,22427,10517,416,2738,10.234735012054443,cols_set_1
2,ds-3,03-STRSAMP-AG,gs://ai-covid19-datalake/trusted/experiment_ma...,2,76542,44538,RF,"{'numTrees': 20, 'maxDepth': 3, 'maxBins': 32}",0.8852348785414577,0.9034201062127387,0.888396501457726,0.9850437481143054,0.9342272002616195,0.968036109064112,0.7854260089686098,0.867222313913187,0.9282163052609189,0.8852348785414577,0.9007247570874033,0.9120308356797244,22854,10509,347,2871,13.86210298538208,cols_set_1
3,ds-3,03-STRSAMP-AG,gs://ai-covid19-datalake/trusted/experiment_ma...,3,75793,44538,RF,"{'numTrees': 20, 'maxDepth': 3, 'maxBins': 32}",0.8917893099274375,0.9066368937176082,0.8935005792354093,0.9836837012929897,0.9364258650645789,0.9663583605368153,0.7998949185618854,0.8752823292677918,0.9299294698861122,0.8917893099274375,0.9058540971661854,0.9157815923019328,22367,10657,371,2666,9.374481201171875,cols_set_1
4,ds-3,03-STRSAMP-AG,gs://ai-covid19-datalake/trusted/experiment_ma...,4,74465,44538,RF,"{'numTrees': 20, 'maxDepth': 3, 'maxBins': 32}",0.8870486418983298,0.9067890791681575,0.886565603763722,0.9852974035840372,0.933327689116539,0.9697192820984813,0.7887998802126226,0.8699529353480308,0.9281424429311016,0.8870486418983299,0.9016403122322849,0.911848659539934,22048,10536,329,2821,9.18627142906189,cols_set_1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3595,ds-3,04-STRSAMP-EW,gs://ai-covid19-datalake/trusted/experiment_ma...,45,74887,44538,RF,"{'numTrees': 50, 'maxDepth': 7, 'maxBins': 64}",0.9069852975142528,0.9029935152835163,0.9107481959741739,0.9724687964673546,0.9405970799738504,0.9483952702702703,0.841501798561151,0.8917566709021602,0.9295717331222221,0.9069852975142528,0.9161768754380053,0.9232912176041872,21582,11229,611,2115,14.487770318984985,cols_set_3
3596,ds-3,04-STRSAMP-EW,gs://ai-covid19-datalake/trusted/experiment_ma...,46,75443,44538,RF,"{'numTrees': 50, 'maxDepth': 7, 'maxBins': 64}",0.9105820020584812,0.9056322609251706,0.9149675528574419,0.9731053522130199,0.9431413589970438,0.9494137353433836,0.8480586519039426,0.8958786106610819,0.9321906441004127,0.9105820020584812,0.9195099848290629,0.9264480111653873,21854,11336,604,2031,14.88740849494934,cols_set_3
3597,ds-3,04-STRSAMP-EW,gs://ai-covid19-datalake/trusted/experiment_ma...,47,74333,44538,RF,"{'numTrees': 50, 'maxDepth': 7, 'maxBins': 64}",0.9074530019202707,0.9003885851023646,0.9132397191574724,0.971113678784108,0.94128796037045,0.9452124072825354,0.8437923250564334,0.8916275741432774,0.9292260632200039,0.9074530019202707,0.9164577672568637,0.9238377291014752,21852,11214,650,2076,15.61817193031311,cols_set_3
3598,ds-3,04-STRSAMP-EW,gs://ai-covid19-datalake/trusted/experiment_ma...,48,75545,44538,RF,"{'numTrees': 50, 'maxDepth': 7, 'maxBins': 64}",0.912008463071735,0.9049108778966718,0.9185219283562103,0.9731323431611616,0.9450388581516006,0.9482758620689655,0.8508845829823083,0.8969442538247285,0.9333988952125879,0.9120084630717349,0.9209915559881645,0.9283106817926542,21949,11110,606,1947,14.372165441513062,cols_set_3


In [None]:
intermed_results.write.parquet('gs://ai-covid19-datalake/trusted/intermed_results/STRSAMP/RF_experiments-ds3.parquet', mode='overwrite')

In [None]:
print('finished')

finished


In [None]:
intermed_results.show()

+-----------------+--------------------+--------------------+-------------+-------+-----------+----------+--------------------+------------------+------------------+---------------------+------------------+------------------+-------------------------+----------------------+------------------+-------------------+------------------+------------------+------------------+--------+--------+--------+--------+------------------+-------------+
|experiment_filter|undersampling_method|            filename|experiment_id|n_covid|n_not_covid|model_name|              params|     model_AUC_ROC|      model_AUC_PR|model_covid_precision|model_covid_recall|    model_covid_f1|model_not_covid_precision|model_not_covid_recall|model_not_covid_f1|model_avg_precision|  model_avg_recall|      model_avg_f1|     model_avg_acc|model_TP|model_TN|model_FN|model_FP|   model_time_exec|model_col_set|
+-----------------+--------------------+--------------------+-------------+-------+-----------+----------+--------------