In [2]:
# !pip install kmodes

In [2]:
import pandas as pd
import pyspark.sql.functions as F
from datetime import datetime
from pyspark.sql.types import *
from pyspark import StorageLevel
from kmodes.kmodes import KModes
from sklearn.metrics import silhouette_score

import time
import numpy as np
pd.set_option("display.max_rows", 1000)
pd.set_option("display.max_columns", 1000)
pd.set_option("mode.chained_assignment", None)

<hr />
<hr />
<hr />

In [3]:
filters = ['nofilter', 'pcrpositive', 'labpositive']

cols_sets = {'cols_set_1': ['NU_NOTIFIC', 'CLASSI_FIN', 'CRITERIO', 'EVOLUCAO', 
                            'AGE_GROUP', 'DIST_PRI_NOTIFIC_Q',
                            'DIST_PRI_INTERNA_Q', 'DIST_PRI_ENTUTI_Q', 'DIST_PRI_SAIDUTI_Q', 'DIST_PRI_EVOLUCA_Q', 'DIST_PRI_ENCERRA_Q',
                            'SYMP_GROUP1', 'SYMP_GROUP2', 'SYMP_GROUP3', 'SYMP_GROUP4',
                            'RF_GROUP1', 'RF_GROUP2', 'RF_GROUP3', 'RF_GROUP4',
                            'SUPORT_VEN', 'UTI', 'HOSPITAL',
                            'DIST_PRI_RAIOX_Q', 'DIST_PRI_COLETA_Q', 'DIST_PRI_TOMO_Q', 'DIST_PRI_IF_Q', 'DIST_PRI_TRA_Q', 'DIST_PRI_PCR_Q', 'DIST_PRI_SOR_Q',
                            'GMR_TRANSIT_STATIONS_1WEEK_BEFORE_Q', 'GMR_RETAIL_AND_RECREATION_1WEEK_BEFORE_Q', 'GMR_RESIDENTIAL_PERCENT_1WEEK_BEFORE_Q', 'GMR_WORKPLACES_PERCENT_1WEEK_BEFORE_Q',
                            'GMR_TRANSIT_STATIONS_2WEEKS_Q', 'GMR_RETAIL_AND_RECREATION_2WEEKS_Q', 'GMR_RESIDENTIAL_PERCENT_2WEEKS_Q', 'GMR_WORKPLACES_PERCENT_2WEEKS_Q',
                            'INMET_RELATIVE_AIR_HUMIDITY_1WEEK_BEFORE_Q', 'INMET_RELATIVE_AIR_HUMIDITY_2WEEKS_BEFORE_Q'],
             'cols_set_2': ['NU_NOTIFIC', 'SG_UF', 'CS_SEXO','CS_RACA', 'CRITERIO', 'SURTO_SG', 'SUPORT_VEN', 'EVOLUCAO',
                            'OUTRO_SIN', 'AGE_GROUP', 
                            'SYMP_GROUP1',  'SYMP_GROUP2', 'SYMP_GROUP3', 'SYMP_GROUP4',
                            'RF_GROUP1', 'RF_GROUP2', 'RF_GROUP3',  'RF_GROUP4',
                            'DIST_PRI_EVOLUCA_Q', 'DIST_PRI_ENCERRA_Q', 'DIST_PRI_INTERNA_Q',
                            'DIST_PRI_NOTIFIC_Q', 'DIST_PRI_COLETA_Q', 'DIST_PRI_PCR_Q', 'CLASSI_FIN'],
             'cols_set_3': ['NU_NOTIFIC', 'SG_UF', 'SG_UF_NOT', 'SYMP_GROUP1', 'SYMP_GROUP2', 'SYMP_GROUP3', 'SYMP_GROUP4',
                            'AGE_GROUP', 'CS_SEXO', 'CS_RACA', 'EVOLUCAO', 'CS_ESCOL_N',
                            'UTI', 'SUPORT_VEN', 'DIST_PRI_ENTUTI_Q', 'DIST_PRI_INTERNA_Q',
                            'CLASSI_FIN']}


undersamp_col = {'02-KMODES': 'KMODES_CLUSTER'}

# state_codes_dict = {
#                 'RO': 11, 'AC': 12, 'AM': 13, 'RR': 14, 'PA': 15, 'AP': 16, 'TO': 17, 
#                 'MA': 21, 'PI': 22, 'CE': 23, 'RN': 24, 'PB': 25, 'PE': 26, 'AL': 27, 'SE': 28, 'BA': 29,
#                 'MG': 31, 'ES': 32, 'RJ': 33, 'SP': 35, 
#                 'PR': 41, 'SC': 42, 'RS': 43,
#                 'MS': 50, 'MT': 51, 'GO': 52, 'DF': 53
#               }

In [4]:
def dropDupeDfCols(df):
    newcols = []
    dupcols = []

    for i in range(len(df.columns)):
        if df.columns[i] not in newcols:
            newcols.append(df.columns[i])
        else:
            dupcols.append(i)

    df = df.toDF(*[str(i) for i in range(len(df.columns))])
    for dupcol in dupcols:
        df = df.drop(str(dupcol))

    return df.toDF(*newcols)


def format_state(state_col):
    state_codes_dict = {
                'RO': 11, 'AC': 12, 'AM': 13, 'RR': 14, 'PA': 15, 'AP': 16, 'TO': 17, 
                'MA': 21, 'PI': 22, 'CE': 23, 'RN': 24, 'PB': 25, 'PE': 26, 'AL': 27, 'SE': 28, 'BA': 29,
                'MG': 31, 'ES': 32, 'RJ': 33, 'SP': 35, 
                'PR': 41, 'SC': 42, 'RS': 43,
                'MS': 50, 'MT': 51, 'GO': 52, 'DF': 53
              }
    if state_col != None:
        return str(state_codes_dict[state_col])
udf_format_state = F.udf(format_state, StringType())

def format_sex(df, sex_col_name):
    return df.withColumn(sex_col_name, F.when(F.col(sex_col_name) == 'F', '2').otherwise('1'))

def preprocessing(df):
    for col in df.columns:
        if col == 'CS_SEXO':
            df = format_sex(df, col)
            
        if col in ['SG_UF', 'SG_UF_NOT']:
            df = df.withColumn(col, udf_format_state(F.col(col)))
            
        if (col not in ['NU_NOTIFIC']) and ((col in cols_sets['cols_set_1']) or (col in cols_sets['cols_set_2']) or (col in cols_sets['cols_set_3'])):
            df = df.withColumn(col, F.col(col).cast('float'))
    
    df = df.na.fill('9999')
    df = df.na.fill(9999)
    df = df.withColumn('CLASSI_FIN', F.when(F.col('CLASSI_FIN') == 5, 1.0).otherwise(0.0))
    
    return dropDupeDfCols(df)

<hr />
<hr />
<hr />

# Reading data

In [5]:
not_covid = spark.read.parquet('gs://ai-covid19-datalake/trusted/super-srag_filters/super-srag_notcovid_nofilter.parquet')
not_covid.limit(2).toPandas()

Unnamed: 0,NU_NOTIFIC,CS_SEXO,DT_NASC,AGE_AT_NOTIF,AGE_GROUP,CS_GESTANT,CS_RACA,CS_ETINIA,CS_ESCOL_N,SG_UF,CO_MUN_RES,SEM_PRI,SEM_NOT,DIST_PRI_NOTIFIC,DT_SIN_PRI,SG_UF_NOT,CO_MUN_NOT,SURTO_SG,NOSOCOMIAL,AVE_SUINO,VACINA,HOSPITAL,DIST_PRI_INTERNA,SUPORT_VEN,UTI,DIST_PRI_ENTUTI,CLASSI_OUT,CRITERIO,EVOLUCAO,CLASSI_FIN,SYMP_GROUP1,SYMP_GROUP2,SYMP_GROUP3,SYMP_GROUP4,OUTRO_SIN,OUTRO_DES,RF_GROUP1,RF_GROUP2,RF_GROUP3,RF_GROUP4,OBES_IMC,OUT_MORBI,MORB_DESC,RAIOX_RES,DIST_PRI_RAIOX,TOMO_RES,DIST_PRI_TOMO,AMOSTRA,TP_AMOSTRA,DT_COLETA,DIST_PRI_COLETA,PP_IF_RESUL,PP_TRA_RESUL,DIST_PRI_TRA,PP_PCR_RESUL,DIST_PRI_PCR,PP_RES_SOR_IGA,PP_RES_SOR_IGM,PP_RES_SOR_IGG,DIST_PRI_SOR,DIST_PRI_IF,DIST_PRI_NOTIFIC_Q,DIST_PRI_INTERNA_Q,DIST_PRI_ENTUTI_Q,DIST_PRI_SAIDUTI_Q,DIST_PRI_EVOLUCA_Q,DIST_PRI_ENCERRA_Q,DIST_PRI_RAIOX_Q,DIST_PRI_TOMO_Q,DIST_PRI_COLETA_Q,DIST_PRI_SOR_Q,DIST_PRI_PCR_Q,DIST_PRI_TRA_Q,DIST_PRI_IF_Q,EPI_WEEK_YEAR,GMR_TRANSIT_STATIONS_AVG,GMR_GROCERY_AND_PHARMACY_AVG,GMR_RETAIL_AND_RECREATION_AVG,GMR_WORKPLACES_PERCENT_AVG,GMR_RESIDENTIAL_PERCENT_AVG,GMR_PARKS_PERCENT_AVG,GMR_TRANSIT_STATIONS_1WEEK_BEFORE_AVG,GMR_GROCERY_AND_PHARMACY_1WEEK_BEFORE_AVG,GMR_RETAIL_AND_RECREATION_1WEEK_BEFORE_AVG,GMR_WORKPLACES_PERCENT_1WEEK_BEFORE_AVG,GMR_RESIDENTIAL_PERCENT_1WEEK_BEFORE_AVG,GMR_PARKS_PERCENT_1WEEK_BEFORE_AVG,GMR_TRANSIT_STATIONS_2WEEKS_AVG,GMR_GROCERY_AND_PHARMACY_2WEEKS_AVG,GMR_RETAIL_AND_RECREATION_2WEEKS_AVG,GMR_WORKPLACES_PERCENT_2WEEKS_AVG,GMR_RESIDENTIAL_PERCENT_2WEEKS_AVG,GMR_PARKS_PERCENT_2WEEKS_AVG,INMET_TEMP_C_AVG,INMET_RELATIVE_AIR_HUMIDITY_AVG,INMET_DAILY_PRECIPT_AVG,INMET_TEMP_C_1WEEK_BEFORE_AVG,INMET_RELATIVE_AIR_HUMIDITY_1WEEK_BEFORE_AVG,INMET_DAILY_PRECIPT_1WEEK_BEFORE_AVG,INMET_TEMP_C_2WEEKS_BEFORE_AVG,INMET_RELATIVE_AIR_HUMIDITY_2WEEKS_BEFORE_AVG,INMET_DAILY_PRECIPT_2WEEKS_BEFORE_AVG,GMR_TRANSIT_STATIONS_Q,GMR_GROCERY_AND_PHARMACY_Q,GMR_RETAIL_AND_RECREATION_Q,GMR_WORKPLACES_PERCENT_Q,GMR_RESIDENTIAL_PERCENT_Q,GMR_PARKS_PERCENT_Q,GMR_TRANSIT_STATIONS_1WEEK_BEFORE_Q,GMR_GROCERY_AND_PHARMACY_1WEEK_BEFORE_Q,GMR_RETAIL_AND_RECREATION_1WEEK_BEFORE_Q,GMR_WORKPLACES_PERCENT_1WEEK_BEFORE_Q,GMR_RESIDENTIAL_PERCENT_1WEEK_BEFORE_Q,GMR_PARKS_PERCENT_1WEEK_BEFORE_Q,GMR_TRANSIT_STATIONS_2WEEKS_Q,GMR_GROCERY_AND_PHARMACY_2WEEKS_Q,GMR_RETAIL_AND_RECREATION_2WEEKS_Q,GMR_WORKPLACES_PERCENT_2WEEKS_Q,GMR_RESIDENTIAL_PERCENT_2WEEKS_Q,GMR_PARKS_PERCENT_2WEEKS_Q,INMET_TEMP_C_Q,INMET_RELATIVE_AIR_HUMIDITY_Q,INMET_DAILY_PRECIPT_Q,INMET_TEMP_C_1WEEK_BEFORE_Q,INMET_RELATIVE_AIR_HUMIDITY_1WEEK_BEFORE_Q,INMET_DAILY_PRECIPT_1WEEK_BEFORE_Q,INMET_TEMP_C_2WEEKS_BEFORE_Q,INMET_RELATIVE_AIR_HUMIDITY_2WEEKS_BEFORE_Q,INMET_DAILY_PRECIPT_2WEEKS_BEFORE_Q,ANO
0,77309712285,M,2020-09-18,0,1,6,4,,0.0,SP,352620,4,4,2,2021-01-26,SP,352220,9,2,2,,1,1.0,2,2.0,,,1,1,2,3,8,1,1,2.0,,1,1,1,1,,,,,,,,1,1,2021-01-27,1,6,3,2.0,4,6,4,4,4,,,1,1,6,6,4,4,6,6,1,6,2,1,6,04-2021,-25.895942,6.296242,-28.311178,-6.281181,6.003215,-23.171171,-26.123958,4.771699,-24.164021,-7.336648,5.483269,-25.172172,-23.547269,8.872993,-20.801375,-4.632532,5.748876,-25.86341,24.476172,72.664078,4.339216,24.36574,75.171226,6.304,24.070494,79.940901,6.042424,3,2,3,3,2,4,3,2,3,3,1,4,4,3,4,3,2,4,4,3,4,4,4,5,4,5,4,2021
1,85899613066,M,2010-03-11,10,3,6,9,,,SP,355030,4,5,5,2021-01-28,SP,355030,2,2,9,9.0,2,,3,,,,1,1,2,8,1,1,1,,,1,1,1,1,,,,6.0,,6.0,,1,1,2021-01-31,3,9,9,,3,3,4,4,4,,,2,6,6,6,1,2,6,6,2,6,1,6,6,04-2021,-25.895942,6.296242,-28.311178,-6.281181,6.003215,-23.171171,-26.123958,4.771699,-24.164021,-7.336648,5.483269,-25.172172,-23.547269,8.872993,-20.801375,-4.632532,5.748876,-25.86341,24.476172,72.664078,4.339216,24.36574,75.171226,6.304,24.070494,79.940901,6.042424,3,2,3,3,2,4,3,2,3,3,1,4,4,3,4,3,2,4,4,3,4,4,4,5,4,5,4,2021


In [6]:
df1 = spark.read.parquet('gs://ai-covid19-datalake/trusted/super-srag_filters/super-srag_covid_nofilter.parquet')
df1.limit(2).toPandas()

Unnamed: 0,NU_NOTIFIC,CS_SEXO,DT_NASC,AGE_AT_NOTIF,AGE_GROUP,CS_GESTANT,CS_RACA,CS_ETINIA,CS_ESCOL_N,SG_UF,CO_MUN_RES,SEM_PRI,SEM_NOT,DIST_PRI_NOTIFIC,DT_SIN_PRI,SG_UF_NOT,CO_MUN_NOT,SURTO_SG,NOSOCOMIAL,AVE_SUINO,VACINA,HOSPITAL,DIST_PRI_INTERNA,SUPORT_VEN,UTI,DIST_PRI_ENTUTI,CLASSI_OUT,CRITERIO,EVOLUCAO,CLASSI_FIN,SYMP_GROUP1,SYMP_GROUP2,SYMP_GROUP3,SYMP_GROUP4,OUTRO_SIN,OUTRO_DES,RF_GROUP1,RF_GROUP2,RF_GROUP3,RF_GROUP4,OBES_IMC,OUT_MORBI,MORB_DESC,RAIOX_RES,DIST_PRI_RAIOX,TOMO_RES,DIST_PRI_TOMO,AMOSTRA,TP_AMOSTRA,DT_COLETA,DIST_PRI_COLETA,PP_IF_RESUL,PP_TRA_RESUL,DIST_PRI_TRA,PP_PCR_RESUL,DIST_PRI_PCR,PP_RES_SOR_IGA,PP_RES_SOR_IGM,PP_RES_SOR_IGG,DIST_PRI_SOR,DIST_PRI_IF,DIST_PRI_NOTIFIC_Q,DIST_PRI_INTERNA_Q,DIST_PRI_ENTUTI_Q,DIST_PRI_SAIDUTI_Q,DIST_PRI_EVOLUCA_Q,DIST_PRI_ENCERRA_Q,DIST_PRI_RAIOX_Q,DIST_PRI_TOMO_Q,DIST_PRI_COLETA_Q,DIST_PRI_SOR_Q,DIST_PRI_PCR_Q,DIST_PRI_TRA_Q,DIST_PRI_IF_Q,EPI_WEEK_YEAR,GMR_TRANSIT_STATIONS_AVG,GMR_GROCERY_AND_PHARMACY_AVG,GMR_RETAIL_AND_RECREATION_AVG,GMR_WORKPLACES_PERCENT_AVG,GMR_RESIDENTIAL_PERCENT_AVG,GMR_PARKS_PERCENT_AVG,GMR_TRANSIT_STATIONS_1WEEK_BEFORE_AVG,GMR_GROCERY_AND_PHARMACY_1WEEK_BEFORE_AVG,GMR_RETAIL_AND_RECREATION_1WEEK_BEFORE_AVG,GMR_WORKPLACES_PERCENT_1WEEK_BEFORE_AVG,GMR_RESIDENTIAL_PERCENT_1WEEK_BEFORE_AVG,GMR_PARKS_PERCENT_1WEEK_BEFORE_AVG,GMR_TRANSIT_STATIONS_2WEEKS_AVG,GMR_GROCERY_AND_PHARMACY_2WEEKS_AVG,GMR_RETAIL_AND_RECREATION_2WEEKS_AVG,GMR_WORKPLACES_PERCENT_2WEEKS_AVG,GMR_RESIDENTIAL_PERCENT_2WEEKS_AVG,GMR_PARKS_PERCENT_2WEEKS_AVG,INMET_TEMP_C_AVG,INMET_RELATIVE_AIR_HUMIDITY_AVG,INMET_DAILY_PRECIPT_AVG,INMET_TEMP_C_1WEEK_BEFORE_AVG,INMET_RELATIVE_AIR_HUMIDITY_1WEEK_BEFORE_AVG,INMET_DAILY_PRECIPT_1WEEK_BEFORE_AVG,INMET_TEMP_C_2WEEKS_BEFORE_AVG,INMET_RELATIVE_AIR_HUMIDITY_2WEEKS_BEFORE_AVG,INMET_DAILY_PRECIPT_2WEEKS_BEFORE_AVG,GMR_TRANSIT_STATIONS_Q,GMR_GROCERY_AND_PHARMACY_Q,GMR_RETAIL_AND_RECREATION_Q,GMR_WORKPLACES_PERCENT_Q,GMR_RESIDENTIAL_PERCENT_Q,GMR_PARKS_PERCENT_Q,GMR_TRANSIT_STATIONS_1WEEK_BEFORE_Q,GMR_GROCERY_AND_PHARMACY_1WEEK_BEFORE_Q,GMR_RETAIL_AND_RECREATION_1WEEK_BEFORE_Q,GMR_WORKPLACES_PERCENT_1WEEK_BEFORE_Q,GMR_RESIDENTIAL_PERCENT_1WEEK_BEFORE_Q,GMR_PARKS_PERCENT_1WEEK_BEFORE_Q,GMR_TRANSIT_STATIONS_2WEEKS_Q,GMR_GROCERY_AND_PHARMACY_2WEEKS_Q,GMR_RETAIL_AND_RECREATION_2WEEKS_Q,GMR_WORKPLACES_PERCENT_2WEEKS_Q,GMR_RESIDENTIAL_PERCENT_2WEEKS_Q,GMR_PARKS_PERCENT_2WEEKS_Q,INMET_TEMP_C_Q,INMET_RELATIVE_AIR_HUMIDITY_Q,INMET_DAILY_PRECIPT_Q,INMET_TEMP_C_1WEEK_BEFORE_Q,INMET_RELATIVE_AIR_HUMIDITY_1WEEK_BEFORE_Q,INMET_DAILY_PRECIPT_1WEEK_BEFORE_Q,INMET_TEMP_C_2WEEKS_BEFORE_Q,INMET_RELATIVE_AIR_HUMIDITY_2WEEKS_BEFORE_Q,INMET_DAILY_PRECIPT_2WEEKS_BEFORE_Q,ANO
0,77309859740,M,1968-12-02,52,7,6,4,,4.0,BA,290730,19,20,7,2021-05-13,BA,291920,,,,9.0,1,6,2,2,,,1,,5,4,6,1,1,2.0,,9,9,9,9,,2.0,,,,,,1,1,2021-05-14,1,2,9,,6,,4,4,4,,1.0,3,3,6,6,6,1,6,6,1,6,6,6,1,19-2021,-26.208333,31.788462,-24.079137,-0.099905,7.358333,-44.62533,-25.097436,34.628571,-20.930314,0.773764,6.81768,-44.472296,-31.233161,17.257009,-30.477193,-4.605239,8.130556,-46.526854,22.265185,71.299468,1.115789,23.258355,74.984254,2.16,24.092008,74.733094,1.653012,3,5,3,4,2,2,4,5,4,5,2,2,3,4,3,3,3,1,2,3,2,3,4,3,4,4,2,2021
1,68719688318,F,1968-12-02,52,7,5,4,,,BA,292370,19,19,5,2021-05-10,BA,290520,,2.0,9.0,,1,6,2,1,6.0,,1,2.0,5,8,6,2,5,,,1,1,5,1,,,,,,1.0,5.0,1,1,2021-05-14,4,6,2,4.0,6,,4,4,4,,,2,3,3,6,2,1,6,2,3,6,6,3,6,19-2021,-26.208333,31.788462,-24.079137,-0.099905,7.358333,-44.62533,-25.097436,34.628571,-20.930314,0.773764,6.81768,-44.472296,-31.233161,17.257009,-30.477193,-4.605239,8.130556,-46.526854,22.265185,71.299468,1.115789,23.258355,74.984254,2.16,24.092008,74.733094,1.653012,3,5,3,4,2,2,4,5,4,5,2,2,3,4,3,3,3,1,2,3,2,3,4,3,4,4,2,2021


In [7]:
df2 = spark.read.parquet('gs://ai-covid19-datalake/trusted/super-srag_filters/super-srag_covid_pcrpositive.parquet')
df2.limit(2).toPandas()

Unnamed: 0,NU_NOTIFIC,CS_SEXO,DT_NASC,AGE_AT_NOTIF,AGE_GROUP,CS_GESTANT,CS_RACA,CS_ETINIA,CS_ESCOL_N,SG_UF,CO_MUN_RES,SEM_PRI,SEM_NOT,DIST_PRI_NOTIFIC,DT_SIN_PRI,SG_UF_NOT,CO_MUN_NOT,SURTO_SG,NOSOCOMIAL,AVE_SUINO,VACINA,HOSPITAL,DIST_PRI_INTERNA,SUPORT_VEN,UTI,DIST_PRI_ENTUTI,CLASSI_OUT,CRITERIO,EVOLUCAO,CLASSI_FIN,SYMP_GROUP1,SYMP_GROUP2,SYMP_GROUP3,SYMP_GROUP4,OUTRO_SIN,OUTRO_DES,RF_GROUP1,RF_GROUP2,RF_GROUP3,RF_GROUP4,OBES_IMC,OUT_MORBI,MORB_DESC,RAIOX_RES,DIST_PRI_RAIOX,TOMO_RES,DIST_PRI_TOMO,AMOSTRA,TP_AMOSTRA,DT_COLETA,DIST_PRI_COLETA,PP_IF_RESUL,PP_TRA_RESUL,DIST_PRI_TRA,PP_PCR_RESUL,DIST_PRI_PCR,PP_RES_SOR_IGA,PP_RES_SOR_IGM,PP_RES_SOR_IGG,DIST_PRI_SOR,DIST_PRI_IF,DIST_PRI_NOTIFIC_Q,DIST_PRI_INTERNA_Q,DIST_PRI_ENTUTI_Q,DIST_PRI_SAIDUTI_Q,DIST_PRI_EVOLUCA_Q,DIST_PRI_ENCERRA_Q,DIST_PRI_RAIOX_Q,DIST_PRI_TOMO_Q,DIST_PRI_COLETA_Q,DIST_PRI_SOR_Q,DIST_PRI_PCR_Q,DIST_PRI_TRA_Q,DIST_PRI_IF_Q,EPI_WEEK_YEAR,GMR_TRANSIT_STATIONS_AVG,GMR_GROCERY_AND_PHARMACY_AVG,GMR_RETAIL_AND_RECREATION_AVG,GMR_WORKPLACES_PERCENT_AVG,GMR_RESIDENTIAL_PERCENT_AVG,GMR_PARKS_PERCENT_AVG,GMR_TRANSIT_STATIONS_1WEEK_BEFORE_AVG,GMR_GROCERY_AND_PHARMACY_1WEEK_BEFORE_AVG,GMR_RETAIL_AND_RECREATION_1WEEK_BEFORE_AVG,GMR_WORKPLACES_PERCENT_1WEEK_BEFORE_AVG,GMR_RESIDENTIAL_PERCENT_1WEEK_BEFORE_AVG,GMR_PARKS_PERCENT_1WEEK_BEFORE_AVG,GMR_TRANSIT_STATIONS_2WEEKS_AVG,GMR_GROCERY_AND_PHARMACY_2WEEKS_AVG,GMR_RETAIL_AND_RECREATION_2WEEKS_AVG,GMR_WORKPLACES_PERCENT_2WEEKS_AVG,GMR_RESIDENTIAL_PERCENT_2WEEKS_AVG,GMR_PARKS_PERCENT_2WEEKS_AVG,INMET_TEMP_C_AVG,INMET_RELATIVE_AIR_HUMIDITY_AVG,INMET_DAILY_PRECIPT_AVG,INMET_TEMP_C_1WEEK_BEFORE_AVG,INMET_RELATIVE_AIR_HUMIDITY_1WEEK_BEFORE_AVG,INMET_DAILY_PRECIPT_1WEEK_BEFORE_AVG,INMET_TEMP_C_2WEEKS_BEFORE_AVG,INMET_RELATIVE_AIR_HUMIDITY_2WEEKS_BEFORE_AVG,INMET_DAILY_PRECIPT_2WEEKS_BEFORE_AVG,GMR_TRANSIT_STATIONS_Q,GMR_GROCERY_AND_PHARMACY_Q,GMR_RETAIL_AND_RECREATION_Q,GMR_WORKPLACES_PERCENT_Q,GMR_RESIDENTIAL_PERCENT_Q,GMR_PARKS_PERCENT_Q,GMR_TRANSIT_STATIONS_1WEEK_BEFORE_Q,GMR_GROCERY_AND_PHARMACY_1WEEK_BEFORE_Q,GMR_RETAIL_AND_RECREATION_1WEEK_BEFORE_Q,GMR_WORKPLACES_PERCENT_1WEEK_BEFORE_Q,GMR_RESIDENTIAL_PERCENT_1WEEK_BEFORE_Q,GMR_PARKS_PERCENT_1WEEK_BEFORE_Q,GMR_TRANSIT_STATIONS_2WEEKS_Q,GMR_GROCERY_AND_PHARMACY_2WEEKS_Q,GMR_RETAIL_AND_RECREATION_2WEEKS_Q,GMR_WORKPLACES_PERCENT_2WEEKS_Q,GMR_RESIDENTIAL_PERCENT_2WEEKS_Q,GMR_PARKS_PERCENT_2WEEKS_Q,INMET_TEMP_C_Q,INMET_RELATIVE_AIR_HUMIDITY_Q,INMET_DAILY_PRECIPT_Q,INMET_TEMP_C_1WEEK_BEFORE_Q,INMET_RELATIVE_AIR_HUMIDITY_1WEEK_BEFORE_Q,INMET_DAILY_PRECIPT_1WEEK_BEFORE_Q,INMET_TEMP_C_2WEEKS_BEFORE_Q,INMET_RELATIVE_AIR_HUMIDITY_2WEEKS_BEFORE_Q,INMET_DAILY_PRECIPT_2WEEKS_BEFORE_Q,ANO
0,68719818720,F,1988-06-04,32,5,5,4,,3.0,BA,292740,19,20,7,2021-05-12,BA,292740,,2,9,9,1,6,9,2,,,1,,5,3,1,1,4,1.0,PROSTACAO INTENSA,9,9,9,9,34.0,2.0,,6.0,,1,6.0,1,1,2021-05-15,3,9,9,,2,7,4,4,4,,,3,3,6,6,6,1,6,2,2,6,3,6,6,19-2021,-26.208333,31.788462,-24.079137,-0.099905,7.358333,-44.62533,-25.097436,34.628571,-20.930314,0.773764,6.81768,-44.472296,-31.233161,17.257009,-30.477193,-4.605239,8.130556,-46.526854,22.265185,71.299468,1.115789,23.258355,74.984254,2.16,24.092008,74.733094,1.653012,3,5,3,4,2,2,4,5,4,5,2,2,3,4,3,3,3,1,2,3,2,3,4,3,4,4,2,2021
1,77309826880,M,1964-05-31,56,7,6,9,,,BA,292740,19,19,2,2021-05-10,BA,292740,,2,9,9,1,1,3,1,1.0,,1,,5,3,2,1,1,,,1,1,1,1,,,,,,6,,1,1,2021-05-10,0,9,9,,2,2,4,4,4,,,1,1,1,6,6,6,6,6,1,6,1,6,6,19-2021,-26.208333,31.788462,-24.079137,-0.099905,7.358333,-44.62533,-25.097436,34.628571,-20.930314,0.773764,6.81768,-44.472296,-31.233161,17.257009,-30.477193,-4.605239,8.130556,-46.526854,22.265185,71.299468,1.115789,23.258355,74.984254,2.16,24.092008,74.733094,1.653012,3,5,3,4,2,2,4,5,4,5,2,2,3,4,3,3,3,1,2,3,2,3,4,3,4,4,2,2021


In [8]:
df3 = spark.read.parquet('gs://ai-covid19-datalake/trusted/super-srag_filters/super-srag_covid_labpositive.parquet')
df3.limit(2).toPandas()

Unnamed: 0,NU_NOTIFIC,CS_SEXO,DT_NASC,AGE_AT_NOTIF,AGE_GROUP,CS_GESTANT,CS_RACA,CS_ETINIA,CS_ESCOL_N,SG_UF,CO_MUN_RES,SEM_PRI,SEM_NOT,DIST_PRI_NOTIFIC,DT_SIN_PRI,SG_UF_NOT,CO_MUN_NOT,SURTO_SG,NOSOCOMIAL,AVE_SUINO,VACINA,HOSPITAL,DIST_PRI_INTERNA,SUPORT_VEN,UTI,DIST_PRI_ENTUTI,CLASSI_OUT,CRITERIO,EVOLUCAO,CLASSI_FIN,SYMP_GROUP1,SYMP_GROUP2,SYMP_GROUP3,SYMP_GROUP4,OUTRO_SIN,OUTRO_DES,RF_GROUP1,RF_GROUP2,RF_GROUP3,RF_GROUP4,OBES_IMC,OUT_MORBI,MORB_DESC,RAIOX_RES,DIST_PRI_RAIOX,TOMO_RES,DIST_PRI_TOMO,AMOSTRA,TP_AMOSTRA,DT_COLETA,DIST_PRI_COLETA,PP_IF_RESUL,PP_TRA_RESUL,DIST_PRI_TRA,PP_PCR_RESUL,DIST_PRI_PCR,PP_RES_SOR_IGA,PP_RES_SOR_IGM,PP_RES_SOR_IGG,DIST_PRI_SOR,DIST_PRI_IF,DIST_PRI_NOTIFIC_Q,DIST_PRI_INTERNA_Q,DIST_PRI_ENTUTI_Q,DIST_PRI_SAIDUTI_Q,DIST_PRI_EVOLUCA_Q,DIST_PRI_ENCERRA_Q,DIST_PRI_RAIOX_Q,DIST_PRI_TOMO_Q,DIST_PRI_COLETA_Q,DIST_PRI_SOR_Q,DIST_PRI_PCR_Q,DIST_PRI_TRA_Q,DIST_PRI_IF_Q,EPI_WEEK_YEAR,GMR_TRANSIT_STATIONS_AVG,GMR_GROCERY_AND_PHARMACY_AVG,GMR_RETAIL_AND_RECREATION_AVG,GMR_WORKPLACES_PERCENT_AVG,GMR_RESIDENTIAL_PERCENT_AVG,GMR_PARKS_PERCENT_AVG,GMR_TRANSIT_STATIONS_1WEEK_BEFORE_AVG,GMR_GROCERY_AND_PHARMACY_1WEEK_BEFORE_AVG,GMR_RETAIL_AND_RECREATION_1WEEK_BEFORE_AVG,GMR_WORKPLACES_PERCENT_1WEEK_BEFORE_AVG,GMR_RESIDENTIAL_PERCENT_1WEEK_BEFORE_AVG,GMR_PARKS_PERCENT_1WEEK_BEFORE_AVG,GMR_TRANSIT_STATIONS_2WEEKS_AVG,GMR_GROCERY_AND_PHARMACY_2WEEKS_AVG,GMR_RETAIL_AND_RECREATION_2WEEKS_AVG,GMR_WORKPLACES_PERCENT_2WEEKS_AVG,GMR_RESIDENTIAL_PERCENT_2WEEKS_AVG,GMR_PARKS_PERCENT_2WEEKS_AVG,INMET_TEMP_C_AVG,INMET_RELATIVE_AIR_HUMIDITY_AVG,INMET_DAILY_PRECIPT_AVG,INMET_TEMP_C_1WEEK_BEFORE_AVG,INMET_RELATIVE_AIR_HUMIDITY_1WEEK_BEFORE_AVG,INMET_DAILY_PRECIPT_1WEEK_BEFORE_AVG,INMET_TEMP_C_2WEEKS_BEFORE_AVG,INMET_RELATIVE_AIR_HUMIDITY_2WEEKS_BEFORE_AVG,INMET_DAILY_PRECIPT_2WEEKS_BEFORE_AVG,GMR_TRANSIT_STATIONS_Q,GMR_GROCERY_AND_PHARMACY_Q,GMR_RETAIL_AND_RECREATION_Q,GMR_WORKPLACES_PERCENT_Q,GMR_RESIDENTIAL_PERCENT_Q,GMR_PARKS_PERCENT_Q,GMR_TRANSIT_STATIONS_1WEEK_BEFORE_Q,GMR_GROCERY_AND_PHARMACY_1WEEK_BEFORE_Q,GMR_RETAIL_AND_RECREATION_1WEEK_BEFORE_Q,GMR_WORKPLACES_PERCENT_1WEEK_BEFORE_Q,GMR_RESIDENTIAL_PERCENT_1WEEK_BEFORE_Q,GMR_PARKS_PERCENT_1WEEK_BEFORE_Q,GMR_TRANSIT_STATIONS_2WEEKS_Q,GMR_GROCERY_AND_PHARMACY_2WEEKS_Q,GMR_RETAIL_AND_RECREATION_2WEEKS_Q,GMR_WORKPLACES_PERCENT_2WEEKS_Q,GMR_RESIDENTIAL_PERCENT_2WEEKS_Q,GMR_PARKS_PERCENT_2WEEKS_Q,INMET_TEMP_C_Q,INMET_RELATIVE_AIR_HUMIDITY_Q,INMET_DAILY_PRECIPT_Q,INMET_TEMP_C_1WEEK_BEFORE_Q,INMET_RELATIVE_AIR_HUMIDITY_1WEEK_BEFORE_Q,INMET_DAILY_PRECIPT_1WEEK_BEFORE_Q,INMET_TEMP_C_2WEEKS_BEFORE_Q,INMET_RELATIVE_AIR_HUMIDITY_2WEEKS_BEFORE_Q,INMET_DAILY_PRECIPT_2WEEKS_BEFORE_Q,ANO
0,77309859740,M,1968-12-02,52,7,6,4,,4.0,BA,290730,19,20,7,2021-05-13,BA,291920,,,,9.0,1,6,2,2,,,1,,5,4,6,1,1,2.0,,9,9,9,9,,2.0,,,,,,1,1,2021-05-14,1,2,9,,6,,4,4,4,,1.0,3,3,6,6,6,1,6,6,1,6,6,6,1,19-2021,-26.208333,31.788462,-24.079137,-0.099905,7.358333,-44.62533,-25.097436,34.628571,-20.930314,0.773764,6.81768,-44.472296,-31.233161,17.257009,-30.477193,-4.605239,8.130556,-46.526854,22.265185,71.299468,1.115789,23.258355,74.984254,2.16,24.092008,74.733094,1.653012,3,5,3,4,2,2,4,5,4,5,2,2,3,4,3,3,3,1,2,3,2,3,4,3,4,4,2,2021
1,68719688318,F,1968-12-02,52,7,5,4,,,BA,292370,19,19,5,2021-05-10,BA,290520,,2.0,9.0,,1,6,2,1,6.0,,1,2.0,5,8,6,2,5,,,1,1,5,1,,,,,,1.0,5.0,1,1,2021-05-14,4,6,2,4.0,6,,4,4,4,,,2,3,3,6,2,1,6,2,3,6,6,3,6,19-2021,-26.208333,31.788462,-24.079137,-0.099905,7.358333,-44.62533,-25.097436,34.628571,-20.930314,0.773764,6.81768,-44.472296,-31.233161,17.257009,-30.477193,-4.605239,8.130556,-46.526854,22.265185,71.299468,1.115789,23.258355,74.984254,2.16,24.092008,74.733094,1.653012,3,5,3,4,2,2,4,5,4,5,2,2,3,4,3,3,3,1,2,3,2,3,4,3,4,4,2,2021


<hr />
<hr />
<hr />

# Preprocessing data

In [9]:
not_covid = preprocessing(not_covid)
not_covid.limit(2).toPandas()

Unnamed: 0,NU_NOTIFIC,CS_SEXO,DT_NASC,AGE_AT_NOTIF,AGE_GROUP,CS_GESTANT,CS_RACA,CS_ETINIA,CS_ESCOL_N,SG_UF,CO_MUN_RES,SEM_PRI,SEM_NOT,DIST_PRI_NOTIFIC,DT_SIN_PRI,SG_UF_NOT,CO_MUN_NOT,SURTO_SG,NOSOCOMIAL,AVE_SUINO,VACINA,HOSPITAL,DIST_PRI_INTERNA,SUPORT_VEN,UTI,DIST_PRI_ENTUTI,CLASSI_OUT,CRITERIO,EVOLUCAO,CLASSI_FIN,SYMP_GROUP1,SYMP_GROUP2,SYMP_GROUP3,SYMP_GROUP4,OUTRO_SIN,OUTRO_DES,RF_GROUP1,RF_GROUP2,RF_GROUP3,RF_GROUP4,OBES_IMC,OUT_MORBI,MORB_DESC,RAIOX_RES,DIST_PRI_RAIOX,TOMO_RES,DIST_PRI_TOMO,AMOSTRA,TP_AMOSTRA,DT_COLETA,DIST_PRI_COLETA,PP_IF_RESUL,PP_TRA_RESUL,DIST_PRI_TRA,PP_PCR_RESUL,DIST_PRI_PCR,PP_RES_SOR_IGA,PP_RES_SOR_IGM,PP_RES_SOR_IGG,DIST_PRI_SOR,DIST_PRI_IF,DIST_PRI_NOTIFIC_Q,DIST_PRI_INTERNA_Q,DIST_PRI_ENTUTI_Q,DIST_PRI_SAIDUTI_Q,DIST_PRI_EVOLUCA_Q,DIST_PRI_ENCERRA_Q,DIST_PRI_RAIOX_Q,DIST_PRI_TOMO_Q,DIST_PRI_COLETA_Q,DIST_PRI_SOR_Q,DIST_PRI_PCR_Q,DIST_PRI_TRA_Q,DIST_PRI_IF_Q,EPI_WEEK_YEAR,GMR_TRANSIT_STATIONS_AVG,GMR_GROCERY_AND_PHARMACY_AVG,GMR_RETAIL_AND_RECREATION_AVG,GMR_WORKPLACES_PERCENT_AVG,GMR_RESIDENTIAL_PERCENT_AVG,GMR_PARKS_PERCENT_AVG,GMR_TRANSIT_STATIONS_1WEEK_BEFORE_AVG,GMR_GROCERY_AND_PHARMACY_1WEEK_BEFORE_AVG,GMR_RETAIL_AND_RECREATION_1WEEK_BEFORE_AVG,GMR_WORKPLACES_PERCENT_1WEEK_BEFORE_AVG,GMR_RESIDENTIAL_PERCENT_1WEEK_BEFORE_AVG,GMR_PARKS_PERCENT_1WEEK_BEFORE_AVG,GMR_TRANSIT_STATIONS_2WEEKS_AVG,GMR_GROCERY_AND_PHARMACY_2WEEKS_AVG,GMR_RETAIL_AND_RECREATION_2WEEKS_AVG,GMR_WORKPLACES_PERCENT_2WEEKS_AVG,GMR_RESIDENTIAL_PERCENT_2WEEKS_AVG,GMR_PARKS_PERCENT_2WEEKS_AVG,INMET_TEMP_C_AVG,INMET_RELATIVE_AIR_HUMIDITY_AVG,INMET_DAILY_PRECIPT_AVG,INMET_TEMP_C_1WEEK_BEFORE_AVG,INMET_RELATIVE_AIR_HUMIDITY_1WEEK_BEFORE_AVG,INMET_DAILY_PRECIPT_1WEEK_BEFORE_AVG,INMET_TEMP_C_2WEEKS_BEFORE_AVG,INMET_RELATIVE_AIR_HUMIDITY_2WEEKS_BEFORE_AVG,INMET_DAILY_PRECIPT_2WEEKS_BEFORE_AVG,GMR_TRANSIT_STATIONS_Q,GMR_GROCERY_AND_PHARMACY_Q,GMR_RETAIL_AND_RECREATION_Q,GMR_WORKPLACES_PERCENT_Q,GMR_RESIDENTIAL_PERCENT_Q,GMR_PARKS_PERCENT_Q,GMR_TRANSIT_STATIONS_1WEEK_BEFORE_Q,GMR_GROCERY_AND_PHARMACY_1WEEK_BEFORE_Q,GMR_RETAIL_AND_RECREATION_1WEEK_BEFORE_Q,GMR_WORKPLACES_PERCENT_1WEEK_BEFORE_Q,GMR_RESIDENTIAL_PERCENT_1WEEK_BEFORE_Q,GMR_PARKS_PERCENT_1WEEK_BEFORE_Q,GMR_TRANSIT_STATIONS_2WEEKS_Q,GMR_GROCERY_AND_PHARMACY_2WEEKS_Q,GMR_RETAIL_AND_RECREATION_2WEEKS_Q,GMR_WORKPLACES_PERCENT_2WEEKS_Q,GMR_RESIDENTIAL_PERCENT_2WEEKS_Q,GMR_PARKS_PERCENT_2WEEKS_Q,INMET_TEMP_C_Q,INMET_RELATIVE_AIR_HUMIDITY_Q,INMET_DAILY_PRECIPT_Q,INMET_TEMP_C_1WEEK_BEFORE_Q,INMET_RELATIVE_AIR_HUMIDITY_1WEEK_BEFORE_Q,INMET_DAILY_PRECIPT_1WEEK_BEFORE_Q,INMET_TEMP_C_2WEEKS_BEFORE_Q,INMET_RELATIVE_AIR_HUMIDITY_2WEEKS_BEFORE_Q,INMET_DAILY_PRECIPT_2WEEKS_BEFORE_Q,ANO
0,77309712285,1.0,2020-09-18,0,1.0,6,4.0,9999,0.0,35.0,352620,4,4,2,2021-01-26,35.0,352220,9.0,2,2,9999,1.0,1,2.0,2.0,9999,9999,1.0,1.0,0.0,3.0,8.0,1.0,1.0,2.0,9999,1.0,1.0,1.0,1.0,9999,9999,9999,9999,9999,9999,9999,1,1,2021-01-27,1,6,3,2,4,6,4,4,4,9999,9999,1.0,1.0,6.0,6.0,4.0,4.0,6.0,6.0,1.0,6.0,2.0,1.0,6.0,04-2021,-25.895942,6.296242,-28.311178,-6.281181,6.003215,-23.171171,-26.123958,4.771699,-24.164021,-7.336648,5.483269,-25.172172,-23.547269,8.872993,-20.801375,-4.632532,5.748876,-25.86341,24.476172,72.664078,4.339216,24.36574,75.171226,6.304,24.070494,79.940901,6.042424,3,2,3,3,2,4,3.0,2,3.0,3.0,1.0,4,4.0,3,4.0,3.0,2.0,4,4,3,4,4,4.0,5,4,5.0,4,2021
1,85899613066,1.0,2010-03-11,10,3.0,6,9.0,9999,9999.0,35.0,355030,4,5,5,2021-01-28,35.0,355030,2.0,2,9,9,2.0,9999,3.0,9999.0,9999,9999,1.0,1.0,0.0,8.0,1.0,1.0,1.0,9999.0,9999,1.0,1.0,1.0,1.0,9999,9999,9999,6,9999,6,9999,1,1,2021-01-31,3,9,9,9999,3,3,4,4,4,9999,9999,2.0,6.0,6.0,6.0,1.0,2.0,6.0,6.0,2.0,6.0,1.0,6.0,6.0,04-2021,-25.895942,6.296242,-28.311178,-6.281181,6.003215,-23.171171,-26.123958,4.771699,-24.164021,-7.336648,5.483269,-25.172172,-23.547269,8.872993,-20.801375,-4.632532,5.748876,-25.86341,24.476172,72.664078,4.339216,24.36574,75.171226,6.304,24.070494,79.940901,6.042424,3,2,3,3,2,4,3.0,2,3.0,3.0,1.0,4,4.0,3,4.0,3.0,2.0,4,4,3,4,4,4.0,5,4,5.0,4,2021


In [10]:
df1 = preprocessing(df1)
df1.limit(2).toPandas()

Unnamed: 0,NU_NOTIFIC,CS_SEXO,DT_NASC,AGE_AT_NOTIF,AGE_GROUP,CS_GESTANT,CS_RACA,CS_ETINIA,CS_ESCOL_N,SG_UF,CO_MUN_RES,SEM_PRI,SEM_NOT,DIST_PRI_NOTIFIC,DT_SIN_PRI,SG_UF_NOT,CO_MUN_NOT,SURTO_SG,NOSOCOMIAL,AVE_SUINO,VACINA,HOSPITAL,DIST_PRI_INTERNA,SUPORT_VEN,UTI,DIST_PRI_ENTUTI,CLASSI_OUT,CRITERIO,EVOLUCAO,CLASSI_FIN,SYMP_GROUP1,SYMP_GROUP2,SYMP_GROUP3,SYMP_GROUP4,OUTRO_SIN,OUTRO_DES,RF_GROUP1,RF_GROUP2,RF_GROUP3,RF_GROUP4,OBES_IMC,OUT_MORBI,MORB_DESC,RAIOX_RES,DIST_PRI_RAIOX,TOMO_RES,DIST_PRI_TOMO,AMOSTRA,TP_AMOSTRA,DT_COLETA,DIST_PRI_COLETA,PP_IF_RESUL,PP_TRA_RESUL,DIST_PRI_TRA,PP_PCR_RESUL,DIST_PRI_PCR,PP_RES_SOR_IGA,PP_RES_SOR_IGM,PP_RES_SOR_IGG,DIST_PRI_SOR,DIST_PRI_IF,DIST_PRI_NOTIFIC_Q,DIST_PRI_INTERNA_Q,DIST_PRI_ENTUTI_Q,DIST_PRI_SAIDUTI_Q,DIST_PRI_EVOLUCA_Q,DIST_PRI_ENCERRA_Q,DIST_PRI_RAIOX_Q,DIST_PRI_TOMO_Q,DIST_PRI_COLETA_Q,DIST_PRI_SOR_Q,DIST_PRI_PCR_Q,DIST_PRI_TRA_Q,DIST_PRI_IF_Q,EPI_WEEK_YEAR,GMR_TRANSIT_STATIONS_AVG,GMR_GROCERY_AND_PHARMACY_AVG,GMR_RETAIL_AND_RECREATION_AVG,GMR_WORKPLACES_PERCENT_AVG,GMR_RESIDENTIAL_PERCENT_AVG,GMR_PARKS_PERCENT_AVG,GMR_TRANSIT_STATIONS_1WEEK_BEFORE_AVG,GMR_GROCERY_AND_PHARMACY_1WEEK_BEFORE_AVG,GMR_RETAIL_AND_RECREATION_1WEEK_BEFORE_AVG,GMR_WORKPLACES_PERCENT_1WEEK_BEFORE_AVG,GMR_RESIDENTIAL_PERCENT_1WEEK_BEFORE_AVG,GMR_PARKS_PERCENT_1WEEK_BEFORE_AVG,GMR_TRANSIT_STATIONS_2WEEKS_AVG,GMR_GROCERY_AND_PHARMACY_2WEEKS_AVG,GMR_RETAIL_AND_RECREATION_2WEEKS_AVG,GMR_WORKPLACES_PERCENT_2WEEKS_AVG,GMR_RESIDENTIAL_PERCENT_2WEEKS_AVG,GMR_PARKS_PERCENT_2WEEKS_AVG,INMET_TEMP_C_AVG,INMET_RELATIVE_AIR_HUMIDITY_AVG,INMET_DAILY_PRECIPT_AVG,INMET_TEMP_C_1WEEK_BEFORE_AVG,INMET_RELATIVE_AIR_HUMIDITY_1WEEK_BEFORE_AVG,INMET_DAILY_PRECIPT_1WEEK_BEFORE_AVG,INMET_TEMP_C_2WEEKS_BEFORE_AVG,INMET_RELATIVE_AIR_HUMIDITY_2WEEKS_BEFORE_AVG,INMET_DAILY_PRECIPT_2WEEKS_BEFORE_AVG,GMR_TRANSIT_STATIONS_Q,GMR_GROCERY_AND_PHARMACY_Q,GMR_RETAIL_AND_RECREATION_Q,GMR_WORKPLACES_PERCENT_Q,GMR_RESIDENTIAL_PERCENT_Q,GMR_PARKS_PERCENT_Q,GMR_TRANSIT_STATIONS_1WEEK_BEFORE_Q,GMR_GROCERY_AND_PHARMACY_1WEEK_BEFORE_Q,GMR_RETAIL_AND_RECREATION_1WEEK_BEFORE_Q,GMR_WORKPLACES_PERCENT_1WEEK_BEFORE_Q,GMR_RESIDENTIAL_PERCENT_1WEEK_BEFORE_Q,GMR_PARKS_PERCENT_1WEEK_BEFORE_Q,GMR_TRANSIT_STATIONS_2WEEKS_Q,GMR_GROCERY_AND_PHARMACY_2WEEKS_Q,GMR_RETAIL_AND_RECREATION_2WEEKS_Q,GMR_WORKPLACES_PERCENT_2WEEKS_Q,GMR_RESIDENTIAL_PERCENT_2WEEKS_Q,GMR_PARKS_PERCENT_2WEEKS_Q,INMET_TEMP_C_Q,INMET_RELATIVE_AIR_HUMIDITY_Q,INMET_DAILY_PRECIPT_Q,INMET_TEMP_C_1WEEK_BEFORE_Q,INMET_RELATIVE_AIR_HUMIDITY_1WEEK_BEFORE_Q,INMET_DAILY_PRECIPT_1WEEK_BEFORE_Q,INMET_TEMP_C_2WEEKS_BEFORE_Q,INMET_RELATIVE_AIR_HUMIDITY_2WEEKS_BEFORE_Q,INMET_DAILY_PRECIPT_2WEEKS_BEFORE_Q,ANO
0,77309859740,1.0,1968-12-02,52,7.0,6,4.0,9999,4.0,29.0,290730,19,20,7,2021-05-13,29.0,291920,9999.0,9999,9999,9,1.0,6,2.0,2.0,9999,9999,1.0,9999.0,1.0,4.0,6.0,1.0,1.0,2.0,9999,9.0,9.0,9.0,9.0,9999,2,9999,9999,9999,9999,9999,1,1,2021-05-14,1,2,9,9999,6,9999,4,4,4,9999,1,3.0,3.0,6.0,6.0,6.0,1.0,6.0,6.0,1.0,6.0,6.0,6.0,1.0,19-2021,-26.208333,31.788462,-24.079137,-0.099905,7.358333,-44.62533,-25.097436,34.628571,-20.930314,0.773764,6.81768,-44.472296,-31.233161,17.257009,-30.477193,-4.605239,8.130556,-46.526854,22.265185,71.299468,1.115789,23.258355,74.984254,2.16,24.092008,74.733094,1.653012,3,5,3,4,2,2,4.0,5,4.0,5.0,2.0,2,3.0,4,3.0,3.0,3.0,1,2,3,2,3,4.0,3,4,4.0,2,2021
1,68719688318,2.0,1968-12-02,52,7.0,5,4.0,9999,9999.0,29.0,292370,19,19,5,2021-05-10,29.0,290520,9999.0,2,9,9999,1.0,6,2.0,1.0,6,9999,1.0,2.0,1.0,8.0,6.0,2.0,5.0,9999.0,9999,1.0,1.0,5.0,1.0,9999,9999,9999,9999,9999,1,5,1,1,2021-05-14,4,6,2,4,6,9999,4,4,4,9999,9999,2.0,3.0,3.0,6.0,2.0,1.0,6.0,2.0,3.0,6.0,6.0,3.0,6.0,19-2021,-26.208333,31.788462,-24.079137,-0.099905,7.358333,-44.62533,-25.097436,34.628571,-20.930314,0.773764,6.81768,-44.472296,-31.233161,17.257009,-30.477193,-4.605239,8.130556,-46.526854,22.265185,71.299468,1.115789,23.258355,74.984254,2.16,24.092008,74.733094,1.653012,3,5,3,4,2,2,4.0,5,4.0,5.0,2.0,2,3.0,4,3.0,3.0,3.0,1,2,3,2,3,4.0,3,4,4.0,2,2021


In [11]:
df2 = preprocessing(df2)
df2.limit(2).toPandas()

Unnamed: 0,NU_NOTIFIC,CS_SEXO,DT_NASC,AGE_AT_NOTIF,AGE_GROUP,CS_GESTANT,CS_RACA,CS_ETINIA,CS_ESCOL_N,SG_UF,CO_MUN_RES,SEM_PRI,SEM_NOT,DIST_PRI_NOTIFIC,DT_SIN_PRI,SG_UF_NOT,CO_MUN_NOT,SURTO_SG,NOSOCOMIAL,AVE_SUINO,VACINA,HOSPITAL,DIST_PRI_INTERNA,SUPORT_VEN,UTI,DIST_PRI_ENTUTI,CLASSI_OUT,CRITERIO,EVOLUCAO,CLASSI_FIN,SYMP_GROUP1,SYMP_GROUP2,SYMP_GROUP3,SYMP_GROUP4,OUTRO_SIN,OUTRO_DES,RF_GROUP1,RF_GROUP2,RF_GROUP3,RF_GROUP4,OBES_IMC,OUT_MORBI,MORB_DESC,RAIOX_RES,DIST_PRI_RAIOX,TOMO_RES,DIST_PRI_TOMO,AMOSTRA,TP_AMOSTRA,DT_COLETA,DIST_PRI_COLETA,PP_IF_RESUL,PP_TRA_RESUL,DIST_PRI_TRA,PP_PCR_RESUL,DIST_PRI_PCR,PP_RES_SOR_IGA,PP_RES_SOR_IGM,PP_RES_SOR_IGG,DIST_PRI_SOR,DIST_PRI_IF,DIST_PRI_NOTIFIC_Q,DIST_PRI_INTERNA_Q,DIST_PRI_ENTUTI_Q,DIST_PRI_SAIDUTI_Q,DIST_PRI_EVOLUCA_Q,DIST_PRI_ENCERRA_Q,DIST_PRI_RAIOX_Q,DIST_PRI_TOMO_Q,DIST_PRI_COLETA_Q,DIST_PRI_SOR_Q,DIST_PRI_PCR_Q,DIST_PRI_TRA_Q,DIST_PRI_IF_Q,EPI_WEEK_YEAR,GMR_TRANSIT_STATIONS_AVG,GMR_GROCERY_AND_PHARMACY_AVG,GMR_RETAIL_AND_RECREATION_AVG,GMR_WORKPLACES_PERCENT_AVG,GMR_RESIDENTIAL_PERCENT_AVG,GMR_PARKS_PERCENT_AVG,GMR_TRANSIT_STATIONS_1WEEK_BEFORE_AVG,GMR_GROCERY_AND_PHARMACY_1WEEK_BEFORE_AVG,GMR_RETAIL_AND_RECREATION_1WEEK_BEFORE_AVG,GMR_WORKPLACES_PERCENT_1WEEK_BEFORE_AVG,GMR_RESIDENTIAL_PERCENT_1WEEK_BEFORE_AVG,GMR_PARKS_PERCENT_1WEEK_BEFORE_AVG,GMR_TRANSIT_STATIONS_2WEEKS_AVG,GMR_GROCERY_AND_PHARMACY_2WEEKS_AVG,GMR_RETAIL_AND_RECREATION_2WEEKS_AVG,GMR_WORKPLACES_PERCENT_2WEEKS_AVG,GMR_RESIDENTIAL_PERCENT_2WEEKS_AVG,GMR_PARKS_PERCENT_2WEEKS_AVG,INMET_TEMP_C_AVG,INMET_RELATIVE_AIR_HUMIDITY_AVG,INMET_DAILY_PRECIPT_AVG,INMET_TEMP_C_1WEEK_BEFORE_AVG,INMET_RELATIVE_AIR_HUMIDITY_1WEEK_BEFORE_AVG,INMET_DAILY_PRECIPT_1WEEK_BEFORE_AVG,INMET_TEMP_C_2WEEKS_BEFORE_AVG,INMET_RELATIVE_AIR_HUMIDITY_2WEEKS_BEFORE_AVG,INMET_DAILY_PRECIPT_2WEEKS_BEFORE_AVG,GMR_TRANSIT_STATIONS_Q,GMR_GROCERY_AND_PHARMACY_Q,GMR_RETAIL_AND_RECREATION_Q,GMR_WORKPLACES_PERCENT_Q,GMR_RESIDENTIAL_PERCENT_Q,GMR_PARKS_PERCENT_Q,GMR_TRANSIT_STATIONS_1WEEK_BEFORE_Q,GMR_GROCERY_AND_PHARMACY_1WEEK_BEFORE_Q,GMR_RETAIL_AND_RECREATION_1WEEK_BEFORE_Q,GMR_WORKPLACES_PERCENT_1WEEK_BEFORE_Q,GMR_RESIDENTIAL_PERCENT_1WEEK_BEFORE_Q,GMR_PARKS_PERCENT_1WEEK_BEFORE_Q,GMR_TRANSIT_STATIONS_2WEEKS_Q,GMR_GROCERY_AND_PHARMACY_2WEEKS_Q,GMR_RETAIL_AND_RECREATION_2WEEKS_Q,GMR_WORKPLACES_PERCENT_2WEEKS_Q,GMR_RESIDENTIAL_PERCENT_2WEEKS_Q,GMR_PARKS_PERCENT_2WEEKS_Q,INMET_TEMP_C_Q,INMET_RELATIVE_AIR_HUMIDITY_Q,INMET_DAILY_PRECIPT_Q,INMET_TEMP_C_1WEEK_BEFORE_Q,INMET_RELATIVE_AIR_HUMIDITY_1WEEK_BEFORE_Q,INMET_DAILY_PRECIPT_1WEEK_BEFORE_Q,INMET_TEMP_C_2WEEKS_BEFORE_Q,INMET_RELATIVE_AIR_HUMIDITY_2WEEKS_BEFORE_Q,INMET_DAILY_PRECIPT_2WEEKS_BEFORE_Q,ANO
0,68719818720,2.0,1988-06-04,32,5.0,5,4.0,9999,3.0,29.0,292740,19,20,7,2021-05-12,29.0,292740,9999.0,2,9,9,1.0,6,9.0,2.0,9999,9999,1.0,9999.0,1.0,3.0,1.0,1.0,4.0,1.0,PROSTACAO INTENSA,9.0,9.0,9.0,9.0,34,2,9999,6,9999,1,6,1,1,2021-05-15,3,9,9,9999,2,7,4,4,4,9999,9999,3.0,3.0,6.0,6.0,6.0,1.0,6.0,2.0,2.0,6.0,3.0,6.0,6.0,19-2021,-26.208333,31.788462,-24.079137,-0.099905,7.358333,-44.62533,-25.097436,34.628571,-20.930314,0.773764,6.81768,-44.472296,-31.233161,17.257009,-30.477193,-4.605239,8.130556,-46.526854,22.265185,71.299468,1.115789,23.258355,74.984254,2.16,24.092008,74.733094,1.653012,3,5,3,4,2,2,4.0,5,4.0,5.0,2.0,2,3.0,4,3.0,3.0,3.0,1,2,3,2,3,4.0,3,4,4.0,2,2021
1,77309826880,1.0,1964-05-31,56,7.0,6,9.0,9999,9999.0,29.0,292740,19,19,2,2021-05-10,29.0,292740,9999.0,2,9,9,1.0,1,3.0,1.0,1,9999,1.0,9999.0,1.0,3.0,2.0,1.0,1.0,9999.0,9999,1.0,1.0,1.0,1.0,9999,9999,9999,9999,9999,6,9999,1,1,2021-05-10,0,9,9,9999,2,2,4,4,4,9999,9999,1.0,1.0,1.0,6.0,6.0,6.0,6.0,6.0,1.0,6.0,1.0,6.0,6.0,19-2021,-26.208333,31.788462,-24.079137,-0.099905,7.358333,-44.62533,-25.097436,34.628571,-20.930314,0.773764,6.81768,-44.472296,-31.233161,17.257009,-30.477193,-4.605239,8.130556,-46.526854,22.265185,71.299468,1.115789,23.258355,74.984254,2.16,24.092008,74.733094,1.653012,3,5,3,4,2,2,4.0,5,4.0,5.0,2.0,2,3.0,4,3.0,3.0,3.0,1,2,3,2,3,4.0,3,4,4.0,2,2021


In [12]:
df3 = preprocessing(df3)
df3.limit(2).toPandas()

Unnamed: 0,NU_NOTIFIC,CS_SEXO,DT_NASC,AGE_AT_NOTIF,AGE_GROUP,CS_GESTANT,CS_RACA,CS_ETINIA,CS_ESCOL_N,SG_UF,CO_MUN_RES,SEM_PRI,SEM_NOT,DIST_PRI_NOTIFIC,DT_SIN_PRI,SG_UF_NOT,CO_MUN_NOT,SURTO_SG,NOSOCOMIAL,AVE_SUINO,VACINA,HOSPITAL,DIST_PRI_INTERNA,SUPORT_VEN,UTI,DIST_PRI_ENTUTI,CLASSI_OUT,CRITERIO,EVOLUCAO,CLASSI_FIN,SYMP_GROUP1,SYMP_GROUP2,SYMP_GROUP3,SYMP_GROUP4,OUTRO_SIN,OUTRO_DES,RF_GROUP1,RF_GROUP2,RF_GROUP3,RF_GROUP4,OBES_IMC,OUT_MORBI,MORB_DESC,RAIOX_RES,DIST_PRI_RAIOX,TOMO_RES,DIST_PRI_TOMO,AMOSTRA,TP_AMOSTRA,DT_COLETA,DIST_PRI_COLETA,PP_IF_RESUL,PP_TRA_RESUL,DIST_PRI_TRA,PP_PCR_RESUL,DIST_PRI_PCR,PP_RES_SOR_IGA,PP_RES_SOR_IGM,PP_RES_SOR_IGG,DIST_PRI_SOR,DIST_PRI_IF,DIST_PRI_NOTIFIC_Q,DIST_PRI_INTERNA_Q,DIST_PRI_ENTUTI_Q,DIST_PRI_SAIDUTI_Q,DIST_PRI_EVOLUCA_Q,DIST_PRI_ENCERRA_Q,DIST_PRI_RAIOX_Q,DIST_PRI_TOMO_Q,DIST_PRI_COLETA_Q,DIST_PRI_SOR_Q,DIST_PRI_PCR_Q,DIST_PRI_TRA_Q,DIST_PRI_IF_Q,EPI_WEEK_YEAR,GMR_TRANSIT_STATIONS_AVG,GMR_GROCERY_AND_PHARMACY_AVG,GMR_RETAIL_AND_RECREATION_AVG,GMR_WORKPLACES_PERCENT_AVG,GMR_RESIDENTIAL_PERCENT_AVG,GMR_PARKS_PERCENT_AVG,GMR_TRANSIT_STATIONS_1WEEK_BEFORE_AVG,GMR_GROCERY_AND_PHARMACY_1WEEK_BEFORE_AVG,GMR_RETAIL_AND_RECREATION_1WEEK_BEFORE_AVG,GMR_WORKPLACES_PERCENT_1WEEK_BEFORE_AVG,GMR_RESIDENTIAL_PERCENT_1WEEK_BEFORE_AVG,GMR_PARKS_PERCENT_1WEEK_BEFORE_AVG,GMR_TRANSIT_STATIONS_2WEEKS_AVG,GMR_GROCERY_AND_PHARMACY_2WEEKS_AVG,GMR_RETAIL_AND_RECREATION_2WEEKS_AVG,GMR_WORKPLACES_PERCENT_2WEEKS_AVG,GMR_RESIDENTIAL_PERCENT_2WEEKS_AVG,GMR_PARKS_PERCENT_2WEEKS_AVG,INMET_TEMP_C_AVG,INMET_RELATIVE_AIR_HUMIDITY_AVG,INMET_DAILY_PRECIPT_AVG,INMET_TEMP_C_1WEEK_BEFORE_AVG,INMET_RELATIVE_AIR_HUMIDITY_1WEEK_BEFORE_AVG,INMET_DAILY_PRECIPT_1WEEK_BEFORE_AVG,INMET_TEMP_C_2WEEKS_BEFORE_AVG,INMET_RELATIVE_AIR_HUMIDITY_2WEEKS_BEFORE_AVG,INMET_DAILY_PRECIPT_2WEEKS_BEFORE_AVG,GMR_TRANSIT_STATIONS_Q,GMR_GROCERY_AND_PHARMACY_Q,GMR_RETAIL_AND_RECREATION_Q,GMR_WORKPLACES_PERCENT_Q,GMR_RESIDENTIAL_PERCENT_Q,GMR_PARKS_PERCENT_Q,GMR_TRANSIT_STATIONS_1WEEK_BEFORE_Q,GMR_GROCERY_AND_PHARMACY_1WEEK_BEFORE_Q,GMR_RETAIL_AND_RECREATION_1WEEK_BEFORE_Q,GMR_WORKPLACES_PERCENT_1WEEK_BEFORE_Q,GMR_RESIDENTIAL_PERCENT_1WEEK_BEFORE_Q,GMR_PARKS_PERCENT_1WEEK_BEFORE_Q,GMR_TRANSIT_STATIONS_2WEEKS_Q,GMR_GROCERY_AND_PHARMACY_2WEEKS_Q,GMR_RETAIL_AND_RECREATION_2WEEKS_Q,GMR_WORKPLACES_PERCENT_2WEEKS_Q,GMR_RESIDENTIAL_PERCENT_2WEEKS_Q,GMR_PARKS_PERCENT_2WEEKS_Q,INMET_TEMP_C_Q,INMET_RELATIVE_AIR_HUMIDITY_Q,INMET_DAILY_PRECIPT_Q,INMET_TEMP_C_1WEEK_BEFORE_Q,INMET_RELATIVE_AIR_HUMIDITY_1WEEK_BEFORE_Q,INMET_DAILY_PRECIPT_1WEEK_BEFORE_Q,INMET_TEMP_C_2WEEKS_BEFORE_Q,INMET_RELATIVE_AIR_HUMIDITY_2WEEKS_BEFORE_Q,INMET_DAILY_PRECIPT_2WEEKS_BEFORE_Q,ANO
0,77309859740,1.0,1968-12-02,52,7.0,6,4.0,9999,4.0,29.0,290730,19,20,7,2021-05-13,29.0,291920,9999.0,9999,9999,9,1.0,6,2.0,2.0,9999,9999,1.0,9999.0,1.0,4.0,6.0,1.0,1.0,2.0,9999,9.0,9.0,9.0,9.0,9999,2,9999,9999,9999,9999,9999,1,1,2021-05-14,1,2,9,9999,6,9999,4,4,4,9999,1,3.0,3.0,6.0,6.0,6.0,1.0,6.0,6.0,1.0,6.0,6.0,6.0,1.0,19-2021,-26.208333,31.788462,-24.079137,-0.099905,7.358333,-44.62533,-25.097436,34.628571,-20.930314,0.773764,6.81768,-44.472296,-31.233161,17.257009,-30.477193,-4.605239,8.130556,-46.526854,22.265185,71.299468,1.115789,23.258355,74.984254,2.16,24.092008,74.733094,1.653012,3,5,3,4,2,2,4.0,5,4.0,5.0,2.0,2,3.0,4,3.0,3.0,3.0,1,2,3,2,3,4.0,3,4,4.0,2,2021
1,68719688318,2.0,1968-12-02,52,7.0,5,4.0,9999,9999.0,29.0,292370,19,19,5,2021-05-10,29.0,290520,9999.0,2,9,9999,1.0,6,2.0,1.0,6,9999,1.0,2.0,1.0,8.0,6.0,2.0,5.0,9999.0,9999,1.0,1.0,5.0,1.0,9999,9999,9999,9999,9999,1,5,1,1,2021-05-14,4,6,2,4,6,9999,4,4,4,9999,9999,2.0,3.0,3.0,6.0,2.0,1.0,6.0,2.0,3.0,6.0,6.0,3.0,6.0,19-2021,-26.208333,31.788462,-24.079137,-0.099905,7.358333,-44.62533,-25.097436,34.628571,-20.930314,0.773764,6.81768,-44.472296,-31.233161,17.257009,-30.477193,-4.605239,8.130556,-46.526854,22.265185,71.299468,1.115789,23.258355,74.984254,2.16,24.092008,74.733094,1.653012,3,5,3,4,2,2,4.0,5,4.0,5.0,2.0,2,3.0,4,3.0,3.0,3.0,1,2,3,2,3,4.0,3,4,4.0,2,2021


<hr />
<hr />
<hr />

In [19]:
kmodes_vars = ['NU_NOTIFIC', 'CLASSI_FIN', 
              'CS_SEXO', 'CS_RACA', 'CS_ESCOL_N', 'SG_UF', 'SG_UF_NOT',
              'CRITERIO', 'SUPORT_VEN', 'EVOLUCAO','AGE_GROUP', 'UTI', 
              'SYMP_GROUP1', 'SYMP_GROUP2','SYMP_GROUP3', 'SYMP_GROUP4',
              'RF_GROUP1','RF_GROUP2', 'RF_GROUP3', 'RF_GROUP4',
              'DIST_PRI_EVOLUCA_Q','DIST_PRI_ENCERRA_Q','DIST_PRI_INTERNA_Q','DIST_PRI_NOTIFIC_Q',
              'DIST_PRI_COLETA_Q','DIST_PRI_PCR_Q', 'DIST_PRI_ENTUTI_Q']

# took of OUTRO_SIN because it is not categorical

In [20]:
df1 = df1.persist(StorageLevel.MEMORY_ONLY)
df1.limit(2).toPandas()

Unnamed: 0,NU_NOTIFIC,CS_SEXO,DT_NASC,AGE_AT_NOTIF,AGE_GROUP,CS_GESTANT,CS_RACA,CS_ETINIA,CS_ESCOL_N,SG_UF,CO_MUN_RES,SEM_PRI,SEM_NOT,DIST_PRI_NOTIFIC,DT_SIN_PRI,SG_UF_NOT,CO_MUN_NOT,SURTO_SG,NOSOCOMIAL,AVE_SUINO,VACINA,HOSPITAL,DIST_PRI_INTERNA,SUPORT_VEN,UTI,DIST_PRI_ENTUTI,CLASSI_OUT,CRITERIO,EVOLUCAO,CLASSI_FIN,SYMP_GROUP1,SYMP_GROUP2,SYMP_GROUP3,SYMP_GROUP4,OUTRO_SIN,OUTRO_DES,RF_GROUP1,RF_GROUP2,RF_GROUP3,RF_GROUP4,OBES_IMC,OUT_MORBI,MORB_DESC,RAIOX_RES,DIST_PRI_RAIOX,TOMO_RES,DIST_PRI_TOMO,AMOSTRA,TP_AMOSTRA,DT_COLETA,DIST_PRI_COLETA,PP_IF_RESUL,PP_TRA_RESUL,DIST_PRI_TRA,PP_PCR_RESUL,DIST_PRI_PCR,PP_RES_SOR_IGA,PP_RES_SOR_IGM,PP_RES_SOR_IGG,DIST_PRI_SOR,DIST_PRI_IF,DIST_PRI_NOTIFIC_Q,DIST_PRI_INTERNA_Q,DIST_PRI_ENTUTI_Q,DIST_PRI_SAIDUTI_Q,DIST_PRI_EVOLUCA_Q,DIST_PRI_ENCERRA_Q,DIST_PRI_RAIOX_Q,DIST_PRI_TOMO_Q,DIST_PRI_COLETA_Q,DIST_PRI_SOR_Q,DIST_PRI_PCR_Q,DIST_PRI_TRA_Q,DIST_PRI_IF_Q,EPI_WEEK_YEAR,GMR_TRANSIT_STATIONS_AVG,GMR_GROCERY_AND_PHARMACY_AVG,GMR_RETAIL_AND_RECREATION_AVG,GMR_WORKPLACES_PERCENT_AVG,GMR_RESIDENTIAL_PERCENT_AVG,GMR_PARKS_PERCENT_AVG,GMR_TRANSIT_STATIONS_1WEEK_BEFORE_AVG,GMR_GROCERY_AND_PHARMACY_1WEEK_BEFORE_AVG,GMR_RETAIL_AND_RECREATION_1WEEK_BEFORE_AVG,GMR_WORKPLACES_PERCENT_1WEEK_BEFORE_AVG,GMR_RESIDENTIAL_PERCENT_1WEEK_BEFORE_AVG,GMR_PARKS_PERCENT_1WEEK_BEFORE_AVG,GMR_TRANSIT_STATIONS_2WEEKS_AVG,GMR_GROCERY_AND_PHARMACY_2WEEKS_AVG,GMR_RETAIL_AND_RECREATION_2WEEKS_AVG,GMR_WORKPLACES_PERCENT_2WEEKS_AVG,GMR_RESIDENTIAL_PERCENT_2WEEKS_AVG,GMR_PARKS_PERCENT_2WEEKS_AVG,INMET_TEMP_C_AVG,INMET_RELATIVE_AIR_HUMIDITY_AVG,INMET_DAILY_PRECIPT_AVG,INMET_TEMP_C_1WEEK_BEFORE_AVG,INMET_RELATIVE_AIR_HUMIDITY_1WEEK_BEFORE_AVG,INMET_DAILY_PRECIPT_1WEEK_BEFORE_AVG,INMET_TEMP_C_2WEEKS_BEFORE_AVG,INMET_RELATIVE_AIR_HUMIDITY_2WEEKS_BEFORE_AVG,INMET_DAILY_PRECIPT_2WEEKS_BEFORE_AVG,GMR_TRANSIT_STATIONS_Q,GMR_GROCERY_AND_PHARMACY_Q,GMR_RETAIL_AND_RECREATION_Q,GMR_WORKPLACES_PERCENT_Q,GMR_RESIDENTIAL_PERCENT_Q,GMR_PARKS_PERCENT_Q,GMR_TRANSIT_STATIONS_1WEEK_BEFORE_Q,GMR_GROCERY_AND_PHARMACY_1WEEK_BEFORE_Q,GMR_RETAIL_AND_RECREATION_1WEEK_BEFORE_Q,GMR_WORKPLACES_PERCENT_1WEEK_BEFORE_Q,GMR_RESIDENTIAL_PERCENT_1WEEK_BEFORE_Q,GMR_PARKS_PERCENT_1WEEK_BEFORE_Q,GMR_TRANSIT_STATIONS_2WEEKS_Q,GMR_GROCERY_AND_PHARMACY_2WEEKS_Q,GMR_RETAIL_AND_RECREATION_2WEEKS_Q,GMR_WORKPLACES_PERCENT_2WEEKS_Q,GMR_RESIDENTIAL_PERCENT_2WEEKS_Q,GMR_PARKS_PERCENT_2WEEKS_Q,INMET_TEMP_C_Q,INMET_RELATIVE_AIR_HUMIDITY_Q,INMET_DAILY_PRECIPT_Q,INMET_TEMP_C_1WEEK_BEFORE_Q,INMET_RELATIVE_AIR_HUMIDITY_1WEEK_BEFORE_Q,INMET_DAILY_PRECIPT_1WEEK_BEFORE_Q,INMET_TEMP_C_2WEEKS_BEFORE_Q,INMET_RELATIVE_AIR_HUMIDITY_2WEEKS_BEFORE_Q,INMET_DAILY_PRECIPT_2WEEKS_BEFORE_Q,ANO
0,77309859740,1.0,1968-12-02,52,7.0,6,4.0,9999,4.0,29.0,290730,19,20,7,2021-05-13,29.0,291920,9999.0,9999,9999,9,1.0,6,2.0,2.0,9999,9999,1.0,9999.0,1.0,4.0,6.0,1.0,1.0,2.0,9999,9.0,9.0,9.0,9.0,9999,2,9999,9999,9999,9999,9999,1,1,2021-05-14,1,2,9,9999,6,9999,4,4,4,9999,1,3.0,3.0,6.0,6.0,6.0,1.0,6.0,6.0,1.0,6.0,6.0,6.0,1.0,19-2021,-26.208333,31.788462,-24.079137,-0.099905,7.358333,-44.62533,-25.097436,34.628571,-20.930314,0.773764,6.81768,-44.472296,-31.233161,17.257009,-30.477193,-4.605239,8.130556,-46.526854,22.265185,71.299468,1.115789,23.258355,74.984254,2.16,24.092008,74.733094,1.653012,3,5,3,4,2,2,4.0,5,4.0,5.0,2.0,2,3.0,4,3.0,3.0,3.0,1,2,3,2,3,4.0,3,4,4.0,2,2021
1,68719688318,2.0,1968-12-02,52,7.0,5,4.0,9999,9999.0,29.0,292370,19,19,5,2021-05-10,29.0,290520,9999.0,2,9,9999,1.0,6,2.0,1.0,6,9999,1.0,2.0,1.0,8.0,6.0,2.0,5.0,9999.0,9999,1.0,1.0,5.0,1.0,9999,9999,9999,9999,9999,1,5,1,1,2021-05-14,4,6,2,4,6,9999,4,4,4,9999,9999,2.0,3.0,3.0,6.0,2.0,1.0,6.0,2.0,3.0,6.0,6.0,3.0,6.0,19-2021,-26.208333,31.788462,-24.079137,-0.099905,7.358333,-44.62533,-25.097436,34.628571,-20.930314,0.773764,6.81768,-44.472296,-31.233161,17.257009,-30.477193,-4.605239,8.130556,-46.526854,22.265185,71.299468,1.115789,23.258355,74.984254,2.16,24.092008,74.733094,1.653012,3,5,3,4,2,2,4.0,5,4.0,5.0,2.0,2,3.0,4,3.0,3.0,3.0,1,2,3,2,3,4.0,3,4,4.0,2,2021


In [21]:
kmodes = KModes(n_clusters=10, init = "Cao",n_jobs=-1,n_init = 1 , verbose=1)

## labeling ds-1

In [22]:
df1_ = df1.select(kmodes_vars).toPandas()

In [None]:
start = time.time()
labels = kmodes.fit_predict(df1_)
print(time.time() - start)
labels

8378.65731048584


array([7, 4, 1, ..., 7, 7, 0], dtype=uint16)

In [None]:
df1_.insert(0, "KMODES_CLUSTER", labels, True)

In [None]:
df1_

Unnamed: 0,KMODES_CLUSTER,NU_NOTIFIC,CLASSI_FIN,CS_SEXO,CS_RACA,CS_ESCOL_N,SG_UF,SG_UF_NOT,CRITERIO,SUPORT_VEN,EVOLUCAO,AGE_GROUP,UTI,SYMP_GROUP1,SYMP_GROUP2,SYMP_GROUP3,SYMP_GROUP4,RF_GROUP1,RF_GROUP2,RF_GROUP3,RF_GROUP4,DIST_PRI_EVOLUCA_Q,DIST_PRI_ENCERRA_Q,DIST_PRI_INTERNA_Q,DIST_PRI_NOTIFIC_Q,DIST_PRI_COLETA_Q,DIST_PRI_PCR_Q,DIST_PRI_ENTUTI_Q
0,7,77309859740,1.0,1.0,4.0,4.0,29.0,29.0,1.0,2.0,9999.0,7.0,2.0,4.0,6.0,1.0,1.0,9.0,9.0,9.0,9.0,6.0,1.0,3.0,3.0,1.0,6.0,6.0
1,4,68719688318,1.0,2.0,4.0,9999.0,29.0,29.0,1.0,2.0,2.0,7.0,1.0,8.0,6.0,2.0,5.0,1.0,1.0,5.0,1.0,2.0,1.0,3.0,2.0,3.0,6.0,3.0
2,1,68719818720,1.0,2.0,4.0,3.0,29.0,29.0,1.0,9.0,9999.0,5.0,2.0,3.0,1.0,1.0,4.0,9.0,9.0,9.0,9.0,6.0,1.0,3.0,3.0,2.0,3.0,6.0
3,6,68719890494,1.0,1.0,9.0,9999.0,29.0,29.0,1.0,2.0,9999.0,7.0,1.0,3.0,6.0,2.0,1.0,1.0,1.0,1.0,1.0,6.0,6.0,3.0,1.0,3.0,6.0,3.0
4,2,68719918840,1.0,2.0,4.0,5.0,29.0,29.0,9999.0,2.0,9999.0,2.0,9999.0,1.0,9.0,1.0,1.0,1.0,1.0,1.0,1.0,6.0,6.0,6.0,1.0,1.0,2.0,6.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2454137,1,214748978383,1.0,2.0,4.0,9999.0,28.0,28.0,1.0,1.0,1.0,7.0,1.0,1.0,3.0,1.0,1.0,9.0,9.0,9.0,9.0,5.0,5.0,1.0,1.0,1.0,1.0,1.0
2454138,7,214749032736,1.0,1.0,9.0,9.0,28.0,28.0,1.0,2.0,9.0,10.0,2.0,4.0,3.0,1.0,1.0,9.0,9.0,9.0,9.0,6.0,1.0,4.0,3.0,4.0,3.0,6.0
2454139,7,231928850240,1.0,1.0,9.0,9.0,28.0,28.0,1.0,1.0,9.0,9.0,2.0,3.0,8.0,1.0,1.0,9.0,9.0,9.0,9.0,6.0,2.0,5.0,4.0,2.0,1.0,6.0
2454140,7,283468444634,1.0,1.0,4.0,9.0,28.0,28.0,1.0,3.0,2.0,9.0,2.0,4.0,5.0,1.0,1.0,9.0,9.0,9.0,9.0,5.0,4.0,1.0,2.0,1.0,1.0,6.0


In [None]:
# taking only 'KMODES_CLUSTER' and 'NU_NOTIFIC' to ensure join to original data
df1_new = spark.createDataFrame(df1_[['KMODES_CLUSTER', 'NU_NOTIFIC']])

# writing temp labeled data
df1_new.write.parquet('gs://ai-covid19-datalake/trusted/super-srag_filters/super-srag_covid_nofilter_kmodes.parquet', mode='overwrite')

In [None]:
df1_new.limit(3).toPandas()

Unnamed: 0,KMODES_CLUSTER,NU_NOTIFIC
0,7,77309859740
1,4,68719688318
2,1,68719818720


In [None]:
df1_new.select('KMODES_CLUSTER').groupBy('KMODES_CLUSTER').count().orderBy('KMODES_CLUSTER').show()

+--------------+------+
|KMODES_CLUSTER| count|
+--------------+------+
|             0|652057|
|             1|244593|
|             2|225371|
|             3|219584|
|             4|294645|
|             5|204332|
|             6|111561|
|             7|329451|
|             8| 64965|
|             9|107583|
+--------------+------+



In [None]:
# joining to original data
df1 = df1.join(df1_new, 'NU_NOTIFIC', 'left')

In [None]:
df1.limit(3).toPandas()

Unnamed: 0,NU_NOTIFIC,CS_SEXO,DT_NASC,AGE_AT_NOTIF,AGE_GROUP,CS_GESTANT,CS_RACA,CS_ETINIA,CS_ESCOL_N,SG_UF,CO_MUN_RES,SEM_PRI,SEM_NOT,DIST_PRI_NOTIFIC,DT_SIN_PRI,SG_UF_NOT,CO_MUN_NOT,SURTO_SG,NOSOCOMIAL,AVE_SUINO,VACINA,HOSPITAL,DIST_PRI_INTERNA,SUPORT_VEN,UTI,DIST_PRI_ENTUTI,CLASSI_OUT,CRITERIO,EVOLUCAO,CLASSI_FIN,SYMP_GROUP1,SYMP_GROUP2,SYMP_GROUP3,SYMP_GROUP4,OUTRO_SIN,OUTRO_DES,RF_GROUP1,RF_GROUP2,RF_GROUP3,RF_GROUP4,OBES_IMC,OUT_MORBI,MORB_DESC,RAIOX_RES,DIST_PRI_RAIOX,TOMO_RES,DIST_PRI_TOMO,AMOSTRA,TP_AMOSTRA,DT_COLETA,DIST_PRI_COLETA,PP_IF_RESUL,PP_TRA_RESUL,DIST_PRI_TRA,PP_PCR_RESUL,DIST_PRI_PCR,PP_RES_SOR_IGA,PP_RES_SOR_IGM,PP_RES_SOR_IGG,DIST_PRI_SOR,DIST_PRI_IF,DIST_PRI_NOTIFIC_Q,DIST_PRI_INTERNA_Q,DIST_PRI_ENTUTI_Q,DIST_PRI_SAIDUTI_Q,DIST_PRI_EVOLUCA_Q,DIST_PRI_ENCERRA_Q,DIST_PRI_RAIOX_Q,DIST_PRI_TOMO_Q,DIST_PRI_COLETA_Q,DIST_PRI_SOR_Q,DIST_PRI_PCR_Q,DIST_PRI_TRA_Q,DIST_PRI_IF_Q,EPI_WEEK_YEAR,GMR_TRANSIT_STATIONS_AVG,GMR_GROCERY_AND_PHARMACY_AVG,GMR_RETAIL_AND_RECREATION_AVG,GMR_WORKPLACES_PERCENT_AVG,GMR_RESIDENTIAL_PERCENT_AVG,GMR_PARKS_PERCENT_AVG,GMR_TRANSIT_STATIONS_1WEEK_BEFORE_AVG,GMR_GROCERY_AND_PHARMACY_1WEEK_BEFORE_AVG,GMR_RETAIL_AND_RECREATION_1WEEK_BEFORE_AVG,GMR_WORKPLACES_PERCENT_1WEEK_BEFORE_AVG,GMR_RESIDENTIAL_PERCENT_1WEEK_BEFORE_AVG,GMR_PARKS_PERCENT_1WEEK_BEFORE_AVG,GMR_TRANSIT_STATIONS_2WEEKS_AVG,GMR_GROCERY_AND_PHARMACY_2WEEKS_AVG,GMR_RETAIL_AND_RECREATION_2WEEKS_AVG,GMR_WORKPLACES_PERCENT_2WEEKS_AVG,GMR_RESIDENTIAL_PERCENT_2WEEKS_AVG,GMR_PARKS_PERCENT_2WEEKS_AVG,INMET_TEMP_C_AVG,INMET_RELATIVE_AIR_HUMIDITY_AVG,INMET_DAILY_PRECIPT_AVG,INMET_TEMP_C_1WEEK_BEFORE_AVG,INMET_RELATIVE_AIR_HUMIDITY_1WEEK_BEFORE_AVG,INMET_DAILY_PRECIPT_1WEEK_BEFORE_AVG,INMET_TEMP_C_2WEEKS_BEFORE_AVG,INMET_RELATIVE_AIR_HUMIDITY_2WEEKS_BEFORE_AVG,INMET_DAILY_PRECIPT_2WEEKS_BEFORE_AVG,GMR_TRANSIT_STATIONS_Q,GMR_GROCERY_AND_PHARMACY_Q,GMR_RETAIL_AND_RECREATION_Q,GMR_WORKPLACES_PERCENT_Q,GMR_RESIDENTIAL_PERCENT_Q,GMR_PARKS_PERCENT_Q,GMR_TRANSIT_STATIONS_1WEEK_BEFORE_Q,GMR_GROCERY_AND_PHARMACY_1WEEK_BEFORE_Q,GMR_RETAIL_AND_RECREATION_1WEEK_BEFORE_Q,GMR_WORKPLACES_PERCENT_1WEEK_BEFORE_Q,GMR_RESIDENTIAL_PERCENT_1WEEK_BEFORE_Q,GMR_PARKS_PERCENT_1WEEK_BEFORE_Q,GMR_TRANSIT_STATIONS_2WEEKS_Q,GMR_GROCERY_AND_PHARMACY_2WEEKS_Q,GMR_RETAIL_AND_RECREATION_2WEEKS_Q,GMR_WORKPLACES_PERCENT_2WEEKS_Q,GMR_RESIDENTIAL_PERCENT_2WEEKS_Q,GMR_PARKS_PERCENT_2WEEKS_Q,INMET_TEMP_C_Q,INMET_RELATIVE_AIR_HUMIDITY_Q,INMET_DAILY_PRECIPT_Q,INMET_TEMP_C_1WEEK_BEFORE_Q,INMET_RELATIVE_AIR_HUMIDITY_1WEEK_BEFORE_Q,INMET_DAILY_PRECIPT_1WEEK_BEFORE_Q,INMET_TEMP_C_2WEEKS_BEFORE_Q,INMET_RELATIVE_AIR_HUMIDITY_2WEEKS_BEFORE_Q,INMET_DAILY_PRECIPT_2WEEKS_BEFORE_Q,ANO,KMODES_CLUSTER
0,25769809653,1.0,1955-02-10,65,8.0,6,1.0,9999,1.0,35.0,350760,12,13,10,2020-03-18,35.0,350760,1.0,2,2,9,1.0,10,3.0,2.0,9999,9999,1.0,1.0,1.0,4.0,3.0,1.0,1.0,2.0,9999,3.0,1.0,1.0,1.0,9999,2,9999,9999,9999,9999,9999,1,1,2020-03-28,10,9,9,9999,2,10,4,4,4,9999,9999,4.0,4.0,6.0,6.0,3.0,3.0,6.0,6.0,5.0,6.0,4.0,6.0,6.0,12-2020,-22.441517,2.559225,-26.850227,-1.403258,6.45553,-24.633056,3.674725,4.602888,-0.581694,16.448507,-1.695273,-0.370824,0.014146,4.24297,-3.003748,15.850823,-1.205669,-6.19869,24.274361,75.761783,5.256627,23.943891,66.591151,0.461628,21.773845,71.656916,2.557396,4,2,3,4,2,4,5.0,2,5.0,5.0,1.0,5,5.0,2,5.0,5.0,1.0,5,4,4,4,4,2.0,2,2,3.0,3,2020,0
1,25769809829,1.0,1926-03-16,94,11.0,6,3.0,9999,2.0,41.0,410690,12,14,14,2020-03-18,41.0,410690,2.0,2,2,9,1.0,14,2.0,1.0,14,9999,1.0,2.0,1.0,4.0,8.0,1.0,1.0,1.0,TAQUICARDIA,3.0,2.0,1.0,1.0,9999,1,EM TRATAMENTO DE MIELOMA MULTI,6,9999,9999,9999,1,1,2020-04-01,14,9,9,9999,2,16,4,4,4,9999,9999,5.0,5.0,5.0,4.0,4.0,3.0,6.0,6.0,5.0,6.0,5.0,6.0,6.0,12-2020,-23.794479,1.968,-27.203655,2.114202,5.37741,-30.843416,-0.522876,3.989333,1.758355,20.726154,-2.0475,-4.044715,-3.718354,3.696,-1.017949,20.655332,-1.640704,-12.542308,23.934508,70.172811,3.485714,24.470405,58.925729,0.401504,21.874305,66.741533,0.628148,4,2,3,5,1,3,5.0,2,5.0,5.0,1.0,5,5.0,2,5.0,5.0,1.0,5,4,3,4,4,1.0,2,2,2.0,2,2020,3
2,25769810062,1.0,1962-06-02,57,7.0,6,9999.0,9999,9999.0,35.0,355030,17,19,15,2020-04-20,35.0,355030,2.0,2,2,9999,1.0,10,2.0,9999.0,9999,9999,1.0,1.0,1.0,4.0,2.0,1.0,1.0,9999.0,9999,1.0,1.0,5.0,1.0,9999,9999,9999,9999,9999,9999,9999,1,1,2020-04-30,10,7,9,9999,2,17,4,4,4,9999,9999,5.0,4.0,6.0,6.0,5.0,4.0,6.0,6.0,5.0,6.0,5.0,6.0,6.0,17-2020,-50.2173,-13.987472,-57.145412,-32.3096,16.897321,-36.537678,-50.519372,-16.036903,-57.530315,-28.299213,16.796902,-40.855276,-51.254777,-10.139073,-58.047691,-37.531577,17.824561,-40.216769,20.903427,68.705156,0.214765,20.174017,71.776044,1.997403,21.555626,69.541469,1.42987,1,1,1,1,5,3,1.0,1,1.0,1.0,5.0,2,1.0,1,1.0,1.0,5.0,2,2,2,1,2,3.0,3,2,2.0,2,2020,0


In [None]:
count = df1.count()
df1.select('KMODES_CLUSTER').groupBy('KMODES_CLUSTER').count().orderBy('KMODES_CLUSTER')\
                            .withColumn('%', (F.col('count')/count)*100).show()
df1.select('KMODES_CLUSTER').printSchema()

+--------------+------+------------------+
|KMODES_CLUSTER| count|                 %|
+--------------+------+------------------+
|             0|652057| 26.56965244879881|
|             1|244593| 9.966538203575832|
|             2|225371| 9.183290942414905|
|             3|219584| 8.947485516323015|
|             4|294645|12.006028990987481|
|             5|204332| 8.326005585658859|
|             6|111561| 4.545824976712839|
|             7|329451|13.424284332365447|
|             8| 64965|2.6471573364540437|
|             9|107583| 4.383731666708773|
+--------------+------+------------------+

root
 |-- KMODES_CLUSTER: long (nullable = true)



In [None]:
df1.write.parquet('gs://ai-covid19-datalake/trusted/super-srag_filters/super-srag_covid_nofilter_kmodes.parquet', mode='overwrite')

<hr />
<hr />
<hr />

## labeling ds-2

In [None]:
df2_ = df2.select(kmodes_vars).toPandas()

In [None]:
start = time.time()
labels = kmodes.fit_predict(df2_)
print(time.time() - start)
labels

5029.200259923935


array([1, 5, 3, ..., 6, 1, 6], dtype=uint16)

In [None]:
df2_.insert(0, "KMODES_CLUSTER", labels, True)

In [None]:
df2_

Unnamed: 0,KMODES_CLUSTER,NU_NOTIFIC,CLASSI_FIN,CS_SEXO,CS_RACA,CS_ESCOL_N,SG_UF,SG_UF_NOT,CRITERIO,SUPORT_VEN,EVOLUCAO,AGE_GROUP,UTI,SYMP_GROUP1,SYMP_GROUP2,SYMP_GROUP3,SYMP_GROUP4,RF_GROUP1,RF_GROUP2,RF_GROUP3,RF_GROUP4,DIST_PRI_EVOLUCA_Q,DIST_PRI_ENCERRA_Q,DIST_PRI_INTERNA_Q,DIST_PRI_NOTIFIC_Q,DIST_PRI_COLETA_Q,DIST_PRI_PCR_Q,DIST_PRI_ENTUTI_Q
0,1,68719818720,1.0,2.0,4.0,3.0,29.0,29.0,1.0,9.0,9999.0,5.0,2.0,3.0,1.0,1.0,4.0,9.0,9.0,9.0,9.0,6.0,1.0,3.0,3.0,2.0,3.0,6.0
1,5,77309826880,1.0,1.0,9.0,9999.0,29.0,29.0,1.0,3.0,9999.0,7.0,1.0,3.0,2.0,1.0,1.0,1.0,1.0,1.0,1.0,6.0,6.0,1.0,1.0,1.0,1.0,1.0
2,3,85899634771,1.0,2.0,9.0,9.0,29.0,29.0,1.0,3.0,9999.0,7.0,2.0,3.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,6.0,1.0,2.0,2.0,2.0,1.0,6.0
3,6,94489525905,1.0,2.0,1.0,4.0,28.0,29.0,1.0,9.0,9999.0,6.0,2.0,4.0,6.0,1.0,1.0,9.0,9.0,9.0,9.0,6.0,1.0,4.0,4.0,1.0,1.0,6.0
4,1,94489567748,1.0,2.0,4.0,2.0,29.0,29.0,1.0,2.0,9999.0,10.0,1.0,3.0,8.0,1.0,1.0,9.0,9.0,9.0,9.0,6.0,6.0,1.0,3.0,1.0,1.0,2.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1507876,1,214748978383,1.0,2.0,4.0,9999.0,28.0,28.0,1.0,1.0,1.0,7.0,1.0,1.0,3.0,1.0,1.0,9.0,9.0,9.0,9.0,5.0,5.0,1.0,1.0,1.0,1.0,1.0
1507877,6,214749032736,1.0,1.0,9.0,9.0,28.0,28.0,1.0,2.0,9.0,10.0,2.0,4.0,3.0,1.0,1.0,9.0,9.0,9.0,9.0,6.0,1.0,4.0,3.0,4.0,3.0,6.0
1507878,6,231928850240,1.0,1.0,9.0,9.0,28.0,28.0,1.0,1.0,9.0,9.0,2.0,3.0,8.0,1.0,1.0,9.0,9.0,9.0,9.0,6.0,2.0,5.0,4.0,2.0,1.0,6.0
1507879,1,283468444634,1.0,1.0,4.0,9.0,28.0,28.0,1.0,3.0,2.0,9.0,2.0,4.0,5.0,1.0,1.0,9.0,9.0,9.0,9.0,5.0,4.0,1.0,2.0,1.0,1.0,6.0


In [None]:
# taking only 'KMODES_CLUSTER' and 'NU_NOTIFIC' to ensure join to original data
df2_new = spark.createDataFrame(df2_[['KMODES_CLUSTER', 'NU_NOTIFIC']])

# writing temp labeled data
df2_new.write.parquet('gs://ai-covid19-datalake/trusted/super-srag_filters/super-srag_covid_pcrpositive_kmodes.parquet', mode='overwrite')

In [None]:
df2_new.limit(3).toPandas()

Unnamed: 0,KMODES_CLUSTER,NU_NOTIFIC
0,1,68719818720
1,5,77309826880
2,3,85899634771


In [None]:
df2_new.select('KMODES_CLUSTER').groupBy('KMODES_CLUSTER').count().orderBy('KMODES_CLUSTER').show()

+--------------+------+
|KMODES_CLUSTER| count|
+--------------+------+
|             0|426614|
|             1|180434|
|             2| 79973|
|             3|137234|
|             4|162260|
|             5|123134|
|             6|180490|
|             7| 70570|
|             8| 85506|
|             9| 61666|
+--------------+------+



In [None]:
# joining to original data
df2 = df2.join(df2_new, 'NU_NOTIFIC', 'left')

In [None]:
df2.limit(3).toPandas()

Unnamed: 0,NU_NOTIFIC,CS_SEXO,DT_NASC,AGE_AT_NOTIF,AGE_GROUP,CS_GESTANT,CS_RACA,CS_ETINIA,CS_ESCOL_N,SG_UF,CO_MUN_RES,SEM_PRI,SEM_NOT,DIST_PRI_NOTIFIC,DT_SIN_PRI,SG_UF_NOT,CO_MUN_NOT,SURTO_SG,NOSOCOMIAL,AVE_SUINO,VACINA,HOSPITAL,DIST_PRI_INTERNA,SUPORT_VEN,UTI,DIST_PRI_ENTUTI,CLASSI_OUT,CRITERIO,EVOLUCAO,CLASSI_FIN,SYMP_GROUP1,SYMP_GROUP2,SYMP_GROUP3,SYMP_GROUP4,OUTRO_SIN,OUTRO_DES,RF_GROUP1,RF_GROUP2,RF_GROUP3,RF_GROUP4,OBES_IMC,OUT_MORBI,MORB_DESC,RAIOX_RES,DIST_PRI_RAIOX,TOMO_RES,DIST_PRI_TOMO,AMOSTRA,TP_AMOSTRA,DT_COLETA,DIST_PRI_COLETA,PP_IF_RESUL,PP_TRA_RESUL,DIST_PRI_TRA,PP_PCR_RESUL,DIST_PRI_PCR,PP_RES_SOR_IGA,PP_RES_SOR_IGM,PP_RES_SOR_IGG,DIST_PRI_SOR,DIST_PRI_IF,DIST_PRI_NOTIFIC_Q,DIST_PRI_INTERNA_Q,DIST_PRI_ENTUTI_Q,DIST_PRI_SAIDUTI_Q,DIST_PRI_EVOLUCA_Q,DIST_PRI_ENCERRA_Q,DIST_PRI_RAIOX_Q,DIST_PRI_TOMO_Q,DIST_PRI_COLETA_Q,DIST_PRI_SOR_Q,DIST_PRI_PCR_Q,DIST_PRI_TRA_Q,DIST_PRI_IF_Q,EPI_WEEK_YEAR,GMR_TRANSIT_STATIONS_AVG,GMR_GROCERY_AND_PHARMACY_AVG,GMR_RETAIL_AND_RECREATION_AVG,GMR_WORKPLACES_PERCENT_AVG,GMR_RESIDENTIAL_PERCENT_AVG,GMR_PARKS_PERCENT_AVG,GMR_TRANSIT_STATIONS_1WEEK_BEFORE_AVG,GMR_GROCERY_AND_PHARMACY_1WEEK_BEFORE_AVG,GMR_RETAIL_AND_RECREATION_1WEEK_BEFORE_AVG,GMR_WORKPLACES_PERCENT_1WEEK_BEFORE_AVG,GMR_RESIDENTIAL_PERCENT_1WEEK_BEFORE_AVG,GMR_PARKS_PERCENT_1WEEK_BEFORE_AVG,GMR_TRANSIT_STATIONS_2WEEKS_AVG,GMR_GROCERY_AND_PHARMACY_2WEEKS_AVG,GMR_RETAIL_AND_RECREATION_2WEEKS_AVG,GMR_WORKPLACES_PERCENT_2WEEKS_AVG,GMR_RESIDENTIAL_PERCENT_2WEEKS_AVG,GMR_PARKS_PERCENT_2WEEKS_AVG,INMET_TEMP_C_AVG,INMET_RELATIVE_AIR_HUMIDITY_AVG,INMET_DAILY_PRECIPT_AVG,INMET_TEMP_C_1WEEK_BEFORE_AVG,INMET_RELATIVE_AIR_HUMIDITY_1WEEK_BEFORE_AVG,INMET_DAILY_PRECIPT_1WEEK_BEFORE_AVG,INMET_TEMP_C_2WEEKS_BEFORE_AVG,INMET_RELATIVE_AIR_HUMIDITY_2WEEKS_BEFORE_AVG,INMET_DAILY_PRECIPT_2WEEKS_BEFORE_AVG,GMR_TRANSIT_STATIONS_Q,GMR_GROCERY_AND_PHARMACY_Q,GMR_RETAIL_AND_RECREATION_Q,GMR_WORKPLACES_PERCENT_Q,GMR_RESIDENTIAL_PERCENT_Q,GMR_PARKS_PERCENT_Q,GMR_TRANSIT_STATIONS_1WEEK_BEFORE_Q,GMR_GROCERY_AND_PHARMACY_1WEEK_BEFORE_Q,GMR_RETAIL_AND_RECREATION_1WEEK_BEFORE_Q,GMR_WORKPLACES_PERCENT_1WEEK_BEFORE_Q,GMR_RESIDENTIAL_PERCENT_1WEEK_BEFORE_Q,GMR_PARKS_PERCENT_1WEEK_BEFORE_Q,GMR_TRANSIT_STATIONS_2WEEKS_Q,GMR_GROCERY_AND_PHARMACY_2WEEKS_Q,GMR_RETAIL_AND_RECREATION_2WEEKS_Q,GMR_WORKPLACES_PERCENT_2WEEKS_Q,GMR_RESIDENTIAL_PERCENT_2WEEKS_Q,GMR_PARKS_PERCENT_2WEEKS_Q,INMET_TEMP_C_Q,INMET_RELATIVE_AIR_HUMIDITY_Q,INMET_DAILY_PRECIPT_Q,INMET_TEMP_C_1WEEK_BEFORE_Q,INMET_RELATIVE_AIR_HUMIDITY_1WEEK_BEFORE_Q,INMET_DAILY_PRECIPT_1WEEK_BEFORE_Q,INMET_TEMP_C_2WEEKS_BEFORE_Q,INMET_RELATIVE_AIR_HUMIDITY_2WEEKS_BEFORE_Q,INMET_DAILY_PRECIPT_2WEEKS_BEFORE_Q,ANO,KMODES_CLUSTER
0,25769809653,1.0,1955-02-10,65,8.0,6,1.0,9999,1.0,35.0,350760,12,13,10,2020-03-18,35.0,350760,1.0,2,2,9,1.0,10,3.0,2.0,9999,9999,1.0,1.0,1.0,4.0,3.0,1.0,1.0,2.0,9999,3.0,1.0,1.0,1.0,9999,2,9999,9999,9999,9999,9999,1,1,2020-03-28,10,9,9,9999,2,10,4,4,4,9999,9999,4.0,4.0,6.0,6.0,3.0,3.0,6.0,6.0,5.0,6.0,4.0,6.0,6.0,12-2020,-22.441517,2.559225,-26.850227,-1.403258,6.45553,-24.633056,3.674725,4.602888,-0.581694,16.448507,-1.695273,-0.370824,0.014146,4.24297,-3.003748,15.850823,-1.205669,-6.19869,24.274361,75.761783,5.256627,23.943891,66.591151,0.461628,21.773845,71.656916,2.557396,4,2,3,4,2,4,5.0,2,5.0,5.0,1.0,5,5.0,2,5.0,5.0,1.0,5,4,4,4,4,2.0,2,2,3.0,3,2020,0
1,25769809829,1.0,1926-03-16,94,11.0,6,3.0,9999,2.0,41.0,410690,12,14,14,2020-03-18,41.0,410690,2.0,2,2,9,1.0,14,2.0,1.0,14,9999,1.0,2.0,1.0,4.0,8.0,1.0,1.0,1.0,TAQUICARDIA,3.0,2.0,1.0,1.0,9999,1,EM TRATAMENTO DE MIELOMA MULTI,6,9999,9999,9999,1,1,2020-04-01,14,9,9,9999,2,16,4,4,4,9999,9999,5.0,5.0,5.0,4.0,4.0,3.0,6.0,6.0,5.0,6.0,5.0,6.0,6.0,12-2020,-23.794479,1.968,-27.203655,2.114202,5.37741,-30.843416,-0.522876,3.989333,1.758355,20.726154,-2.0475,-4.044715,-3.718354,3.696,-1.017949,20.655332,-1.640704,-12.542308,23.934508,70.172811,3.485714,24.470405,58.925729,0.401504,21.874305,66.741533,0.628148,4,2,3,5,1,3,5.0,2,5.0,5.0,1.0,5,5.0,2,5.0,5.0,1.0,5,4,3,4,4,1.0,2,2,2.0,2,2020,4
2,25769810062,1.0,1962-06-02,57,7.0,6,9999.0,9999,9999.0,35.0,355030,17,19,15,2020-04-20,35.0,355030,2.0,2,2,9999,1.0,10,2.0,9999.0,9999,9999,1.0,1.0,1.0,4.0,2.0,1.0,1.0,9999.0,9999,1.0,1.0,5.0,1.0,9999,9999,9999,9999,9999,9999,9999,1,1,2020-04-30,10,7,9,9999,2,17,4,4,4,9999,9999,5.0,4.0,6.0,6.0,5.0,4.0,6.0,6.0,5.0,6.0,5.0,6.0,6.0,17-2020,-50.2173,-13.987472,-57.145412,-32.3096,16.897321,-36.537678,-50.519372,-16.036903,-57.530315,-28.299213,16.796902,-40.855276,-51.254777,-10.139073,-58.047691,-37.531577,17.824561,-40.216769,20.903427,68.705156,0.214765,20.174017,71.776044,1.997403,21.555626,69.541469,1.42987,1,1,1,1,5,3,1.0,1,1.0,1.0,5.0,2,1.0,1,1.0,1.0,5.0,2,2,2,1,2,3.0,3,2,2.0,2,2020,4


In [None]:
count = df2.count()
df2.select('KMODES_CLUSTER').groupBy('KMODES_CLUSTER').count().orderBy('KMODES_CLUSTER')\
                            .withColumn('%', (F.col('count')/count)*100).show()
df2.select('KMODES_CLUSTER').printSchema()

+--------------+------+------------------+
|KMODES_CLUSTER| count|                 %|
+--------------+------+------------------+
|             0|426614|  28.2922856644523|
|             1|180434| 11.96606363499507|
|             2| 79973|5.3036678623843665|
|             3|137234| 9.101116069504158|
|             4|162260|10.760796110568407|
|             5|123134| 8.166029016878653|
|             6|180490|11.969777455913299|
|             7| 70570| 4.680077539275314|
|             8| 85506| 5.670606632751523|
|             9| 61666|  4.08958001327691|
+--------------+------+------------------+

root
 |-- KMODES_CLUSTER: long (nullable = true)



In [None]:
df2.write.parquet('gs://ai-covid19-datalake/trusted/super-srag_filters/super-srag_covid_pcrpositive_kmodes.parquet', mode='overwrite')

<hr />
<hr />
<hr />

## labeling ds-3

In [None]:
df3_ = df3.select(kmodes_vars).toPandas()

In [None]:
start = time.time()
labels = kmodes.fit_predict(df3_)
print(time.time() - start)
labels

7407.238794326782


array([6, 4, 1, ..., 6, 6, 0], dtype=uint16)

In [None]:
df3_.insert(0, "KMODES_CLUSTER", labels, True)

In [None]:
df3_

Unnamed: 0,KMODES_CLUSTER,NU_NOTIFIC,CLASSI_FIN,CS_SEXO,CS_RACA,CS_ESCOL_N,SG_UF,SG_UF_NOT,CRITERIO,SUPORT_VEN,EVOLUCAO,AGE_GROUP,UTI,SYMP_GROUP1,SYMP_GROUP2,SYMP_GROUP3,SYMP_GROUP4,RF_GROUP1,RF_GROUP2,RF_GROUP3,RF_GROUP4,DIST_PRI_EVOLUCA_Q,DIST_PRI_ENCERRA_Q,DIST_PRI_INTERNA_Q,DIST_PRI_NOTIFIC_Q,DIST_PRI_COLETA_Q,DIST_PRI_PCR_Q,DIST_PRI_ENTUTI_Q
0,6,77309859740,1.0,1.0,4.0,4.0,29.0,29.0,1.0,2.0,9999.0,7.0,2.0,4.0,6.0,1.0,1.0,9.0,9.0,9.0,9.0,6.0,1.0,3.0,3.0,1.0,6.0,6.0
1,4,68719688318,1.0,2.0,4.0,9999.0,29.0,29.0,1.0,2.0,2.0,7.0,1.0,8.0,6.0,2.0,5.0,1.0,1.0,5.0,1.0,2.0,1.0,3.0,2.0,3.0,6.0,3.0
2,1,68719818720,1.0,2.0,4.0,3.0,29.0,29.0,1.0,9.0,9999.0,5.0,2.0,3.0,1.0,1.0,4.0,9.0,9.0,9.0,9.0,6.0,1.0,3.0,3.0,2.0,3.0,6.0
3,0,68719890494,1.0,1.0,9.0,9999.0,29.0,29.0,1.0,2.0,9999.0,7.0,1.0,3.0,6.0,2.0,1.0,1.0,1.0,1.0,1.0,6.0,6.0,3.0,1.0,3.0,6.0,3.0
4,7,77309826880,1.0,1.0,9.0,9999.0,29.0,29.0,1.0,3.0,9999.0,7.0,1.0,3.0,2.0,1.0,1.0,1.0,1.0,1.0,1.0,6.0,6.0,1.0,1.0,1.0,1.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2157540,1,214748978383,1.0,2.0,4.0,9999.0,28.0,28.0,1.0,1.0,1.0,7.0,1.0,1.0,3.0,1.0,1.0,9.0,9.0,9.0,9.0,5.0,5.0,1.0,1.0,1.0,1.0,1.0
2157541,6,214749032736,1.0,1.0,9.0,9.0,28.0,28.0,1.0,2.0,9.0,10.0,2.0,4.0,3.0,1.0,1.0,9.0,9.0,9.0,9.0,6.0,1.0,4.0,3.0,4.0,3.0,6.0
2157542,6,231928850240,1.0,1.0,9.0,9.0,28.0,28.0,1.0,1.0,9.0,9.0,2.0,3.0,8.0,1.0,1.0,9.0,9.0,9.0,9.0,6.0,2.0,5.0,4.0,2.0,1.0,6.0
2157543,6,283468444634,1.0,1.0,4.0,9.0,28.0,28.0,1.0,3.0,2.0,9.0,2.0,4.0,5.0,1.0,1.0,9.0,9.0,9.0,9.0,5.0,4.0,1.0,2.0,1.0,1.0,6.0


In [None]:
# taking only 'KMODES_CLUSTER' and 'NU_NOTIFIC' to ensure join to original data
df3_new = spark.createDataFrame(df3_[['KMODES_CLUSTER', 'NU_NOTIFIC']])

# writing temp labeled data
df3_new.write.parquet('gs://ai-covid19-datalake/trusted/super-srag_filters/super-srag_covid_labpositive_kmodes.parquet', mode='overwrite')

In [None]:
df3_new.limit(3).toPandas()

Unnamed: 0,KMODES_CLUSTER,NU_NOTIFIC
0,6,77309859740
1,4,68719688318
2,1,68719818720


In [None]:
df3_new.select('KMODES_CLUSTER').groupBy('KMODES_CLUSTER').count().orderBy('KMODES_CLUSTER').show()

+--------------+------+
|KMODES_CLUSTER| count|
+--------------+------+
|             0|521135|
|             1|221582|
|             2|144643|
|             3|194429|
|             4|252804|
|             5|182931|
|             6|294211|
|             7|160028|
|             8|108715|
|             9| 77067|
+--------------+------+



In [None]:
# joining to original data
df3 = df3.join(df3_new, 'NU_NOTIFIC', 'left')

In [None]:
df3.limit(3).toPandas()

Unnamed: 0,NU_NOTIFIC,CS_SEXO,DT_NASC,AGE_AT_NOTIF,AGE_GROUP,CS_GESTANT,CS_RACA,CS_ETINIA,CS_ESCOL_N,SG_UF,CO_MUN_RES,SEM_PRI,SEM_NOT,DIST_PRI_NOTIFIC,DT_SIN_PRI,SG_UF_NOT,CO_MUN_NOT,SURTO_SG,NOSOCOMIAL,AVE_SUINO,VACINA,HOSPITAL,DIST_PRI_INTERNA,SUPORT_VEN,UTI,DIST_PRI_ENTUTI,CLASSI_OUT,CRITERIO,EVOLUCAO,CLASSI_FIN,SYMP_GROUP1,SYMP_GROUP2,SYMP_GROUP3,SYMP_GROUP4,OUTRO_SIN,OUTRO_DES,RF_GROUP1,RF_GROUP2,RF_GROUP3,RF_GROUP4,OBES_IMC,OUT_MORBI,MORB_DESC,RAIOX_RES,DIST_PRI_RAIOX,TOMO_RES,DIST_PRI_TOMO,AMOSTRA,TP_AMOSTRA,DT_COLETA,DIST_PRI_COLETA,PP_IF_RESUL,PP_TRA_RESUL,DIST_PRI_TRA,PP_PCR_RESUL,DIST_PRI_PCR,PP_RES_SOR_IGA,PP_RES_SOR_IGM,PP_RES_SOR_IGG,DIST_PRI_SOR,DIST_PRI_IF,DIST_PRI_NOTIFIC_Q,DIST_PRI_INTERNA_Q,DIST_PRI_ENTUTI_Q,DIST_PRI_SAIDUTI_Q,DIST_PRI_EVOLUCA_Q,DIST_PRI_ENCERRA_Q,DIST_PRI_RAIOX_Q,DIST_PRI_TOMO_Q,DIST_PRI_COLETA_Q,DIST_PRI_SOR_Q,DIST_PRI_PCR_Q,DIST_PRI_TRA_Q,DIST_PRI_IF_Q,EPI_WEEK_YEAR,GMR_TRANSIT_STATIONS_AVG,GMR_GROCERY_AND_PHARMACY_AVG,GMR_RETAIL_AND_RECREATION_AVG,GMR_WORKPLACES_PERCENT_AVG,GMR_RESIDENTIAL_PERCENT_AVG,GMR_PARKS_PERCENT_AVG,GMR_TRANSIT_STATIONS_1WEEK_BEFORE_AVG,GMR_GROCERY_AND_PHARMACY_1WEEK_BEFORE_AVG,GMR_RETAIL_AND_RECREATION_1WEEK_BEFORE_AVG,GMR_WORKPLACES_PERCENT_1WEEK_BEFORE_AVG,GMR_RESIDENTIAL_PERCENT_1WEEK_BEFORE_AVG,GMR_PARKS_PERCENT_1WEEK_BEFORE_AVG,GMR_TRANSIT_STATIONS_2WEEKS_AVG,GMR_GROCERY_AND_PHARMACY_2WEEKS_AVG,GMR_RETAIL_AND_RECREATION_2WEEKS_AVG,GMR_WORKPLACES_PERCENT_2WEEKS_AVG,GMR_RESIDENTIAL_PERCENT_2WEEKS_AVG,GMR_PARKS_PERCENT_2WEEKS_AVG,INMET_TEMP_C_AVG,INMET_RELATIVE_AIR_HUMIDITY_AVG,INMET_DAILY_PRECIPT_AVG,INMET_TEMP_C_1WEEK_BEFORE_AVG,INMET_RELATIVE_AIR_HUMIDITY_1WEEK_BEFORE_AVG,INMET_DAILY_PRECIPT_1WEEK_BEFORE_AVG,INMET_TEMP_C_2WEEKS_BEFORE_AVG,INMET_RELATIVE_AIR_HUMIDITY_2WEEKS_BEFORE_AVG,INMET_DAILY_PRECIPT_2WEEKS_BEFORE_AVG,GMR_TRANSIT_STATIONS_Q,GMR_GROCERY_AND_PHARMACY_Q,GMR_RETAIL_AND_RECREATION_Q,GMR_WORKPLACES_PERCENT_Q,GMR_RESIDENTIAL_PERCENT_Q,GMR_PARKS_PERCENT_Q,GMR_TRANSIT_STATIONS_1WEEK_BEFORE_Q,GMR_GROCERY_AND_PHARMACY_1WEEK_BEFORE_Q,GMR_RETAIL_AND_RECREATION_1WEEK_BEFORE_Q,GMR_WORKPLACES_PERCENT_1WEEK_BEFORE_Q,GMR_RESIDENTIAL_PERCENT_1WEEK_BEFORE_Q,GMR_PARKS_PERCENT_1WEEK_BEFORE_Q,GMR_TRANSIT_STATIONS_2WEEKS_Q,GMR_GROCERY_AND_PHARMACY_2WEEKS_Q,GMR_RETAIL_AND_RECREATION_2WEEKS_Q,GMR_WORKPLACES_PERCENT_2WEEKS_Q,GMR_RESIDENTIAL_PERCENT_2WEEKS_Q,GMR_PARKS_PERCENT_2WEEKS_Q,INMET_TEMP_C_Q,INMET_RELATIVE_AIR_HUMIDITY_Q,INMET_DAILY_PRECIPT_Q,INMET_TEMP_C_1WEEK_BEFORE_Q,INMET_RELATIVE_AIR_HUMIDITY_1WEEK_BEFORE_Q,INMET_DAILY_PRECIPT_1WEEK_BEFORE_Q,INMET_TEMP_C_2WEEKS_BEFORE_Q,INMET_RELATIVE_AIR_HUMIDITY_2WEEKS_BEFORE_Q,INMET_DAILY_PRECIPT_2WEEKS_BEFORE_Q,ANO,KMODES_CLUSTER
0,25769809653,1.0,1955-02-10,65,8.0,6,1.0,9999,1.0,35.0,350760,12,13,10,2020-03-18,35.0,350760,1.0,2,2,9,1.0,10,3.0,2.0,9999,9999,1.0,1.0,1.0,4.0,3.0,1.0,1.0,2.0,9999,3.0,1.0,1.0,1.0,9999,2,9999,9999,9999,9999,9999,1,1,2020-03-28,10,9,9,9999,2,10,4,4,4,9999,9999,4.0,4.0,6.0,6.0,3.0,3.0,6.0,6.0,5.0,6.0,4.0,6.0,6.0,12-2020,-22.441517,2.559225,-26.850227,-1.403258,6.45553,-24.633056,3.674725,4.602888,-0.581694,16.448507,-1.695273,-0.370824,0.014146,4.24297,-3.003748,15.850823,-1.205669,-6.19869,24.274361,75.761783,5.256627,23.943891,66.591151,0.461628,21.773845,71.656916,2.557396,4,2,3,4,2,4,5.0,2,5.0,5.0,1.0,5,5.0,2,5.0,5.0,1.0,5,4,4,4,4,2.0,2,2,3.0,3,2020,0
1,25769809829,1.0,1926-03-16,94,11.0,6,3.0,9999,2.0,41.0,410690,12,14,14,2020-03-18,41.0,410690,2.0,2,2,9,1.0,14,2.0,1.0,14,9999,1.0,2.0,1.0,4.0,8.0,1.0,1.0,1.0,TAQUICARDIA,3.0,2.0,1.0,1.0,9999,1,EM TRATAMENTO DE MIELOMA MULTI,6,9999,9999,9999,1,1,2020-04-01,14,9,9,9999,2,16,4,4,4,9999,9999,5.0,5.0,5.0,4.0,4.0,3.0,6.0,6.0,5.0,6.0,5.0,6.0,6.0,12-2020,-23.794479,1.968,-27.203655,2.114202,5.37741,-30.843416,-0.522876,3.989333,1.758355,20.726154,-2.0475,-4.044715,-3.718354,3.696,-1.017949,20.655332,-1.640704,-12.542308,23.934508,70.172811,3.485714,24.470405,58.925729,0.401504,21.874305,66.741533,0.628148,4,2,3,5,1,3,5.0,2,5.0,5.0,1.0,5,5.0,2,5.0,5.0,1.0,5,4,3,4,4,1.0,2,2,2.0,2,2020,3
2,25769810062,1.0,1962-06-02,57,7.0,6,9999.0,9999,9999.0,35.0,355030,17,19,15,2020-04-20,35.0,355030,2.0,2,2,9999,1.0,10,2.0,9999.0,9999,9999,1.0,1.0,1.0,4.0,2.0,1.0,1.0,9999.0,9999,1.0,1.0,5.0,1.0,9999,9999,9999,9999,9999,9999,9999,1,1,2020-04-30,10,7,9,9999,2,17,4,4,4,9999,9999,5.0,4.0,6.0,6.0,5.0,4.0,6.0,6.0,5.0,6.0,5.0,6.0,6.0,17-2020,-50.2173,-13.987472,-57.145412,-32.3096,16.897321,-36.537678,-50.519372,-16.036903,-57.530315,-28.299213,16.796902,-40.855276,-51.254777,-10.139073,-58.047691,-37.531577,17.824561,-40.216769,20.903427,68.705156,0.214765,20.174017,71.776044,1.997403,21.555626,69.541469,1.42987,1,1,1,1,5,3,1.0,1,1.0,1.0,5.0,2,1.0,1,1.0,1.0,5.0,2,2,2,1,2,3.0,3,2,2.0,2,2020,0


In [None]:
count = df3.count()
df3.select('KMODES_CLUSTER').groupBy('KMODES_CLUSTER').count().orderBy('KMODES_CLUSTER')\
                            .withColumn('%', (F.col('count')/count)*100).show()
df3.select('KMODES_CLUSTER').printSchema()

+--------------+------+------------------+
|KMODES_CLUSTER| count|                 %|
+--------------+------+------------------+
|             0|521135| 24.15407326382532|
|             1|221582|10.270098653794012|
|             2|144643| 6.704054840107622|
|             3|194429|  9.01158492638624|
|             4|252804|11.717206361860356|
|             5|182931| 8.478664407926601|
|             6|294211| 13.63637838376488|
|             7|160028| 7.417133825713948|
|             8|108715| 5.038828854091109|
|             9| 77067|3.5719764825299123|
+--------------+------+------------------+

root
 |-- KMODES_CLUSTER: long (nullable = true)



In [None]:
df3.write.parquet('gs://ai-covid19-datalake/trusted/super-srag_filters/super-srag_covid_labpositive_kmodes.parquet', mode='overwrite')

In [None]:
print('finished writing')

finished writing


<hr />
<hr />
<hr />

# Reading data (now with KMODES attribute)

In [12]:
not_covid = spark.read.parquet('gs://ai-covid19-datalake/trusted/super-srag_filters/super-srag_notcovid_nofilter.parquet')
not_covid.limit(2).toPandas()

Unnamed: 0,NU_NOTIFIC,CS_SEXO,DT_NASC,AGE_AT_NOTIF,AGE_GROUP,CS_GESTANT,CS_RACA,CS_ETINIA,CS_ESCOL_N,SG_UF,CO_MUN_RES,SEM_PRI,SEM_NOT,DIST_PRI_NOTIFIC,DT_SIN_PRI,SG_UF_NOT,CO_MUN_NOT,SURTO_SG,NOSOCOMIAL,AVE_SUINO,VACINA,HOSPITAL,DIST_PRI_INTERNA,SUPORT_VEN,UTI,DIST_PRI_ENTUTI,CLASSI_OUT,CRITERIO,EVOLUCAO,CLASSI_FIN,SYMP_GROUP1,SYMP_GROUP2,SYMP_GROUP3,SYMP_GROUP4,OUTRO_SIN,OUTRO_DES,RF_GROUP1,RF_GROUP2,RF_GROUP3,RF_GROUP4,OBES_IMC,OUT_MORBI,MORB_DESC,RAIOX_RES,DIST_PRI_RAIOX,TOMO_RES,DIST_PRI_TOMO,AMOSTRA,TP_AMOSTRA,DT_COLETA,DIST_PRI_COLETA,PP_IF_RESUL,PP_TRA_RESUL,DIST_PRI_TRA,PP_PCR_RESUL,DIST_PRI_PCR,PP_RES_SOR_IGA,PP_RES_SOR_IGM,PP_RES_SOR_IGG,DIST_PRI_SOR,DIST_PRI_IF,DIST_PRI_NOTIFIC_Q,DIST_PRI_INTERNA_Q,DIST_PRI_ENTUTI_Q,DIST_PRI_SAIDUTI_Q,DIST_PRI_EVOLUCA_Q,DIST_PRI_ENCERRA_Q,DIST_PRI_RAIOX_Q,DIST_PRI_TOMO_Q,DIST_PRI_COLETA_Q,DIST_PRI_SOR_Q,DIST_PRI_PCR_Q,DIST_PRI_TRA_Q,DIST_PRI_IF_Q,EPI_WEEK_YEAR,GMR_TRANSIT_STATIONS_AVG,GMR_GROCERY_AND_PHARMACY_AVG,GMR_RETAIL_AND_RECREATION_AVG,GMR_WORKPLACES_PERCENT_AVG,GMR_RESIDENTIAL_PERCENT_AVG,GMR_PARKS_PERCENT_AVG,GMR_TRANSIT_STATIONS_1WEEK_BEFORE_AVG,GMR_GROCERY_AND_PHARMACY_1WEEK_BEFORE_AVG,GMR_RETAIL_AND_RECREATION_1WEEK_BEFORE_AVG,GMR_WORKPLACES_PERCENT_1WEEK_BEFORE_AVG,GMR_RESIDENTIAL_PERCENT_1WEEK_BEFORE_AVG,GMR_PARKS_PERCENT_1WEEK_BEFORE_AVG,GMR_TRANSIT_STATIONS_2WEEKS_AVG,GMR_GROCERY_AND_PHARMACY_2WEEKS_AVG,GMR_RETAIL_AND_RECREATION_2WEEKS_AVG,GMR_WORKPLACES_PERCENT_2WEEKS_AVG,GMR_RESIDENTIAL_PERCENT_2WEEKS_AVG,GMR_PARKS_PERCENT_2WEEKS_AVG,INMET_TEMP_C_AVG,INMET_RELATIVE_AIR_HUMIDITY_AVG,INMET_DAILY_PRECIPT_AVG,INMET_TEMP_C_1WEEK_BEFORE_AVG,INMET_RELATIVE_AIR_HUMIDITY_1WEEK_BEFORE_AVG,INMET_DAILY_PRECIPT_1WEEK_BEFORE_AVG,INMET_TEMP_C_2WEEKS_BEFORE_AVG,INMET_RELATIVE_AIR_HUMIDITY_2WEEKS_BEFORE_AVG,INMET_DAILY_PRECIPT_2WEEKS_BEFORE_AVG,GMR_TRANSIT_STATIONS_Q,GMR_GROCERY_AND_PHARMACY_Q,GMR_RETAIL_AND_RECREATION_Q,GMR_WORKPLACES_PERCENT_Q,GMR_RESIDENTIAL_PERCENT_Q,GMR_PARKS_PERCENT_Q,GMR_TRANSIT_STATIONS_1WEEK_BEFORE_Q,GMR_GROCERY_AND_PHARMACY_1WEEK_BEFORE_Q,GMR_RETAIL_AND_RECREATION_1WEEK_BEFORE_Q,GMR_WORKPLACES_PERCENT_1WEEK_BEFORE_Q,GMR_RESIDENTIAL_PERCENT_1WEEK_BEFORE_Q,GMR_PARKS_PERCENT_1WEEK_BEFORE_Q,GMR_TRANSIT_STATIONS_2WEEKS_Q,GMR_GROCERY_AND_PHARMACY_2WEEKS_Q,GMR_RETAIL_AND_RECREATION_2WEEKS_Q,GMR_WORKPLACES_PERCENT_2WEEKS_Q,GMR_RESIDENTIAL_PERCENT_2WEEKS_Q,GMR_PARKS_PERCENT_2WEEKS_Q,INMET_TEMP_C_Q,INMET_RELATIVE_AIR_HUMIDITY_Q,INMET_DAILY_PRECIPT_Q,INMET_TEMP_C_1WEEK_BEFORE_Q,INMET_RELATIVE_AIR_HUMIDITY_1WEEK_BEFORE_Q,INMET_DAILY_PRECIPT_1WEEK_BEFORE_Q,INMET_TEMP_C_2WEEKS_BEFORE_Q,INMET_RELATIVE_AIR_HUMIDITY_2WEEKS_BEFORE_Q,INMET_DAILY_PRECIPT_2WEEKS_BEFORE_Q,ANO
0,77309712285,M,2020-09-18,0,1,6,4,,0.0,SP,352620,4,4,2,2021-01-26,SP,352220,9,2,2,,1,1.0,2,2.0,,,1,1,2,3,8,1,1,2.0,,1,1,1,1,,,,,,,,1,1,2021-01-27,1,6,3,2.0,4,6,4,4,4,,,1,1,6,6,4,4,6,6,1,6,2,1,6,04-2021,-25.895942,6.296242,-28.311178,-6.281181,6.003215,-23.171171,-26.123958,4.771699,-24.164021,-7.336648,5.483269,-25.172172,-23.547269,8.872993,-20.801375,-4.632532,5.748876,-25.86341,24.476172,72.664078,4.339216,24.36574,75.171226,6.304,24.070494,79.940901,6.042424,3,2,3,3,2,4,3,2,3,3,1,4,4,3,4,3,2,4,4,3,4,4,4,5,4,5,4,2021
1,85899613066,M,2010-03-11,10,3,6,9,,,SP,355030,4,5,5,2021-01-28,SP,355030,2,2,9,9.0,2,,3,,,,1,1,2,8,1,1,1,,,1,1,1,1,,,,6.0,,6.0,,1,1,2021-01-31,3,9,9,,3,3,4,4,4,,,2,6,6,6,1,2,6,6,2,6,1,6,6,04-2021,-25.895942,6.296242,-28.311178,-6.281181,6.003215,-23.171171,-26.123958,4.771699,-24.164021,-7.336648,5.483269,-25.172172,-23.547269,8.872993,-20.801375,-4.632532,5.748876,-25.86341,24.476172,72.664078,4.339216,24.36574,75.171226,6.304,24.070494,79.940901,6.042424,3,2,3,3,2,4,3,2,3,3,1,4,4,3,4,3,2,4,4,3,4,4,4,5,4,5,4,2021


In [13]:
df1 = spark.read.parquet('gs://ai-covid19-datalake/trusted/super-srag_filters/super-srag_covid_nofilter_kmodes.parquet')
df1.limit(2).toPandas()

Unnamed: 0,NU_NOTIFIC,CS_SEXO,DT_NASC,AGE_AT_NOTIF,AGE_GROUP,CS_GESTANT,CS_RACA,CS_ETINIA,CS_ESCOL_N,SG_UF,CO_MUN_RES,SEM_PRI,SEM_NOT,DIST_PRI_NOTIFIC,DT_SIN_PRI,SG_UF_NOT,CO_MUN_NOT,SURTO_SG,NOSOCOMIAL,AVE_SUINO,VACINA,HOSPITAL,DIST_PRI_INTERNA,SUPORT_VEN,UTI,DIST_PRI_ENTUTI,CLASSI_OUT,CRITERIO,EVOLUCAO,CLASSI_FIN,SYMP_GROUP1,SYMP_GROUP2,SYMP_GROUP3,SYMP_GROUP4,OUTRO_SIN,OUTRO_DES,RF_GROUP1,RF_GROUP2,RF_GROUP3,RF_GROUP4,OBES_IMC,OUT_MORBI,MORB_DESC,RAIOX_RES,DIST_PRI_RAIOX,TOMO_RES,DIST_PRI_TOMO,AMOSTRA,TP_AMOSTRA,DT_COLETA,DIST_PRI_COLETA,PP_IF_RESUL,PP_TRA_RESUL,DIST_PRI_TRA,PP_PCR_RESUL,DIST_PRI_PCR,PP_RES_SOR_IGA,PP_RES_SOR_IGM,PP_RES_SOR_IGG,DIST_PRI_SOR,DIST_PRI_IF,DIST_PRI_NOTIFIC_Q,DIST_PRI_INTERNA_Q,DIST_PRI_ENTUTI_Q,DIST_PRI_SAIDUTI_Q,DIST_PRI_EVOLUCA_Q,DIST_PRI_ENCERRA_Q,DIST_PRI_RAIOX_Q,DIST_PRI_TOMO_Q,DIST_PRI_COLETA_Q,DIST_PRI_SOR_Q,DIST_PRI_PCR_Q,DIST_PRI_TRA_Q,DIST_PRI_IF_Q,EPI_WEEK_YEAR,GMR_TRANSIT_STATIONS_AVG,GMR_GROCERY_AND_PHARMACY_AVG,GMR_RETAIL_AND_RECREATION_AVG,GMR_WORKPLACES_PERCENT_AVG,GMR_RESIDENTIAL_PERCENT_AVG,GMR_PARKS_PERCENT_AVG,GMR_TRANSIT_STATIONS_1WEEK_BEFORE_AVG,GMR_GROCERY_AND_PHARMACY_1WEEK_BEFORE_AVG,GMR_RETAIL_AND_RECREATION_1WEEK_BEFORE_AVG,GMR_WORKPLACES_PERCENT_1WEEK_BEFORE_AVG,GMR_RESIDENTIAL_PERCENT_1WEEK_BEFORE_AVG,GMR_PARKS_PERCENT_1WEEK_BEFORE_AVG,GMR_TRANSIT_STATIONS_2WEEKS_AVG,GMR_GROCERY_AND_PHARMACY_2WEEKS_AVG,GMR_RETAIL_AND_RECREATION_2WEEKS_AVG,GMR_WORKPLACES_PERCENT_2WEEKS_AVG,GMR_RESIDENTIAL_PERCENT_2WEEKS_AVG,GMR_PARKS_PERCENT_2WEEKS_AVG,INMET_TEMP_C_AVG,INMET_RELATIVE_AIR_HUMIDITY_AVG,INMET_DAILY_PRECIPT_AVG,INMET_TEMP_C_1WEEK_BEFORE_AVG,INMET_RELATIVE_AIR_HUMIDITY_1WEEK_BEFORE_AVG,INMET_DAILY_PRECIPT_1WEEK_BEFORE_AVG,INMET_TEMP_C_2WEEKS_BEFORE_AVG,INMET_RELATIVE_AIR_HUMIDITY_2WEEKS_BEFORE_AVG,INMET_DAILY_PRECIPT_2WEEKS_BEFORE_AVG,GMR_TRANSIT_STATIONS_Q,GMR_GROCERY_AND_PHARMACY_Q,GMR_RETAIL_AND_RECREATION_Q,GMR_WORKPLACES_PERCENT_Q,GMR_RESIDENTIAL_PERCENT_Q,GMR_PARKS_PERCENT_Q,GMR_TRANSIT_STATIONS_1WEEK_BEFORE_Q,GMR_GROCERY_AND_PHARMACY_1WEEK_BEFORE_Q,GMR_RETAIL_AND_RECREATION_1WEEK_BEFORE_Q,GMR_WORKPLACES_PERCENT_1WEEK_BEFORE_Q,GMR_RESIDENTIAL_PERCENT_1WEEK_BEFORE_Q,GMR_PARKS_PERCENT_1WEEK_BEFORE_Q,GMR_TRANSIT_STATIONS_2WEEKS_Q,GMR_GROCERY_AND_PHARMACY_2WEEKS_Q,GMR_RETAIL_AND_RECREATION_2WEEKS_Q,GMR_WORKPLACES_PERCENT_2WEEKS_Q,GMR_RESIDENTIAL_PERCENT_2WEEKS_Q,GMR_PARKS_PERCENT_2WEEKS_Q,INMET_TEMP_C_Q,INMET_RELATIVE_AIR_HUMIDITY_Q,INMET_DAILY_PRECIPT_Q,INMET_TEMP_C_1WEEK_BEFORE_Q,INMET_RELATIVE_AIR_HUMIDITY_1WEEK_BEFORE_Q,INMET_DAILY_PRECIPT_1WEEK_BEFORE_Q,INMET_TEMP_C_2WEEKS_BEFORE_Q,INMET_RELATIVE_AIR_HUMIDITY_2WEEKS_BEFORE_Q,INMET_DAILY_PRECIPT_2WEEKS_BEFORE_Q,ANO,KMODES_CLUSTER
0,25769811359,2.0,1965-05-04,55,7.0,5,1.0,9999,9.0,33.0,330455,22,23,8,2020-05-24,33.0,330455,2.0,1,2,9999,1.0,8,2.0,2.0,9999,9999,4.0,9999.0,1.0,3.0,1.0,1.0,1.0,9999.0,9999,1.0,1.0,1.0,1.0,9999,1,HAS,5,8,1,9999,9999,9999,,9999,9,9,9999,6,9999,4,4,4,9999,9999,3.0,4.0,6.0,6.0,6.0,6.0,4.0,6.0,6.0,6.0,6.0,6.0,6.0,22-2020,-50.345763,-7.057057,-51.131579,-25.053942,15.68,-53.498489,-51.535117,-11.102102,-52.811209,-26.890041,16.522034,-52.897898,-52.550336,-12.186186,-54.727811,-28.160825,16.948097,-56.337349,19.012543,75.959249,2.971429,21.98708,77.203198,0.057143,20.599825,78.219749,3.543182,1,1,1,1,5,1,1.0,1,1.0,1.0,5.0,1,1.0,1,1.0,1.0,5.0,1,1,4,3,2,4.0,1,2,4.0,3,2020,6
1,25769811370,2.0,1950-02-02,70,9.0,5,1.0,9999,9.0,26.0,260290,21,22,7,2020-05-19,26.0,261160,2.0,2,2,9,1.0,3,2.0,1.0,3,9999,1.0,2.0,1.0,3.0,4.0,1.0,1.0,2.0,9999,3.0,1.0,5.0,1.0,9999,1,EX-TABAGISTA,6,9999,9999,9999,1,1,2020-06-06,18,7,9,9999,2,21,4,4,4,9999,9999,3.0,2.0,2.0,6.0,5.0,4.0,6.0,6.0,5.0,6.0,5.0,6.0,6.0,21-2020,-55.372881,-17.528662,-62.81068,-29.111111,18.525974,-49.909091,-52.884956,-14.348101,-61.270531,-27.875,17.575,-48.531401,-48.917355,-5.03268,-57.066986,-26.817102,15.740964,-44.960784,24.574487,72.588433,1.187755,24.168571,73.145833,0.783673,24.660119,70.371429,0.183673,1,1,1,1,5,1,1.0,1,1.0,1.0,5.0,1,1.0,1,1.0,1.0,5.0,2,4,3,2,4,3.0,2,4,3.0,1,2020,3


In [14]:
df2 = spark.read.parquet('gs://ai-covid19-datalake/trusted/super-srag_filters/super-srag_covid_pcrpositive_kmodes.parquet')
df2.limit(2).toPandas()

Unnamed: 0,NU_NOTIFIC,CS_SEXO,DT_NASC,AGE_AT_NOTIF,AGE_GROUP,CS_GESTANT,CS_RACA,CS_ETINIA,CS_ESCOL_N,SG_UF,CO_MUN_RES,SEM_PRI,SEM_NOT,DIST_PRI_NOTIFIC,DT_SIN_PRI,SG_UF_NOT,CO_MUN_NOT,SURTO_SG,NOSOCOMIAL,AVE_SUINO,VACINA,HOSPITAL,DIST_PRI_INTERNA,SUPORT_VEN,UTI,DIST_PRI_ENTUTI,CLASSI_OUT,CRITERIO,EVOLUCAO,CLASSI_FIN,SYMP_GROUP1,SYMP_GROUP2,SYMP_GROUP3,SYMP_GROUP4,OUTRO_SIN,OUTRO_DES,RF_GROUP1,RF_GROUP2,RF_GROUP3,RF_GROUP4,OBES_IMC,OUT_MORBI,MORB_DESC,RAIOX_RES,DIST_PRI_RAIOX,TOMO_RES,DIST_PRI_TOMO,AMOSTRA,TP_AMOSTRA,DT_COLETA,DIST_PRI_COLETA,PP_IF_RESUL,PP_TRA_RESUL,DIST_PRI_TRA,PP_PCR_RESUL,DIST_PRI_PCR,PP_RES_SOR_IGA,PP_RES_SOR_IGM,PP_RES_SOR_IGG,DIST_PRI_SOR,DIST_PRI_IF,DIST_PRI_NOTIFIC_Q,DIST_PRI_INTERNA_Q,DIST_PRI_ENTUTI_Q,DIST_PRI_SAIDUTI_Q,DIST_PRI_EVOLUCA_Q,DIST_PRI_ENCERRA_Q,DIST_PRI_RAIOX_Q,DIST_PRI_TOMO_Q,DIST_PRI_COLETA_Q,DIST_PRI_SOR_Q,DIST_PRI_PCR_Q,DIST_PRI_TRA_Q,DIST_PRI_IF_Q,EPI_WEEK_YEAR,GMR_TRANSIT_STATIONS_AVG,GMR_GROCERY_AND_PHARMACY_AVG,GMR_RETAIL_AND_RECREATION_AVG,GMR_WORKPLACES_PERCENT_AVG,GMR_RESIDENTIAL_PERCENT_AVG,GMR_PARKS_PERCENT_AVG,GMR_TRANSIT_STATIONS_1WEEK_BEFORE_AVG,GMR_GROCERY_AND_PHARMACY_1WEEK_BEFORE_AVG,GMR_RETAIL_AND_RECREATION_1WEEK_BEFORE_AVG,GMR_WORKPLACES_PERCENT_1WEEK_BEFORE_AVG,GMR_RESIDENTIAL_PERCENT_1WEEK_BEFORE_AVG,GMR_PARKS_PERCENT_1WEEK_BEFORE_AVG,GMR_TRANSIT_STATIONS_2WEEKS_AVG,GMR_GROCERY_AND_PHARMACY_2WEEKS_AVG,GMR_RETAIL_AND_RECREATION_2WEEKS_AVG,GMR_WORKPLACES_PERCENT_2WEEKS_AVG,GMR_RESIDENTIAL_PERCENT_2WEEKS_AVG,GMR_PARKS_PERCENT_2WEEKS_AVG,INMET_TEMP_C_AVG,INMET_RELATIVE_AIR_HUMIDITY_AVG,INMET_DAILY_PRECIPT_AVG,INMET_TEMP_C_1WEEK_BEFORE_AVG,INMET_RELATIVE_AIR_HUMIDITY_1WEEK_BEFORE_AVG,INMET_DAILY_PRECIPT_1WEEK_BEFORE_AVG,INMET_TEMP_C_2WEEKS_BEFORE_AVG,INMET_RELATIVE_AIR_HUMIDITY_2WEEKS_BEFORE_AVG,INMET_DAILY_PRECIPT_2WEEKS_BEFORE_AVG,GMR_TRANSIT_STATIONS_Q,GMR_GROCERY_AND_PHARMACY_Q,GMR_RETAIL_AND_RECREATION_Q,GMR_WORKPLACES_PERCENT_Q,GMR_RESIDENTIAL_PERCENT_Q,GMR_PARKS_PERCENT_Q,GMR_TRANSIT_STATIONS_1WEEK_BEFORE_Q,GMR_GROCERY_AND_PHARMACY_1WEEK_BEFORE_Q,GMR_RETAIL_AND_RECREATION_1WEEK_BEFORE_Q,GMR_WORKPLACES_PERCENT_1WEEK_BEFORE_Q,GMR_RESIDENTIAL_PERCENT_1WEEK_BEFORE_Q,GMR_PARKS_PERCENT_1WEEK_BEFORE_Q,GMR_TRANSIT_STATIONS_2WEEKS_Q,GMR_GROCERY_AND_PHARMACY_2WEEKS_Q,GMR_RETAIL_AND_RECREATION_2WEEKS_Q,GMR_WORKPLACES_PERCENT_2WEEKS_Q,GMR_RESIDENTIAL_PERCENT_2WEEKS_Q,GMR_PARKS_PERCENT_2WEEKS_Q,INMET_TEMP_C_Q,INMET_RELATIVE_AIR_HUMIDITY_Q,INMET_DAILY_PRECIPT_Q,INMET_TEMP_C_1WEEK_BEFORE_Q,INMET_RELATIVE_AIR_HUMIDITY_1WEEK_BEFORE_Q,INMET_DAILY_PRECIPT_1WEEK_BEFORE_Q,INMET_TEMP_C_2WEEKS_BEFORE_Q,INMET_RELATIVE_AIR_HUMIDITY_2WEEKS_BEFORE_Q,INMET_DAILY_PRECIPT_2WEEKS_BEFORE_Q,ANO,KMODES_CLUSTER
0,25769810023,1.0,1948-07-29,71,9.0,6,4.0,9999,9.0,35.0,355030,16,17,7,2020-04-18,35.0,355030,2.0,2,2,1,1.0,4,3.0,9.0,9999,9999,1.0,1.0,1.0,8.0,8.0,1.0,1.0,2.0,9999,3.0,1.0,1.0,1.0,9999,2,9999,6,9999,9999,9999,1,1,2020-04-22,4,9,9,9999,2,6,4,4,4,9999,9999,3.0,2.0,6.0,6.0,2.0,4.0,6.0,6.0,3.0,6.0,2.0,6.0,6.0,16-2020,-50.519372,-16.036903,-57.530315,-28.299213,16.796902,-40.855276,-51.254777,-10.139073,-58.047691,-37.531577,17.824561,-40.216769,-54.925532,-19.774049,-61.253203,-34.75445,18.749331,-45.325581,20.174017,71.776044,1.997403,21.555626,69.541469,1.42987,23.257402,73.122932,4.262025,1,1,1,1,5,2,1.0,1,1.0,1.0,5.0,2,1.0,1,1.0,1.0,5.0,2,2,3,3,2,2.0,2,3,3.0,4,2020,4
1,25769810115,1.0,1963-02-06,57,7.0,6,4.0,9999,3.0,21.0,211130,15,17,16,2020-04-08,21.0,211130,2.0,2,2,2,1.0,12,2.0,1.0,12,9999,9999.0,9999.0,1.0,4.0,3.0,1.0,1.0,1.0,"DOR RETROCULAR, DISGEUSIA",3.0,1.0,5.0,1.0,9999,2,9999,6,9999,9999,9999,1,4,2020-04-20,12,7,9,9999,2,15,4,4,4,9999,9999,5.0,5.0,5.0,3.0,6.0,6.0,6.0,6.0,5.0,6.0,5.0,6.0,6.0,15-2020,-61.803571,-16.692308,-54.291667,-36.98995,13.984848,-52.117647,-61.688525,-16.933333,-55.680556,-30.226804,14.768116,-54.228571,-61.650794,-26.4,-65.0,-39.15942,15.69697,-59.351351,27.155383,80.216322,6.67619,27.252573,77.409525,4.324242,26.409964,81.153399,8.221538,1,1,1,1,5,1,1.0,1,1.0,1.0,5.0,1,1.0,1,1.0,1.0,5.0,1,5,5,5,5,4.0,4,5,5.0,5,2020,3


In [15]:
df3 = spark.read.parquet('gs://ai-covid19-datalake/trusted/super-srag_filters/super-srag_covid_labpositive_kmodes.parquet')
df3.limit(2).toPandas()

Unnamed: 0,NU_NOTIFIC,CS_SEXO,DT_NASC,AGE_AT_NOTIF,AGE_GROUP,CS_GESTANT,CS_RACA,CS_ETINIA,CS_ESCOL_N,SG_UF,CO_MUN_RES,SEM_PRI,SEM_NOT,DIST_PRI_NOTIFIC,DT_SIN_PRI,SG_UF_NOT,CO_MUN_NOT,SURTO_SG,NOSOCOMIAL,AVE_SUINO,VACINA,HOSPITAL,DIST_PRI_INTERNA,SUPORT_VEN,UTI,DIST_PRI_ENTUTI,CLASSI_OUT,CRITERIO,EVOLUCAO,CLASSI_FIN,SYMP_GROUP1,SYMP_GROUP2,SYMP_GROUP3,SYMP_GROUP4,OUTRO_SIN,OUTRO_DES,RF_GROUP1,RF_GROUP2,RF_GROUP3,RF_GROUP4,OBES_IMC,OUT_MORBI,MORB_DESC,RAIOX_RES,DIST_PRI_RAIOX,TOMO_RES,DIST_PRI_TOMO,AMOSTRA,TP_AMOSTRA,DT_COLETA,DIST_PRI_COLETA,PP_IF_RESUL,PP_TRA_RESUL,DIST_PRI_TRA,PP_PCR_RESUL,DIST_PRI_PCR,PP_RES_SOR_IGA,PP_RES_SOR_IGM,PP_RES_SOR_IGG,DIST_PRI_SOR,DIST_PRI_IF,DIST_PRI_NOTIFIC_Q,DIST_PRI_INTERNA_Q,DIST_PRI_ENTUTI_Q,DIST_PRI_SAIDUTI_Q,DIST_PRI_EVOLUCA_Q,DIST_PRI_ENCERRA_Q,DIST_PRI_RAIOX_Q,DIST_PRI_TOMO_Q,DIST_PRI_COLETA_Q,DIST_PRI_SOR_Q,DIST_PRI_PCR_Q,DIST_PRI_TRA_Q,DIST_PRI_IF_Q,EPI_WEEK_YEAR,GMR_TRANSIT_STATIONS_AVG,GMR_GROCERY_AND_PHARMACY_AVG,GMR_RETAIL_AND_RECREATION_AVG,GMR_WORKPLACES_PERCENT_AVG,GMR_RESIDENTIAL_PERCENT_AVG,GMR_PARKS_PERCENT_AVG,GMR_TRANSIT_STATIONS_1WEEK_BEFORE_AVG,GMR_GROCERY_AND_PHARMACY_1WEEK_BEFORE_AVG,GMR_RETAIL_AND_RECREATION_1WEEK_BEFORE_AVG,GMR_WORKPLACES_PERCENT_1WEEK_BEFORE_AVG,GMR_RESIDENTIAL_PERCENT_1WEEK_BEFORE_AVG,GMR_PARKS_PERCENT_1WEEK_BEFORE_AVG,GMR_TRANSIT_STATIONS_2WEEKS_AVG,GMR_GROCERY_AND_PHARMACY_2WEEKS_AVG,GMR_RETAIL_AND_RECREATION_2WEEKS_AVG,GMR_WORKPLACES_PERCENT_2WEEKS_AVG,GMR_RESIDENTIAL_PERCENT_2WEEKS_AVG,GMR_PARKS_PERCENT_2WEEKS_AVG,INMET_TEMP_C_AVG,INMET_RELATIVE_AIR_HUMIDITY_AVG,INMET_DAILY_PRECIPT_AVG,INMET_TEMP_C_1WEEK_BEFORE_AVG,INMET_RELATIVE_AIR_HUMIDITY_1WEEK_BEFORE_AVG,INMET_DAILY_PRECIPT_1WEEK_BEFORE_AVG,INMET_TEMP_C_2WEEKS_BEFORE_AVG,INMET_RELATIVE_AIR_HUMIDITY_2WEEKS_BEFORE_AVG,INMET_DAILY_PRECIPT_2WEEKS_BEFORE_AVG,GMR_TRANSIT_STATIONS_Q,GMR_GROCERY_AND_PHARMACY_Q,GMR_RETAIL_AND_RECREATION_Q,GMR_WORKPLACES_PERCENT_Q,GMR_RESIDENTIAL_PERCENT_Q,GMR_PARKS_PERCENT_Q,GMR_TRANSIT_STATIONS_1WEEK_BEFORE_Q,GMR_GROCERY_AND_PHARMACY_1WEEK_BEFORE_Q,GMR_RETAIL_AND_RECREATION_1WEEK_BEFORE_Q,GMR_WORKPLACES_PERCENT_1WEEK_BEFORE_Q,GMR_RESIDENTIAL_PERCENT_1WEEK_BEFORE_Q,GMR_PARKS_PERCENT_1WEEK_BEFORE_Q,GMR_TRANSIT_STATIONS_2WEEKS_Q,GMR_GROCERY_AND_PHARMACY_2WEEKS_Q,GMR_RETAIL_AND_RECREATION_2WEEKS_Q,GMR_WORKPLACES_PERCENT_2WEEKS_Q,GMR_RESIDENTIAL_PERCENT_2WEEKS_Q,GMR_PARKS_PERCENT_2WEEKS_Q,INMET_TEMP_C_Q,INMET_RELATIVE_AIR_HUMIDITY_Q,INMET_DAILY_PRECIPT_Q,INMET_TEMP_C_1WEEK_BEFORE_Q,INMET_RELATIVE_AIR_HUMIDITY_1WEEK_BEFORE_Q,INMET_DAILY_PRECIPT_1WEEK_BEFORE_Q,INMET_TEMP_C_2WEEKS_BEFORE_Q,INMET_RELATIVE_AIR_HUMIDITY_2WEEKS_BEFORE_Q,INMET_DAILY_PRECIPT_2WEEKS_BEFORE_Q,ANO,KMODES_CLUSTER
0,25769810426,2.0,1926-05-05,93,11.0,5,9.0,9999,9.0,35.0,355030,18,19,7,2020-05-01,35.0,355030,1.0,2,2,9,1.0,6,3.0,1.0,6,9999,1.0,1.0,1.0,4.0,9.0,2.0,1.0,1.0,QUEDA DO ESTADO GERAL,3.0,1.0,1.0,1.0,9999,1,"DISFAGIA, NEO MAMA, HIPOTIREOI",9999,9999,9999,9999,1,1,2020-05-01,0,9,9,9999,2,5,4,4,4,9999,9999,3.0,3.0,3.0,1.0,5.0,4.0,6.0,6.0,1.0,6.0,2.0,6.0,6.0,18-2020,-48.729253,-12.909818,-55.344186,-29.583796,15.792982,-34.934694,-50.2173,-13.987472,-57.145412,-32.3096,16.897321,-36.537678,-50.519372,-16.036903,-57.530315,-28.299213,16.796902,-40.855276,20.891993,63.837136,0.062937,20.903427,68.705156,0.214765,20.174017,71.776044,1.997403,1,1,1,1,5,3,1.0,1,1.0,1.0,5.0,3,1.0,1,1.0,1.0,5.0,2,2,1,1,2,2.0,1,1,3.0,3,2020,5
1,25769810863,2.0,1934-09-03,85,10.0,6,4.0,9999,9.0,35.0,350570,16,20,30,2020-04-15,35.0,350570,2.0,2,2,9,1.0,30,1.0,2.0,9999,9999,1.0,2.0,1.0,4.0,4.0,1.0,1.0,9999.0,9999,3.0,1.0,1.0,1.0,9999,9999,9999,3,30,9999,9999,1,9999,2020-05-20,35,7,9,9999,2,40,4,4,4,9999,9999,5.0,5.0,6.0,6.0,5.0,5.0,5.0,6.0,5.0,6.0,5.0,6.0,6.0,16-2020,-50.519372,-16.036903,-57.530315,-28.299213,16.796902,-40.855276,-51.254777,-10.139073,-58.047691,-37.531577,17.824561,-40.216769,-54.925532,-19.774049,-61.253203,-34.75445,18.749331,-45.325581,20.174017,71.776044,1.997403,21.555626,69.541469,1.42987,23.257402,73.122932,4.262025,1,1,1,1,5,2,1.0,1,1.0,1.0,5.0,2,1.0,1,1.0,1.0,5.0,2,2,3,3,2,2.0,2,3,3.0,4,2020,3


<hr />
<hr />
<hr />

# Preprocessing (not-covid) data

In [16]:
not_covid = preprocessing(not_covid)
not_covid.limit(2).toPandas()

Unnamed: 0,NU_NOTIFIC,CS_SEXO,DT_NASC,AGE_AT_NOTIF,AGE_GROUP,CS_GESTANT,CS_RACA,CS_ETINIA,CS_ESCOL_N,SG_UF,CO_MUN_RES,SEM_PRI,SEM_NOT,DIST_PRI_NOTIFIC,DT_SIN_PRI,SG_UF_NOT,CO_MUN_NOT,SURTO_SG,NOSOCOMIAL,AVE_SUINO,VACINA,HOSPITAL,DIST_PRI_INTERNA,SUPORT_VEN,UTI,DIST_PRI_ENTUTI,CLASSI_OUT,CRITERIO,EVOLUCAO,CLASSI_FIN,SYMP_GROUP1,SYMP_GROUP2,SYMP_GROUP3,SYMP_GROUP4,OUTRO_SIN,OUTRO_DES,RF_GROUP1,RF_GROUP2,RF_GROUP3,RF_GROUP4,OBES_IMC,OUT_MORBI,MORB_DESC,RAIOX_RES,DIST_PRI_RAIOX,TOMO_RES,DIST_PRI_TOMO,AMOSTRA,TP_AMOSTRA,DT_COLETA,DIST_PRI_COLETA,PP_IF_RESUL,PP_TRA_RESUL,DIST_PRI_TRA,PP_PCR_RESUL,DIST_PRI_PCR,PP_RES_SOR_IGA,PP_RES_SOR_IGM,PP_RES_SOR_IGG,DIST_PRI_SOR,DIST_PRI_IF,DIST_PRI_NOTIFIC_Q,DIST_PRI_INTERNA_Q,DIST_PRI_ENTUTI_Q,DIST_PRI_SAIDUTI_Q,DIST_PRI_EVOLUCA_Q,DIST_PRI_ENCERRA_Q,DIST_PRI_RAIOX_Q,DIST_PRI_TOMO_Q,DIST_PRI_COLETA_Q,DIST_PRI_SOR_Q,DIST_PRI_PCR_Q,DIST_PRI_TRA_Q,DIST_PRI_IF_Q,EPI_WEEK_YEAR,GMR_TRANSIT_STATIONS_AVG,GMR_GROCERY_AND_PHARMACY_AVG,GMR_RETAIL_AND_RECREATION_AVG,GMR_WORKPLACES_PERCENT_AVG,GMR_RESIDENTIAL_PERCENT_AVG,GMR_PARKS_PERCENT_AVG,GMR_TRANSIT_STATIONS_1WEEK_BEFORE_AVG,GMR_GROCERY_AND_PHARMACY_1WEEK_BEFORE_AVG,GMR_RETAIL_AND_RECREATION_1WEEK_BEFORE_AVG,GMR_WORKPLACES_PERCENT_1WEEK_BEFORE_AVG,GMR_RESIDENTIAL_PERCENT_1WEEK_BEFORE_AVG,GMR_PARKS_PERCENT_1WEEK_BEFORE_AVG,GMR_TRANSIT_STATIONS_2WEEKS_AVG,GMR_GROCERY_AND_PHARMACY_2WEEKS_AVG,GMR_RETAIL_AND_RECREATION_2WEEKS_AVG,GMR_WORKPLACES_PERCENT_2WEEKS_AVG,GMR_RESIDENTIAL_PERCENT_2WEEKS_AVG,GMR_PARKS_PERCENT_2WEEKS_AVG,INMET_TEMP_C_AVG,INMET_RELATIVE_AIR_HUMIDITY_AVG,INMET_DAILY_PRECIPT_AVG,INMET_TEMP_C_1WEEK_BEFORE_AVG,INMET_RELATIVE_AIR_HUMIDITY_1WEEK_BEFORE_AVG,INMET_DAILY_PRECIPT_1WEEK_BEFORE_AVG,INMET_TEMP_C_2WEEKS_BEFORE_AVG,INMET_RELATIVE_AIR_HUMIDITY_2WEEKS_BEFORE_AVG,INMET_DAILY_PRECIPT_2WEEKS_BEFORE_AVG,GMR_TRANSIT_STATIONS_Q,GMR_GROCERY_AND_PHARMACY_Q,GMR_RETAIL_AND_RECREATION_Q,GMR_WORKPLACES_PERCENT_Q,GMR_RESIDENTIAL_PERCENT_Q,GMR_PARKS_PERCENT_Q,GMR_TRANSIT_STATIONS_1WEEK_BEFORE_Q,GMR_GROCERY_AND_PHARMACY_1WEEK_BEFORE_Q,GMR_RETAIL_AND_RECREATION_1WEEK_BEFORE_Q,GMR_WORKPLACES_PERCENT_1WEEK_BEFORE_Q,GMR_RESIDENTIAL_PERCENT_1WEEK_BEFORE_Q,GMR_PARKS_PERCENT_1WEEK_BEFORE_Q,GMR_TRANSIT_STATIONS_2WEEKS_Q,GMR_GROCERY_AND_PHARMACY_2WEEKS_Q,GMR_RETAIL_AND_RECREATION_2WEEKS_Q,GMR_WORKPLACES_PERCENT_2WEEKS_Q,GMR_RESIDENTIAL_PERCENT_2WEEKS_Q,GMR_PARKS_PERCENT_2WEEKS_Q,INMET_TEMP_C_Q,INMET_RELATIVE_AIR_HUMIDITY_Q,INMET_DAILY_PRECIPT_Q,INMET_TEMP_C_1WEEK_BEFORE_Q,INMET_RELATIVE_AIR_HUMIDITY_1WEEK_BEFORE_Q,INMET_DAILY_PRECIPT_1WEEK_BEFORE_Q,INMET_TEMP_C_2WEEKS_BEFORE_Q,INMET_RELATIVE_AIR_HUMIDITY_2WEEKS_BEFORE_Q,INMET_DAILY_PRECIPT_2WEEKS_BEFORE_Q,ANO
0,77309712285,1.0,2020-09-18,0,1.0,6,4.0,9999,0.0,35.0,352620,4,4,2,2021-01-26,35.0,352220,9.0,2,2,9999,1.0,1,2.0,2.0,9999,9999,1.0,1.0,0.0,3.0,8.0,1.0,1.0,2.0,9999,1.0,1.0,1.0,1.0,9999,9999,9999,9999,9999,9999,9999,1,1,2021-01-27,1,6,3,2,4,6,4,4,4,9999,9999,1.0,1.0,6.0,6.0,4.0,4.0,6.0,6.0,1.0,6.0,2.0,1.0,6.0,04-2021,-25.895942,6.296242,-28.311178,-6.281181,6.003215,-23.171171,-26.123958,4.771699,-24.164021,-7.336648,5.483269,-25.172172,-23.547269,8.872993,-20.801375,-4.632532,5.748876,-25.86341,24.476172,72.664078,4.339216,24.36574,75.171226,6.304,24.070494,79.940901,6.042424,3,2,3,3,2,4,3.0,2,3.0,3.0,1.0,4,4.0,3,4.0,3.0,2.0,4,4,3,4,4,4.0,5,4,5.0,4,2021
1,85899613066,1.0,2010-03-11,10,3.0,6,9.0,9999,9999.0,35.0,355030,4,5,5,2021-01-28,35.0,355030,2.0,2,9,9,2.0,9999,3.0,9999.0,9999,9999,1.0,1.0,0.0,8.0,1.0,1.0,1.0,9999.0,9999,1.0,1.0,1.0,1.0,9999,9999,9999,6,9999,6,9999,1,1,2021-01-31,3,9,9,9999,3,3,4,4,4,9999,9999,2.0,6.0,6.0,6.0,1.0,2.0,6.0,6.0,2.0,6.0,1.0,6.0,6.0,04-2021,-25.895942,6.296242,-28.311178,-6.281181,6.003215,-23.171171,-26.123958,4.771699,-24.164021,-7.336648,5.483269,-25.172172,-23.547269,8.872993,-20.801375,-4.632532,5.748876,-25.86341,24.476172,72.664078,4.339216,24.36574,75.171226,6.304,24.070494,79.940901,6.042424,3,2,3,3,2,4,3.0,2,3.0,3.0,1.0,4,4.0,3,4.0,3.0,2.0,4,4,3,4,4,4.0,5,4,5.0,4,2021


<hr />
<hr />
<hr />

# Creating all experiment datasets for KMODES sampling strategy

In [None]:
dfs = {
    'ds-1': {'df': df1.persist(StorageLevel.MEMORY_ONLY), 'fractions': {'KMODES_CLUSTER': {}}},
    'ds-2': {'df': df2.persist(StorageLevel.MEMORY_ONLY), 'fractions': {'KMODES_CLUSTER': {}}},
    'ds-3': {'df': df3.persist(StorageLevel.MEMORY_ONLY), 'fractions': {'KMODES_CLUSTER': {}}}
}

In [None]:
for df in list(dfs.keys()): 
    ag_list = dfs[df]['df'].select('KMODES_CLUSTER').filter(F.col('KMODES_CLUSTER').isNotNull()).distinct().rdd.flatMap(lambda x: x).collect()
    for ag in ag_list:
        dfs[df]['fractions']['KMODES_CLUSTER'][ag] = 0.02
    
    print('===== {} ====='.format(df))
    print(dfs[df]['fractions'])

===== ds-1 =====
{'KMODES_CLUSTER': {0: 0.02, 7: 0.02, 6: 0.02, 9: 0.02, 5: 0.02, 1: 0.02, 3: 0.02, 8: 0.02, 2: 0.02, 4: 0.02}}
===== ds-2 =====
{'KMODES_CLUSTER': {0: 0.02, 7: 0.02, 6: 0.02, 9: 0.02, 5: 0.02, 1: 0.02, 3: 0.02, 8: 0.02, 2: 0.02, 4: 0.02}}
===== ds-3 =====
{'KMODES_CLUSTER': {0: 0.02, 7: 0.02, 6: 0.02, 9: 0.02, 5: 0.02, 1: 0.02, 3: 0.02, 8: 0.02, 2: 0.02, 4: 0.02}}


<hr />
<hr />
<hr />

In [None]:
for undersamp_met in list(undersamp_col.keys()): 
    for ds in list(dfs.keys()):
        for col_set in list(cols_sets.keys()):
            for exp_id in range(50): 
                df = dfs[ds]['df'].select(cols_sets[col_set])

                fractions = dfs[ds]['fractions'][undersamp_col[undersamp_met]]
                df = df.sampleBy(undersamp_col[undersamp_met], fractions=fractions, seed=exp_id)
                df = df.union(not_covid.select(cols_sets[col_set]))
                
                prefix = 'gs://ai-covid19-datalake/trusted/experiment_map/'
                filename = 'experiment' + str(exp_id) + '.parquet'
                uri = prefix + undersamp_met + '/' + ds + '/' + col_set + '/' + filename
                
                df.write.parquet(uri, mode='overwrite')

                print('Dataframe written: {}'.format(uri))
                df.select('CLASSI_FIN').groupBy('CLASSI_FIN').count().show()
                print('')
                

Dataframe written: gs://ai-covid19-datalake/trusted/experiment_map/02-KMODES/ds-1/cols_set_1/experiment0.parquet
+----------+-----+
|CLASSI_FIN|count|
+----------+-----+
|       0.0|44538|
|       1.0|53827|
+----------+-----+


Dataframe written: gs://ai-covid19-datalake/trusted/experiment_map/02-KMODES/ds-1/cols_set_1/experiment1.parquet
+----------+-----+
|CLASSI_FIN|count|
+----------+-----+
|       0.0|44538|
|       1.0|52360|
+----------+-----+


Dataframe written: gs://ai-covid19-datalake/trusted/experiment_map/02-KMODES/ds-1/cols_set_1/experiment2.parquet
+----------+-----+
|CLASSI_FIN|count|
+----------+-----+
|       0.0|44538|
|       1.0|50618|
+----------+-----+


Dataframe written: gs://ai-covid19-datalake/trusted/experiment_map/02-KMODES/ds-1/cols_set_1/experiment3.parquet
+----------+-----+
|CLASSI_FIN|count|
+----------+-----+
|       0.0|44538|
|       1.0|50145|
+----------+-----+


Dataframe written: gs://ai-covid19-datalake/trusted/experiment_map/02-KMODES/ds-1/co

In [60]:
print('finished')

finished


<hr />
<hr />
<hr />