In [1]:
import pandas as pd
import pyspark.sql.functions as F
from pyspark.sql.types import *

In [2]:
pd.set_option("display.max_rows", 101)
pd.set_option("display.max_columns", 101)

<hr />

### reading preprocessed dataframes

In [3]:
srag_2019 = spark.read.parquet('gs://ai-covid19-datalake/standard/srag/pp_interm_srag_2019_v3_new_attr')

In [4]:
srag_2020 = spark.read.parquet('gs://ai-covid19-datalake/standard/srag/pp_interm_srag_2020_v3_new_attr')

In [5]:
srag_2021 = spark.read.parquet('gs://ai-covid19-datalake/standard/srag/pp_interm_srag_2021_v3_new_attr')

In [6]:
gmr = spark.read.csv('gs://ai-covid19-datalake/standard/google-mobility/pp_google-mobility_report-28-05-2021.csv', header=True, inferSchema=True)

In [7]:
ibge = spark.read.csv('gs://ai-covid19-datalake/standard/ibge-data/pp_ibge-municipality-to-code-28-05-2021.csv', header=True, inferSchema=True)

In [8]:
inmet = spark.read.csv('gs://ai-covid19-datalake/standard/inmet-data/pp_inmet_meteorological_data-01012019-a-28052021', header=True, inferSchema=True)

In [9]:
epi_weeks = spark.read.csv('gs://ai-covid19-datalake/raw/epidemiological-data/epi_weeks.csv', header=True, inferSchema=True)

<hr />

# Preparing srag to receive the new information from UF's and Epidemiological Weeks

In [10]:
# # dropping duplicate columns
# def dropDupeDfCols(df):
#     newcols = []
#     dupcols = []

#     for i in range(len(df.columns)):
#         if df.columns[i] not in newcols:
#             newcols.append(df.columns[i])
#         else:
#             dupcols.append(i)

#     df = df.toDF(*[str(i) for i in range(len(df.columns))])
#     for dupcol in dupcols:
#         df = df.drop(str(dupcol))

#     return df.toDF(*newcols)

In [11]:
# srag_2019 = dropDupeDfCols(srag_2019)
# srag_2020 = dropDupeDfCols(srag_2020)
# srag_2021 = dropDupeDfCols(srag_2021)

In [12]:
srag_2019 = srag_2019.withColumn('ANO', F.lit('2019'))
srag_2020 = srag_2020.withColumn('ANO', F.lit('2020'))
srag_2021 = srag_2021.withColumn('ANO', F.lit('2021'))

In [13]:
def get_epi_week_year(epi_week, year):
    return str(epi_week) + '-' + str(year)
udf_get_epi_week_year = F.udf(get_epi_week_year, StringType())

In [14]:
# adding 'epi_week_year' tag
srag_2019 = srag_2019.withColumn('epi_week_year', udf_get_epi_week_year(F.col('SEM_PRI'), F.col('ANO')))
srag_2020 = srag_2020.withColumn('epi_week_year', udf_get_epi_week_year(F.col('SEM_PRI'), F.col('ANO')))
srag_2021 = srag_2021.withColumn('epi_week_year', udf_get_epi_week_year(F.col('SEM_PRI'), F.col('ANO')))

In [15]:
# padding
srag_2019 = srag_2019.withColumn('epi_week_year', F.lpad(F.col('epi_week_year'), 7, '0'))
srag_2020 = srag_2020.withColumn('epi_week_year', F.lpad(F.col('epi_week_year'), 7, '0'))
srag_2021 = srag_2021.withColumn('epi_week_year', F.lpad(F.col('epi_week_year'), 7, '0'))

In [16]:
def get_uf(cod_mun_res):
    return str(cod_mun_res)[:2]
udf_get_uf = F.udf(get_uf, StringType())

In [17]:
srag_2019 = srag_2019.withColumn('UF', udf_get_uf(F.col('CO_MUN_NOT')))
srag_2020 = srag_2020.withColumn('UF', udf_get_uf(F.col('CO_MUN_NOT')))
srag_2021 = srag_2021.withColumn('UF', udf_get_uf(F.col('CO_MUN_NOT')))

# Google Mobility Report with state codes by epi_weeks

###  adding epidemiological weeks

In [18]:
# converting date type variables
gmr = gmr.withColumn('date', F.to_date(F.col('date'), 'dd/MM/yyyy'))
epi_weeks = epi_weeks.withColumn('date', F.to_date(F.col('date'), 'dd/MM/yyyy'))

In [19]:
# find the date from 7 or 14 days before
epi_weeks = epi_weeks.withColumn('date_7_days_early', F.date_sub(F.col('date'), 7))
epi_weeks = epi_weeks.withColumn('date_14_days_early', F.date_sub(F.col('date'), 14))

In [20]:
# finding the epi_week number from 1 or 2 weeks before
def find_one_or_two_epi_weeks(n_weeks, col):
    return col - n_weeks
udf_find_one_or_two_epi_weeks = F.udf(find_one_or_two_epi_weeks, IntegerType())

In [21]:
epi_weeks = epi_weeks.withColumn('epi_week_7_days_early', udf_find_one_or_two_epi_weeks(F.lit(1), F.col('epi_week')))
epi_weeks = epi_weeks.withColumn('epi_week_14_days_early', udf_find_one_or_two_epi_weeks(F.lit(2), F.col('epi_week')))

In [22]:
# creating the epi_week tag for the new found early epi_weeks
def set_epi_week_year(col1, col2):
    string = str(col1) + '-' + str(col2)
    return string
udf_set_epi_week_year = F.udf(set_epi_week_year, StringType())

In [23]:
epi_weeks = epi_weeks.withColumn('epi_week_year_7_days_early', udf_set_epi_week_year(F.col('epi_week_7_days_early'), F.col('epi_year')))
epi_weeks = epi_weeks.withColumn('epi_week_year_14_early', udf_set_epi_week_year(F.col('epi_week_14_days_early'), F.col('epi_year')))

In [24]:
# make sure to erase inexistent epi_week
epi_weeks = epi_weeks.withColumn('epi_week_year_7_days_early', F.when((F.col('epi_week_7_days_early') < 1), None).otherwise(F.col('epi_week_year_7_days_early')))
epi_weeks = epi_weeks.withColumn('epi_week_7_days_early', F.when((F.col('epi_week_7_days_early') < 1), None).otherwise(F.col('epi_week_7_days_early')))

epi_weeks = epi_weeks.withColumn('epi_week_year_14_early', F.when((F.col('epi_week_14_days_early') < 1), None).otherwise(F.col('epi_week_year_14_early')))
epi_weeks = epi_weeks.withColumn('epi_week_14_days_early', F.when((F.col('epi_week_14_days_early') < 1), None).otherwise(F.col('epi_week_14_days_early')))

In [25]:
# joining using the actual date
epi_weeks_actual = epi_weeks.select(['date', 'epi_week', 'epi_year', 'epi_week_year'])
gmr = gmr.join(epi_weeks_actual, 'date', 'left')

In [26]:
# joining using date from 7 days before
epi_weeks_7 = epi_weeks.select(['date_7_days_early', 'epi_week_7_days_early', 'epi_week_year_7_days_early'])
gmr = gmr.join(epi_weeks_7, F.date_add(gmr.date, 7) == epi_weeks.date_7_days_early, 'left')

In [27]:
# joining using date from 14 days before
epi_weeks_14 = epi_weeks.select(['date_14_days_early', 'epi_week_14_days_early', 'epi_week_year_14_early'])
gmr = gmr.join(epi_weeks_14, F.date_add(gmr.date, 14) == epi_weeks.date_14_days_early, 'left')

In [28]:
epi_weeks.limit(10).toPandas()

Unnamed: 0,date,epi_week,epi_year,epi_week_year,date_7_days_early,date_14_days_early,epi_week_7_days_early,epi_week_14_days_early,epi_week_year_7_days_early,epi_week_year_14_early
0,2018-12-30,1,2019,1-2019,2018-12-23,2018-12-16,,,,
1,2018-12-31,1,2019,1-2019,2018-12-24,2018-12-17,,,,
2,2019-01-01,1,2019,1-2019,2018-12-25,2018-12-18,,,,
3,2019-01-02,1,2019,1-2019,2018-12-26,2018-12-19,,,,
4,2019-01-03,1,2019,1-2019,2018-12-27,2018-12-20,,,,
5,2019-01-04,1,2019,1-2019,2018-12-28,2018-12-21,,,,
6,2019-01-05,1,2019,1-2019,2018-12-29,2018-12-22,,,,
7,2019-01-06,2,2019,2-2019,2018-12-30,2018-12-23,1.0,,1-2019,
8,2019-01-07,2,2019,2-2019,2018-12-31,2018-12-24,1.0,,1-2019,
9,2019-01-08,2,2019,2-2019,2019-01-01,2018-12-25,1.0,,1-2019,


In [29]:
gmr.limit(5).toPandas()

Unnamed: 0,date,country_region_code,country_region,sub_region_1,sub_region_2,metro_area,iso_3166_2_code,census_fips_code,place_id,retail_and_recreation_percent_change_from_baseline,grocery_and_pharmacy_percent_change_from_baseline,parks_percent_change_from_baseline,transit_stations_percent_change_from_baseline,workplaces_percent_change_from_baseline,residential_percent_change_from_baseline,epi_week,epi_year,epi_week_year,date_7_days_early,epi_week_7_days_early,epi_week_year_7_days_early,date_14_days_early,epi_week_14_days_early,epi_week_year_14_early
0,2020-02-25,BR,Brazil,ACRE,BRASILEIA,,,,ChIJm4g9Bje3fpERnpwI2L71NRQ,,,,,-41,,9,2020,9-2020,2020-03-03,10,10-2020,2020-03-10,11,11-2020
1,2020-03-24,BR,Brazil,ACRE,BRASILEIA,,,,ChIJm4g9Bje3fpERnpwI2L71NRQ,,,,,-40,,13,2020,13-2020,2020-03-31,14,14-2020,2020-04-07,15,15-2020
2,2020-03-25,BR,Brazil,ACRE,BRASILEIA,,,,ChIJm4g9Bje3fpERnpwI2L71NRQ,,,,,-43,,13,2020,13-2020,2020-04-01,14,14-2020,2020-04-08,15,15-2020
3,2020-04-10,BR,Brazil,ACRE,BRASILEIA,,,,ChIJm4g9Bje3fpERnpwI2L71NRQ,,,,,-65,,15,2020,15-2020,2020-04-17,16,16-2020,2020-04-24,17,17-2020
4,2020-04-21,BR,Brazil,ACRE,BRASILEIA,,,,ChIJm4g9Bje3fpERnpwI2L71NRQ,,,,,-44,,17,2020,17-2020,2020-04-28,18,18-2020,2020-05-05,19,19-2020


### adding state code

In [30]:
gmr = gmr.join(ibge.select('UF', 'sub_region_1'), 'sub_region_1', 'left')

### aggregating

In [31]:
gmr1 = gmr.groupBy(['UF', 'epi_week_year']).agg({"residential_percent_change_from_baseline":"avg",
                                          "workplaces_percent_change_from_baseline":"avg",
                                          "transit_stations_percent_change_from_baseline":"avg",
                                          "parks_percent_change_from_baseline":"avg",
                                          "grocery_and_pharmacy_percent_change_from_baseline":"avg",
                                          "retail_and_recreation_percent_change_from_baseline":"avg"})\
                                    .withColumnRenamed('avg(transit_stations_percent_change_from_baseline)', 'gmr_transit_stations_avg')\
                                    .withColumnRenamed('avg(grocery_and_pharmacy_percent_change_from_baseline)', 'gmr_grocery_and_pharmacy_avg')\
                                    .withColumnRenamed('avg(retail_and_recreation_percent_change_from_baseline)', 'gmr_retail_and_recreation_avg')\
                                    .withColumnRenamed('avg(retail_and_recreation_percent_change_from_baseline)', 'gmr_retail_and_recreation_avg')\
                                    .withColumnRenamed('avg(workplaces_percent_change_from_baseline)', 'gmr_workplaces_percent_avg')\
                                    .withColumnRenamed('avg(residential_percent_change_from_baseline)', 'gmr_residential_percent_avg')\
                                    .withColumnRenamed('avg(parks_percent_change_from_baseline)', 'gmr_parks_percent_avg')\
                                    .orderBy('UF')

In [32]:
gmr7 = gmr.groupBy(['UF', 'epi_week_year_7_days_early']).agg({"residential_percent_change_from_baseline":"avg",
                                          "workplaces_percent_change_from_baseline":"avg",
                                          "transit_stations_percent_change_from_baseline":"avg",
                                          "parks_percent_change_from_baseline":"avg",
                                          "grocery_and_pharmacy_percent_change_from_baseline":"avg",
                                          "retail_and_recreation_percent_change_from_baseline":"avg"})\
                                    .withColumnRenamed('avg(transit_stations_percent_change_from_baseline)', 'gmr_transit_stations_1week_before_avg')\
                                    .withColumnRenamed('avg(grocery_and_pharmacy_percent_change_from_baseline)', 'gmr_grocery_and_pharmacy_1week_before_avg')\
                                    .withColumnRenamed('avg(retail_and_recreation_percent_change_from_baseline)', 'gmr_retail_and_recreation_1week_before_avg')\
                                    .withColumnRenamed('avg(retail_and_recreation_percent_change_from_baseline)', 'gmr_retail_and_recreation_1week_before_avg')\
                                    .withColumnRenamed('avg(workplaces_percent_change_from_baseline)', 'gmr_workplaces_percent_1week_before_avg')\
                                    .withColumnRenamed('avg(residential_percent_change_from_baseline)', 'gmr_residential_percent_1week_before_avg')\
                                    .withColumnRenamed('avg(parks_percent_change_from_baseline)', 'gmr_parks_percent_1week_before_avg')\
                                    .orderBy('UF')

In [33]:
gmr14 = gmr.groupBy(['UF', 'epi_week_year_14_early']).agg({"residential_percent_change_from_baseline":"avg",
                                          "workplaces_percent_change_from_baseline":"avg",
                                          "transit_stations_percent_change_from_baseline":"avg",
                                          "parks_percent_change_from_baseline":"avg",
                                          "grocery_and_pharmacy_percent_change_from_baseline":"avg",
                                          "retail_and_recreation_percent_change_from_baseline":"avg"})\
                                    .withColumnRenamed('avg(transit_stations_percent_change_from_baseline)', 'gmr_transit_stations_2weeks_avg')\
                                    .withColumnRenamed('avg(grocery_and_pharmacy_percent_change_from_baseline)', 'gmr_grocery_and_pharmacy_2weeks_avg')\
                                    .withColumnRenamed('avg(retail_and_recreation_percent_change_from_baseline)', 'gmr_retail_and_recreation_2weeks_avg')\
                                    .withColumnRenamed('avg(retail_and_recreation_percent_change_from_baseline)', 'gmr_retail_and_recreation_2weeks_avg')\
                                    .withColumnRenamed('avg(workplaces_percent_change_from_baseline)', 'gmr_workplaces_percent_2weeks_avg')\
                                    .withColumnRenamed('avg(residential_percent_change_from_baseline)', 'gmr_residential_percent_2weeks_avg')\
                                    .withColumnRenamed('avg(parks_percent_change_from_baseline)', 'gmr_parks_percent_2weeks_avg')\
                                    .orderBy('UF')

##### making some validations

In [34]:
gmr1.filter((F.col('UF')=='11') &  ((F.col('epi_week_year') == '22-2020') | (F.col('epi_week_year') == '23-2020') | (F.col('epi_week_year') == '24-2020') | (F.col('epi_week_year') == '25-2020')))\
    .orderBy('epi_week_year').limit(5).toPandas()

Unnamed: 0,UF,epi_week_year,gmr_transit_stations_avg,gmr_grocery_and_pharmacy_avg,gmr_retail_and_recreation_avg,gmr_workplaces_percent_avg,gmr_residential_percent_avg,gmr_parks_percent_avg
0,11,22-2020,-54.5,2.830189,-33.387755,-3.505882,10.82,-27.666667
1,11,23-2020,-53.393939,9.166667,-29.02,-4.119048,10.52,-25.314286
2,11,24-2020,-59.96875,6.259259,-31.9,-9.711268,12.0,-24.78125
3,11,25-2020,-57.714286,2.0,-29.877551,-2.129252,11.56,-25.114286


In [35]:
gmr7.filter((F.col('UF')=='11') &  ((F.col('epi_week_year_7_days_early') == '22-2020') | (F.col('epi_week_year_7_days_early') == '23-2020') | (F.col('epi_week_year_7_days_early') == '24-2020') | (F.col('epi_week_year_7_days_early') == '25-2020')))\
    .orderBy('epi_week_year_7_days_early').limit(5).toPandas()

Unnamed: 0,UF,epi_week_year_7_days_early,gmr_transit_stations_1week_before_avg,gmr_grocery_and_pharmacy_1week_before_avg,gmr_retail_and_recreation_1week_before_avg,gmr_workplaces_percent_1week_before_avg,gmr_residential_percent_1week_before_avg,gmr_parks_percent_1week_before_avg
0,11,22-2020,-55.1875,-2.277778,-35.673469,-5.617978,10.469388,-30.558824
1,11,23-2020,-54.5,2.830189,-33.387755,-3.505882,10.82,-27.666667
2,11,24-2020,-53.393939,9.166667,-29.02,-4.119048,10.52,-25.314286
3,11,25-2020,-59.96875,6.259259,-31.9,-9.711268,12.0,-24.78125


In [36]:
gmr14.filter((F.col('UF')=='11') &  ((F.col('epi_week_year_14_early') == '22-2020') | (F.col('epi_week_year_14_early') == '23-2020') | (F.col('epi_week_year_14_early') == '24-2020') | (F.col('epi_week_year_14_early') == '25-2020')))\
    .orderBy('epi_week_year_14_early').limit(5).toPandas()

Unnamed: 0,UF,epi_week_year_14_early,gmr_transit_stations_2weeks_avg,gmr_grocery_and_pharmacy_2weeks_avg,gmr_retail_and_recreation_2weeks_avg,gmr_workplaces_percent_2weeks_avg,gmr_residential_percent_2weeks_avg,gmr_parks_percent_2weeks_avg
0,11,22-2020,-53.9375,-1.611111,-35.693878,-6.586207,10.0,-30.575758
1,11,23-2020,-55.1875,-2.277778,-35.673469,-5.617978,10.469388,-30.558824
2,11,24-2020,-54.5,2.830189,-33.387755,-3.505882,10.82,-27.666667
3,11,25-2020,-53.393939,9.166667,-29.02,-4.119048,10.52,-25.314286


In [37]:
# joining all for 1 week early
gmr7 = gmr7.withColumnRenamed('UF', 'UF7')
cond = [gmr1.UF == gmr7.UF7, gmr1.epi_week_year == gmr7.epi_week_year_7_days_early]
gmr_agg = gmr1.join(gmr7, cond, 'left')

gmr_agg.filter((F.col('UF')=='11') &  ((F.col('epi_week_year') == '22-2020') | (F.col('epi_week_year') == '23-2020') | (F.col('epi_week_year') == '24-2020') | (F.col('epi_week_year') == '25-2020')))\
    .orderBy('epi_week_year').limit(5).toPandas()

Unnamed: 0,UF,epi_week_year,gmr_transit_stations_avg,gmr_grocery_and_pharmacy_avg,gmr_retail_and_recreation_avg,gmr_workplaces_percent_avg,gmr_residential_percent_avg,gmr_parks_percent_avg,UF7,epi_week_year_7_days_early,gmr_transit_stations_1week_before_avg,gmr_grocery_and_pharmacy_1week_before_avg,gmr_retail_and_recreation_1week_before_avg,gmr_workplaces_percent_1week_before_avg,gmr_residential_percent_1week_before_avg,gmr_parks_percent_1week_before_avg
0,11,22-2020,-54.5,2.830189,-33.387755,-3.505882,10.82,-27.666667,11,22-2020,-55.1875,-2.277778,-35.673469,-5.617978,10.469388,-30.558824
1,11,23-2020,-53.393939,9.166667,-29.02,-4.119048,10.52,-25.314286,11,23-2020,-54.5,2.830189,-33.387755,-3.505882,10.82,-27.666667
2,11,24-2020,-59.96875,6.259259,-31.9,-9.711268,12.0,-24.78125,11,24-2020,-53.393939,9.166667,-29.02,-4.119048,10.52,-25.314286
3,11,25-2020,-57.714286,2.0,-29.877551,-2.129252,11.56,-25.114286,11,25-2020,-59.96875,6.259259,-31.9,-9.711268,12.0,-24.78125


In [38]:
# joining all for 2 weeks early
gmr14 = gmr14.withColumnRenamed('UF', 'UF14')
cond = [gmr1.UF == gmr14.UF14, gmr1.epi_week_year == gmr14.epi_week_year_14_early]
gmr_agg = gmr_agg.join(gmr14, cond, 'left')

gmr_agg.filter((F.col('UF')=='11') &  ((F.col('epi_week_year') == '22-2020') | (F.col('epi_week_year') == '23-2020') | (F.col('epi_week_year') == '24-2020') | (F.col('epi_week_year') == '25-2020')))\
    .orderBy('epi_week_year').limit(5).toPandas()

Unnamed: 0,UF,epi_week_year,gmr_transit_stations_avg,gmr_grocery_and_pharmacy_avg,gmr_retail_and_recreation_avg,gmr_workplaces_percent_avg,gmr_residential_percent_avg,gmr_parks_percent_avg,UF7,epi_week_year_7_days_early,gmr_transit_stations_1week_before_avg,gmr_grocery_and_pharmacy_1week_before_avg,gmr_retail_and_recreation_1week_before_avg,gmr_workplaces_percent_1week_before_avg,gmr_residential_percent_1week_before_avg,gmr_parks_percent_1week_before_avg,UF14,epi_week_year_14_early,gmr_transit_stations_2weeks_avg,gmr_grocery_and_pharmacy_2weeks_avg,gmr_retail_and_recreation_2weeks_avg,gmr_workplaces_percent_2weeks_avg,gmr_residential_percent_2weeks_avg,gmr_parks_percent_2weeks_avg
0,11,22-2020,-54.5,2.830189,-33.387755,-3.505882,10.82,-27.666667,11,22-2020,-55.1875,-2.277778,-35.673469,-5.617978,10.469388,-30.558824,11,22-2020,-53.9375,-1.611111,-35.693878,-6.586207,10.0,-30.575758
1,11,23-2020,-53.393939,9.166667,-29.02,-4.119048,10.52,-25.314286,11,23-2020,-54.5,2.830189,-33.387755,-3.505882,10.82,-27.666667,11,23-2020,-55.1875,-2.277778,-35.673469,-5.617978,10.469388,-30.558824
2,11,24-2020,-59.96875,6.259259,-31.9,-9.711268,12.0,-24.78125,11,24-2020,-53.393939,9.166667,-29.02,-4.119048,10.52,-25.314286,11,24-2020,-54.5,2.830189,-33.387755,-3.505882,10.82,-27.666667
3,11,25-2020,-57.714286,2.0,-29.877551,-2.129252,11.56,-25.114286,11,25-2020,-59.96875,6.259259,-31.9,-9.711268,12.0,-24.78125,11,25-2020,-53.393939,9.166667,-29.02,-4.119048,10.52,-25.314286


In [39]:
# casting to string
gmr_agg = gmr_agg.withColumn('UF', F.col('UF').cast('string'))

In [40]:
# selecting variables of interest
gmr_agg = gmr_agg.select(['UF', 'epi_week_year',
                          'gmr_transit_stations_avg', 'gmr_grocery_and_pharmacy_avg', 'gmr_retail_and_recreation_avg', 'gmr_workplaces_percent_avg', 'gmr_residential_percent_avg', 'gmr_parks_percent_avg',
                          'gmr_transit_stations_1week_before_avg', 'gmr_grocery_and_pharmacy_1week_before_avg', 'gmr_retail_and_recreation_1week_before_avg', 'gmr_workplaces_percent_1week_before_avg', 'gmr_residential_percent_1week_before_avg', 'gmr_parks_percent_1week_before_avg',
                          'gmr_transit_stations_2weeks_avg', 'gmr_grocery_and_pharmacy_2weeks_avg', 'gmr_retail_and_recreation_2weeks_avg', 'gmr_workplaces_percent_2weeks_avg', 'gmr_residential_percent_2weeks_avg', 'gmr_parks_percent_2weeks_avg'])

In [41]:
# padding 'epi_week_year' col
gmr_agg = gmr_agg.withColumn('epi_week_year', F.lpad(F.col('epi_week_year'), 7, '0'))

In [42]:
gmr_agg.limit(10).toPandas()

Unnamed: 0,UF,epi_week_year,gmr_transit_stations_avg,gmr_grocery_and_pharmacy_avg,gmr_retail_and_recreation_avg,gmr_workplaces_percent_avg,gmr_residential_percent_avg,gmr_parks_percent_avg,gmr_transit_stations_1week_before_avg,gmr_grocery_and_pharmacy_1week_before_avg,gmr_retail_and_recreation_1week_before_avg,gmr_workplaces_percent_1week_before_avg,gmr_residential_percent_1week_before_avg,gmr_parks_percent_1week_before_avg,gmr_transit_stations_2weeks_avg,gmr_grocery_and_pharmacy_2weeks_avg,gmr_retail_and_recreation_2weeks_avg,gmr_workplaces_percent_2weeks_avg,gmr_residential_percent_2weeks_avg,gmr_parks_percent_2weeks_avg
0,14,30-2020,-44.142857,7.0,-32.285714,2.538462,12.285714,-45.714286,-42.857143,5.571429,-38.142857,-1.692308,12.714286,-49.571429,-42.714286,14.285714,-34.857143,-5.384615,12.714286,-50.142857
1,15,20-2021,-6.530612,55.402439,11.496241,16.412844,6.491429,3.163265,1.166667,57.160494,11.992424,12.783784,6.104651,-4.817568,-0.191919,67.111111,17.88189,13.604072,5.468208,-3.153846
2,15,37-2020,-11.45,27.414634,7.296875,2.766304,7.508772,17.457627,-2.285714,23.933333,0.382353,-0.185714,7.0,22.444444,-14.428571,12.666667,-8.058824,-0.857143,8.559322,7.181818
3,17,23-2020,-57.0,9.571429,-25.272727,-6.54902,13.096774,-46.0,-64.034483,-2.628571,-35.69697,-11.320755,14.566667,-53.666667,-69.241379,-10.428571,-42.818182,-21.315789,16.133333,-54.235294
4,26,07-2020,-8.526316,-0.925926,-0.545455,6.139535,-0.038462,-10.272727,,,,,,,,,,,,
5,29,38-2020,-41.8125,13.540284,-23.806228,-0.854545,10.24537,-34.438395,-44.825581,13.649485,-27.350365,-6.192593,9.843602,-35.166667,-47.95,15.066667,-31.105263,-9.398438,9.708333,-55.911765
6,29,08-2021,-28.341969,7.64486,-34.619718,-9.177298,7.898876,-48.387755,-25.602151,4.910377,-27.978799,-11.994371,6.462604,-35.345646,-21.706186,10.200935,-22.259786,-5.385269,4.81044,-37.036649
7,32,08-2020,-0.109756,3.117647,-4.833333,18.933962,-0.893805,-11.224,-8.75,0.214286,-5.0,7.166667,0.636364,-17.952381,,,,,,
8,41,48-2020,-20.229358,13.529412,-9.40874,7.981399,4.53012,-32.946429,-18.538462,13.38874,-9.782946,8.685284,4.791165,-34.879859,-16.993846,14.275401,-6.875325,9.244681,4.248,-32.176895
9,42,31-2020,-55.453061,-12.080429,-34.752604,-7.491713,11.841176,-54.762376,-52.137652,-12.149733,-34.707572,-7.229075,11.390029,-50.454839,-49.948617,-11.029333,-31.389034,-5.820994,11.008798,-52.990164


### Joining mobility data for srags

#### 2019

In [43]:
print("How much distincts before? ", srag_2019.select('NU_NOTIFIC').distinct().count())
print("How much distincts epi_weeks before? ", srag_2019.select('epi_week_year').distinct().count())

How much distincts before?  48554
How much distincts epi_weeks before?  53


In [44]:
srag_2019 = srag_2019.join(gmr_agg, ['UF', 'epi_week_year'], 'left')

In [45]:
print("How much distincts after? ", srag_2019.select('NU_NOTIFIC').distinct().count())
print("How much distincts epi_weeks after? ", srag_2019.select('epi_week_year').distinct().count())

How much distincts after?  48554
How much distincts epi_weeks after?  53


#### 2020

In [46]:
print("How much distincts before? ", srag_2020.select('NU_NOTIFIC').distinct().count())
print("How much distincts epi_weeks before? ", srag_2020.select('epi_week_year').distinct().count())

How much distincts before?  1193735
How much distincts epi_weeks before?  53


In [47]:
srag_2020 = srag_2020.join(gmr_agg, ['UF', 'epi_week_year'], 'left')

In [48]:
print("How much distincts after? ", srag_2020.select('NU_NOTIFIC').distinct().count())
print("How much distincts epi_weeks after? ", srag_2020.select('epi_week_year').distinct().count())

How much distincts after?  1193735
How much distincts epi_weeks after?  53


#### 2021

In [49]:
print("How much distincts before? ", srag_2021.select('NU_NOTIFIC').distinct().count())
print("How much distincts epi_weeks before? ", srag_2021.select('epi_week_year').distinct().count())

How much distincts before?  868367
How much distincts epi_weeks before?  21


In [50]:
srag_2021 = srag_2021.join(gmr_agg, ['UF', 'epi_week_year'], 'left')

In [51]:
print("How much distincts after? ", srag_2021.select('NU_NOTIFIC').distinct().count())
print("How much distincts epi_weeks after? ", srag_2021.select('epi_week_year').distinct().count())

How much distincts after?  868367
How much distincts epi_weeks after?  21


# Meteorological data with state codes by epi_weeks

In [52]:
inmet.limit(5).toPandas()

Unnamed: 0,station_id,measurement_date,total_daily_precipitation_mm,daily_atmospheric_pression_mb,daily_avg_dew_point_temp_c,max_daily_temp_maxima_diaria_c,daily_avg_temp_c,daily_min_temp_c,daily_avg_relative_air_humidity_percent,daily_min_relative_air_humidity_percent,max_gust_wind_ms,avg_wind_velocity_ms,empty,municipality_name,sub_region_1
0,A601,2020-03-13,0.0,1011.9125,20.683332,32.7,25.975,21.2,74.416664,46.0,10.5,1.8625,,SEROPEDICA-ECOLOGIA,RIO DE JANEIRO
1,A601,2020-03-14,0.0,1012.37085,21.8,32.9,26.379168,22.3,77.416664,47.0,8.5,1.141667,,SEROPEDICA-ECOLOGIA,RIO DE JANEIRO
2,A601,2020-03-15,0.0,1010.29584,21.245832,35.0,27.829166,22.5,70.416664,37.0,9.8,1.933333,,SEROPEDICA-ECOLOGIA,RIO DE JANEIRO
3,A601,2020-03-16,0.0,1008.62085,20.470833,34.3,28.270832,24.4,63.75,42.0,10.8,2.379167,,SEROPEDICA-ECOLOGIA,RIO DE JANEIRO
4,A601,2020-03-17,0.0,1009.9708,21.75,31.6,25.9875,23.1,78.041664,54.0,10.5,1.795833,,SEROPEDICA-ECOLOGIA,RIO DE JANEIRO


In [53]:
inmet = inmet.join(ibge.select('UF', 'sub_region_1'), 'sub_region_1', 'left')
inmet = inmet.withColumn('UF', F.col('UF').cast('string'))

In [54]:
inmet = inmet.na.drop(subset=['sub_region_1', 'UF'])

In [55]:
inmet = inmet.withColumnRenamed('measurement_date', 'date')

In [56]:
# joining using the actual date
epi_weeks_actual = epi_weeks.select(['date', 'epi_week', 'epi_year', 'epi_week_year'])
inmet = inmet.join(epi_weeks_actual, 'date', 'left')

In [57]:
# joining using date from 7 days before
epi_weeks_7 = epi_weeks.select(['date_7_days_early', 'epi_week_7_days_early', 'epi_week_year_7_days_early'])
inmet = inmet.join(epi_weeks_7, F.date_add(inmet.date, 7) == epi_weeks.date_7_days_early, 'left')

In [58]:
# joining using date from 14 days before
epi_weeks_14 = epi_weeks.select(['date_14_days_early', 'epi_week_14_days_early', 'epi_week_year_14_early'])
inmet = inmet.join(epi_weeks_14, F.date_add(inmet.date, 14) == epi_weeks.date_14_days_early, 'left')

In [59]:
inmet1 = inmet.groupBy(['UF', 'epi_week_year']).agg({"total_daily_precipitation_mm":"avg",
                                          "daily_avg_temp_c":"avg",
                                          "daily_avg_relative_air_humidity_percent":"avg"})\
                                    .withColumnRenamed('avg(total_daily_precipitation_mm)', 'inmet_daily_precipt_avg')\
                                    .withColumnRenamed('avg(daily_avg_temp_c)', 'inmet_temp_c_avg')\
                                    .withColumnRenamed('avg(daily_avg_relative_air_humidity_percent)', 'inmet_relative_air_humidity_avg')\
                                    .orderBy('UF')

In [60]:
inmet7 = inmet.groupBy(['UF', 'epi_week_year_7_days_early']).agg({"total_daily_precipitation_mm":"avg",
                                          "daily_avg_temp_c":"avg",
                                          "daily_avg_relative_air_humidity_percent":"avg"})\
                                    .withColumnRenamed('avg(total_daily_precipitation_mm)', 'inmet_daily_precipt_1week_before_avg')\
                                    .withColumnRenamed('avg(daily_avg_temp_c)', 'inmet_temp_c_1week_before_avg')\
                                    .withColumnRenamed('avg(daily_avg_relative_air_humidity_percent)', 'inmet_relative_air_humidity_1week_before_avg')\
                                    .orderBy('UF')

In [61]:
inmet14 = inmet.groupBy(['UF', 'epi_week_year_14_early']).agg({"total_daily_precipitation_mm":"avg",
                                          "daily_avg_temp_c":"avg",
                                          "daily_avg_relative_air_humidity_percent":"avg"})\
                                    .withColumnRenamed('avg(total_daily_precipitation_mm)', 'inmet_daily_precipt_2weeks_before_avg')\
                                    .withColumnRenamed('avg(daily_avg_temp_c)', 'inmet_temp_c_2weeks_before_avg')\
                                    .withColumnRenamed('avg(daily_avg_relative_air_humidity_percent)', 'inmet_relative_air_humidity_2weeks_before_avg')\
                                    .orderBy('UF')

##### some validations

In [62]:
inmet1.filter((F.col('UF')=='11') &  ((F.col('epi_week_year') == '22-2020') | (F.col('epi_week_year') == '23-2020') | (F.col('epi_week_year') == '24-2020') | (F.col('epi_week_year') == '25-2020')))\
    .orderBy('epi_week_year').limit(5).toPandas()

Unnamed: 0,UF,epi_week_year,inmet_temp_c_avg,inmet_relative_air_humidity_avg,inmet_daily_precipt_avg
0,11,22-2020,21.964815,70.026316,0.1
1,11,23-2020,24.943452,73.357142,0.157143
2,11,24-2020,25.924702,70.943453,0.157143
3,11,25-2020,24.491667,71.193453,1.257143


In [63]:
inmet7.filter((F.col('UF')=='11') &  ((F.col('epi_week_year_7_days_early') == '22-2020') | (F.col('epi_week_year_7_days_early') == '23-2020') | (F.col('epi_week_year_7_days_early') == '24-2020') | (F.col('epi_week_year_7_days_early') == '25-2020')))\
    .orderBy('epi_week_year_7_days_early').limit(5).toPandas()

Unnamed: 0,UF,epi_week_year_7_days_early,inmet_temp_c_1week_before_avg,inmet_relative_air_humidity_1week_before_avg,inmet_daily_precipt_1week_before_avg
0,11,22-2020,25.405357,77.420634,0.571429
1,11,23-2020,21.964815,70.026316,0.1
2,11,24-2020,24.943452,73.357142,0.157143
3,11,25-2020,25.924702,70.943453,0.157143


In [64]:
inmet14.filter((F.col('UF')=='11') &  ((F.col('epi_week_year_14_early') == '22-2020') | (F.col('epi_week_year_14_early') == '23-2020') | (F.col('epi_week_year_14_early') == '24-2020') | (F.col('epi_week_year_14_early') == '25-2020')))\
    .orderBy('epi_week_year_14_early').limit(5).toPandas()

Unnamed: 0,UF,epi_week_year_14_early,inmet_temp_c_2weeks_before_avg,inmet_relative_air_humidity_2weeks_before_avg,inmet_daily_precipt_2weeks_before_avg
0,11,22-2020,25.143981,80.949696,6.033333
1,11,23-2020,25.405357,77.420634,0.571429
2,11,24-2020,21.964815,70.026316,0.1
3,11,25-2020,24.943452,73.357142,0.157143


<hr />

In [65]:
# joining all for 1 week early
inmet7 = inmet7.withColumnRenamed('UF', 'UF7')
cond = [inmet1.UF == inmet7.UF7, inmet1.epi_week_year == inmet7.epi_week_year_7_days_early]
inmet_agg = inmet1.join(inmet7, cond, 'left')

inmet_agg.filter((F.col('UF')=='11') &  ((F.col('epi_week_year') == '22-2020') | (F.col('epi_week_year') == '23-2020') | (F.col('epi_week_year') == '24-2020') | (F.col('epi_week_year') == '25-2020')))\
    .orderBy('epi_week_year').limit(5).toPandas()

Unnamed: 0,UF,epi_week_year,inmet_temp_c_avg,inmet_relative_air_humidity_avg,inmet_daily_precipt_avg,UF7,epi_week_year_7_days_early,inmet_temp_c_1week_before_avg,inmet_relative_air_humidity_1week_before_avg,inmet_daily_precipt_1week_before_avg
0,11,22-2020,21.964815,70.026316,0.1,11,22-2020,25.405357,77.420634,0.571429
1,11,23-2020,24.943452,73.357142,0.157143,11,23-2020,21.964815,70.026316,0.1
2,11,24-2020,25.924702,70.943453,0.157143,11,24-2020,24.943452,73.357142,0.157143
3,11,25-2020,24.491667,71.193453,1.257143,11,25-2020,25.924702,70.943453,0.157143


In [66]:
# joining all for 2 weeks early
inmet14 = inmet14.withColumnRenamed('UF', 'UF14')
cond = [inmet1.UF == inmet14.UF14, inmet1.epi_week_year == inmet14.epi_week_year_14_early]
inmet_agg = inmet_agg.join(inmet14, cond, 'left')

inmet_agg.filter((F.col('UF')=='11') &  ((F.col('epi_week_year') == '22-2020') | (F.col('epi_week_year') == '23-2020') | (F.col('epi_week_year') == '24-2020') | (F.col('epi_week_year') == '25-2020')))\
    .orderBy('epi_week_year').limit(5).toPandas()

Unnamed: 0,UF,epi_week_year,inmet_temp_c_avg,inmet_relative_air_humidity_avg,inmet_daily_precipt_avg,UF7,epi_week_year_7_days_early,inmet_temp_c_1week_before_avg,inmet_relative_air_humidity_1week_before_avg,inmet_daily_precipt_1week_before_avg,UF14,epi_week_year_14_early,inmet_temp_c_2weeks_before_avg,inmet_relative_air_humidity_2weeks_before_avg,inmet_daily_precipt_2weeks_before_avg
0,11,22-2020,21.964815,70.026316,0.1,11,22-2020,25.405357,77.420634,0.571429,11,22-2020,25.143981,80.949696,6.033333
1,11,23-2020,24.943452,73.357142,0.157143,11,23-2020,21.964815,70.026316,0.1,11,23-2020,25.405357,77.420634,0.571429
2,11,24-2020,25.924702,70.943453,0.157143,11,24-2020,24.943452,73.357142,0.157143,11,24-2020,21.964815,70.026316,0.1
3,11,25-2020,24.491667,71.193453,1.257143,11,25-2020,25.924702,70.943453,0.157143,11,25-2020,24.943452,73.357142,0.157143


<hr />

In [67]:
# selecting variables of interest
inmet_agg = inmet_agg.select(['UF', 'epi_week_year',
                          'inmet_temp_c_avg', 'inmet_relative_air_humidity_avg', 'inmet_daily_precipt_avg',
                          'inmet_temp_c_1week_before_avg', 'inmet_relative_air_humidity_1week_before_avg', 'inmet_daily_precipt_1week_before_avg',
                          'inmet_temp_c_2weeks_before_avg', 'inmet_relative_air_humidity_2weeks_before_avg', 'inmet_daily_precipt_2weeks_before_avg'])

In [68]:
# padding 'epi_week_year' col
inmet_agg = inmet_agg.withColumn('epi_week_year', F.lpad(F.col('epi_week_year'), 7, '0'))

<hr />

### Joining meteorological data for srags

#### 2019

In [69]:
print("How much distincts before? ", srag_2019.select('NU_NOTIFIC').distinct().count())
print("How much distincts epi_weeks before? ", srag_2019.select('epi_week_year').distinct().count())

How much distincts before?  48554
How much distincts epi_weeks before?  53


In [70]:
srag_2019 = srag_2019.join(inmet_agg, ['UF', 'epi_week_year'], 'left')

In [71]:
print("How much distincts after? ", srag_2019.select('NU_NOTIFIC').distinct().count())
print("How much distincts epi_weeks after? ", srag_2019.select('epi_week_year').distinct().count())

How much distincts after?  48554
How much distincts epi_weeks after?  53


#### 2020

In [72]:
print("How much distincts before? ", srag_2020.select('NU_NOTIFIC').distinct().count())
print("How much distincts epi_weeks before? ", srag_2020.select('epi_week_year').distinct().count())

How much distincts before?  1193735
How much distincts epi_weeks before?  53


In [73]:
srag_2020 = srag_2020.join(inmet_agg, ['UF', 'epi_week_year'], 'left')

In [74]:
print("How much distincts after? ", srag_2020.select('NU_NOTIFIC').distinct().count())
print("How much distincts epi_weeks after? ", srag_2020.select('epi_week_year').distinct().count())

How much distincts after?  1193735
How much distincts epi_weeks after?  53


#### 2021

In [75]:
print("How much distincts before? ", srag_2021.select('NU_NOTIFIC').distinct().count())
print("How much distincts epi_weeks before? ", srag_2021.select('epi_week_year').distinct().count())

How much distincts before?  868367
How much distincts epi_weeks before?  21


In [76]:
srag_2021 = srag_2021.join(inmet_agg, ['UF', 'epi_week_year'], 'left')

In [77]:
print("How much distincts after? ", srag_2021.select('NU_NOTIFIC').distinct().count())
print("How much distincts epi_weeks after? ", srag_2021.select('epi_week_year').distinct().count())

How much distincts after?  868367
How much distincts epi_weeks after?  21


<hr />

#### writing the last temporary version of srags

In [79]:
srag_2019.write.parquet('gs://ai-covid19-datalake/standard/srag/pp_interm_srag_2019_v4_super-srag/', mode='overwrite')

In [80]:
srag_2020.write.parquet('gs://ai-covid19-datalake/standard/srag/pp_interm_srag_2020_v4_super-srag', mode='overwrite')

In [None]:
srag_2021.write.parquet('gs://ai-covid19-datalake/standard/srag/pp_interm_srag_2021_v4_super-srag', mode='overwrite')

<hr />

In [11]:
# # reading temporary files
# srag_2019 = spark.read.parquet('gs://ai-covid19-datalake/standard/srag/pp_interm_srag_2019_v4_super-srag/')
# srag_2020 = spark.read.parquet('gs://ai-covid19-datalake/standard/srag/pp_interm_srag_2020_v4_super-srag')
# srag_2021 = spark.read.parquet('gs://ai-covid19-datalake/standard/srag/pp_interm_srag_2021_v4_super-srag')

In [83]:
srag_2019 = srag_2019.withColumn('AGE_GROUP', F.when(F.col('AGE_AT_NOTIF') < 1, 1)\
                                                    .when((F.col('AGE_AT_NOTIF') >= 1) & (F.col('AGE_AT_NOTIF') <= 5), 2)\
                                                    .when((F.col('AGE_AT_NOTIF') >= 6) & (F.col('AGE_AT_NOTIF') <= 19), 3)\
                                                    .when((F.col('AGE_AT_NOTIF') >= 20) & (F.col('AGE_AT_NOTIF') <= 29), 4)\
                                                    .when((F.col('AGE_AT_NOTIF') >= 30) & (F.col('AGE_AT_NOTIF') <= 39), 5)\
                                                    .when((F.col('AGE_AT_NOTIF') >= 40) & (F.col('AGE_AT_NOTIF') <= 49), 6)\
                                                    .when((F.col('AGE_AT_NOTIF') >= 50) & (F.col('AGE_AT_NOTIF') <= 59), 7)\
                                                    .when((F.col('AGE_AT_NOTIF') >= 60) & (F.col('AGE_AT_NOTIF') <= 69), 8)\
                                                    .when((F.col('AGE_AT_NOTIF') >= 70) & (F.col('AGE_AT_NOTIF') <= 79), 9)\
                                                    .when((F.col('AGE_AT_NOTIF') >= 80) & (F.col('AGE_AT_NOTIF') <= 89), 10)\
                                                    .when((F.col('AGE_AT_NOTIF') >= 90), 11)\
                                                    .otherwise(12))

In [84]:
srag_2020 = srag_2020.withColumn('AGE_GROUP', F.when(F.col('AGE_AT_NOTIF') < 1, 1)\
                                                    .when((F.col('AGE_AT_NOTIF') >= 1) & (F.col('AGE_AT_NOTIF') <= 5), 2)\
                                                    .when((F.col('AGE_AT_NOTIF') >= 6) & (F.col('AGE_AT_NOTIF') <= 19), 3)\
                                                    .when((F.col('AGE_AT_NOTIF') >= 20) & (F.col('AGE_AT_NOTIF') <= 29), 4)\
                                                    .when((F.col('AGE_AT_NOTIF') >= 30) & (F.col('AGE_AT_NOTIF') <= 39), 5)\
                                                    .when((F.col('AGE_AT_NOTIF') >= 40) & (F.col('AGE_AT_NOTIF') <= 49), 6)\
                                                    .when((F.col('AGE_AT_NOTIF') >= 50) & (F.col('AGE_AT_NOTIF') <= 59), 7)\
                                                    .when((F.col('AGE_AT_NOTIF') >= 60) & (F.col('AGE_AT_NOTIF') <= 69), 8)\
                                                    .when((F.col('AGE_AT_NOTIF') >= 70) & (F.col('AGE_AT_NOTIF') <= 79), 9)\
                                                    .when((F.col('AGE_AT_NOTIF') >= 80) & (F.col('AGE_AT_NOTIF') <= 89), 10)\
                                                    .when((F.col('AGE_AT_NOTIF') >= 90), 11)\
                                                    .otherwise(12))

In [85]:
srag_2021 = srag_2021.withColumn('AGE_GROUP', F.when(F.col('AGE_AT_NOTIF') < 1, 1)\
                                                    .when((F.col('AGE_AT_NOTIF') >= 1) & (F.col('AGE_AT_NOTIF') <= 5), 2)\
                                                    .when((F.col('AGE_AT_NOTIF') >= 6) & (F.col('AGE_AT_NOTIF') <= 19), 3)\
                                                    .when((F.col('AGE_AT_NOTIF') >= 20) & (F.col('AGE_AT_NOTIF') <= 29), 4)\
                                                    .when((F.col('AGE_AT_NOTIF') >= 30) & (F.col('AGE_AT_NOTIF') <= 39), 5)\
                                                    .when((F.col('AGE_AT_NOTIF') >= 40) & (F.col('AGE_AT_NOTIF') <= 49), 6)\
                                                    .when((F.col('AGE_AT_NOTIF') >= 50) & (F.col('AGE_AT_NOTIF') <= 59), 7)\
                                                    .when((F.col('AGE_AT_NOTIF') >= 60) & (F.col('AGE_AT_NOTIF') <= 69), 8)\
                                                    .when((F.col('AGE_AT_NOTIF') >= 70) & (F.col('AGE_AT_NOTIF') <= 79), 9)\
                                                    .when((F.col('AGE_AT_NOTIF') >= 80) & (F.col('AGE_AT_NOTIF') <= 89), 10)\
                                                    .when((F.col('AGE_AT_NOTIF') >= 90), 11)\
                                                    .otherwise(12))

In [86]:
# just for check the header later
srag_2019.limit(1).toPandas()

Unnamed: 0,UF,epi_week_year,NU_NOTIFIC,DT_NOTIFIC,SEM_NOT,DT_SIN_PRI,SEM_PRI,SG_UF_NOT,ID_REGIONA,CO_REGIONA,ID_MUNICIP,CO_MUN_NOT,ID_UNIDADE,CO_UNI_NOT,CS_SEXO,DT_NASC,NU_IDADE_N,TP_IDADE,COD_IDADE,CS_GESTANT,CS_RACA,CS_ETINIA,CS_ESCOL_N,ID_PAIS,CO_PAIS,SG_UF,CO_RG_RESI,ID_MN_RESI,CO_MUN_RES,CS_ZONA,SURTO_SG,NOSOCOMIAL,AVE_SUINO,FEBRE,TOSSE,GARGANTA,DISPNEIA,DESC_RESP,SATURACAO,DIARREIA,VOMITO,OUTRO_SIN,OUTRO_DES,PUERPERA,CARDIOPATI,HEMATOLOGI,SIND_DOWN,HEPATICA,ASMA,DIABETES,...,DIST_PRI_NOTIFIC_q,DIST_PRI_INTERNA_q,DIST_PRI_ENTUTI_q,DIST_PRI_SAIDUTI_q,DIST_PRI_EVOLUCA_q,DIST_PRI_ENCERRA_q,DIST_PRI_RAIOX_q,DIST_PRI_TOMO_q,DIST_PRI_COLETA_q,DIST_PRI_SOR_q,DIST_PRI_PCR_q,DIST_PRI_TRA_q,DIST_PRI_IF_q,SYMP_GROUP1,SYMP_GROUP2,SYMP_GROUP3,SYMP_GROUP4,RF_GROUP1,RF_GROUP2,RF_GROUP3,RF_GROUP4,ANO,gmr_transit_stations_avg,gmr_grocery_and_pharmacy_avg,gmr_retail_and_recreation_avg,gmr_workplaces_percent_avg,gmr_residential_percent_avg,gmr_parks_percent_avg,gmr_transit_stations_1week_before_avg,gmr_grocery_and_pharmacy_1week_before_avg,gmr_retail_and_recreation_1week_before_avg,gmr_workplaces_percent_1week_before_avg,gmr_residential_percent_1week_before_avg,gmr_parks_percent_1week_before_avg,gmr_transit_stations_2weeks_avg,gmr_grocery_and_pharmacy_2weeks_avg,gmr_retail_and_recreation_2weeks_avg,gmr_workplaces_percent_2weeks_avg,gmr_residential_percent_2weeks_avg,gmr_parks_percent_2weeks_avg,inmet_temp_c_avg,inmet_relative_air_humidity_avg,inmet_daily_precipt_avg,inmet_temp_c_1week_before_avg,inmet_relative_air_humidity_1week_before_avg,inmet_daily_precipt_1week_before_avg,inmet_temp_c_2weeks_before_avg,inmet_relative_air_humidity_2weeks_before_avg,inmet_daily_precipt_2weeks_before_avg,AGE_GROUP
0,13,07-2019,2577,2019-02-20,8,2019-02-14,7,AM,ENTORNO DE MANAUS E RIO NEGRO,5584,MANAUS,130260,UPA 24HS JOSE RODRIGUES,9634738,F,1910-11-21,108,3,3108,5,4,,0,BRASIL,1,AM,5584,MANAUS,130260,1,1,2,2,1,1,2,1,1,1,2,2,2,,2,2,2,2,2,2,2,...,3,1,6,6,2,3,6,6,6,6,6,6,6,4,8,1,9,1,1,1,1,2019,,,,,,,,,,,,,,,,,,,26.103136,86.341099,15.366667,26.241688,85.106318,7.678571,26.6344,83.187601,7.942169,11


In [87]:
# just for check the header later
srag_2020.limit(1).toPandas()

Unnamed: 0,UF,epi_week_year,NU_NOTIFIC,DT_NOTIFIC,SEM_NOT,DT_SIN_PRI,SEM_PRI,SG_UF_NOT,ID_REGIONA,CO_REGIONA,ID_MUNICIP,CO_MUN_NOT,ID_UNIDADE,CO_UNI_NOT,CS_SEXO,DT_NASC,NU_IDADE_N,TP_IDADE,COD_IDADE,CS_GESTANT,CS_RACA,CS_ETINIA,CS_ESCOL_N,ID_PAIS,CO_PAIS,SG_UF,ID_RG_RESI,CO_RG_RESI,ID_MN_RESI,CO_MUN_RES,CS_ZONA,SURTO_SG,NOSOCOMIAL,AVE_SUINO,FEBRE,TOSSE,GARGANTA,DISPNEIA,DESC_RESP,SATURACAO,DIARREIA,VOMITO,OUTRO_SIN,OUTRO_DES,PUERPERA,FATOR_RISC,CARDIOPATI,HEMATOLOGI,SIND_DOWN,HEPATICA,...,DIST_PRI_NOTIFIC_q,DIST_PRI_INTERNA_q,DIST_PRI_ENTUTI_q,DIST_PRI_SAIDUTI_q,DIST_PRI_EVOLUCA_q,DIST_PRI_ENCERRA_q,DIST_PRI_RAIOX_q,DIST_PRI_TOMO_q,DIST_PRI_COLETA_q,DIST_PRI_SOR_q,DIST_PRI_PCR_q,DIST_PRI_TRA_q,DIST_PRI_IF_q,SYMP_GROUP1,SYMP_GROUP2,SYMP_GROUP3,SYMP_GROUP4,RF_GROUP1,RF_GROUP2,RF_GROUP3,RF_GROUP4,ANO,gmr_transit_stations_avg,gmr_grocery_and_pharmacy_avg,gmr_retail_and_recreation_avg,gmr_workplaces_percent_avg,gmr_residential_percent_avg,gmr_parks_percent_avg,gmr_transit_stations_1week_before_avg,gmr_grocery_and_pharmacy_1week_before_avg,gmr_retail_and_recreation_1week_before_avg,gmr_workplaces_percent_1week_before_avg,gmr_residential_percent_1week_before_avg,gmr_parks_percent_1week_before_avg,gmr_transit_stations_2weeks_avg,gmr_grocery_and_pharmacy_2weeks_avg,gmr_retail_and_recreation_2weeks_avg,gmr_workplaces_percent_2weeks_avg,gmr_residential_percent_2weeks_avg,gmr_parks_percent_2weeks_avg,inmet_temp_c_avg,inmet_relative_air_humidity_avg,inmet_daily_precipt_avg,inmet_temp_c_1week_before_avg,inmet_relative_air_humidity_1week_before_avg,inmet_daily_precipt_1week_before_avg,inmet_temp_c_2weeks_before_avg,inmet_relative_air_humidity_2weeks_before_avg,inmet_daily_precipt_2weeks_before_avg,AGE_GROUP
0,13,11-2020,25769820812,2020-03-27,13,2020-03-13,11,AM,ENTORNO DE MANAUS E RIO NEGRO,5584,MANAUS,130260,HOSPITAL PRONTO SOCORRO 28 DE AGOSTO,2013649,F,1979-02-27,41,3,3041,5,4,,,BRASIL,1,AM,RIO NEGRO E SOLIMOES,5588,COARI,130120,1,1,2,2,1,1,1,2,1,2,,,,,2,S,2,2,2,2,...,5,5,6,6,3,3,5,6,5,6,5,6,6,8,3,1,1,1,1,5,5,2020,0.363636,1.75,-2.172414,19.591837,-0.588235,-15.071429,-0.583333,8.52,2.62069,19.94,-0.722222,-18.148148,-9.1,-3.041667,3.5,-8.810345,3.272727,-14.44,27.860686,78.611595,2.526531,27.182616,81.819703,9.604082,26.866808,82.893669,5.281633,6


In [88]:
# just for check the header later
srag_2021.limit(1).toPandas()

Unnamed: 0,UF,epi_week_year,NU_NOTIFIC,DT_NOTIFIC,SEM_NOT,DT_SIN_PRI,SEM_PRI,SG_UF_NOT,ID_REGIONA,CO_REGIONA,ID_MUNICIP,CO_MUN_NOT,ID_UNIDADE,CO_UNI_NOT,CS_SEXO,DT_NASC,NU_IDADE_N,TP_IDADE,COD_IDADE,CS_GESTANT,CS_RACA,CS_ETINIA,CS_ESCOL_N,ID_PAIS,CO_PAIS,SG_UF,ID_RG_RESI,CO_RG_RESI,ID_MN_RESI,CO_MUN_RES,CS_ZONA,SURTO_SG,NOSOCOMIAL,AVE_SUINO,FEBRE,TOSSE,GARGANTA,DISPNEIA,DESC_RESP,SATURACAO,DIARREIA,VOMITO,OUTRO_SIN,OUTRO_DES,PUERPERA,FATOR_RISC,CARDIOPATI,HEMATOLOGI,SIND_DOWN,HEPATICA,...,DIST_PRI_NOTIFIC_q,DIST_PRI_INTERNA_q,DIST_PRI_ENTUTI_q,DIST_PRI_SAIDUTI_q,DIST_PRI_EVOLUCA_q,DIST_PRI_ENCERRA_q,DIST_PRI_RAIOX_q,DIST_PRI_TOMO_q,DIST_PRI_COLETA_q,DIST_PRI_SOR_q,DIST_PRI_PCR_q,DIST_PRI_TRA_q,DIST_PRI_IF_q,SYMP_GROUP1,SYMP_GROUP2,SYMP_GROUP3,SYMP_GROUP4,RF_GROUP1,RF_GROUP2,RF_GROUP3,RF_GROUP4,ANO,gmr_transit_stations_avg,gmr_grocery_and_pharmacy_avg,gmr_retail_and_recreation_avg,gmr_workplaces_percent_avg,gmr_residential_percent_avg,gmr_parks_percent_avg,gmr_transit_stations_1week_before_avg,gmr_grocery_and_pharmacy_1week_before_avg,gmr_retail_and_recreation_1week_before_avg,gmr_workplaces_percent_1week_before_avg,gmr_residential_percent_1week_before_avg,gmr_parks_percent_1week_before_avg,gmr_transit_stations_2weeks_avg,gmr_grocery_and_pharmacy_2weeks_avg,gmr_retail_and_recreation_2weeks_avg,gmr_workplaces_percent_2weeks_avg,gmr_residential_percent_2weeks_avg,gmr_parks_percent_2weeks_avg,inmet_temp_c_avg,inmet_relative_air_humidity_avg,inmet_daily_precipt_avg,inmet_temp_c_1week_before_avg,inmet_relative_air_humidity_1week_before_avg,inmet_daily_precipt_1week_before_avg,inmet_temp_c_2weeks_before_avg,inmet_relative_air_humidity_2weeks_before_avg,inmet_daily_precipt_2weeks_before_avg,AGE_GROUP
0,29,01-2021,68719675035,2021-01-11,2,2021-01-05,1,BA,NUCLEO REGIONAL DE SAUDE LESTE,1380,SALVADOR,292740,HOSPITAL DA BAHIA,3827992,F,1952-04-19,68,3,3068,5,9,,9,BRASIL,1,BA,NUCLEO REGIONAL DE SAUDE LESTE,1380,SALVADOR,292740,1,2,2,9,2,2,2,2,2,2,2,1,1,CONFUSAO MENTAL,2,S,1,2,2,2,...,3,2,2,3,4,1,6,6,3,6,3,6,6,1,1,3,1,9,9,9,9,2021,-5.063218,25.785714,-16.11828,0.560534,7.260989,-11.461095,-13.384615,26.571429,-19.401434,-12.686145,7.114846,3.63141,-14.966667,30.004695,-16.609155,-7.159544,6.598886,-16.958824,26.274545,62.914755,0.502703,25.558466,62.840933,0.676667,25.456032,68.208723,1.274419,8


# The SUPER SRAG at last

In [89]:
super_srag_cols = ['NU_NOTIFIC', 'CS_SEXO', 'DT_NASC', 'AGE_AT_NOTIF', 'AGE_GROUP', 'CS_GESTANT', 'CS_RACA', 'CS_ETINIA', 'CS_ESCOL_N', 'SG_UF', 'CO_MUN_RES',
                   'SEM_PRI', 'SEM_NOT', 'DIST_PRI_NOTIFIC', 'DT_SIN_PRI', 'SG_UF_NOT', 'CO_MUN_NOT', 'SURTO_SG', 'NOSOCOMIAL', 'AVE_SUINO', 'VACINA', 'HOSPITAL', 'DIST_PRI_INTERNA', 'SUPORT_VEN', 'UTI', 'DIST_PRI_ENTUTI', 'CLASSI_OUT', 'CRITERIO', 'EVOLUCAO', 'CLASSI_FIN',
                   'SYMP_GROUP1', 'SYMP_GROUP2', 'SYMP_GROUP3', 'SYMP_GROUP4', 'OUTRO_SIN', 'OUTRO_DES',
                   'RF_GROUP1', 'RF_GROUP2', 'RF_GROUP3', 'RF_GROUP4', 'OBES_IMC', 'OUT_MORBI', 'MORB_DESC',
                   'RAIOX_RES', 'DIST_PRI_RAIOX', 'TOMO_RES', 'DIST_PRI_TOMO', 'AMOSTRA', 'TP_AMOSTRA', 'DT_COLETA', 'DIST_PRI_COLETA', 'PP_IF_RESUL', 'PP_TRA_RESUL', 'DIST_PRI_TRA', 'PP_PCR_RESUL', 'DIST_PRI_PCR', 'PP_RES_SOR_IGA', 'PP_RES_SOR_IGM', 'PP_RES_SOR_IGG', 'DIST_PRI_SOR', 'DIST_PRI_IF',
                   'DIST_PRI_NOTIFIC_q', 'DIST_PRI_INTERNA_q', 'DIST_PRI_ENTUTI_q', 'DIST_PRI_SAIDUTI_q', 'DIST_PRI_EVOLUCA_q', 'DIST_PRI_ENCERRA_q', 'DIST_PRI_RAIOX_q', 'DIST_PRI_TOMO_q', 'DIST_PRI_COLETA_q', 'DIST_PRI_SOR_q', 'DIST_PRI_PCR_q', 'DIST_PRI_TRA_q', 'DIST_PRI_IF_q',
                   'epi_week_year', 'gmr_transit_stations_avg', 'gmr_grocery_and_pharmacy_avg', 'gmr_retail_and_recreation_avg', 'gmr_workplaces_percent_avg', 'gmr_residential_percent_avg', 'gmr_parks_percent_avg',
                   'gmr_transit_stations_1week_before_avg', 'gmr_grocery_and_pharmacy_1week_before_avg', 'gmr_retail_and_recreation_1week_before_avg', 'gmr_workplaces_percent_1week_before_avg', 'gmr_residential_percent_1week_before_avg',
                   'gmr_parks_percent_1week_before_avg', 'gmr_transit_stations_2weeks_avg', 'gmr_grocery_and_pharmacy_2weeks_avg', 'gmr_retail_and_recreation_2weeks_avg', 'gmr_workplaces_percent_2weeks_avg',
                   'gmr_residential_percent_2weeks_avg', 'gmr_parks_percent_2weeks_avg',
                   'inmet_temp_c_avg', 'inmet_relative_air_humidity_avg', 'inmet_daily_precipt_avg', 'inmet_temp_c_1week_before_avg', 'inmet_relative_air_humidity_1week_before_avg', 
                   'inmet_daily_precipt_1week_before_avg', 'inmet_temp_c_2weeks_before_avg', 'inmet_relative_air_humidity_2weeks_before_avg', 'inmet_daily_precipt_2weeks_before_avg']

In [90]:
srag_2019.printSchema()

root
 |-- UF: string (nullable = true)
 |-- epi_week_year: string (nullable = true)
 |-- NU_NOTIFIC: long (nullable = true)
 |-- DT_NOTIFIC: date (nullable = true)
 |-- SEM_NOT: string (nullable = true)
 |-- DT_SIN_PRI: date (nullable = true)
 |-- SEM_PRI: string (nullable = true)
 |-- SG_UF_NOT: string (nullable = true)
 |-- ID_REGIONA: string (nullable = true)
 |-- CO_REGIONA: string (nullable = true)
 |-- ID_MUNICIP: string (nullable = true)
 |-- CO_MUN_NOT: string (nullable = true)
 |-- ID_UNIDADE: string (nullable = true)
 |-- CO_UNI_NOT: string (nullable = true)
 |-- CS_SEXO: string (nullable = true)
 |-- DT_NASC: date (nullable = true)
 |-- NU_IDADE_N: integer (nullable = true)
 |-- TP_IDADE: integer (nullable = true)
 |-- COD_IDADE: integer (nullable = true)
 |-- CS_GESTANT: integer (nullable = true)
 |-- CS_RACA: integer (nullable = true)
 |-- CS_ETINIA: string (nullable = true)
 |-- CS_ESCOL_N: integer (nullable = true)
 |-- ID_PAIS: string (nullable = true)
 |-- CO_PAIS: int

In [91]:
srag_2020.printSchema()

root
 |-- UF: string (nullable = true)
 |-- epi_week_year: string (nullable = true)
 |-- NU_NOTIFIC: long (nullable = true)
 |-- DT_NOTIFIC: date (nullable = true)
 |-- SEM_NOT: integer (nullable = true)
 |-- DT_SIN_PRI: date (nullable = true)
 |-- SEM_PRI: integer (nullable = true)
 |-- SG_UF_NOT: string (nullable = true)
 |-- ID_REGIONA: string (nullable = true)
 |-- CO_REGIONA: integer (nullable = true)
 |-- ID_MUNICIP: string (nullable = true)
 |-- CO_MUN_NOT: integer (nullable = true)
 |-- ID_UNIDADE: string (nullable = true)
 |-- CO_UNI_NOT: integer (nullable = true)
 |-- CS_SEXO: string (nullable = true)
 |-- DT_NASC: date (nullable = true)
 |-- NU_IDADE_N: integer (nullable = true)
 |-- TP_IDADE: integer (nullable = true)
 |-- COD_IDADE: string (nullable = true)
 |-- CS_GESTANT: integer (nullable = true)
 |-- CS_RACA: integer (nullable = true)
 |-- CS_ETINIA: string (nullable = true)
 |-- CS_ESCOL_N: integer (nullable = true)
 |-- ID_PAIS: string (nullable = true)
 |-- CO_PAIS:

In [92]:
# adding lost columns 
srag_2019 = srag_2019.withColumn('TOMO_RES', F.lit(None).cast('string'))

In [93]:
srag_2019 = srag_2019.select(super_srag_cols)
srag_2020 = srag_2020.select(super_srag_cols)
srag_2021 = srag_2021.select(super_srag_cols)

In [94]:
super_srag = srag_2019.union(srag_2020).union(srag_2021)

In [95]:
# srag 2019 has 48554 records
# srag 2020 has 1193735 records
# srag 2021 has 868367 records
# super srag must contain (48554 + 1193735 + 868367 =) 2110656 records
print('super srag has', super_srag.count(), 'records')

super srag has 2110656 records


In [96]:
# now the are united, lets create the last attributes
suffix = 'avg'
gmr_inmet_cols = {'gmr_transit_stations_': [],
                  'gmr_grocery_and_pharmacy_': [], 
                  'gmr_retail_and_recreation_': [], 
                  'gmr_workplaces_percent_': [], 
                  'gmr_residential_percent_': [], 
                  'gmr_parks_percent_': [], 
                  'gmr_transit_stations_1week_before_': [], 
                  'gmr_grocery_and_pharmacy_1week_before_': [], 
                  'gmr_retail_and_recreation_1week_before_': [], 
                  'gmr_workplaces_percent_1week_before_': [], 
                  'gmr_residential_percent_1week_before_': [], 
                  'gmr_parks_percent_1week_before_': [], 
                  'gmr_transit_stations_2weeks_': [], 
                  'gmr_grocery_and_pharmacy_2weeks_': [], 
                  'gmr_retail_and_recreation_2weeks_': [], 
                  'gmr_workplaces_percent_2weeks_': [], 
                  'gmr_residential_percent_2weeks_': [], 
                  'gmr_parks_percent_2weeks_': [], 
                  'inmet_temp_c_': [], 
                  'inmet_relative_air_humidity_': [], 
                  'inmet_daily_precipt_': [], 
                  'inmet_temp_c_1week_before_': [], 
                  'inmet_relative_air_humidity_1week_before_' : [],
                  'inmet_daily_precipt_1week_before_': [], 
                  'inmet_temp_c_2weeks_before_': [], 
                  'inmet_relative_air_humidity_2weeks_before_': [], 
                  'inmet_daily_precipt_2weeks_before_': []
                 }

In [None]:
# generating quintiles 
for col in list(gmr_inmet_cols.keys()):
    quint_cut = super_srag.select(col+suffix).approxQuantile(col+suffix, [0.2, 0.4, 0.6, 0.8], 0)
    gmr_inmet_cols[col].append(quint_cut)

In [None]:
# Showing the quintiles cuttofs 
for col in pd.DataFrame(gmr_inmet_cols).columns:
    print(col, list(pd.DataFrame(gmr_inmet_cols)[col]))

gmr_transit_stations_ [[-43.09183673469388, -32.61737331954498, -25.20892494929006, -17.24561403508772]]
gmr_grocery_and_pharmacy_ [[0.36936936936936937, 8.107558139534884, 13.838709677419354, 19.86090775988287]]
gmr_retail_and_recreation_ [[-42.607894736842105, -31.163636363636364, -22.735064935064933, -15.647230320699709]]
gmr_workplaces_percent_ [[-15.347786811201445, -7.407114624505929, -4.023725391216558, 0.5605338417540515]]
gmr_residential_percent_ [[5.825, 7.780269058295964, 9.963333333333333, 12.902788844621513]]
gmr_parks_percent_ [[-45.52965235173824, -36.97651663405088, -30.29383886255924, -21.115384615384617]]
gmr_transit_stations_1week_before_ [[-44.09894736842105, -32.61737331954498, -25.263959390862944, -17.85185185185185]]
gmr_grocery_and_pharmacy_1week_before_ [[-0.16783216783216784, 7.644859813084112, 13.094644167278062, 18.64406779661017]]
gmr_retail_and_recreation_1week_before_ [[-42.81818181818182, -31.208955223880597, -22.889212827988338, -15.928205128205128]]
gm

In [None]:
# Showing the quintiles cuttofs
pd.DataFrame(gmr_inmet_cols)

Unnamed: 0,gmr_transit_stations_,gmr_grocery_and_pharmacy_,gmr_retail_and_recreation_,gmr_workplaces_percent_,gmr_residential_percent_,gmr_parks_percent_,gmr_transit_stations_1week_before_,gmr_grocery_and_pharmacy_1week_before_,gmr_retail_and_recreation_1week_before_,gmr_workplaces_percent_1week_before_,gmr_residential_percent_1week_before_,gmr_parks_percent_1week_before_,gmr_transit_stations_2weeks_,gmr_grocery_and_pharmacy_2weeks_,gmr_retail_and_recreation_2weeks_,gmr_workplaces_percent_2weeks_,gmr_residential_percent_2weeks_,gmr_parks_percent_2weeks_,inmet_temp_c_,inmet_relative_air_humidity_,inmet_daily_precipt_,inmet_temp_c_1week_before_,inmet_relative_air_humidity_1week_before_,inmet_daily_precipt_1week_before_,inmet_temp_c_2weeks_before_,inmet_relative_air_humidity_2weeks_before_,inmet_daily_precipt_2weeks_before_
0,"[-43.09183673469388, -32.61737331954498, -25.2...","[0.36936936936936937, 8.107558139534884, 13.83...","[-42.607894736842105, -31.163636363636364, -22...","[-15.347786811201445, -7.407114624505929, -4.0...","[5.825, 7.780269058295964, 9.963333333333333, ...","[-45.52965235173824, -36.97651663405088, -30.2...","[-44.09894736842105, -32.61737331954498, -25.2...","[-0.16783216783216784, 7.644859813084112, 13.0...","[-42.81818181818182, -31.208955223880597, -22....","[-15.347786811201445, -7.351599852887091, -3.9...","[5.746001279590531, 7.714285714285714, 9.98084...","[-45.42248062015504, -36.97651663405088, -30.6...","[-44.16945606694561, -32.86206896551724, -25.2...","[-0.5689655172413793, 7.173590504451038, 11.92...","[-43.526233359436176, -30.847341337907377, -22...","[-17.4364406779661, -7.7555555555555555, -4.39...","[5.692477876106195, 7.639135959339263, 10.0180...","[-45.38961038961039, -36.54710144927536, -29.9...","[19.99995412088011, 22.3166005555556, 23.72160...","[65.59553142063525, 69.93693714285736, 73.8520...","[0.3383720930232559, 1.4355555555555408, 3.355...","[20.05500949193533, 22.322388241935233, 23.761...","[65.65009503030261, 70.04159629710138, 73.8870...","[0.3585585585585601, 1.5249999999999975, 3.567...","[20.174016811110565, 22.320047571428823, 23.82...","[65.91730179032191, 70.25798429936523, 73.8870...","[0.3714285714285674, 1.6878787878788217, 3.640..."


In [None]:
n_suffix = 'q'
for col in list(gmr_inmet_cols.keys()):
    super_srag = super_srag.withColumn(col+n_suffix, F.when(F.col(col+suffix) <= gmr_inmet_cols[col][0][0], '1')\
                                                                  .when((F.col(col+suffix) > gmr_inmet_cols[col][0][0]) &  (F.col(col+suffix) <= gmr_inmet_cols[col][0][1]), '2')\
                                                                  .when((F.col(col+suffix) > gmr_inmet_cols[col][0][1]) &  (F.col(col+suffix) <= gmr_inmet_cols[col][0][2]), '3')\
                                                                  .when((F.col(col+suffix) > gmr_inmet_cols[col][0][2]) &  (F.col(col+suffix) <= gmr_inmet_cols[col][0][3]), '4')\
                                                                  .when(F.col(col+suffix) > gmr_inmet_cols[col][0][3], '5')\
                                                                  .otherwise('6'))

In [None]:
# making all column names uppercase
for col in super_srag.columns:
    super_srag = super_srag.withColumnRenamed(col, col.upper())

#### writing super srag

In [None]:
super_srag.write.parquet('gs://ai-covid19-datalake/standard/super-srag/super_srag_v1.parquet')

In [None]:
super_srag.coalesce(1).write.csv('gs://ai-covid19-datalake/standard/super-srag/super_srag_v1.csv', header=True)