In [81]:
import pandas as pd
import pyspark.sql.functions as F
from pyspark.sql.types import *

In [82]:
pd.set_option("display.max_rows", 101)
pd.set_option("display.max_columns", 101)

<hr />

### reading preprocessed dataframes

In [3]:
srag_2021 = spark.read.parquet('gs://ai-covid19-datalake/standard/srag/pp_interm_srag_2021_v3_new_attr-UPDATE-28-11-2021')

In [4]:
gmr = spark.read.csv('gs://ai-covid19-datalake/standard/google-mobility/pp_google-mobility_report-28-11-2021.csv', header=True, inferSchema=True)

In [5]:
ibge = spark.read.csv('gs://ai-covid19-datalake/standard/ibge-data/pp_ibge-municipality-to-code-28-11-2021.csv', header=True, inferSchema=True)

In [6]:
inmet = spark.read.csv('gs://ai-covid19-datalake/standard/inmet-data/pp_inmet_meteorological_data-25052021-a-28112021', header=True, inferSchema=True)

In [7]:
epi_weeks = spark.read.csv('gs://ai-covid19-datalake/raw/epidemiological-data/epi_weeks.csv', header=True, inferSchema=True)

<hr />

# Preparing srag to receive the new information from UF's and Epidemiological Weeks

In [8]:
# # dropping duplicate columns
# def dropDupeDfCols(df):
#     newcols = []
#     dupcols = []

#     for i in range(len(df.columns)):
#         if df.columns[i] not in newcols:
#             newcols.append(df.columns[i])
#         else:
#             dupcols.append(i)

#     df = df.toDF(*[str(i) for i in range(len(df.columns))])
#     for dupcol in dupcols:
#         df = df.drop(str(dupcol))

#     return df.toDF(*newcols)

In [9]:
# srag_2019 = dropDupeDfCols(srag_2019)
# srag_2020 = dropDupeDfCols(srag_2020)
# srag_2021 = dropDupeDfCols(srag_2021)

In [10]:
srag_2021 = srag_2021.withColumn('ANO', F.lit('2021'))

In [11]:
def get_epi_week_year(epi_week, year):
    return str(epi_week) + '-' + str(year)
udf_get_epi_week_year = F.udf(get_epi_week_year, StringType())

In [12]:
# adding 'epi_week_year' tag
srag_2021 = srag_2021.withColumn('epi_week_year', udf_get_epi_week_year(F.col('SEM_PRI'), F.col('ANO')))

In [13]:
# padding
srag_2021 = srag_2021.withColumn('epi_week_year', F.lpad(F.col('epi_week_year'), 7, '0'))

In [14]:
def get_uf(cod_mun_res):
    return str(cod_mun_res)[:2]
udf_get_uf = F.udf(get_uf, StringType())

In [15]:
srag_2021 = srag_2021.withColumn('UF', udf_get_uf(F.col('CO_MUN_NOT')))

# Google Mobility Report with state codes by epi_weeks

###  adding epidemiological weeks

In [16]:
# converting date type variables
gmr = gmr.withColumn('date', F.to_date(F.col('date'), 'dd/MM/yyyy'))
epi_weeks = epi_weeks.withColumn('date', F.to_date(F.col('date'), 'dd/MM/yyyy'))

In [17]:
# find the date from 7 or 14 days before
epi_weeks = epi_weeks.withColumn('date_7_days_early', F.date_sub(F.col('date'), 7))
epi_weeks = epi_weeks.withColumn('date_14_days_early', F.date_sub(F.col('date'), 14))

In [18]:
# finding the epi_week number from 1 or 2 weeks before
def find_one_or_two_epi_weeks(n_weeks, col):
    return col - n_weeks
udf_find_one_or_two_epi_weeks = F.udf(find_one_or_two_epi_weeks, IntegerType())

In [19]:
epi_weeks = epi_weeks.withColumn('epi_week_7_days_early', udf_find_one_or_two_epi_weeks(F.lit(1), F.col('epi_week')))
epi_weeks = epi_weeks.withColumn('epi_week_14_days_early', udf_find_one_or_two_epi_weeks(F.lit(2), F.col('epi_week')))

In [20]:
# creating the epi_week tag for the new found early epi_weeks
def set_epi_week_year(col1, col2):
    string = str(col1) + '-' + str(col2)
    return string
udf_set_epi_week_year = F.udf(set_epi_week_year, StringType())

In [21]:
epi_weeks = epi_weeks.withColumn('epi_week_year_7_days_early', udf_set_epi_week_year(F.col('epi_week_7_days_early'), F.col('epi_year')))
epi_weeks = epi_weeks.withColumn('epi_week_year_14_early', udf_set_epi_week_year(F.col('epi_week_14_days_early'), F.col('epi_year')))

In [22]:
# make sure to erase inexistent epi_week
epi_weeks = epi_weeks.withColumn('epi_week_year_7_days_early', F.when((F.col('epi_week_7_days_early') < 1), None).otherwise(F.col('epi_week_year_7_days_early')))
epi_weeks = epi_weeks.withColumn('epi_week_7_days_early', F.when((F.col('epi_week_7_days_early') < 1), None).otherwise(F.col('epi_week_7_days_early')))

epi_weeks = epi_weeks.withColumn('epi_week_year_14_early', F.when((F.col('epi_week_14_days_early') < 1), None).otherwise(F.col('epi_week_year_14_early')))
epi_weeks = epi_weeks.withColumn('epi_week_14_days_early', F.when((F.col('epi_week_14_days_early') < 1), None).otherwise(F.col('epi_week_14_days_early')))

In [23]:
# joining using the actual date
epi_weeks_actual = epi_weeks.select(['date', 'epi_week', 'epi_year', 'epi_week_year'])
gmr = gmr.join(epi_weeks_actual, 'date', 'left')

In [24]:
# joining using date from 7 days before
epi_weeks_7 = epi_weeks.select(['date_7_days_early', 'epi_week_7_days_early', 'epi_week_year_7_days_early'])
gmr = gmr.join(epi_weeks_7, F.date_add(gmr.date, 7) == epi_weeks.date_7_days_early, 'left')

In [25]:
# joining using date from 14 days before
epi_weeks_14 = epi_weeks.select(['date_14_days_early', 'epi_week_14_days_early', 'epi_week_year_14_early'])
gmr = gmr.join(epi_weeks_14, F.date_add(gmr.date, 14) == epi_weeks.date_14_days_early, 'left')

In [26]:
epi_weeks.limit(10).toPandas()

Unnamed: 0,date,epi_week,epi_year,epi_week_year,date_7_days_early,date_14_days_early,epi_week_7_days_early,epi_week_14_days_early,epi_week_year_7_days_early,epi_week_year_14_early
0,2018-12-30,1,2019,1-2019,2018-12-23,2018-12-16,,,,
1,2018-12-31,1,2019,1-2019,2018-12-24,2018-12-17,,,,
2,2019-01-01,1,2019,1-2019,2018-12-25,2018-12-18,,,,
3,2019-01-02,1,2019,1-2019,2018-12-26,2018-12-19,,,,
4,2019-01-03,1,2019,1-2019,2018-12-27,2018-12-20,,,,
5,2019-01-04,1,2019,1-2019,2018-12-28,2018-12-21,,,,
6,2019-01-05,1,2019,1-2019,2018-12-29,2018-12-22,,,,
7,2019-01-06,2,2019,2-2019,2018-12-30,2018-12-23,1.0,,1-2019,
8,2019-01-07,2,2019,2-2019,2018-12-31,2018-12-24,1.0,,1-2019,
9,2019-01-08,2,2019,2-2019,2019-01-01,2018-12-25,1.0,,1-2019,


In [27]:
gmr.limit(5).toPandas()

Unnamed: 0,date,country_region_code,country_region,sub_region_1,sub_region_2,metro_area,iso_3166_2_code,census_fips_code,place_id,retail_and_recreation_percent_change_from_baseline,grocery_and_pharmacy_percent_change_from_baseline,parks_percent_change_from_baseline,transit_stations_percent_change_from_baseline,workplaces_percent_change_from_baseline,residential_percent_change_from_baseline,epi_week,epi_year,epi_week_year,date_7_days_early,epi_week_7_days_early,epi_week_year_7_days_early,date_14_days_early,epi_week_14_days_early,epi_week_year_14_early
0,2021-05-25,BR,Brazil,ACRE,BRASILEIA,,,,ChIJm4g9Bje3fpERnpwI2L71NRQ,,,,,19,,21,2021,21-2021,2021-06-01,22,22-2021,2021-06-08,23,23-2021
1,2021-05-26,BR,Brazil,ACRE,BRASILEIA,,,,ChIJm4g9Bje3fpERnpwI2L71NRQ,,,,,15,,21,2021,21-2021,2021-06-02,22,22-2021,2021-06-09,23,23-2021
2,2021-05-27,BR,Brazil,ACRE,BRASILEIA,,,,ChIJm4g9Bje3fpERnpwI2L71NRQ,,,,,31,,21,2021,21-2021,2021-06-03,22,22-2021,2021-06-10,23,23-2021
3,2021-05-28,BR,Brazil,ACRE,BRASILEIA,,,,ChIJm4g9Bje3fpERnpwI2L71NRQ,,,,,29,,21,2021,21-2021,2021-06-04,22,22-2021,2021-06-11,23,23-2021
4,2021-05-29,BR,Brazil,ACRE,BRASILEIA,,,,ChIJm4g9Bje3fpERnpwI2L71NRQ,,,,,19,,21,2021,21-2021,2021-06-05,22,22-2021,2021-06-12,23,23-2021


### adding state code

In [28]:
gmr = gmr.join(ibge.select('UF', 'sub_region_1'), 'sub_region_1', 'left')

### aggregating

In [29]:
gmr1 = gmr.groupBy(['UF', 'epi_week_year']).agg({"residential_percent_change_from_baseline":"avg",
                                          "workplaces_percent_change_from_baseline":"avg",
                                          "transit_stations_percent_change_from_baseline":"avg",
                                          "parks_percent_change_from_baseline":"avg",
                                          "grocery_and_pharmacy_percent_change_from_baseline":"avg",
                                          "retail_and_recreation_percent_change_from_baseline":"avg"})\
                                    .withColumnRenamed('avg(transit_stations_percent_change_from_baseline)', 'gmr_transit_stations_avg')\
                                    .withColumnRenamed('avg(grocery_and_pharmacy_percent_change_from_baseline)', 'gmr_grocery_and_pharmacy_avg')\
                                    .withColumnRenamed('avg(retail_and_recreation_percent_change_from_baseline)', 'gmr_retail_and_recreation_avg')\
                                    .withColumnRenamed('avg(retail_and_recreation_percent_change_from_baseline)', 'gmr_retail_and_recreation_avg')\
                                    .withColumnRenamed('avg(workplaces_percent_change_from_baseline)', 'gmr_workplaces_percent_avg')\
                                    .withColumnRenamed('avg(residential_percent_change_from_baseline)', 'gmr_residential_percent_avg')\
                                    .withColumnRenamed('avg(parks_percent_change_from_baseline)', 'gmr_parks_percent_avg')\
                                    .orderBy('UF')

In [30]:
gmr7 = gmr.groupBy(['UF', 'epi_week_year_7_days_early']).agg({"residential_percent_change_from_baseline":"avg",
                                          "workplaces_percent_change_from_baseline":"avg",
                                          "transit_stations_percent_change_from_baseline":"avg",
                                          "parks_percent_change_from_baseline":"avg",
                                          "grocery_and_pharmacy_percent_change_from_baseline":"avg",
                                          "retail_and_recreation_percent_change_from_baseline":"avg"})\
                                    .withColumnRenamed('avg(transit_stations_percent_change_from_baseline)', 'gmr_transit_stations_1week_before_avg')\
                                    .withColumnRenamed('avg(grocery_and_pharmacy_percent_change_from_baseline)', 'gmr_grocery_and_pharmacy_1week_before_avg')\
                                    .withColumnRenamed('avg(retail_and_recreation_percent_change_from_baseline)', 'gmr_retail_and_recreation_1week_before_avg')\
                                    .withColumnRenamed('avg(retail_and_recreation_percent_change_from_baseline)', 'gmr_retail_and_recreation_1week_before_avg')\
                                    .withColumnRenamed('avg(workplaces_percent_change_from_baseline)', 'gmr_workplaces_percent_1week_before_avg')\
                                    .withColumnRenamed('avg(residential_percent_change_from_baseline)', 'gmr_residential_percent_1week_before_avg')\
                                    .withColumnRenamed('avg(parks_percent_change_from_baseline)', 'gmr_parks_percent_1week_before_avg')\
                                    .orderBy('UF')

In [31]:
gmr14 = gmr.groupBy(['UF', 'epi_week_year_14_early']).agg({"residential_percent_change_from_baseline":"avg",
                                          "workplaces_percent_change_from_baseline":"avg",
                                          "transit_stations_percent_change_from_baseline":"avg",
                                          "parks_percent_change_from_baseline":"avg",
                                          "grocery_and_pharmacy_percent_change_from_baseline":"avg",
                                          "retail_and_recreation_percent_change_from_baseline":"avg"})\
                                    .withColumnRenamed('avg(transit_stations_percent_change_from_baseline)', 'gmr_transit_stations_2weeks_avg')\
                                    .withColumnRenamed('avg(grocery_and_pharmacy_percent_change_from_baseline)', 'gmr_grocery_and_pharmacy_2weeks_avg')\
                                    .withColumnRenamed('avg(retail_and_recreation_percent_change_from_baseline)', 'gmr_retail_and_recreation_2weeks_avg')\
                                    .withColumnRenamed('avg(retail_and_recreation_percent_change_from_baseline)', 'gmr_retail_and_recreation_2weeks_avg')\
                                    .withColumnRenamed('avg(workplaces_percent_change_from_baseline)', 'gmr_workplaces_percent_2weeks_avg')\
                                    .withColumnRenamed('avg(residential_percent_change_from_baseline)', 'gmr_residential_percent_2weeks_avg')\
                                    .withColumnRenamed('avg(parks_percent_change_from_baseline)', 'gmr_parks_percent_2weeks_avg')\
                                    .orderBy('UF')

##### making some validations

In [32]:
gmr1.filter((F.col('UF')=='11') &  ((F.col('epi_week_year') == '22-2020') | (F.col('epi_week_year') == '23-2020') | (F.col('epi_week_year') == '24-2020') | (F.col('epi_week_year') == '25-2020')))\
    .orderBy('epi_week_year').limit(5).toPandas()

Unnamed: 0,UF,epi_week_year,gmr_transit_stations_avg,gmr_grocery_and_pharmacy_avg,gmr_retail_and_recreation_avg,gmr_workplaces_percent_avg,gmr_residential_percent_avg,gmr_parks_percent_avg


In [33]:
gmr7.filter((F.col('UF')=='11') &  ((F.col('epi_week_year_7_days_early') == '22-2020') | (F.col('epi_week_year_7_days_early') == '23-2020') | (F.col('epi_week_year_7_days_early') == '24-2020') | (F.col('epi_week_year_7_days_early') == '25-2020')))\
    .orderBy('epi_week_year_7_days_early').limit(5).toPandas()

Unnamed: 0,UF,epi_week_year_7_days_early,gmr_transit_stations_1week_before_avg,gmr_grocery_and_pharmacy_1week_before_avg,gmr_retail_and_recreation_1week_before_avg,gmr_workplaces_percent_1week_before_avg,gmr_residential_percent_1week_before_avg,gmr_parks_percent_1week_before_avg


In [34]:
gmr14.filter((F.col('UF')=='11') &  ((F.col('epi_week_year_14_early') == '22-2020') | (F.col('epi_week_year_14_early') == '23-2020') | (F.col('epi_week_year_14_early') == '24-2020') | (F.col('epi_week_year_14_early') == '25-2020')))\
    .orderBy('epi_week_year_14_early').limit(5).toPandas()

Unnamed: 0,UF,epi_week_year_14_early,gmr_transit_stations_2weeks_avg,gmr_grocery_and_pharmacy_2weeks_avg,gmr_retail_and_recreation_2weeks_avg,gmr_workplaces_percent_2weeks_avg,gmr_residential_percent_2weeks_avg,gmr_parks_percent_2weeks_avg


In [35]:
# joining all for 1 week early
gmr7 = gmr7.withColumnRenamed('UF', 'UF7')
cond = [gmr1.UF == gmr7.UF7, gmr1.epi_week_year == gmr7.epi_week_year_7_days_early]
gmr_agg = gmr1.join(gmr7, cond, 'left')

gmr_agg.filter((F.col('UF')=='11') &  ((F.col('epi_week_year') == '22-2020') | (F.col('epi_week_year') == '23-2020') | (F.col('epi_week_year') == '24-2020') | (F.col('epi_week_year') == '25-2020')))\
    .orderBy('epi_week_year').limit(5).toPandas()

Unnamed: 0,UF,epi_week_year,gmr_transit_stations_avg,gmr_grocery_and_pharmacy_avg,gmr_retail_and_recreation_avg,gmr_workplaces_percent_avg,gmr_residential_percent_avg,gmr_parks_percent_avg,UF7,epi_week_year_7_days_early,gmr_transit_stations_1week_before_avg,gmr_grocery_and_pharmacy_1week_before_avg,gmr_retail_and_recreation_1week_before_avg,gmr_workplaces_percent_1week_before_avg,gmr_residential_percent_1week_before_avg,gmr_parks_percent_1week_before_avg


In [36]:
# joining all for 2 weeks early
gmr14 = gmr14.withColumnRenamed('UF', 'UF14')
cond = [gmr1.UF == gmr14.UF14, gmr1.epi_week_year == gmr14.epi_week_year_14_early]
gmr_agg = gmr_agg.join(gmr14, cond, 'left')

gmr_agg.filter((F.col('UF')=='11') &  ((F.col('epi_week_year') == '22-2020') | (F.col('epi_week_year') == '23-2020') | (F.col('epi_week_year') == '24-2020') | (F.col('epi_week_year') == '25-2020')))\
    .orderBy('epi_week_year').limit(5).toPandas()

Unnamed: 0,UF,epi_week_year,gmr_transit_stations_avg,gmr_grocery_and_pharmacy_avg,gmr_retail_and_recreation_avg,gmr_workplaces_percent_avg,gmr_residential_percent_avg,gmr_parks_percent_avg,UF7,epi_week_year_7_days_early,gmr_transit_stations_1week_before_avg,gmr_grocery_and_pharmacy_1week_before_avg,gmr_retail_and_recreation_1week_before_avg,gmr_workplaces_percent_1week_before_avg,gmr_residential_percent_1week_before_avg,gmr_parks_percent_1week_before_avg,UF14,epi_week_year_14_early,gmr_transit_stations_2weeks_avg,gmr_grocery_and_pharmacy_2weeks_avg,gmr_retail_and_recreation_2weeks_avg,gmr_workplaces_percent_2weeks_avg,gmr_residential_percent_2weeks_avg,gmr_parks_percent_2weeks_avg


In [37]:
# casting to string
gmr_agg = gmr_agg.withColumn('UF', F.col('UF').cast('string'))

In [38]:
# selecting variables of interest
gmr_agg = gmr_agg.select(['UF', 'epi_week_year',
                          'gmr_transit_stations_avg', 'gmr_grocery_and_pharmacy_avg', 'gmr_retail_and_recreation_avg', 'gmr_workplaces_percent_avg', 'gmr_residential_percent_avg', 'gmr_parks_percent_avg',
                          'gmr_transit_stations_1week_before_avg', 'gmr_grocery_and_pharmacy_1week_before_avg', 'gmr_retail_and_recreation_1week_before_avg', 'gmr_workplaces_percent_1week_before_avg', 'gmr_residential_percent_1week_before_avg', 'gmr_parks_percent_1week_before_avg',
                          'gmr_transit_stations_2weeks_avg', 'gmr_grocery_and_pharmacy_2weeks_avg', 'gmr_retail_and_recreation_2weeks_avg', 'gmr_workplaces_percent_2weeks_avg', 'gmr_residential_percent_2weeks_avg', 'gmr_parks_percent_2weeks_avg'])

In [39]:
# padding 'epi_week_year' col
gmr_agg = gmr_agg.withColumn('epi_week_year', F.lpad(F.col('epi_week_year'), 7, '0'))

In [40]:
gmr_agg.limit(10).toPandas()

Unnamed: 0,UF,epi_week_year,gmr_transit_stations_avg,gmr_grocery_and_pharmacy_avg,gmr_retail_and_recreation_avg,gmr_workplaces_percent_avg,gmr_residential_percent_avg,gmr_parks_percent_avg,gmr_transit_stations_1week_before_avg,gmr_grocery_and_pharmacy_1week_before_avg,gmr_retail_and_recreation_1week_before_avg,gmr_workplaces_percent_1week_before_avg,gmr_residential_percent_1week_before_avg,gmr_parks_percent_1week_before_avg,gmr_transit_stations_2weeks_avg,gmr_grocery_and_pharmacy_2weeks_avg,gmr_retail_and_recreation_2weeks_avg,gmr_workplaces_percent_2weeks_avg,gmr_residential_percent_2weeks_avg,gmr_parks_percent_2weeks_avg
0,28,26-2021,-16.909091,37.916667,-13.322581,-2.808989,7.859649,-27.980769,-20.391304,30.138889,-19.78125,-11.488889,7.859649,-29.659574,-17.782609,33.638889,-14.125,-0.147727,8.263158,-24.708333
1,22,30-2021,-6.115385,41.037037,-8.285714,7.878613,7.263158,6.380952,-4.807692,39.464286,-8.114286,5.810345,7.333333,10.473684,-8.692308,42.241379,-9.117647,5.5,7.315789,5.75
2,23,40-2021,24.114754,61.022901,10.964706,14.342508,2.008621,-18.592105,14.490566,53.972973,5.910345,14.903339,2.875,-19.489362,14.629032,48.706767,5.914286,12.664122,3.039648,-19.595652
3,15,32-2021,6.714286,78.9375,36.427419,26.889151,3.772727,13.829457,8.694118,84.481013,40.451613,25.551887,3.318182,25.696,4.548387,70.620253,32.096774,18.591224,4.428571,28.702479
4,25,38-2021,-6.121212,48.793103,-0.084507,14.291815,4.45,-19.092308,-6.657143,51.155172,0.557143,13.946996,4.123457,-19.58209,-12.538462,53.067797,0.072464,2.576923,4.864198,-9.246154
5,27,28-2021,7.809524,41.538462,-12.642857,5.381974,4.126582,-31.95122,8.428571,46.025641,-13.116279,4.361702,4.050633,-31.441558,-3.238095,34.025641,-23.243902,-3.84322,5.493506,-40.807229
6,41,37-2021,-6.254658,33.276423,2.903896,21.612961,1.01751,-23.759124,-8.108359,39.413514,5.139842,3.281501,3.031621,-11.568627,-2.782334,38.830189,4.23822,22.233813,0.46184,-24.275362
7,43,37-2021,-4.634921,31.662188,-3.619048,19.290528,2.710863,-24.412731,-11.102894,31.423529,-8.373626,2.97192,6.138486,-28.160083,-4.07619,37.697087,-2.505474,20.618049,2.1888,-20.803419
8,13,21-2021,13.6,56.2,14.058824,29.816901,9.26087,-16.9,,,,,,,,,,,,
9,24,24-2021,-11.815385,36.538462,-9.506329,3.583333,8.102273,-26.980198,-11.469697,45.384615,-7.082192,3.633333,8.329545,-29.633663,-17.523077,42.76,-12.821918,-5.25,10.590909,-27.580952


### Joining mobility data for srags

#### 2021

In [41]:
print("How much distincts before? ", srag_2021.select('NU_NOTIFIC').distinct().count())
print("How much distincts epi_weeks before? ", srag_2021.select('epi_week_year').distinct().count())

How much distincts before?  1615809
How much distincts epi_weeks before?  47


In [42]:
srag_2021 = srag_2021.join(gmr_agg, ['UF', 'epi_week_year'], 'left')

In [43]:
print("How much distincts after? ", srag_2021.select('NU_NOTIFIC').distinct().count())
print("How much distincts epi_weeks after? ", srag_2021.select('epi_week_year').distinct().count())

How much distincts after?  1615809
How much distincts epi_weeks after?  47


# Meteorological data with state codes by epi_weeks

In [44]:
inmet.limit(5).toPandas()

Unnamed: 0,station_id,measurement_date,total_daily_precipitation_mm,daily_atmospheric_pression_mb,daily_avg_dew_point_temp_c,max_daily_temp_maxima_diaria_c,daily_avg_temp_c,daily_min_temp_c,daily_avg_relative_air_humidity_percent,daily_min_relative_air_humidity_percent,max_gust_wind_ms,avg_wind_velocity_ms,empty,municipality_name,sub_region_1
0,A601,2021-08-23,0.0,1008.8375,10.270833,35.0,26.591667,18.6,39.041668,15.0,10.6,2.466667,,SEROPEDICA-ECOLOGIA,RIO DE JANEIRO
1,A601,2021-08-24,0.0,1007.74164,12.804167,35.9,26.6625,20.1,46.125,15.0,8.8,2.383333,,SEROPEDICA-ECOLOGIA,RIO DE JANEIRO
2,A601,2021-08-25,0.0,1006.075,10.758333,34.8,28.095833,19.2,36.5,21.0,11.9,3.225,,SEROPEDICA-ECOLOGIA,RIO DE JANEIRO
3,A601,2021-08-26,0.0,1006.625,9.070833,35.2,28.670834,25.5,30.791668,21.0,13.6,3.7625,,SEROPEDICA-ECOLOGIA,RIO DE JANEIRO
4,A601,2021-08-27,0.0,1014.3542,17.304167,25.8,21.95,19.3,75.416664,57.0,8.0,1.995833,,SEROPEDICA-ECOLOGIA,RIO DE JANEIRO


In [45]:
inmet = inmet.join(ibge.select('UF', 'sub_region_1'), 'sub_region_1', 'left')
inmet = inmet.withColumn('UF', F.col('UF').cast('string'))

In [46]:
inmet = inmet.na.drop(subset=['sub_region_1', 'UF'])

In [47]:
inmet = inmet.withColumnRenamed('measurement_date', 'date')

In [48]:
# joining using the actual date
epi_weeks_actual = epi_weeks.select(['date', 'epi_week', 'epi_year', 'epi_week_year'])
inmet = inmet.join(epi_weeks_actual, 'date', 'left')

In [49]:
# joining using date from 7 days before
epi_weeks_7 = epi_weeks.select(['date_7_days_early', 'epi_week_7_days_early', 'epi_week_year_7_days_early'])
inmet = inmet.join(epi_weeks_7, F.date_add(inmet.date, 7) == epi_weeks.date_7_days_early, 'left')

In [50]:
# joining using date from 14 days before
epi_weeks_14 = epi_weeks.select(['date_14_days_early', 'epi_week_14_days_early', 'epi_week_year_14_early'])
inmet = inmet.join(epi_weeks_14, F.date_add(inmet.date, 14) == epi_weeks.date_14_days_early, 'left')

In [51]:
inmet1 = inmet.groupBy(['UF', 'epi_week_year']).agg({"total_daily_precipitation_mm":"avg",
                                          "daily_avg_temp_c":"avg",
                                          "daily_avg_relative_air_humidity_percent":"avg"})\
                                    .withColumnRenamed('avg(total_daily_precipitation_mm)', 'inmet_daily_precipt_avg')\
                                    .withColumnRenamed('avg(daily_avg_temp_c)', 'inmet_temp_c_avg')\
                                    .withColumnRenamed('avg(daily_avg_relative_air_humidity_percent)', 'inmet_relative_air_humidity_avg')\
                                    .orderBy('UF')

In [52]:
inmet7 = inmet.groupBy(['UF', 'epi_week_year_7_days_early']).agg({"total_daily_precipitation_mm":"avg",
                                          "daily_avg_temp_c":"avg",
                                          "daily_avg_relative_air_humidity_percent":"avg"})\
                                    .withColumnRenamed('avg(total_daily_precipitation_mm)', 'inmet_daily_precipt_1week_before_avg')\
                                    .withColumnRenamed('avg(daily_avg_temp_c)', 'inmet_temp_c_1week_before_avg')\
                                    .withColumnRenamed('avg(daily_avg_relative_air_humidity_percent)', 'inmet_relative_air_humidity_1week_before_avg')\
                                    .orderBy('UF')

In [53]:
inmet14 = inmet.groupBy(['UF', 'epi_week_year_14_early']).agg({"total_daily_precipitation_mm":"avg",
                                          "daily_avg_temp_c":"avg",
                                          "daily_avg_relative_air_humidity_percent":"avg"})\
                                    .withColumnRenamed('avg(total_daily_precipitation_mm)', 'inmet_daily_precipt_2weeks_before_avg')\
                                    .withColumnRenamed('avg(daily_avg_temp_c)', 'inmet_temp_c_2weeks_before_avg')\
                                    .withColumnRenamed('avg(daily_avg_relative_air_humidity_percent)', 'inmet_relative_air_humidity_2weeks_before_avg')\
                                    .orderBy('UF')

##### some validations

In [54]:
inmet1.filter((F.col('UF')=='11') &  ((F.col('epi_week_year') == '22-2020') | (F.col('epi_week_year') == '23-2020') | (F.col('epi_week_year') == '24-2020') | (F.col('epi_week_year') == '25-2020')))\
    .orderBy('epi_week_year').limit(5).toPandas()

Unnamed: 0,UF,epi_week_year,inmet_temp_c_avg,inmet_relative_air_humidity_avg,inmet_daily_precipt_avg


In [55]:
inmet7.filter((F.col('UF')=='11') &  ((F.col('epi_week_year_7_days_early') == '22-2020') | (F.col('epi_week_year_7_days_early') == '23-2020') | (F.col('epi_week_year_7_days_early') == '24-2020') | (F.col('epi_week_year_7_days_early') == '25-2020')))\
    .orderBy('epi_week_year_7_days_early').limit(5).toPandas()

Unnamed: 0,UF,epi_week_year_7_days_early,inmet_temp_c_1week_before_avg,inmet_relative_air_humidity_1week_before_avg,inmet_daily_precipt_1week_before_avg


In [56]:
inmet14.filter((F.col('UF')=='11') &  ((F.col('epi_week_year_14_early') == '22-2020') | (F.col('epi_week_year_14_early') == '23-2020') | (F.col('epi_week_year_14_early') == '24-2020') | (F.col('epi_week_year_14_early') == '25-2020')))\
    .orderBy('epi_week_year_14_early').limit(5).toPandas()

Unnamed: 0,UF,epi_week_year_14_early,inmet_temp_c_2weeks_before_avg,inmet_relative_air_humidity_2weeks_before_avg,inmet_daily_precipt_2weeks_before_avg


<hr />

In [57]:
# joining all for 1 week early
inmet7 = inmet7.withColumnRenamed('UF', 'UF7')
cond = [inmet1.UF == inmet7.UF7, inmet1.epi_week_year == inmet7.epi_week_year_7_days_early]
inmet_agg = inmet1.join(inmet7, cond, 'left')

inmet_agg.filter((F.col('UF')=='11') &  ((F.col('epi_week_year') == '22-2020') | (F.col('epi_week_year') == '23-2020') | (F.col('epi_week_year') == '24-2020') | (F.col('epi_week_year') == '25-2020')))\
    .orderBy('epi_week_year').limit(5).toPandas()

Unnamed: 0,UF,epi_week_year,inmet_temp_c_avg,inmet_relative_air_humidity_avg,inmet_daily_precipt_avg,UF7,epi_week_year_7_days_early,inmet_temp_c_1week_before_avg,inmet_relative_air_humidity_1week_before_avg,inmet_daily_precipt_1week_before_avg


In [58]:
# joining all for 2 weeks early
inmet14 = inmet14.withColumnRenamed('UF', 'UF14')
cond = [inmet1.UF == inmet14.UF14, inmet1.epi_week_year == inmet14.epi_week_year_14_early]
inmet_agg = inmet_agg.join(inmet14, cond, 'left')

inmet_agg.filter((F.col('UF')=='11') &  ((F.col('epi_week_year') == '22-2020') | (F.col('epi_week_year') == '23-2020') | (F.col('epi_week_year') == '24-2020') | (F.col('epi_week_year') == '25-2020')))\
    .orderBy('epi_week_year').limit(5).toPandas()

Unnamed: 0,UF,epi_week_year,inmet_temp_c_avg,inmet_relative_air_humidity_avg,inmet_daily_precipt_avg,UF7,epi_week_year_7_days_early,inmet_temp_c_1week_before_avg,inmet_relative_air_humidity_1week_before_avg,inmet_daily_precipt_1week_before_avg,UF14,epi_week_year_14_early,inmet_temp_c_2weeks_before_avg,inmet_relative_air_humidity_2weeks_before_avg,inmet_daily_precipt_2weeks_before_avg


<hr />

In [59]:
# selecting variables of interest
inmet_agg = inmet_agg.select(['UF', 'epi_week_year',
                          'inmet_temp_c_avg', 'inmet_relative_air_humidity_avg', 'inmet_daily_precipt_avg',
                          'inmet_temp_c_1week_before_avg', 'inmet_relative_air_humidity_1week_before_avg', 'inmet_daily_precipt_1week_before_avg',
                          'inmet_temp_c_2weeks_before_avg', 'inmet_relative_air_humidity_2weeks_before_avg', 'inmet_daily_precipt_2weeks_before_avg'])

In [60]:
# padding 'epi_week_year' col
inmet_agg = inmet_agg.withColumn('epi_week_year', F.lpad(F.col('epi_week_year'), 7, '0'))

<hr />

### Joining meteorological data for srags

#### 2021

In [61]:
print("How much distincts before? ", srag_2021.select('NU_NOTIFIC').distinct().count())
print("How much distincts epi_weeks before? ", srag_2021.select('epi_week_year').distinct().count())

How much distincts before?  1615809
How much distincts epi_weeks before?  47


In [62]:
srag_2021 = srag_2021.join(inmet_agg, ['UF', 'epi_week_year'], 'left')

In [63]:
print("How much distincts after? ", srag_2021.select('NU_NOTIFIC').distinct().count())
print("How much distincts epi_weeks after? ", srag_2021.select('epi_week_year').distinct().count())

How much distincts after?  1615809
How much distincts epi_weeks after?  47


<hr />

#### writing the last temporary version of srags

In [64]:
srag_2021.write.parquet('gs://ai-covid19-datalake/standard/srag/pp_interm_srag_2021_v4_super-srag', mode='overwrite')

<hr />

In [65]:
# # reading temporary files
# srag_2019 = spark.read.parquet('gs://ai-covid19-datalake/standard/srag/pp_interm_srag_2019_v4_super-srag/')
# srag_2020 = spark.read.parquet('gs://ai-covid19-datalake/standard/srag/pp_interm_srag_2020_v4_super-srag')
# srag_2021 = spark.read.parquet('gs://ai-covid19-datalake/standard/srag/pp_interm_srag_2021_v4_super-srag')

In [66]:
srag_2021 = srag_2021.withColumn('AGE_GROUP', F.when(F.col('AGE_AT_NOTIF') < 1, 1)\
                                                    .when((F.col('AGE_AT_NOTIF') >= 1) & (F.col('AGE_AT_NOTIF') <= 5), 2)\
                                                    .when((F.col('AGE_AT_NOTIF') >= 6) & (F.col('AGE_AT_NOTIF') <= 19), 3)\
                                                    .when((F.col('AGE_AT_NOTIF') >= 20) & (F.col('AGE_AT_NOTIF') <= 29), 4)\
                                                    .when((F.col('AGE_AT_NOTIF') >= 30) & (F.col('AGE_AT_NOTIF') <= 39), 5)\
                                                    .when((F.col('AGE_AT_NOTIF') >= 40) & (F.col('AGE_AT_NOTIF') <= 49), 6)\
                                                    .when((F.col('AGE_AT_NOTIF') >= 50) & (F.col('AGE_AT_NOTIF') <= 59), 7)\
                                                    .when((F.col('AGE_AT_NOTIF') >= 60) & (F.col('AGE_AT_NOTIF') <= 69), 8)\
                                                    .when((F.col('AGE_AT_NOTIF') >= 70) & (F.col('AGE_AT_NOTIF') <= 79), 9)\
                                                    .when((F.col('AGE_AT_NOTIF') >= 80) & (F.col('AGE_AT_NOTIF') <= 89), 10)\
                                                    .when((F.col('AGE_AT_NOTIF') >= 90), 11)\
                                                    .otherwise(12))

In [67]:
# just for check the header later
srag_2021.limit(1).toPandas()

Unnamed: 0,UF,epi_week_year,NU_NOTIFIC,DT_NOTIFIC,SEM_NOT,DT_SIN_PRI,SEM_PRI,SG_UF_NOT,ID_REGIONA,CO_REGIONA,ID_MUNICIP,CO_MUN_NOT,ID_UNIDADE,CO_UNI_NOT,CS_SEXO,DT_NASC,NU_IDADE_N,TP_IDADE,COD_IDADE,CS_GESTANT,CS_RACA,CS_ETINIA,CS_ESCOL_N,ID_PAIS,CO_PAIS,SG_UF,ID_RG_RESI,CO_RG_RESI,ID_MN_RESI,CO_MUN_RES,CS_ZONA,SURTO_SG,NOSOCOMIAL,AVE_SUINO,FEBRE,TOSSE,GARGANTA,DISPNEIA,DESC_RESP,SATURACAO,DIARREIA,VOMITO,OUTRO_SIN,OUTRO_DES,PUERPERA,FATOR_RISC,CARDIOPATI,HEMATOLOGI,SIND_DOWN,HEPATICA,...,DIST_PRI_NOTIFIC_q,DIST_PRI_INTERNA_q,DIST_PRI_ENTUTI_q,DIST_PRI_SAIDUTI_q,DIST_PRI_EVOLUCA_q,DIST_PRI_ENCERRA_q,DIST_PRI_RAIOX_q,DIST_PRI_TOMO_q,DIST_PRI_COLETA_q,DIST_PRI_SOR_q,DIST_PRI_PCR_q,DIST_PRI_TRA_q,DIST_PRI_IF_q,SYMP_GROUP1,SYMP_GROUP2,SYMP_GROUP3,SYMP_GROUP4,RF_GROUP1,RF_GROUP2,RF_GROUP3,RF_GROUP4,ANO,gmr_transit_stations_avg,gmr_grocery_and_pharmacy_avg,gmr_retail_and_recreation_avg,gmr_workplaces_percent_avg,gmr_residential_percent_avg,gmr_parks_percent_avg,gmr_transit_stations_1week_before_avg,gmr_grocery_and_pharmacy_1week_before_avg,gmr_retail_and_recreation_1week_before_avg,gmr_workplaces_percent_1week_before_avg,gmr_residential_percent_1week_before_avg,gmr_parks_percent_1week_before_avg,gmr_transit_stations_2weeks_avg,gmr_grocery_and_pharmacy_2weeks_avg,gmr_retail_and_recreation_2weeks_avg,gmr_workplaces_percent_2weeks_avg,gmr_residential_percent_2weeks_avg,gmr_parks_percent_2weeks_avg,inmet_temp_c_avg,inmet_relative_air_humidity_avg,inmet_daily_precipt_avg,inmet_temp_c_1week_before_avg,inmet_relative_air_humidity_1week_before_avg,inmet_daily_precipt_1week_before_avg,inmet_temp_c_2weeks_before_avg,inmet_relative_air_humidity_2weeks_before_avg,inmet_daily_precipt_2weeks_before_avg,AGE_GROUP
0,11,22-2021,240518771697,2021-06-21,25,2021-06-05,22,RO,VI GRS PORTO VELHO,1483,PORTO VELHO,110020,HOSPITAL DE TRATAMENTO A COVID 19 DE RONDONIA ...,213837,M,1963-12-21,57,3,3057,6,4,,3,BRASIL,1,RO,VI GRS PORTO VELHO,1483,PORTO VELHO,110020,,,2,9,1,1,2,1,1,2,2,2,1,"CEFALEIA, MIALGIA",2,1,2,2,2,2,...,5,5,6,6,4,2,6,5,4,6,6,6,3,4,4,1,5,9,9,9,9,2021,-34.04,49.377358,-0.653061,4.102041,6.714286,8.322581,-32.157895,41.102564,-3.114286,14.196429,5.708333,-4.608696,,,,,,,25.3125,58.684524,0.0,25.645833,60.645834,0.05,,,,7


# The SUPER SRAG at last

In [71]:
# now the are united, lets create the last attributes
suffix = 'avg'

gmr_inmet_cols = {
    'gmr_transit_stations_': [[-43.09183673469388, -32.61737331954498, -25.20892494929006, -17.24561403508772]],
    'gmr_grocery_and_pharmacy_': [[0.36936936936936937, 8.107558139534884, 13.838709677419354, 19.86090775988287]],
    'gmr_retail_and_recreation_': [[-42.607894736842105, -31.163636363636364, -22.735064935064933, -15.647230320699709]],
    'gmr_workplaces_percent_': [[-15.347786811201445, -7.407114624505929, -4.023725391216558, 0.5605338417540515]],
    'gmr_residential_percent_': [[5.825, 7.780269058295964, 9.963333333333333, 12.902788844621513]],
    'gmr_parks_percent_': [[-45.52965235173824, -36.97651663405088, -30.29383886255924, -21.115384615384617]],
    'gmr_transit_stations_1week_before_': [[-44.09894736842105, -32.61737331954498, -25.263959390862944, -17.85185185185185]],
    'gmr_grocery_and_pharmacy_1week_before_': [[-0.16783216783216784, 7.644859813084112, 13.094644167278062, 18.64406779661017]],
    'gmr_retail_and_recreation_1week_before_': [[-42.81818181818182, -31.208955223880597, -22.889212827988338, -15.928205128205128]],
    'gmr_workplaces_percent_1week_before_': [[-15.347786811201445, -7.351599852887091, -3.9318181818181817, 0.33611111111111114]],
    'gmr_residential_percent_1week_before_': [[5.746001279590531, 7.714285714285714, 9.980842911877394, 13.173153296266879]],
    'gmr_parks_percent_1week_before_': [[-45.42248062015504, -36.97651663405088, -30.672727272727272, -21.639327024185068]],
    'gmr_transit_stations_2weeks_': [[-44.16945606694561, -32.86206896551724, -25.244274809160306, -17.40625]],
    'gmr_grocery_and_pharmacy_2weeks_': [[-0.5689655172413793, 7.173590504451038, 11.925925925925926, 18.068483063328426]],
    'gmr_retail_and_recreation_2weeks_': [[-43.526233359436176, -30.847341337907377, -22.798561151079138, -16.060301507537687]],
    'gmr_workplaces_percent_2weeks_': [[-17.4364406779661, -7.7555555555555555, -4.397341211225997, -0.5694656488549619]],
    'gmr_residential_percent_2weeks_': [[5.692477876106195, 7.639135959339263, 10.018041237113403, 13.193877551020408]],
    'gmr_parks_percent_2weeks_': [[-45.38961038961039, -36.54710144927536, -29.9047131147541, -20.95306859205776]],
    'inmet_temp_c_': [[19.99995412088011, 22.3166005555556, 23.721603579832205, 25.526168289473578]],
    'inmet_relative_air_humidity_': [[65.59553142063525, 69.93693714285736, 73.8520828571426, 78.24404857142858]],
    'inmet_daily_precipt_': [[0.3383720930232559, 1.4355555555555408, 3.35503875968993, 5.962886597938155]],
    'inmet_temp_c_1week_before_': [[20.05500949193533, 22.322388241935233, 23.761091968254064, 25.53099282105264]],
    'inmet_relative_air_humidity_1week_before_': [[65.65009503030261, 70.04159629710138, 73.88706227868933, 78.36411010769187]],
    'inmet_daily_precipt_1week_before_': [[0.3585585585585601, 1.5249999999999975, 3.567741935483844, 6.042424242424034]],
    'inmet_temp_c_2weeks_before_': [[20.174016811110565, 22.320047571428823, 23.821744027322836, 25.50684535398227]],
    'inmet_relative_air_humidity_2weeks_before_': [[65.91730179032191, 70.25798429936523, 73.88706227868934, 78.47414770987979]],
    'inmet_daily_precipt_2weeks_before_': [[0.3714285714285674, 1.6878787878788217, 3.640816326530609, 6.274725274725252]]
}

In [None]:
# Showing the quintiles cuttofs 
for col in pd.DataFrame(gmr_inmet_cols).columns:
    print(col, list(pd.DataFrame(gmr_inmet_cols)[col]))

gmr_transit_stations_ [[-43.09183673469388, -32.61737331954498, -25.20892494929006, -17.24561403508772]]
gmr_grocery_and_pharmacy_ [[0.36936936936936937, 8.107558139534884, 13.838709677419354, 19.86090775988287]]
gmr_retail_and_recreation_ [[-42.607894736842105, -31.163636363636364, -22.735064935064933, -15.647230320699709]]
gmr_workplaces_percent_ [[-15.347786811201445, -7.407114624505929, -4.023725391216558, 0.5605338417540515]]
gmr_residential_percent_ [[5.825, 7.780269058295964, 9.963333333333333, 12.902788844621513]]
gmr_parks_percent_ [[-45.52965235173824, -36.97651663405088, -30.29383886255924, -21.115384615384617]]
gmr_transit_stations_1week_before_ [[-44.09894736842105, -32.61737331954498, -25.263959390862944, -17.85185185185185]]
gmr_grocery_and_pharmacy_1week_before_ [[-0.16783216783216784, 7.644859813084112, 13.094644167278062, 18.64406779661017]]
gmr_retail_and_recreation_1week_before_ [[-42.81818181818182, -31.208955223880597, -22.889212827988338, -15.928205128205128]]
gm

In [None]:
# Showing the quintiles cuttofs
pd.DataFrame(gmr_inmet_cols)

Unnamed: 0,gmr_transit_stations_,gmr_grocery_and_pharmacy_,gmr_retail_and_recreation_,gmr_workplaces_percent_,gmr_residential_percent_,gmr_parks_percent_,gmr_transit_stations_1week_before_,gmr_grocery_and_pharmacy_1week_before_,gmr_retail_and_recreation_1week_before_,gmr_workplaces_percent_1week_before_,gmr_residential_percent_1week_before_,gmr_parks_percent_1week_before_,gmr_transit_stations_2weeks_,gmr_grocery_and_pharmacy_2weeks_,gmr_retail_and_recreation_2weeks_,gmr_workplaces_percent_2weeks_,gmr_residential_percent_2weeks_,gmr_parks_percent_2weeks_,inmet_temp_c_,inmet_relative_air_humidity_,inmet_daily_precipt_,inmet_temp_c_1week_before_,inmet_relative_air_humidity_1week_before_,inmet_daily_precipt_1week_before_,inmet_temp_c_2weeks_before_,inmet_relative_air_humidity_2weeks_before_,inmet_daily_precipt_2weeks_before_
0,"[-43.09183673469388, -32.61737331954498, -25.2...","[0.36936936936936937, 8.107558139534884, 13.83...","[-42.607894736842105, -31.163636363636364, -22...","[-15.347786811201445, -7.407114624505929, -4.0...","[5.825, 7.780269058295964, 9.963333333333333, ...","[-45.52965235173824, -36.97651663405088, -30.2...","[-44.09894736842105, -32.61737331954498, -25.2...","[-0.16783216783216784, 7.644859813084112, 13.0...","[-42.81818181818182, -31.208955223880597, -22....","[-15.347786811201445, -7.351599852887091, -3.9...","[5.746001279590531, 7.714285714285714, 9.98084...","[-45.42248062015504, -36.97651663405088, -30.6...","[-44.16945606694561, -32.86206896551724, -25.2...","[-0.5689655172413793, 7.173590504451038, 11.92...","[-43.526233359436176, -30.847341337907377, -22...","[-17.4364406779661, -7.7555555555555555, -4.39...","[5.692477876106195, 7.639135959339263, 10.0180...","[-45.38961038961039, -36.54710144927536, -29.9...","[19.99995412088011, 22.3166005555556, 23.72160...","[65.59553142063525, 69.93693714285736, 73.8520...","[0.3383720930232559, 1.4355555555555408, 3.355...","[20.05500949193533, 22.322388241935233, 23.761...","[65.65009503030261, 70.04159629710138, 73.8870...","[0.3585585585585601, 1.5249999999999975, 3.567...","[20.174016811110565, 22.320047571428823, 23.82...","[65.91730179032191, 70.25798429936523, 73.8870...","[0.3714285714285674, 1.6878787878788217, 3.640..."


In [72]:
n_suffix = 'q'
for col in list(gmr_inmet_cols.keys()):
    srag_2021 = srag_2021.withColumn(col+n_suffix, F.when(F.col(col+suffix) <= gmr_inmet_cols[col][0][0], '1')\
                                                                  .when((F.col(col+suffix) > gmr_inmet_cols[col][0][0]) &  (F.col(col+suffix) <= gmr_inmet_cols[col][0][1]), '2')\
                                                                  .when((F.col(col+suffix) > gmr_inmet_cols[col][0][1]) &  (F.col(col+suffix) <= gmr_inmet_cols[col][0][2]), '3')\
                                                                  .when((F.col(col+suffix) > gmr_inmet_cols[col][0][2]) &  (F.col(col+suffix) <= gmr_inmet_cols[col][0][3]), '4')\
                                                                  .when(F.col(col+suffix) > gmr_inmet_cols[col][0][3], '5')\
                                                                  .otherwise('6'))

In [75]:
# making all column names uppercase
for col in srag_2021.columns:
    srag_2021 = srag_2021.withColumnRenamed(col, col.upper())

<hr />
<hr />
<hr />

In [76]:
super_srag = spark.read.parquet('gs://ai-covid19-datalake/standard/super-srag/super_srag_v1.parquet')

In [77]:
super_srag_cols = super_srag.columns

In [78]:
srag_2021 = srag_2021.select(super_srag_cols)

In [79]:
super_srag = super_srag.union(srag_2021)

In [88]:
super_srag = super_srag.withColumn('ANO', F.year('DT_SIN_PRI'))

In [89]:
# srag 2019 has 48554 records
# srag 2020 has 1193735 records
# srag 2021 has 868367 records
# super srag must contain (48554 + 1193735 + 868367 =) 2110656 records
print('super srag has', super_srag.count(), 'records')

super srag has 3726465 records


#### writing super srag

In [90]:
super_srag.write.parquet('gs://ai-covid19-datalake/standard/super-srag/super_srag_v2.parquet')

In [None]:
super_srag.coalesce(1).write.csv('gs://ai-covid19-datalake/standard/super-srag/super_srag_v2.csv', header=True)