### imports

In [1]:
import pandas as pd
import pyspark.sql.functions as F
from pyspark.sql.types import DateType
#from pyspark.sql.functions import min, max

In [2]:
pd.set_option('display.float_format', lambda x: '%.2f' % x)

spark = SparkSession.builder.getOrCreate()
TMP_BUCKET = "ai-covid-tmp"
spark.conf.set("temporaryGcsBucket", TMP_BUCKET)
spark.sparkContext.setCheckpointDir("hdfs:///tmp/")
sc = spark.sparkContext

### download and read

In [4]:
!hdfs dfs -mkdir -p /googlemobility/data/
!wget 'https://www.gstatic.com/covid19/mobility/Global_Mobility_Report.csv'
!mv -fv 'Global_Mobility_Report.csv' 'Global_Mobility_Report-28-05-2021.csv'
!hdfs dfs -put 'Global_Mobility_Report-28-05-2021.csv'  /googlemobility/data
df = spark.read.csv("hdfs:///googlemobility/data/Global_Mobility_Report-28-05-2021.csv", header=True, inferSchema= True)

--2021-06-01 11:02:28--  https://www.gstatic.com/covid19/mobility/Global_Mobility_Report.csv
Resolving www.gstatic.com (www.gstatic.com)... 74.125.129.94, 2607:f8b0:4001:c56::5e
Connecting to www.gstatic.com (www.gstatic.com)|74.125.129.94|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 545180722 (520M) [text/csv]
Saving to: ‘Global_Mobility_Report.csv’


2021-06-01 11:02:32 (165 MB/s) - ‘Global_Mobility_Report.csv’ saved [545180722/545180722]

renamed 'Global_Mobility_Report.csv' -> 'Global_Mobility_Report-28-05-2021.csv'
put: `/googlemobility/data/Global_Mobility_Report-28-05-2021.csv': File exists


In [6]:
# puting ibge codes on municipalities and states
# file from ftp://geoftp.ibge.gov.br/organizacao_do_territorio/estrutura_territorial/divisao_territorial/2018/DTB_2018.zip
!hdfs dfs -put /RELATORIO_DTB_BRASIL_MUNICIPIO.csv /googlemobility/data/
ibge = spark.read.csv('hdfs:///googlemobility/data/RELATORIO_DTB_BRASIL_MUNICIPIO.csv', header=True, inferSchema= True)
ibge = ibge.withColumnRenamed('Nome_Município', 'mun_name')
ibge = ibge.withColumnRenamed('Código Município Completo', 'mun_code')

### writing on Google Cloud Storage

In [7]:
df.write.csv('gs://ai-covid19-datalake/raw/google-mobility/google-mobility_report-28-05-2021.csv', header=True, mode='overwrite')

In [8]:
ibge.write.csv('gs://ai-covid19-datalake/raw/ibge-data/ibge-municipality-to-code-28-05-2021.csv', header=True, mode='overwrite')

In [96]:
print(ibge.count())
print(ibge.dropDuplicates(['Nome_UF','mun_name']).count())

5570
5570


In [88]:
# FAZER JOIN USANDO O NOME DA CIDADE!
cond = 
df_joined = df.join(ibge, df['sub_region_2']==ibge['Nome_Município'], 'left')

In [None]:
df

### some investigations

In [89]:
print(df.count())

1009775


In [45]:
# Is there Brazil's data? 
df = df.filter(F.col('country_region_code') == "BR")
print(df.count())

904413


In [58]:
# which period? 
df = df.withColumn('date', df.date.cast(DateType()))

# df.groupBy('date').count().orderBy('date').show()
maxdate, mindate = df.select(F.max("date"), F.min("date")).first()

print(mindate, maxdate)

2020-02-15 2021-05-24


In [None]:
df.printSchema()

In [None]:
# how much states? 

In [70]:
estados = df.select('sub_region_1').distinct()

In [71]:
print(estados.count())
estados.show(28)

28
+--------------------+
|        sub_region_1|
+--------------------+
|State of Minas Ge...|
|State of Espírito...|
|State of Mato Grosso|
|      State of Goiás|
|State of Rio de J...|
|    State of Roraima|
|      State of Ceará|
|     State of Paraná|
|    State of Paraíba|
|       State of Pará|
|                null|
|    State of Alagoas|
|  State of São Paulo|
|       State of Acre|
|State of Rio Gran...|
| State of Pernambuco|
|  State of Tocantins|
|State of Rio Gran...|
|      State of Amapá|
|   State of Rondônia|
|      State of Bahia|
|State of Santa Ca...|
|   State of Amazonas|
|      State of Piauí|
|State of Mato Gro...|
|    State of Sergipe|
|   State of Maranhão|
|    Federal District|
+--------------------+



In [72]:
cidades = df.select('sub_region_2').distinct()

In [None]:
# how much cities? 

In [75]:
cidades = df.select('sub_region_2').distinct()

In [76]:
print(cidades.count())
cidades.show(20)

2254
+--------------------+
|        sub_region_2|
+--------------------+
|            Araruama|
|               Apodi|
|Boa Esperança do Sul|
|              Maruim|
|    Senador Guiomard|
|              Itatim|
|  São João dos Patos|
|  Ribas do Rio Pardo|
|           Fronteira|
|         Piranguinho|
| Presidente Olegário|
|          Carlópolis|
|             Ibiporã|
|               Tapes|
|       Guajará-Mirim|
|           Rancharia|
|    Miranda do Norte|
|     Barra do Bugres|
|             Mantena|
|             Aracati|
+--------------------+
only showing top 20 rows



In [None]:
# from which states, mostly? 