### imports

In [1]:
import pandas as pd
import pyspark.sql.functions as F
from pyspark.sql.types import DateType
#from pyspark.sql.functions import min, max

In [2]:
pd.set_option('display.float_format', lambda x: '%.2f' % x)

spark = SparkSession.builder.getOrCreate()
TMP_BUCKET = "ai-covid-tmp"
spark.conf.set("temporaryGcsBucket", TMP_BUCKET)
spark.sparkContext.setCheckpointDir("hdfs:///tmp/")
sc = spark.sparkContext

### download and read

In [3]:
!hdfs dfs -mkdir -p /googlemobility/data/
!wget 'https://www.gstatic.com/covid19/mobility/Global_Mobility_Report.csv'
!mv -fv 'Global_Mobility_Report.csv' 'Global_Mobility_Report-28-11-2021.csv'
!hdfs dfs -put 'Global_Mobility_Report-28-11-2021.csv'  /googlemobility/data
df = spark.read.csv("hdfs:///googlemobility/data/Global_Mobility_Report-28-11-2021.csv", header=True, inferSchema= True)

--2021-11-28 15:49:09--  https://www.gstatic.com/covid19/mobility/Global_Mobility_Report.csv
Resolving www.gstatic.com (www.gstatic.com)... 172.217.214.94, 2607:f8b0:4001:c0f::5e
Connecting to www.gstatic.com (www.gstatic.com)|172.217.214.94|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 762170816 (727M) [text/csv]
Saving to: ‘Global_Mobility_Report.csv’


2021-11-28 15:49:15 (120 MB/s) - ‘Global_Mobility_Report.csv’ saved [762170816/762170816]

renamed 'Global_Mobility_Report.csv' -> 'Global_Mobility_Report-28-11-2021.csv'


In [4]:
# puting ibge codes on municipalities and states
# file from ftp://geoftp.ibge.gov.br/organizacao_do_territorio/estrutura_territorial/divisao_territorial/2018/DTB_2018.zip
!hdfs dfs -put /RELATORIO_DTB_BRASIL_MUNICIPIO.csv /googlemobility/data/
ibge = spark.read.csv('hdfs:///googlemobility/data/RELATORIO_DTB_BRASIL_MUNICIPIO.csv', header=True, inferSchema= True)
ibge = ibge.withColumnRenamed('Nome_Município', 'mun_name')
ibge = ibge.withColumnRenamed('Código Município Completo', 'mun_code')

### writing on Google Cloud Storage

In [5]:
df.write.csv('gs://ai-covid19-datalake/raw/google-mobility/google-mobility_report-28-11-2021.csv', header=True, mode='overwrite')

In [6]:
ibge.write.csv('gs://ai-covid19-datalake/raw/ibge-data/ibge-municipality-to-code-28-11-2021.csv', header=True, mode='overwrite')

In [7]:
print(ibge.count())
print(ibge.dropDuplicates(['Nome_UF','mun_name']).count())

5570
5570


In [9]:
# FAZER JOIN USANDO O NOME DA CIDADE!
# cond = 
df_joined = df.join(ibge, df['sub_region_2']==ibge['mun_name'], 'left')

In [12]:
df.limit(2).toPandas()

Unnamed: 0,country_region_code,country_region,sub_region_1,sub_region_2,metro_area,iso_3166_2_code,census_fips_code,place_id,date,retail_and_recreation_percent_change_from_baseline,grocery_and_pharmacy_percent_change_from_baseline,parks_percent_change_from_baseline,transit_stations_percent_change_from_baseline,workplaces_percent_change_from_baseline,residential_percent_change_from_baseline
0,AE,United Arab Emirates,,,,,,ChIJvRKrsd9IXj4RpwoIwFYv0zM,2020-02-15,0,4,5,0,2,1
1,AE,United Arab Emirates,,,,,,ChIJvRKrsd9IXj4RpwoIwFYv0zM,2020-02-16,1,4,4,1,2,1


In [16]:
df_joined.filter(F.col('country_region') == 'Brazil').limit(2).toPandas()

Unnamed: 0,country_region_code,country_region,sub_region_1,sub_region_2,metro_area,iso_3166_2_code,census_fips_code,place_id,date,retail_and_recreation_percent_change_from_baseline,...,residential_percent_change_from_baseline,UF,Nome_UF,Mesorregião Geográfica,Nome_Mesorregião,Microrregião Geográfica,Nome_Microrregião,Município,mun_code,mun_name
0,BR,Brazil,,,,,,ChIJzyjM68dZnAARYz4p8gYVWik,2020-02-15,5,...,0,,,,,,,,,
1,BR,Brazil,,,,,,ChIJzyjM68dZnAARYz4p8gYVWik,2020-02-16,2,...,1,,,,,,,,,


In [15]:
df_joined.select('country_region').groupBy('country_region').count().show()

+--------------+------+
|country_region| count|
+--------------+------+
| Côte d'Ivoire|  7869|
|      Paraguay| 11500|
|        Russia| 11050|
|         Yemen|   650|
|       Senegal|  8103|
|        Sweden|155202|
|   Philippines| 16900|
|      Malaysia| 11045|
|     Singapore|   650|
|          Fiji|   650|
|        Turkey|354962|
|          Iraq|   650|
|       Germany| 11050|
|      Cambodia| 13813|
|   Afghanistan|  1210|
|        Rwanda|  7395|
|        Jordan|  8880|
|        France| 71489|
|        Greece|  5200|
|     Sri Lanka|   650|
+--------------+------+
only showing top 20 rows



### some investigations

In [89]:
print(df.count())

1009775


In [17]:
# Is there Brazil's data? 
df = df.filter(F.col('country_region_code') == "BR")
print(df.count())
# 904413

1301098


In [18]:
# which period? 
df = df.withColumn('date', df.date.cast(DateType()))

# df.groupBy('date').count().orderBy('date').show()
maxdate, mindate = df.select(F.max("date"), F.min("date")).first()

print(mindate, maxdate)
# 2020-02-15 2021-05-24

2020-02-15 2021-11-25


In [None]:
df.printSchema()

In [None]:
# how much states? 

In [19]:
estados = df.select('sub_region_1').distinct()

In [20]:
print(estados.count())
estados.show(28)

28
+--------------------+
|        sub_region_1|
+--------------------+
|State of Minas Ge...|
|State of Espírito...|
|State of Mato Grosso|
|      State of Goiás|
|State of Rio de J...|
|    State of Roraima|
|      State of Ceará|
|     State of Paraná|
|    State of Paraíba|
|       State of Pará|
|                null|
|    State of Alagoas|
|  State of São Paulo|
|       State of Acre|
|State of Rio Gran...|
| State of Pernambuco|
|  State of Tocantins|
|State of Rio Gran...|
|      State of Amapá|
|   State of Rondônia|
|      State of Bahia|
|State of Santa Ca...|
|   State of Amazonas|
|      State of Piauí|
|State of Mato Gro...|
|    State of Sergipe|
|   State of Maranhão|
|    Federal District|
+--------------------+



In [21]:
cidades = df.select('sub_region_2').distinct()

In [22]:
# how much cities? 

In [23]:
cidades = df.select('sub_region_2').distinct()

In [24]:
print(cidades.count())
cidades.show(20)
# 2254

2254
+--------------------+
|        sub_region_2|
+--------------------+
|             Ibiporã|
|            Araruama|
|               Apodi|
|Boa Esperança do Sul|
|              Maruim|
|    Senador Guiomard|
|              Itatim|
|  São João dos Patos|
|  Ribas do Rio Pardo|
|           Fronteira|
|         Piranguinho|
| Presidente Olegário|
|          Carlópolis|
|    Miranda do Norte|
|     Barra do Bugres|
|             Mantena|
|               Tapes|
|       Guajará-Mirim|
|           Rancharia|
|             Aracati|
+--------------------+
only showing top 20 rows



In [None]:
# from which states, mostly? 