In [4]:
!pip install google-auth google-auth-oauthlib google-auth-httplib2 google-cloud-bigquery



In [5]:
!pip freeze > ..//requirements.txt

In [6]:
from _spark import *
from transformations import transform

import pyspark.sql.functions as f
import pyspark.sql.types as t
import os
import findspark
findspark.init()

spark = get_spark()

gcs_bucket =  'tech-challenge'

In [2]:
df = spark\
    .read\
    .option('delimiter',',')\
    .option('header',True)\
    .option('inferSchema',True)\
    .csv('../data/raw')

## BigQuery links

- [BigQuery Table](https://console.cloud.google.com/bigquery?hl=pt-br&project=fiap-tech-challenge-3&ws=!1m0)
- [Storage](https://console.cloud.google.com/storage/browser/tech-challenge;tab=configuration?hl=pt-br&project=fiap-tech-challenge-3&prefix=&forceOnObjectsSortingFiltering=false)
- [IAM e admin](https://console.cloud.google.com/iam-admin/iam?hl=pt-br&project=fiap-tech-challenge-3)

### Raw Data

#### Fato

In [8]:
# Write a PySpark DataFrame to a BigQuery table

df.write\
    .format("bigquery")\
    .option("temporaryGcsBucket", gcs_bucket)\
    .option("credentialsFile",os.environ["GOOGLE_APPLICATION_CREDENTIALS"])\
    .option("project", "fiap-tech-challenge-3")\
    .option("parentProject", "fiap-tech-challenge-3")\
    .option('dataset', 'raw_pnad')\
    .option("table", "tb_f_covid_2020")\
    .mode("overwrite")\
    .save()


#### Dimensao

In [9]:
_schema = t.StructType(
    [
          t.StructField('cd', t.StringType())
        , t.StructField('uf', t.StringType())
    ]
)

_uf = spark.createDataFrame(data=[
    {
        "11": "Rondônia",
        "12": "Acre",
        "13": "Amazonas",
        "14": "Roraima",
        "15": "Pará",
        "16": "Amapá",
        "17": "Tocantins",
        "21": "Maranhão",
        "22": "Piauí",
        "23": "Ceará",
        "24": "Rio Grande do Norte",
        "25": "Paraíba",
        "26": "Pernambuco",
        "27": "Alagoas",
        "28": "Sergipe",
        "29": "Bahia",
        "31": "Minas Gerais",
        "32": "Espírito Santo",
        "33": "Rio de Janeiro",
        "35": "São Paulo",
        "41": "Paraná",
        "42": "Santa Catarina",
        "43": "Rio Grande do Sul",
        "50": "Mato Grosso do Sul",
        "51": "Mato Grosso",
        "52": "Goiás",
        "53": "Distrito Federal",
    }], schema=_schema)

_uf.write\
    .format("bigquery")\
    .option("temporaryGcsBucket", gcs_bucket)\
    .option("credentialsFile",os.environ["GOOGLE_APPLICATION_CREDENTIALS"])\
    .option("project", "fiap-tech-challenge-3")\
    .option("parentProject", "fiap-tech-challenge-3")\
    .option('dataset', 'raw_pnad')\
    .option("table", "tb_d_uf")\
    .mode("overwrite")\
    .save()

In [10]:
_schema = t.StructType(
    [
          t.StructField('cd', t.StringType())
        , t.StructField('area_domicilio', t.StringType())
    ]
)

_area_domicilio = spark.createDataFrame(data=[
    {
        '1': 'Urbana',
        '2': 'Rural',
    }], schema=_schema)

_area_domicilio.write\
    .format("bigquery")\
    .option("temporaryGcsBucket", gcs_bucket)\
    .option("credentialsFile",os.environ["GOOGLE_APPLICATION_CREDENTIALS"])\
    .option("project", "fiap-tech-challenge-3")\
    .option("parentProject", "fiap-tech-challenge-3")\
    .option('dataset', 'raw_pnad')\
    .option("table", "tb_d_area_domicilio")\
    .mode("overwrite")\
    .save()

In [11]:
_schema = t.StructType(
    [
          t.StructField('cd', t.StringType())
        , t.StructField('sexo', t.StringType())
    ]
)

_sexo = spark.createDataFrame(data=[
    {
        '1': 'Masculino',
        '2': 'Feminino',
    }], schema=_schema)

_sexo.write\
    .format("bigquery")\
    .option("temporaryGcsBucket", gcs_bucket)\
    .option("credentialsFile",os.environ["GOOGLE_APPLICATION_CREDENTIALS"])\
    .option("project", "fiap-tech-challenge-3")\
    .option("parentProject", "fiap-tech-challenge-3")\
    .option('dataset', 'raw_pnad')\
    .option("table", "tb_d_sexo")\
    .mode("overwrite")\
    .save()

In [12]:
_schema = t.StructType(
    [
          t.StructField('cd', t.StringType())
        , t.StructField('raca', t.StringType())
    ]
)

_raca = spark.createDataFrame(data=[
    {
        '1': 'Branca',
        '2': 'Preta',
        '3': 'Amarela',
        '4': 'Parda',
        '5': 'Indígena',
        '9': 'Ignorado',
    }], schema=_schema)

_raca.write\
    .format("bigquery")\
    .option("temporaryGcsBucket", gcs_bucket)\
    .option("credentialsFile",os.environ["GOOGLE_APPLICATION_CREDENTIALS"])\
    .option("project", "fiap-tech-challenge-3")\
    .option("parentProject", "fiap-tech-challenge-3")\
    .option('dataset', 'raw_pnad')\
    .option("table", "tb_d_raca")\
    .mode("overwrite")\
    .save()

In [13]:
_schema = t.StructType(
    [
          t.StructField('cd', t.StringType())
        , t.StructField('escolaridade', t.StringType())
    ]
)

_escolaridade = spark.createDataFrame(data=[
    {
        '1': 'Sem instrução',
        '2': 'Fundamental incompleto',
        '3': 'Fundamental completa',
        '4': 'Médio incompleto',
        '5': 'Médio completo',
        '6': 'Superior incompleto',
        '7': 'Superior completo',
        '8': 'Pós-graduação, mestrado ou doutorado',
    }], schema=_schema)

_escolaridade.write\
    .format("bigquery")\
    .option("temporaryGcsBucket", gcs_bucket)\
    .option("credentialsFile",os.environ["GOOGLE_APPLICATION_CREDENTIALS"])\
    .option("project", "fiap-tech-challenge-3")\
    .option("parentProject", "fiap-tech-challenge-3")\
    .option('dataset', 'raw_pnad')\
    .option("table", "tb_d_escolaridade")\
    .mode("overwrite")\
    .save()

In [14]:
_schema = t.StructType(
    [
          t.StructField('cd', t.StringType())
        , t.StructField('escolaridade', t.StringType())
    ]
)

_resposta_covid = spark.createDataFrame(data=[
    {
        '1': 'Sim',
        '2': 'Não ',
        '3': 'Não sabe',
        '9': 'Ignorado',
    }], schema=_schema)

_resposta_covid.write\
    .format("bigquery")\
    .option("temporaryGcsBucket", gcs_bucket)\
    .option("credentialsFile",os.environ["GOOGLE_APPLICATION_CREDENTIALS"])\
    .option("project", "fiap-tech-challenge-3")\
    .option("parentProject", "fiap-tech-challenge-3")\
    .option('dataset', 'raw_pnad')\
    .option("table", "tb_d_resposta_covid")\
    .mode("overwrite")\
    .save()

In [15]:
_schema = t.StructType(
    [
          t.StructField('cd', t.StringType())
        , t.StructField('resposta_internado', t.StringType())
    ]
)

_resposta_internado = spark.createDataFrame(data=[
    {
        '1': 'Sim',
        '2': 'Não ',
        '3': 'Não foi atendido',
        '9': 'Ignorado',
    }], schema=_schema)

_resposta_internado.write\
    .format("bigquery")\
    .option("temporaryGcsBucket", gcs_bucket)\
    .option("credentialsFile",os.environ["GOOGLE_APPLICATION_CREDENTIALS"])\
    .option("project", "fiap-tech-challenge-3")\
    .option("parentProject", "fiap-tech-challenge-3")\
    .option('dataset', 'raw_pnad')\
    .option("table", "tb_d_resposta_internado")\
    .mode("overwrite")\
    .save()

In [16]:
_schema = t.StructType(
    [
          t.StructField('cd', t.StringType())
        , t.StructField('resposta_faixa_rendimento', t.StringType())
    ]
)

_resposta_faixa_rendimento = spark.createDataFrame(data=[
    {
        '00':   '0 - 100',
        '01':	'101 - 300',
        '02':	'301 - 600',
        '03':	'601 - 800',
        '04':	'801 - 1.600',
        '05':	'1.601 - 3.000',
        '06':	'3.001 - 10.000',
        '07':	'10.001 - 50.000',
        '08':	'50.001 - 100.000',
        '09':	'Mais de 100.000',
    }], schema=_schema)

_resposta_faixa_rendimento.write\
    .format("bigquery")\
    .option("temporaryGcsBucket", gcs_bucket)\
    .option("credentialsFile",os.environ["GOOGLE_APPLICATION_CREDENTIALS"])\
    .option("project", "fiap-tech-challenge-3")\
    .option("parentProject", "fiap-tech-challenge-3")\
    .option('dataset', 'raw_pnad')\
    .option("table", "tb_d_resposta_faixa_rendimento")\
    .mode("overwrite")\
    .save()

In [17]:
_schema = t.StructType(
    [
          t.StructField('cd', t.StringType())
        , t.StructField('resposta_situacao_domicilio', t.StringType())
    ]
)

_resposta_situacao_domicilio = spark.createDataFrame(data=[
    {
        '1': 'Próprio - já pago ',
        '2': 'Próprio - ainda pagando',
        '3': 'Alugado',
        '4': 'Cedido por empregador',
        '5': 'Cedido por familiar ',
        '6': 'Cedido de outra forma ',
        '7': 'Outra condição',
    }], schema=_schema)

_resposta_situacao_domicilio.write\
    .format("bigquery")\
    .option("temporaryGcsBucket", gcs_bucket)\
    .option("credentialsFile",os.environ["GOOGLE_APPLICATION_CREDENTIALS"])\
    .option("project", "fiap-tech-challenge-3")\
    .option("parentProject", "fiap-tech-challenge-3")\
    .option('dataset', 'raw_pnad')\
    .option("table", "tb_d_resposta_situacao_domicilio")\
    .mode("overwrite")\
    .save()

In [18]:
_schema = t.StructType(
    [
          t.StructField('cd', t.StringType())
        , t.StructField('questao', t.StringType())
    ]
)

_mapa_questoes = spark.createDataFrame(data=[{  
    "UF": "uf"
  , "V1012": "semana_mes"
  , "V1013": "mes"
  , "V1022": "area_domicilio"
  , "A002": "idade"
  , "A003": "sexo"
  , "A004": "cor_raca"
  , "A005": "escolaridade"
  , "B0011": "teve_febre"
  , "B0014": "teve_dificuldade_respirar"
  , "B0015": "teve_dor_cabeca"
  , "B0019": "teve_fadiga"
  , "B00111": "teve_perda_cheiro"
  , "B002": "foi_posto_saude"
  , "B0031": "ficou_em_casa"
  , "B005": "ficou_internado"
  , "B007": "tem_plano_saude"
  , "C007B": "assalariado"
  , "C01011": "faixa_rendimento"
  , "F001": "situacao_domicilio"
  }], schema=_schema)

_mapa_questoes.write\
    .format("bigquery")\
    .option("temporaryGcsBucket", gcs_bucket)\
    .option("credentialsFile",os.environ["GOOGLE_APPLICATION_CREDENTIALS"])\
    .option("project", "fiap-tech-challenge-3")\
    .option("parentProject", "fiap-tech-challenge-3")\
    .option('dataset', 'raw_pnad')\
    .option("table", "tb_d_questoes")\
    .mode("overwrite")\
    .save()

In [19]:
_schema = t.StructType(
    [
          t.StructField('cd', t.StringType())
        , t.StructField('sexo', t.StringType())
    ]
)

_sexo = spark.createDataFrame(data=[
    {
        '1': 'Masculino',
        '2': 'Feminino',
    }], schema=_schema)

_sexo.write\
    .format("bigquery")\
    .option("temporaryGcsBucket", gcs_bucket)\
    .option("credentialsFile",os.environ["GOOGLE_APPLICATION_CREDENTIALS"])\
    .option("project", "fiap-tech-challenge-3")\
    .option("parentProject", "fiap-tech-challenge-3")\
    .option('dataset', 'raw_pnad')\
    .option("table", "tb_d_sexo")\
    .mode("overwrite")\
    .save()

### Refined Data

#### Fato

In [20]:
df = spark\
    .read\
    .option('delimiter',',')\
    .option('header',True)\
    .option('inferSchema',True)\
    .csv('../data/raw')

columns = [
    "UF", "V1012", "V1013", "V1022", "A002", "A003",
    "A004", "A005", "B0011", "B0014", "B0015", "B0019",
    "B00111", "B002", "B0031", "B005", "B007", "C007B",
    "C01011", "F001",'B009B'
]

df = df.select(columns)

df = df\
        .withColumnRenamed("UF", "uf")\
        .withColumnRenamed("V1012", "semana_mes")\
        .withColumnRenamed("V1013", "mes")\
        .withColumnRenamed("V1022", "area_domicilio")\
        .withColumnRenamed("A002", "idade")\
        .withColumnRenamed("A003", "sexo")\
        .withColumnRenamed("A004", "cor_raca")\
        .withColumnRenamed("A005", "escolaridade")\
        .withColumnRenamed("B0011", "teve_febre")\
        .withColumnRenamed("B0014", "teve_dificuldade_respirar")\
        .withColumnRenamed("B0015", "teve_dor_cabeca")\
        .withColumnRenamed("B0019", "teve_fadiga")\
        .withColumnRenamed("B00111", "teve_perda_cheiro")\
        .withColumnRenamed("B002", "foi_posto_saude")\
        .withColumnRenamed("B0031", "ficou_em_casa")\
        .withColumnRenamed("B005", "ficou_internado")\
        .withColumnRenamed("B009B", "resultado_covid")\
        .withColumnRenamed("B007", "tem_plano_saude")\
        .withColumnRenamed("C007B", "assalariado")\
        .withColumnRenamed("C01011", "faixa_rendimento")\
        .withColumnRenamed("F001", "situacao_domicilio")

df.write\
    .format("bigquery")\
    .option("temporaryGcsBucket", gcs_bucket)\
    .option("credentialsFile",os.environ["GOOGLE_APPLICATION_CREDENTIALS"])\
    .option("project", "fiap-tech-challenge-3")\
    .option("parentProject", "fiap-tech-challenge-3")\
    .option('dataset', 'refined_pnad')\
    .option("table", "tb_f_covid_2020")\
    .mode("overwrite")\
    .save()

#### Dimensao

In [21]:
_schema = t.StructType(
    [
          t.StructField('cd', t.StringType())
        , t.StructField('uf', t.StringType())
    ]
)

_uf = spark.createDataFrame(data=[
    {
        "11": "Rondônia",
        "12": "Acre",
        "13": "Amazonas",
        "14": "Roraima",
        "15": "Pará",
        "16": "Amapá",
        "17": "Tocantins",
        "21": "Maranhão",
        "22": "Piauí",
        "23": "Ceará",
        "24": "Rio Grande do Norte",
        "25": "Paraíba",
        "26": "Pernambuco",
        "27": "Alagoas",
        "28": "Sergipe",
        "29": "Bahia",
        "31": "Minas Gerais",
        "32": "Espírito Santo",
        "33": "Rio de Janeiro",
        "35": "São Paulo",
        "41": "Paraná",
        "42": "Santa Catarina",
        "43": "Rio Grande do Sul",
        "50": "Mato Grosso do Sul",
        "51": "Mato Grosso",
        "52": "Goiás",
        "53": "Distrito Federal",
    }], schema=_schema)

_uf.write\
    .format("bigquery")\
    .option("temporaryGcsBucket", gcs_bucket)\
    .option("credentialsFile",os.environ["GOOGLE_APPLICATION_CREDENTIALS"])\
    .option("project", "fiap-tech-challenge-3")\
    .option("parentProject", "fiap-tech-challenge-3")\
    .option('dataset', 'refined_pnad')\
    .option("table", "tb_d_uf")\
    .mode("overwrite")\
    .save()

In [22]:
_schema = t.StructType(
    [
          t.StructField('cd', t.StringType())
        , t.StructField('area_domicilio', t.StringType())
    ]
)

_area_domicilio = spark.createDataFrame(data=[
    {
        '1': 'Urbana',
        '2': 'Rural',
    }], schema=_schema)

_area_domicilio.write\
    .format("bigquery")\
    .option("temporaryGcsBucket", gcs_bucket)\
    .option("credentialsFile",os.environ["GOOGLE_APPLICATION_CREDENTIALS"])\
    .option("project", "fiap-tech-challenge-3")\
    .option("parentProject", "fiap-tech-challenge-3")\
    .option('dataset', 'refined_pnad')\
    .option("table", "tb_d_area_domicilio")\
    .mode("overwrite")\
    .save()

In [23]:
_schema = t.StructType(
    [
          t.StructField('cd', t.StringType())
        , t.StructField('sexo', t.StringType())
    ]
)

_sexo = spark.createDataFrame(data=[
    {
        '1': 'Masculino',
        '2': 'Feminino',
    }], schema=_schema)

_sexo.write\
    .format("bigquery")\
    .option("temporaryGcsBucket", gcs_bucket)\
    .option("credentialsFile",os.environ["GOOGLE_APPLICATION_CREDENTIALS"])\
    .option("project", "fiap-tech-challenge-3")\
    .option("parentProject", "fiap-tech-challenge-3")\
    .option('dataset', 'refined_pnad')\
    .option("table", "tb_d_sexo")\
    .mode("overwrite")\
    .save()

In [24]:
_schema = t.StructType(
    [
          t.StructField('cd', t.StringType())
        , t.StructField('raca', t.StringType())
    ]
)

_raca = spark.createDataFrame(data=[
    {
        '1': 'Branca',
        '2': 'Preta',
        '3': 'Amarela',
        '4': 'Parda',
        '5': 'Indígena',
        '9': 'Ignorado',
    }], schema=_schema)

_raca.write\
    .format("bigquery")\
    .option("temporaryGcsBucket", gcs_bucket)\
    .option("credentialsFile",os.environ["GOOGLE_APPLICATION_CREDENTIALS"])\
    .option("project", "fiap-tech-challenge-3")\
    .option("parentProject", "fiap-tech-challenge-3")\
    .option('dataset', 'refined_pnad')\
    .option("table", "tb_d_raca")\
    .mode("overwrite")\
    .save()

In [25]:
_schema = t.StructType(
    [
          t.StructField('cd', t.StringType())
        , t.StructField('escolaridade', t.StringType())
    ]
)

_escolaridade = spark.createDataFrame(data=[
    {
        '1': 'Sem instrução',
        '2': 'Fundamental incompleto',
        '3': 'Fundamental completa',
        '4': 'Médio incompleto',
        '5': 'Médio completo',
        '6': 'Superior incompleto',
        '7': 'Superior completo',
        '8': 'Pós-graduação, mestrado ou doutorado',
    }], schema=_schema)

_escolaridade.write\
    .format("bigquery")\
    .option("temporaryGcsBucket", gcs_bucket)\
    .option("credentialsFile",os.environ["GOOGLE_APPLICATION_CREDENTIALS"])\
    .option("project", "fiap-tech-challenge-3")\
    .option("parentProject", "fiap-tech-challenge-3")\
    .option('dataset', 'refined_pnad')\
    .option("table", "tb_d_escolaridade")\
    .mode("overwrite")\
    .save()

In [26]:
_schema = t.StructType(
    [
          t.StructField('cd', t.StringType())
        , t.StructField('escolaridade', t.StringType())
    ]
)

_resposta_covid = spark.createDataFrame(data=[
    {
        '1': 'Sim',
        '2': 'Não ',
        '3': 'Não sabe',
        '9': 'Ignorado',
    }], schema=_schema)

_resposta_covid.write\
    .format("bigquery")\
    .option("temporaryGcsBucket", gcs_bucket)\
    .option("credentialsFile",os.environ["GOOGLE_APPLICATION_CREDENTIALS"])\
    .option("project", "fiap-tech-challenge-3")\
    .option("parentProject", "fiap-tech-challenge-3")\
    .option('dataset', 'refined_pnad')\
    .option("table", "tb_d_resposta_covid")\
    .mode("overwrite")\
    .save()

In [27]:
_schema = t.StructType(
    [
          t.StructField('cd', t.StringType())
        , t.StructField('resposta_internado', t.StringType())
    ]
)

_resposta_internado = spark.createDataFrame(data=[
    {
        '1': 'Sim',
        '2': 'Não ',
        '3': 'Não foi atendido',
        '9': 'Ignorado',
    }], schema=_schema)

_resposta_internado.write\
    .format("bigquery")\
    .option("temporaryGcsBucket", gcs_bucket)\
    .option("credentialsFile",os.environ["GOOGLE_APPLICATION_CREDENTIALS"])\
    .option("project", "fiap-tech-challenge-3")\
    .option("parentProject", "fiap-tech-challenge-3")\
    .option('dataset', 'refined_pnad')\
    .option("table", "tb_d_resposta_internado")\
    .mode("overwrite")\
    .save()

In [28]:
_schema = t.StructType(
    [
          t.StructField('cd', t.StringType())
        , t.StructField('resposta_faixa_rendimento', t.StringType())
    ]
)

_resposta_faixa_rendimento = spark.createDataFrame(data=[
    {
        '00':   '0 - 100',
        '01':	'101 - 300',
        '02':	'301 - 600',
        '03':	'601 - 800',
        '04':	'801 - 1.600',
        '05':	'1.601 - 3.000',
        '06':	'3.001 - 10.000',
        '07':	'10.001 - 50.000',
        '08':	'50.001 - 100.000',
        '09':	'Mais de 100.000',
    }], schema=_schema)

_resposta_faixa_rendimento.write\
    .format("bigquery")\
    .option("temporaryGcsBucket", gcs_bucket)\
    .option("credentialsFile",os.environ["GOOGLE_APPLICATION_CREDENTIALS"])\
    .option("project", "fiap-tech-challenge-3")\
    .option("parentProject", "fiap-tech-challenge-3")\
    .option('dataset', 'refined_pnad')\
    .option("table", "tb_d_resposta_faixa_rendimento")\
    .mode("overwrite")\
    .save()

In [29]:
_schema = t.StructType(
    [
          t.StructField('cd', t.StringType())
        , t.StructField('resposta_situacao_domicilio', t.StringType())
    ]
)

_resposta_situacao_domicilio = spark.createDataFrame(data=[
    {
        '1': 'Próprio - já pago ',
        '2': 'Próprio - ainda pagando',
        '3': 'Alugado',
        '4': 'Cedido por empregador',
        '5': 'Cedido por familiar ',
        '6': 'Cedido de outra forma ',
        '7': 'Outra condição',
    }], schema=_schema)

_resposta_situacao_domicilio.write\
    .format("bigquery")\
    .option("temporaryGcsBucket", gcs_bucket)\
    .option("credentialsFile",os.environ["GOOGLE_APPLICATION_CREDENTIALS"])\
    .option("project", "fiap-tech-challenge-3")\
    .option("parentProject", "fiap-tech-challenge-3")\
    .option('dataset', 'refined_pnad')\
    .option("table", "tb_d_resposta_situacao_domicilio")\
    .mode("overwrite")\
    .save()

In [30]:
_schema = t.StructType(
    [
          t.StructField('cd', t.StringType())
        , t.StructField('questao', t.StringType())
    ]
)

_mapa_questoes = spark.createDataFrame(data=[{  
    "UF": "uf"
  , "V1012": "semana_mes"
  , "V1013": "mes"
  , "V1022": "area_domicilio"
  , "A002": "idade"
  , "A003": "sexo"
  , "A004": "cor_raca"
  , "A005": "escolaridade"
  , "B0011": "teve_febre"
  , "B0014": "teve_dificuldade_respirar"
  , "B0015": "teve_dor_cabeca"
  , "B0019": "teve_fadiga"
  , "B00111": "teve_perda_cheiro"
  , "B002": "foi_posto_saude"
  , "B0031": "ficou_em_casa"
  , "B005": "ficou_internado"
  , "B007": "tem_plano_saude"
  , "C007B": "assalariado"
  , "C01011": "faixa_rendimento"
  , "F001": "situacao_domicilio"
  }], schema=_schema)

_mapa_questoes.write\
    .format("bigquery")\
    .option("temporaryGcsBucket", gcs_bucket)\
    .option("credentialsFile",os.environ["GOOGLE_APPLICATION_CREDENTIALS"])\
    .option("project", "fiap-tech-challenge-3")\
    .option("parentProject", "fiap-tech-challenge-3")\
    .option('dataset', 'refined_pnad')\
    .option("table", "tb_d_questoes")\
    .mode("overwrite")\
    .save()

In [31]:
_schema = t.StructType(
    [
          t.StructField('cd', t.StringType())
        , t.StructField('sexo', t.StringType())
    ]
)

_sexo = spark.createDataFrame(data=[
    {
        '1': 'Masculino',
        '2': 'Feminino',
    }], schema=_schema)

_sexo.write\
    .format("bigquery")\
    .option("temporaryGcsBucket", gcs_bucket)\
    .option("credentialsFile",os.environ["GOOGLE_APPLICATION_CREDENTIALS"])\
    .option("project", "fiap-tech-challenge-3")\
    .option("parentProject", "fiap-tech-challenge-3")\
    .option('dataset', 'refined_pnad')\
    .option("table", "tb_d_sexo")\
    .mode("overwrite")\
    .save()

### Trusted Data

#### Fato

In [7]:
from transformations import transform
from _spark import get_spark, _display

df = spark\
    .read\
    .option('delimiter',',')\
    .option('header',True)\
    .option('inferSchema',True)\
    .csv('../data/raw')

df = transform(df)

df.write\
    .format("bigquery")\
    .option("temporaryGcsBucket", gcs_bucket)\
    .option("credentialsFile",os.environ["GOOGLE_APPLICATION_CREDENTIALS"])\
    .option("project", "fiap-tech-challenge-3")\
    .option("parentProject", "fiap-tech-challenge-3")\
    .option('dataset', 'trusted_pnad')\
    .option("table", "tb_f_covid_2020")\
    .mode("overwrite")\
    .save()
