In [5]:
!pip install google-cloud-bigquery




[notice] A new release of pip is available: 23.2.1 -> 23.3
[notice] To update, run: python.exe -m pip install --upgrade pip


In [6]:
!pip freeze > ..//requirements.txt

In [1]:
from _spark import get_spark, _display
from transformations import transform
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.inspection import permutation_importance

import pyspark.sql.functions as f
import pyspark.sql.types as t
import os
import findspark
import pandas as pd

findspark.init()

spark = get_spark()

## BigQuery links

- [BigQuery Table](https://console.cloud.google.com/bigquery?hl=pt-br&project=fiap-tech-challenge-3&ws=!1m0)
- [Storage](https://console.cloud.google.com/storage/browser/tech-challenge;tab=configuration?hl=pt-br&project=fiap-tech-challenge-3&prefix=&forceOnObjectsSortingFiltering=false)
- [IAM e admin](https://console.cloud.google.com/iam-admin/iam?hl=pt-br&project=fiap-tech-challenge-3)

### Refined Data

#### Fato

In [2]:
table_name = "fiap-tech-challenge-3.refined_pnad.tb_f_covid_2020"
df = spark.read \
    .format("bigquery") \
    .option("table", table_name) \
    .load()

_display(df\
            .groupBy('resultado_covid')\
            .agg(f.count('uf'))
            )


Unnamed: 0,resultado_covid,count(uf)
0,3.0,293
1,,1488952
2,9.0,644
3,1.0,100845
4,4.0,1762


### Trusted Data

#### Fato

In [63]:
table_name = "fiap-tech-challenge-3.trusted_pnad.tb_f_covid_2020"
df = spark.read \
    .format("bigquery") \
    .option("table", table_name) \
    .load()
    
_display(df\
            .groupBy('resultado_covid')\
            .agg(f.count('uf'))
            )



Unnamed: 0,resultado_covid,count(uf)
0,,1490714
1,Ignorado,644
2,Sim,100845
3,Não sabe,293
4,Não,327387


In [64]:
# Manter apenas sim/nao
# df = df.filter(f.col('resultado_covid').isin(['Sim','Não']))

# Remover NA
for col in df.columns:
    df = df.withColumn(col, f.when(f.col(col).isin(['NA','Não sabe','Ignorado']), None).otherwise(f.col(col)))
_display(df\
        .groupBy('resultado_covid')\
        .agg(f.count('uf'))
        )

Unnamed: 0,resultado_covid,count(uf)
0,,1491651
1,Sim,100845
2,Não,327387


In [65]:
df = df.select(['uf', 'area_domicilio', 'sexo', 'cor_raca', 'escolaridade',
                'teve_dificuldade_respirar', 'teve_dor_cabeca',
                'teve_fadiga', 'teve_perda_cheiro', 'ficou_em_casa',
                'resultado_covid',])
df = df.toPandas()

In [66]:
df.isna().sum()

uf                                 0
area_domicilio                     0
sexo                               0
cor_raca                         474
escolaridade                       0
teve_dificuldade_respirar       8372
teve_dor_cabeca                 8492
teve_fadiga                     8500
teve_perda_cheiro              10256
ficou_em_casa                1480173
resultado_covid              1491651
dtype: int64

In [67]:
# dropando na (menos de 10% da base)
df = df.dropna()
df.isna().sum()

uf                           0
area_domicilio               0
sexo                         0
cor_raca                     0
escolaridade                 0
teve_dificuldade_respirar    0
teve_dor_cabeca              0
teve_fadiga                  0
teve_perda_cheiro            0
ficou_em_casa                0
resultado_covid              0
dtype: int64

In [68]:
df.shape

(381497, 11)

In [69]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline

pipeline = Pipeline(
    steps = [
        ('one-hot', OneHotEncoder(handle_unknown='infrequent_if_exist')),
        ('model',RandomForestClassifier(random_state=42))
    ]
)

X = df.drop(columns={'resultado_covid'})
y = df[['resultado_covid']]

In [70]:
y.value_counts() # fazer undersampling

resultado_covid
Não                293935
Sim                 87562
Name: count, dtype: int64

In [71]:

X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, random_state=42)
pipeline.fit(X_train, y_train)


In [72]:
model = pipeline['model']

In [73]:
y_pred = pipeline.predict(X_test)

In [74]:
y_test['resultado_covid'].values

array(['Não ', 'Não ', 'Não ', ..., 'Não ', 'Sim', 'Não '], dtype=object)

In [75]:
pd.DataFrame([y_test['resultado_covid'].values, y_pred]).T.value_counts()

0     1   
Não   Não     69963
Sim   Não     15551
      Sim      6340
Não   Sim      3521
Name: count, dtype: int64

In [76]:
from sklearn.inspection import permutation_importance

result = permutation_importance(
    pipeline, X_test, y_test, n_repeats=10, random_state=42, n_jobs=2
)

forest_importances = pd.Series(result.importances_mean, index=X.columns)

In [77]:
forest_importances

uf                           0.010323
area_domicilio               0.004917
sexo                        -0.000124
cor_raca                     0.003639
escolaridade                 0.058604
teve_dificuldade_respirar    0.000370
teve_dor_cabeca              0.000558
teve_fadiga                  0.000118
teve_perda_cheiro            0.000720
ficou_em_casa                0.000651
dtype: float64

In [80]:
pd.DataFrame([model.feature_importances_, X.columns]).T

Unnamed: 0,0,1
0,0.008857,uf
1,0.007405,area_domicilio
2,0.004652,sexo
3,0.006992,cor_raca
4,0.004547,escolaridade
5,0.005949,teve_dificuldade_respirar
6,0.004642,teve_dor_cabeca
7,0.006851,teve_fadiga
8,0.007244,teve_perda_cheiro
9,0.019328,ficou_em_casa
