In [1]:
%run ../spark-default.py

In [2]:
import builtins as b
from datetime import datetime, timedelta, date
import sys, traceback
from delta.tables import DeltaTable

from scipy.cluster.hierarchy import linkage, dendrogram
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from pyspark.ml.functions import vector_to_array
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.feature import StringIndexer, VectorAssembler, OneHotEncoder
from pyspark.ml import Pipeline, PipelineModel

In [3]:
df = spark.table("stage.ciha_step3")

df.printSchema()

root
 |-- pk: string (nullable = true)
 |-- mes_cmpt: string (nullable = true)
 |-- espec: string (nullable = true)
 |-- cgc_hosp: string (nullable = true)
 |-- munic_res: string (nullable = true)
 |-- nasc: date (nullable = true)
 |-- sexo: string (nullable = true)
 |-- uti_mes_to: string (nullable = true)
 |-- uti_int_to: string (nullable = true)
 |-- proc_rea: string (nullable = true)
 |-- qt_proc: string (nullable = true)
 |-- dt_atend: date (nullable = true)
 |-- dt_atend_dow: integer (nullable = true)
 |-- dt_saida: date (nullable = true)
 |-- diag_princ: string (nullable = true)
 |-- diag_secun: string (nullable = true)
 |-- cobranca: string (nullable = true)
 |-- natureza: string (nullable = true)
 |-- gestao: string (nullable = true)
 |-- munic_mov: string (nullable = true)
 |-- cod_idade: string (nullable = true)
 |-- idade: string (nullable = true)
 |-- dias_perm: string (nullable = true)
 |-- morte: string (nullable = true)
 |-- nacional: string (nullable = true)
 |-- car_i

In [4]:
df_prep = df.filter(
    col("feriado").isNotNull()
    & col("uti_int_to").cast("integer").isNotNull()
    & col("diag_princ_categoria").isNotNull()
).select(
    col("dt_atend"),
    date_format("dt_atend", "dd").cast("integer").alias("dt_atend_dia"),
    date_format("dt_atend", "MM").cast("integer").alias("dt_atend_mes"),
    date_format("dt_atend", "yyyy").cast("integer").alias("dt_atend_ano"),
    weekofyear("dt_atend").alias("dt_atend_woy"),
    col("nasc"),
    date_format("nasc", "dd").cast("integer").alias("dt_nasc_dia"),
    date_format("nasc", "MM").cast("integer").alias("dt_nasc_mes"),
    date_format("nasc", "yyyy").cast("integer").alias("dt_nasc_ano"),
    weekofyear("nasc").alias("dt_nasc_woy"),
    col("sexo"),
    col("uti_int_to").cast("integer").alias("uti_int_to"),
    col("qt_proc").cast("integer").alias("qt_proc"),
    col("dt_atend_dow"), # 1 -> Domingo, 7 -> Sabado
    col("dt_saida"),
    date_format("dt_saida", "dd").cast("integer").alias("dt_saida_dia"),
    date_format("dt_saida", "MM").cast("integer").alias("dt_saida_mes"),
    date_format("dt_saida", "yyyy").cast("integer").alias("dt_saida_ano"),
    weekofyear("dt_saida").alias("dt_saida_woy"),
    col("diag_princ"),
    # col("diag_secun"),
    col("idade").cast("integer").alias("idade"),
    col("dias_perm").cast("integer").alias("dias_perm"),
    when(col("feriado") == lit(True), lit(1)).otherwise(lit(0)).alias("feriado"),
    size(col("feriados")).alias("feriados_qtd"),
    size(col("feriados_prox")).alias("feriados_prox_qtd"),
    when(col("feriados_prox_dist").isNull(), lit(30)).otherwise(col("feriados_prox_dist")).alias("feriados_prox_dist"),
    col("munic_res_sigla_uf"),
    col("munic_mov_sigla_uf"),
    col("munic_res_nome_uf"),
    col("munic_mov_nome_uf"),
    col("munic_res_populacao_residente").cast("integer").alias("munic_res_populacao_residente"),
    col("munic_mov_populacao_residente").cast("integer").alias("munic_mov_populacao_residente"),
    col("munic_res_area_unidade_territorial").cast("integer").alias("munic_res_area_unidade_territorial"),
    col("munic_mov_area_unidade_territorial").cast("integer").alias("munic_mov_area_unidade_territorial"),
    col("munic_res_idhm").cast("double").alias("munic_res_idhm"),
    col("munic_mov_idhm").cast("double").alias("munic_mov_idhm"),
    col("diag_princ_categoria"),
    # col("diag_secun_categoria"),
    col("diag_princ_capitulo_numero"),
    # col("diag_secun_capitulo_numero"),
    col("proc_rea"),
    # col("proc_tp_complexidade"),
    # col("proc_tp_sexo"),
    # col("proc_qt_max_exec").cast("integer").alias("proc_qt_max_exec"),
    # col("proc_qt_dias_perm").cast("integer").alias("proc_qt_dias_perm"),
    # col("proc_qt_pontos").cast("integer").alias("proc_qt_pontos"),
    # col("proc_no_grupo"),
    # col("proc_no_sub_grupo"),
    col("morte").cast("integer").alias("morte")
)

categorical_cols = [
    "sexo",
    "dt_atend_dow",
    "diag_princ",
    # "diag_secun",
    "munic_res_sigla_uf",
    "munic_mov_sigla_uf",
    "munic_res_nome_uf",
    "munic_mov_nome_uf",
    "feriado",
    "diag_princ_categoria",
    # "diag_secun_categoria",
    "diag_princ_capitulo_numero",
    # "diag_secun_capitulo_numero",
    "proc_rea",
    # "proc_tp_complexidade",
    # "proc_tp_sexo",
    # "proc_no_grupo",
    # "proc_no_sub_grupo",
    "dt_atend_mes",
    "dt_nasc_mes",
    "dt_saida_mes"
]

numeric_cols = [
    "dt_atend_dia",
    "dt_atend_ano",
    "dt_atend_woy",
    "dt_nasc_dia",
    "dt_nasc_ano",
    "dt_nasc_woy",
    "dt_saida_dia",
    "dt_saida_ano",
    "dt_saida_woy",
    "uti_int_to",
    "qt_proc",
    "idade",
    "dias_perm",
    "feriados_qtd",
    "feriados_prox_qtd",
    "feriados_prox_dist",
    "munic_res_populacao_residente",
    "munic_mov_populacao_residente",
    "munic_res_area_unidade_territorial",
    "munic_mov_area_unidade_territorial",
    "munic_res_idhm",
    "munic_mov_idhm",
    # "proc_qt_max_exec",
    # "proc_qt_dias_perm",
    # "proc_qt_pontos"
]

df_treinamento = df_prep.filter("dt_atend between '2011-01-01' and '2023-12-31'")
df_teste = df_prep.filter("dt_atend between '2024-01-01' and '2024-12-31'")

target_col = 'morte'

In [5]:
df_prep.filter("morte = 1").union(df_prep.filter("morte = 0").sample(withReplacement=False, fraction=0.1, seed=42)).repartition(1).write.option("header", True).csv("s3a://datalake/exports/ciha_sample10/")

In [6]:
df_treinamento.filter("morte = 1").union(df_treinamento.filter("morte = 0").sample(withReplacement=False, fraction=0.1, seed=42)).repartition(1).write.option("header", True).csv("s3a://datalake/exports/ciha_treinamento_sample10/")

In [7]:
df_teste.filter("morte = 1").union(df_teste.filter("morte = 0").sample(withReplacement=False, fraction=0.1, seed=42)).repartition(1).write.option("header", True).csv("s3a://datalake/exports/ciha_teste_sample10/")

In [8]:
spark.stop()