# Data Preparation 

In [None]:
import pandas as pd
import json

## Reading Data

In [None]:
raw_df = pd.read_csv('../data/01_raw/roraima_with_procedures.csv')

In [None]:
raw_df.head()

In [None]:
raw_df.info()

# Converting Ids to Strings

In [None]:
raw_df['id_municipio_estabelecimento_aih'] = raw_df['id_municipio_estabelecimento_aih'].astype(str)
raw_df['id_municipio_paciente'] = raw_df['id_municipio_paciente'].astype(str)
raw_df['id_procedimento_principal'] = raw_df['id_procedimento_principal'].astype(str)
raw_df.info()

## Removing id_municipio_estabelecimento_aih digit


In [None]:
raw_df['id_municipio_estabelecimento_aih'] = raw_df['id_municipio_estabelecimento_aih'].apply(lambda x: x[:-1])
raw_df

## Filtering Cities by States of Roraima and Amazonas

In [None]:
filtered_df =raw_df[
        (raw_df['id_municipio_paciente'].str.startswith('14','13'))]

filtered_df



In [None]:
# outliers percentage
outliers_percent = 100*(1-(filtered_df.shape[0]/raw_df.shape[0]))
outliers_percent

In [None]:
filtered_df['id_municipio_paciente'].unique()

## Mapping City Names

In [None]:
mapping_df = filtered_df.copy()

city_data = pd.read_json('../data/01_raw/cities.json')
city_data['id'] = city_data['id'].astype(str)

city_names = city_data.set_index('id')['city'].to_dict()
mapping_df['municipio_paciente'] = mapping_df['id_municipio_paciente'].map(city_names)
mapping_df['municipio_atendimento'] = mapping_df['id_municipio_estabelecimento_aih'].map(city_names)


In [None]:
mapping_df['municipio_paciente'].unique()

In [None]:
mapping_df['municipio_atendimento'].unique()

## Mapping Procedure Names

In [None]:
procedure_names = pd.read_csv('../data/01_raw/procedure_names.csv', dtype=str)
procedure_dict = procedure_names.set_index('CÓDIGO')['PROCEDIMENTO'].to_dict()

mapping_df['procedimento_principal'] = mapping_df['id_procedimento_principal'].map(procedure_dict) # mapping the procedimento
mapping_df['procedimento_principal']



## Dropping Values That Couldn't be Identified

In [None]:
processed_df = mapping_df.copy()
processed_df.isnull().sum()



In [None]:
# percentage of data to be dropped
100*(processed_df.isnull().sum()['procedimento_principal']/processed_df.shape[0])

In [None]:
processed_df.dropna(axis=0, how='any', inplace=True)

In [None]:
processed_df = processed_df.reset_index(drop = True)

In [None]:
processed_df.info()

In [None]:
null_counts = processed_df.isnull().sum()
null_counts

## Dropping converted columns


In [None]:
processed_df.drop(axis=1, 
                  columns=['id_municipio_estabelecimento_aih', 
                           'id_municipio_paciente',
                           'id_procedimento_principal'],
                  inplace=True)

processed_df.info()

## Saving Processed Data

In [None]:
processed_df.to_csv('../data/02_processed/processed_data.csv', index=False)