In [1]:
# Dados extraidos do site do IBGE

# Ano 2013:
#   https://ftp.ibge.gov.br/PNS/2013/Microdados/Dados/PNS_2013.zip
#   https://ftp.ibge.gov.br/PNS/2013/Microdados/Documentacao/Dicionario_e_input_20200930.zip

# Ano 2019
#   https://ftp.ibge.gov.br/PNS/2019/Microdados/Dados/PNS_2019_20220525.zip
#   https://ftp.ibge.gov.br/PNS/2019/Microdados/Documentacao/Dicionario_e_input_20220530.zip

## Parameters definition

In [2]:
# pns_year = 2013
pns_year = 2019

## Functions definition

In [3]:
import pandas as pd

In [4]:
def read_schema(file_path):
    try:
        # Read the schema CSV file
        schema_df = pd.read_csv(file_path)

        columns = []
        for index, row in schema_df.iterrows():
            col_name = row['column_name']
            start_pos = row['start_position'] - 1  # Adjust for zero-based index
            end_pos = row['start_position'] + row['length'] - 1
            columns.append((col_name, start_pos, end_pos))
        return columns
    except Exception as e:
        print(f"Error reading schema file: {e}")

# Read the PNS file with the schema
def read_positional_file_with_schema(file_path, columns):
    try:
        data = []
        with open(file_path, 'r') as file:
            for line in file:
                row_data = {}
                for col_name, start_pos, end_pos in columns:
                    row_data[col_name] = line[start_pos:end_pos]
                data.append(row_data)
        return pd.DataFrame(data)
    except Exception as e:
        print(f"Error reading positional file with schema: {e}")

## Data extraction

In [5]:
# Read schema
schema = read_schema(f"../../data/schema/schema_{pns_year}.csv")
if not schema:
    print("Error reading schema")
    exit()

# Read file
df_file = read_positional_file_with_schema(f"../../data/raw/PNS_{pns_year}.txt", schema)
if df_file.empty:
    print("Error reading positional file with schema")
    exit()

# Show head
df_file.head()

Unnamed: 0,V0001,V0024,UPA_PNS,V0006_PNS,V0015,V0020,V0022,V0026,V0031,V0025A,...,VDE002,VDE014,VDF002,VDF003,VDF004,VDL001,VDM001,VDP001,VDR001,VDDATA
0,11,1110011,110000016,1,1,2019,6,1,1,1,...,1.0,6.0,2098,350,2,,,,,20220504
1,11,1110011,110000016,1,1,2019,6,1,1,0,...,,,2098,350,2,,,,,20220504
2,11,1110011,110000016,1,1,2019,6,1,1,0,...,1.0,4.0,2098,350,2,,,,,20220504
3,11,1110011,110000016,1,1,2019,6,1,1,9,...,,,2098,350,2,,,,,20220504
4,11,1110011,110000016,1,1,2019,6,1,1,9,...,,,2098,350,2,,,,,20220504


## Data staging

In [6]:
df_file.to_parquet(f"../../data/staged/PNS_{pns_year}.parquet", index=False)