## On this notebook we extract Censo 2010 individual data from their files.


In [1]:
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import numpy as np

In [2]:
# Los numeros de la tabla proyeccion de poblacion discrepan del censo 2010.
# Para solucionar, voy a usar los ratios que propone INDEC post 2010, pero sobreescribiendo el valor de 2010 con lo que dice el censo.
proy_pop = pd.read_csv('./../data/info/proy_pop200125.csv', encoding = 'utf-8')

# poblacion 2010
P02 = pd.read_csv('./../data/info/PERSONA-P02.csv', encoding = 'latin-1')
radio_ref = pd.read_csv('./../data/info/radio_ref.csv')
radio_ref['DPTO'] = radio_ref['DPTO'].astype(int)
P02_geo = P02.merge(radio_ref[['DPTO', 'NOMDPTO','PROV','NOMPROV', 'radio']])
pob2010_DPTO = P02_geo.groupby(['PROV','NOMPROV','DPTO','NOMDPTO'])['TOTAL'].sum().reset_index()

merged = proy_pop.merge(pob2010_DPTO, left_on='C�digo', right_on = 'DPTO') 
# solo se pierde la antartida
merged[[str(s) for s in range(2010, 2026)]] = round(merged[[str(s) for s in range(2010, 2026)]].T*merged.TOTAL/merged['2010']).astype(int).T

proy_pob = merged.set_index(['PROV','NOMPROV','DPTO','NOMDPTO'])[[str(s) for s in range(2001, 2026)]]

#redo linear interpolation of 2001 -> 2010
proy_pob[[str(s) for s in range(2002, 2010)]] = np.nan
proy_pob = round(proy_pob.interpolate(axis = 1)).astype(int)

In [None]:
## Hacen Falta los datasets PERSONA, VIVIENDA, HOGAR. Se extraen de la base de datos del Censo.

### Calcular columna IX_TOT (n. personas en hogar)

In [4]:

import dask.dataframe as dd
from dask.diagnostics import ProgressBar

# Commented after it could be saved
# Cantidad de personas  en cada hogar. No esta la columna pero la podemos construir..
PERSONA = dd.read_csv('./../../extracted_/PERSONA.csv', sep = ';', usecols = ['HOGAR_REF_ID']) # csv is too big, so it is dask-loaded. Not sure it's efficient thou

with ProgressBar():
    IX_TOT = PERSONA['HOGAR_REF_ID'].value_counts().reset_index().compute()
    IX_TOT.columns = ['HOGAR_REF_ID', 'IX_TOT']
    
IX_TOT.to_csv('./../data/info/IX_TOT.csv')

IX_TOT = pd.read_csv('./../data/info/IX_TOT.csv')
IX_TOT.head() # el tamanio de 12.197.647 hogares

FileNotFoundError: [WinError 3] The system cannot find the path specified: 'C:/Users/BCNCPC23/Desktop/extracted_/PERSONA.csv'

In [None]:
# radio_ref = pd.read_csv('./../Censo_individual/data/radio_ref.csv')

## Hogar y Vivienda

In [None]:
frac = 0.02

#Esto es para extraer las viviendas, hogares y personas de los partidos (DPTOs) en cuestion.
# seleccion_DPTOS y usecols nos sirven para no cargar data innecesaria.
VIVIENDA = dd.read_csv('./../../extracted_/VIVIENDA.csv', sep = ';', usecols = ['VIVIENDA_REF_ID', 'RADIO_REF_ID', 'TIPVV', 'V01'])
VIVIENDA = VIVIENDA.merge(radio_ref[['RADIO_REF_ID', 'DPTO']])


# VIVIENDA_ = VIVIENDA.loc[VIVIENDA.DPTO.isin(seleccion_DPTOS)]
    
HOGAR = dd.read_csv('./../../extracted_/HOGAR.csv', sep = ';', usecols = ['HOGAR_REF_ID', 'VIVIENDA_REF_ID']) # csv is too big, so it is dask-loaded. Not sure it's efficient thou

with ProgressBar():
    HOGAR_DPTO = HOGAR.merge(VIVIENDA[['VIVIENDA_REF_ID', 'DPTO']]).compute()

In [None]:
startyr = 2020
endyr = 2021

In [None]:
# grouped = HOGAR_DPTO.merge(proyeccion[['DPTO', 'ratio_18']]).groupby('DPTO')
ratios = proy_pob.div(proy_pob['2010'], 0).reset_index()


for yr in [str(s) for s in range(startyr, endyr)]:
    print(yr)
    grouped = HOGAR_DPTO.merge(ratios[['DPTO', yr]]).groupby('DPTO')
    sample = grouped.apply(lambda x: x.sample(frac=frac*x[yr].mean()))

    HOGAR = dd.read_csv('./../../extracted_/HOGAR.csv', sep = ';', usecols = ['HOGAR_REF_ID', 'VIVIENDA_REF_ID', 'H05', 'H06', 'H07', 'H08',
           'H09', 'H10', 'H11', 'H12', 'H13', 'H14', 'H15', 'H16', 'PROP', 'TOTPERS']) 

    VIVIENDA_sample = VIVIENDA.loc[VIVIENDA.VIVIENDA_REF_ID.isin(sample.VIVIENDA_REF_ID)]
    HOGAR_sample = HOGAR.loc[HOGAR.HOGAR_REF_ID.isin(sample.HOGAR_REF_ID)]

    tabla_censo = VIVIENDA_sample.merge(HOGAR_sample)
    tabla_censo = tabla_censo.merge(IX_TOT)

    with ProgressBar():
        table = tabla_censo.compute()


    # Approach: modify Census to fit EPH
    table['V01'] = table['V01'].map({1:1, 2:6, 3:6, 4:2, 5:3, 6:4, 7:5, 8:6})
    table['H06'] = table['H06'].map({1:1, 2:2, 3:3, 4:4, 5:5, 6:6, 7:7, 8:9})
    table['H09'] = table['H09'].map({1:1, 2:2, 3:3, 4:4, 5:4, 6:4})
    table['H16'] = table['H16'].clip(0, 9)
    table['H14'] = table['H14'].map({1:1, 2:4, 3:2, 4:2, 5:4, 6:3, 7:4, 8:9})
    table['H13'] = table['H13'].map({1:1, 2:2, 4:0})

    # saber de que aglo es la persona. Se usa los resultados de cada aglo.
    table = table.merge(radio_ref[['RADIO_REF_ID','AGLOMERADO']]) 

#     Only once to save time in the future.

    table.to_csv('./../../extracted_/yr_samples/sample_censo_table_f'+str(frac)+'_'+yr+'_ARG.csv', index = False)

    PERSONA = dd.read_csv('./../../extracted_/PERSONA.csv', sep = ';', usecols = ['PERSONA_REF_ID', 'HOGAR_REF_ID', 'P01', 'P02', 'P03', 'P05', 'P06',
           'P07', 'P12', 'P08', 'P09', 'P10', 'CONDACT'])

    PERSONA_sample = PERSONA.loc[PERSONA.HOGAR_REF_ID.isin(sample.HOGAR_REF_ID)]


    with ProgressBar():
        table = table.merge(PERSONA_sample.compute())

    table['P07'] = table['P07'].map({1:1, 2:2, 0:2})

    df = table[['RADIO_REF_ID']].merge(radio_ref, on = 'RADIO_REF_ID', how = 'left')
    display(df[['IDPROV','PROV']].nunique())

    # Only once to save time in the future
    table.to_csv('./../../extracted_/yr_samples/sample_censo_table_f'+str(frac)+'_'+yr+'_ARG.csv', index = False)