## Genera Datasets de entrenamiento a partir de los microdatos de EPH

In [8]:
from IPython import get_ipython

if get_ipython() is None:
    print('ARGUMENTOS TOMADOS DE CLI')
    import argparse

    parser = argparse.ArgumentParser(description='A script to process data for a range of years')

    parser.add_argument('-y','--years', nargs='+', help='Set the range of years to process data for. Default is the current year and the next year', required=False, type=int, default=[2022, 2023])
    parser.add_argument('-ow','--overwrite', nargs=1, required=False, default= True, help='Flag to specify if previous data should be overwritten. Default is True')

    args = parser.parse_args()

    overwrite = args.overwrite
    startyr = args.years[0]
    endyr = args.years[1]
    
else:
    print('ARGUMENTOS INTRODUCIDOS POR EL USUARIO')
    startyr = input("Enter the start year [default: 2022]: ") or 2022
    endyr = input("Enter the end year [default: 2023]: ") or 2023
    overwrite = input("Do you want to overwrite previous data? [y/n] [default: y]: ") or "y"

    if overwrite.lower() == "y":
        overwrite = True
    else:
        overwrite = False

    #Convert the input to integers
    startyr = int(startyr)
    endyr = int(endyr)

    print("Start year: ", startyr)
    print("End year: ", endyr)
    print("Overwrite: ", overwrite)


ARGUMENTOS INTRODUCIDOS POR EL USUARIO
Start year:  2015
End year:  2016
Overwrite:  True


In [9]:
import pandas as pd
import numpy as np
import glob

In [10]:
radio_ref = pd.read_csv('./../data/info/radio_ref.csv')

AGLO_Region = radio_ref[['AGLOMERADO', 'Region']].drop_duplicates()

# Decision sobre cual es la region de un aglomerado. GBA tiene que ir a Gran Buenos Aires, aunque algunos de sus radios en partidos como Rodriguez, Escobar, etc sean region pampeana.
# Viedma Patagones, se tendria que tirar de un lado, y la mayoria de sus radios, son Patagonia.
# Se tiene que corregir a mano, porque el AGLO 0 SI tiene varias regiones.

AGLO_Region = AGLO_Region.loc[~((AGLO_Region.AGLOMERADO == 33) & (AGLO_Region.Region == 'Pampeana'))]
AGLO_Region = AGLO_Region.loc[~((AGLO_Region.AGLOMERADO == 93) & (AGLO_Region.Region == 'Pampeana'))]

### Match column names

names_censo = ['IX_TOT', 'P02', 'P03', 'CONDACT', 'AGLOMERADO',
    'V01', 'H05', 'H06', 'H07', 'H08', 'H09', 'H10', 'H11', 'H12', 'H16', 'H15', 'PROP', 'H14', 'H13',
      'P07', 'P08', 'P09', 'P10', 'P05']


names_EPH = ['IX_TOT','CH04','CH06','CONDACT', 'AGLOMERADO',
    'IV1', 'IV3', 'IV4','IV5','IV6','IV7','IV8','IV10','IV11','II1','II2','II7','II8','II9',
    'CH09','CH10','CH12','CH13','CH15']

col_mon = [u'P21', u'P47T', u'PP08D1', u'TOT_P12', u'T_VI', u'V12_M', u'V2_M', u'V3_M', u'V5_M']



## Cargar IPC

In [None]:
def create_cpi_df(url: str, start_year: int, end_year: int) -> pd.DataFrame:
    """
    This function creates a CPI dataframe from a given url and a specified range of years.

    Parameters:
    url (str): The url where the cpi data is located.
    start_year (int): The first year to include in the dataframe.
    end_year (int): The last year to include in the dataframe.

    Returns:
    pd.DataFrame: The created dataframe with the cpi data.
    """
    cpi = pd.read_csv(url, index_col = 0) #reads csv from url and sets first column as index
    cpi.index = pd.to_datetime(cpi.index) #convert index to datetime
    cpi = cpi[str(start_year):str(end_year)] #filter dataframe by range of years
    return cpi

In [44]:

from datetime import datetime
ano_actual = datetime.today().strftime("%Y")

# Crear CPI dataframe, TRIMESTRAL
cpi = create_cpi_df('https://raw.githubusercontent.com/matuteiglesias/IPC-Argentina/main/data/info/indice_precios_Q.csv', 
    2003, end_year=ano_actual)
cpi.index = cpi.index - pd.offsets.MonthBegin(1) + pd.offsets.Day(14) #force day 15 of the month

# Crear CPI dataframe, MENSUAL
cpi_M = create_cpi_df('https://raw.githubusercontent.com/matuteiglesias/IPC-Argentina/main/data/info/indice_precios_M.csv', 
    2003, end_year=ano_actual)

# Crear CPI dataframe, DIARIO
cpi_d = create_cpi_df('https://raw.githubusercontent.com/matuteiglesias/IPC-Argentina/main/data/info/indice_precios_d.csv', 
    2003, end_year=ano_actual)

# Fecha de referencia para el IPC. ix es el nivel del indice en la fecha de referencia, y es 100, por definicion
ix = cpi_d.loc['2016-01-01']['index']

# Primer dia del mes en curso
mes_actual = datetime.today().replace(day=1).strftime("%Y-%m-%d")


log_index            1.194752
index             1565.973054
log_index_diff       0.025106
pct_m                5.955085
Name: 2023-01-01 00:00:00, dtype: float64

## Cargar EPHs

Los microdatos de la Encuesta Permanente de Hogares (copias actualizadas de los archivos oficiales) estan disponibles en el repositorio:
https://github.com/matuteiglesias/microdatos-EPH-INDEC.git


SI ESTOS DATOS TE RESULTAN UTILES, TE PIDO DARLE UN STAR AL REPOSITORIO

tomando pull del mismo repositorio se va a poder actualizar con los nuevos microdatos a medida que se publican. 

``cd path/to/microdatos-EPH-INDEC``

``git pull``

El INDEC se toma aproximadamente 130 dias luego de terminado un trimestre para subir las bases de microdatos. 


In [16]:
## 
import os

# Verifico si existe el directorio donde se guardarian los microdatos
directorio_microdatos = "./../../microdatos-EPH-INDEC/"

# Si el directorio no existe, lo creo y clono el repositorio en ese lugar
if not os.path.exists(directorio_microdatos):
    os.makedirs(directorio_microdatos)

    # Si el modulo git no esta instalado, lo instalo
    try:
        import git
    except ImportError:
        !pip install gitpython
        import git

    git.Repo.clone_from("https://github.com/matuteiglesias/microdatos-EPH-INDEC.git", 
    directorio_microdatos)

# Ahora, tenemos los microdatos de la EPH en el directorio ./../../microdatos-EPH-INDEC/microdatos/

In [17]:
## Ademas, verifico si existe el directorio donde se guardarian los datos de entrenamiento
if not os.path.exists('./../data/training/'):
    os.makedirs('./../data/training/')

In [18]:
import glob
import pandas as pd

# df = concatenate_files(2022, '/path/to/directory', ['column1', 'column2', 'column3'])
def read_data_from_files(year: int, directory: str, columns: list[str]) -> pd.DataFrame:
    """
    Given a year, a directory, and a list of columns, this function concatenates all the files in the directory that match the year and returns a DataFrame containing only the specified columns.
    
    Parameters:
        - year (int): The year to match in the file names.
        - directory (str): The directory where the files are located.
        - columns (List[str]): The list of columns to keep in the returned DataFrame.
        
    Returns:
        - pd.DataFrame: A DataFrame containing only the specified columns from all the files in the directory that match the year.
    """
    all_files = glob.glob(directory + '/*{}.txt'.format(str(year)[2:]))
    list_ = []
    for file_ in all_files:
        # Read the file and select only the specified columns
        df = pd.read_csv(file_, index_col=None, header=0, delimiter=';', usecols=columns)
        # Add the selected columns of the file to the list
        list_ += [df]
    # Concatenate all the selected columns of the files and return as a DataFrame
    return pd.concat(list_)


def correct_responses_hogar(df: pd.DataFrame) -> pd.DataFrame:
    """
    Applies corrections to the input DataFrame to match the census.
    """
    df = df.loc[df.IV1 != 9]
    df['IV10'] = df['IV10'].map({1: 1, 2: 2, 3: 2, 0: 0, 9: 9})
    df['II9'] = df['II9'].map({1: 1, 2: 2, 3: 2, 4: 4, 0: 0})
    df['II7'] = df['II7'].map({1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 6, 8: 6, 9: 6, 0: 0})

    df['IX_TOT'] = df['IX_TOT'].clip(0, 8)
    return df

def correct_responses_individual(df_: pd.DataFrame) -> pd.DataFrame:
    """
    Apply cleaning steps to the input DataFrame. (Para que matchee censo)
    :param df: DataFrame to be cleaned.
    :return: A copy of the input DataFrame with the cleaning steps applied.
    """
    # Copy the input DataFrame to prevent modifying the original object
    df = df_.copy()

    df['CH15'] = df['CH15'].map({1:1, 2:1, 3:1, 4:2, 5:2, 9:0})
    df['CH06'] = df['CH06'].clip(0)
    df['CH09'] = df['CH09'].map({1:1, 2:2, 0:2, 3:2})
    df.loc[df['CH06'] < 14, 'CONDACT'] = 0 # Menores de 14 van con CONDACT 0, como en el Censo

    df = df.rename(columns = {'ESTADO': 'CONDACT'})

    ## En Censo, Jardin y educacion especial no preguntan terminado si/no.
    df['CH12'] = df.CH12.replace(99, 0)
    df.loc[df.CH12.isin([0, 1, 9]), 'CH13'] = 0

    return df

In [26]:
# from pandas.tseries.offsets import MonthEnd

for y in range(startyr, endyr):
    print(y)
    yr = str(y)[2:]
    training_file = './../data/training/EPHARG_train_{}.csv'.format(yr)
    
    # Si todavia no existe la training data de ese anio, o si la opcion overwrite esta activada:
    if (not os.path.exists(training_file)) or (overwrite): 

        ## Data Hogares
        hogar_df = read_data_from_files(y, directorio_microdatos + 'microdatos/hogar',
            ['CODUSU','ANO4','TRIMESTRE','IX_TOT', 'AGLOMERADO', 'IV1', 'IV3', 'IV4','IV5',
            'IV6','IV7','IV8','IV10','IV11','II1','II2','II7','II8','II9'])
        hogar_df = correct_responses_hogar(hogar_df)
        hogar_df = hogar_df.drop_duplicates()

        ## Data Individual
        individual_df = read_data_from_files(y, directorio_microdatos + 'microdatos/individual',
            ['CODUSU','ANO4','TRIMESTRE','CH04','CH06', 'AGLOMERADO', 'CH09','CH10','CH12','CH13','CH15',
            'CH07', 'ESTADO','CAT_INAC','CAT_OCUP','PP07G1', 'PP07G2', 'PP07G3', 'PP07G4', 'PP07G_59', 'PP07H', 'PP07I', 'PP07J', 'PP07K',
            'P47T', 'V3_M', 'T_VI', 'V12_M', 'TOT_P12', 'V5_M','V2_M', 'PP08D1', 'P21'])
        individual_df = correct_responses_individual(individual_df)

        individual_df = individual_df.dropna(subset = ['P47T'])

        indiv_table = individual_df[list(individual_df.columns.difference(hogar_df.columns)) + ['CODUSU', 'ANO4', 'TRIMESTRE', 'AGLOMERADO']]

        EPH = hogar_df.merge(indiv_table, on = ['CODUSU', 'ANO4', 'TRIMESTRE', 'AGLOMERADO'])#, indicator = True)

        EPH = EPH.merge(AGLO_Region)

        EPH_no_aglo = EPH.copy(); 
        EPH_no_aglo['AGLOMERADO'] = 0

        EPH = pd.concat([EPH, EPH_no_aglo]).reset_index(drop = True)

        # Quarters / deflation
        EPH['Q'] = EPH.ANO4.astype(str) + ':' + (3*EPH.TRIMESTRE).astype(str)
        EPH['Q'] = pd.to_datetime(EPH['Q'], format='%Y:%m') - pd.DateOffset(months=1) + pd.DateOffset(days=14)

        EPH[col_mon] = ix*EPH[col_mon].div(EPH[['Q'] + col_mon].merge(cpi, on = 'Q', how = 'left')['index'].values, 0)

        EPH[col_mon] = EPH[col_mon].round()

        training = EPH.rename(columns = dict(zip(names_EPH, names_censo)))
        
        # remove bad observations
        training = training.loc[training.P47T >= -0.001].fillna(0)
        
        for col in ['CAT_OCUP', 'CH07', 'PP07G1', 'PP07G_59', 'PP07I', 'PP07J', 'PP07K']:
            training = training.loc[training[col] != 9]

        ### RANKING AGLOMERADO
        AGLO_rk = training.loc[(training.CAT_OCUP == 3) & (training.P47T >= 100)].groupby(['ANO4', 'AGLOMERADO'])[['P47T']].mean()
        AGLO_rk['AGLO_rk'] = AGLO_rk.rank(pct = True).round(3)
        AGLO_rk = AGLO_rk.sort_values('P47T').reset_index()
        AGLO_rk = AGLO_rk[['ANO4', 'AGLOMERADO', 'AGLO_rk']].drop_duplicates()

        ### RANKING REGION
        Reg_rk = training.loc[(training.CAT_OCUP == 3) & (training.P47T >= 100)].groupby(['ANO4', 'Region'])[['P47T']].mean()
        Reg_rk['Reg_rk'] = Reg_rk.rank(pct = True).round(3)
        Reg_rk = Reg_rk.sort_values('P47T').reset_index()
        Reg_rk = Reg_rk[['ANO4', 'Region', 'Reg_rk']].drop_duplicates()
            
        training = training.merge(AGLO_rk).merge(Reg_rk)
        
        ## Crear columnas binarias para ingreso.
        training['INGRESO'] = (training.P47T > 100).astype(int)
        training['INGRESO_NLB'] = (training.T_VI > 100).astype(int)
        training['INGRESO_JUB'] = (training.V2_M > 100).astype(int)
        training['INGRESO_SBS'] = (training.V5_M > 100).astype(int)
        
        ## Ordenar por id de hogar.
        training = training.sort_values('CODUSU')
        
        training.to_csv(training_file, index = False)
        print('Saved to', training_file)

2015
Hogar - Indiv merged:
Saved to ./../data/training/EPHARG_train_15.csv


### Ranking de AGLOS y Regiones

A continuacion se extrae el ranking de aglomerados y regiones para cada uno de los anios.

In [1]:
import pandas as pd

In [2]:
aglo_list = []
regs_list = []

startyr = 2023; endyr = 2024;
for y in range(startyr, endyr):
    print(y)
    yr = str(y)[2:]
    training_file = './../data/training/EPHARG_train_'+str(yr)+'.csv'
    
    aglo_table = pd.read_csv(training_file, usecols = ['ANO4', 'AGLOMERADO', 'AGLO_rk']).drop_duplicates()
    aglo_list += [aglo_table]
    
    regs_table = pd.read_csv(training_file, usecols = ['ANO4', 'Region', 'Reg_rk']).drop_duplicates()
    regs_list += [regs_table]
    
aglo_rk = pd.concat(aglo_list)
regs_rk = pd.concat(regs_list)

aglo_rk.to_csv('./../data/info/AGLO_rk', index = False)
regs_rk.to_csv('./../data/info/Reg_rk', index = False)

2023


## Listo. Salvado el training set.