

### **1. Configuración y Datos Auxiliares.ipynb**
  - **1.1. Configuración del entorno:** Importación de bibliotecas necesarias y configuración.
  - **1.2. Repositorios:**
    - **1.2.1.** Verificación de la existencia de repositorios.
    - **1.2.2.** Clonación de repositorios si es necesario.
  - **1.3. Carga de datos auxiliares:** 
    - **1.3.1.** Importación de datos de encuestas.
    - **1.3.2.** Importación y procesamiento del índice de precios al consumidor (IPC).
    - **1.3.3.** Importación de identificadores geográficos.


In [12]:
# -------------------
# Libraries and Imports
# -------------------
import subprocess

import os
import pandas as pd
from funciones import log_message, transform_censo_data, generate_unique_ids, generate_Qs

# -------------------
# Parameters and Configuration
# -------------------

FRAC = 0.02
START_YEAR = 2003
END_YEAR = 2024
EXPERIMENT_TAG = 'ARG'

# REPO_PATH = './../../samplerCensoARG/'
REPO_PATH = './../../samplerCensoARG/'
DATABASE_PATH = "/media/matias/Elements/suite/ext_CPV2010_basico_radio_pub"
# -------------------
# Repository Operations
# -------------------

# Check and clone the repository if necessary
if not os.path.exists(REPO_PATH):
    !git clone https://github.com/matuteiglesias/samplerCensoARG.git REPO_PATH

# -------------------
# File Operations
# -------------------

# path = './../../encuestador-de-hogares/fitted_RF/clf4_'
# allFiles = sorted(glob.glob(path +'*'))
# allqs = [f[-14:-4] for f in allFiles]
# print(allqs)

# # Ensure the results directory exists
# if not os.path.exists('./../data/resultados'):
#     os.makedirs('./../data/resultados')



# Check and clone repository if necessary
# ...



# Load radio reference
# ...

In [13]:


# Cargar referencia de radios y otros datos auxiliares
radio_ref = pd.read_csv('./../data/info/radio_ref.csv')




# Load agglomerate information and merge with radio reference
# ...

In [14]:

# Cargamos información sobre los aglomerados
radio_AGLO = pd.read_csv('https://raw.githubusercontent.com/matuteiglesias/Aglomerados-EPH-INDEC/main/result/radios_aglo_EPH.csv')
radio_AGLO['radio'] = radio_AGLO.COD_2010.str.replace('XX', '99').astype(int)
radio_AGLO['AGLOMERADO'] = radio_AGLO.eph_codagl
radio_AGLO['NOMAGLO'] = radio_AGLO.eph_aglome

radio_ref = radio_ref.drop(['AGLOMERADO'], axis = 1).merge(radio_AGLO[['radio','AGLOMERADO', 'NOMAGLO']], how = 'left')
radio_ref['AGLOMERADO'] = radio_ref['AGLOMERADO'].fillna(0).astype(int)

# Cargamos el mapeo de departamentos a regiones
dpto_region = pd.read_csv('./../data/info/DPTO_PROV_Region.csv')
radio_ref = radio_ref.merge(dpto_region)


In [15]:
# radio_ref.dtypes/

# Load rankings and map regions
# ...

In [16]:
AGLO_rk = pd.read_csv('./../../encuestador-de-hogares/data/info/AGLO_rk')
rk_table = AGLO_rk.set_index(['ANO4', 'AGLOMERADO']).unstack()
AGLO_rk_filled = rk_table.fillna(rk_table.mean()).stack().reset_index()
AGLO_rk = AGLO_rk_filled

Reg_rk = pd.read_csv('./../../encuestador-de-hogares/data/info/Reg_rk')
# Reg_rk['Region'] = Reg_rk['region_']
# Reg_rk = Reg_rk.drop('region_', axis=1)

# # Define the mapping dictionary
regiones = {
    'Gran Buenos Aires': 'gran_buenos_aires',
    'Pampeana': 'pampeana',
    'Noroeste': 'noroeste',
    'Noreste': 'noreste',
    'Patagónica': 'patagonia',
    'Cuyo': 'cuyo'
}

# Update region names using the mapping dictionary
Reg_rk['Region'] = Reg_rk['Region'].map(regiones)



In [17]:
# file_path = f'{REPO_PATH}data/censo_samples/table_f{FRAC}_{syr}_{EXPERIMENT_TAG}.csv'
# pd.read_csv(file_path, nrows=1000).dtypes

In [18]:
# file_path

In [19]:
# input_path = f'{REPO_PATH}data/censo_samples/table_f{FRAC}_{syr}_{EXPERIMENT_TAG}.csv'
# input_path

In [20]:
log_message("Script started.")

# Save the original working directory
original_wd = os.getcwd()

for yr in [s for s in range(START_YEAR, END_YEAR)]:
    syr = str(yr)
    log_message(f"Processing year: {syr}")

    file_path = f'{REPO_PATH}data/censo_samples/table_f{FRAC}_{syr}_{EXPERIMENT_TAG}.csv'
    
    if not os.path.exists(file_path):
        log_message(f"File {file_path} not found. Running the sampler script...")
        
        # Navigate to the samplerCensoARG repository
        os.chdir(os.path.join(os.getcwd(), REPO_PATH, "notebooks"))
        
        # Run the sampler script
        # cmd = ["python", "samplear.py", "-dbp", DATABASE_PATH , "-n", EXPERIMENT_TAG, "-f", str(FRAC), "-y", str(yr)]
        # subprocess.call(cmd)
        cmd = ["python", "samplear.py", "-dbp", DATABASE_PATH, "-n", EXPERIMENT_TAG, "-f", str(FRAC), "-y", syr, str(yr+1)]
        subprocess.run(cmd)
        
        # Navigate back to the original working directory
        os.chdir(original_wd)
        
    # Extracción y transformación de datos
    input_data = pd.read_csv(file_path)
    # Extracción de datos
    table = input_data.copy()
    table = table.rename(columns = {'TOTPERS': 'IX_TOT'})
    table['ANO4'] = int(yr)
    
    # Transformación de datos
    table = transform_censo_data(table)
    
    # Agregar la región 
    table = table.merge(dpto_region[['DPTO', 'Region']])

    # Generar IDs únicos
    table = generate_unique_ids(table, 9)#, n_digits)

    # Agregar ranking de Region y Aglo
    table = table.merge(AGLO_rk[['AGLOMERADO', 'ANO4', 'AGLO_rk']]).merge(Reg_rk[['Region', 'ANO4', 'Reg_rk']])

    # Guardado de los datos transformados
    output_directory = '/media/matias/Elements/suite/poblaciones'
    if not os.path.exists(output_directory):
        os.makedirs(output_directory)
    table.to_csv(f'{output_directory}/table_f{FRAC}_{syr}_{EXPERIMENT_TAG}.csv', index=False)

log_message("Script completed.")


[2023-10-28 18:33:58] Script started. 
[2023-10-28 18:33:58] Processing year: 2004 


[2023-10-28 18:39:29] Processing year: 2005 
[2023-10-28 18:43:06] Processing year: 2006 
[2023-10-28 18:47:14] Processing year: 2007 
[2023-10-28 18:52:52] Processing year: 2008 
[2023-10-28 18:58:59] Processing year: 2009 
[2023-10-28 19:04:47] Processing year: 2010 
[2023-10-28 19:10:59] Processing year: 2011 
[2023-10-28 19:17:20] Processing year: 2012 
[2023-10-28 19:23:53] Processing year: 2013 
[2023-10-28 19:30:55] Processing year: 2014 
[2023-10-28 19:37:24] Processing year: 2015 
[2023-10-28 19:44:21] Processing year: 2016 
[2023-10-28 19:51:39] Processing year: 2017 
[2023-10-28 19:58:51] Processing year: 2018 
[2023-10-28 20:07:42] Processing year: 2019 
[2023-10-28 20:16:06] Processing year: 2020 
[2023-10-28 20:23:22] Processing year: 2021 
[2023-10-28 20:30:21] Processing year: 2022 
[2023-10-28 20:38:29] Processing year: 2023 
[2023-10-28 20:46:48] Script completed. 


In [21]:
    # table.to_csv(f'{output_directory}/table_f{FRAC}_{syr}_{EXPERIMENT_TAG}.csv', index=False)


In [22]:
table.dtypes

# ID                   int64
# VIVIENDA_REF_ID      int64
# RADIO_REF_ID         int64
# TIPVV                int64
# V01                float64
# URP                  int64
# DPTO                 int64
# PROV                 int64
# AGLOMERADO           int64
# HOGAR_REF_ID         int64
# H05                  int64
# H06                float64
# H07                  int64
# H08                  int64
# H09                float64
# H10                  int64
# H11                  int64
# H12                  int64
# H13                float64
# H14                float64
# H15                  int64
# H16                  int64
# PROP                 int64
# IX_TOT               int64
# PERSONA_REF_ID       int64
# P01                  int64
# P02                  int64
# P03                  int64
# P05                  int64
# P06                  int64
# P07                  int64
# P12                  int64
# P08                  int64
# P09                  int64
# P10                  int64
# CONDACT              int64
# ANO4                 int64
# Region              object
# AGLO_rk            float64
# Reg_rk             float64
# dtype: object

ID                   int64
VIVIENDA_REF_ID      int64
RADIO_REF_ID         int64
TIPVV                int64
V01                float64
URP                  int64
DPTO                 int64
PROV                 int64
AGLOMERADO           int64
HOGAR_REF_ID         int64
H05                  int64
H06                float64
H07                  int64
H08                  int64
H09                float64
H10                  int64
H11                  int64
H12                  int64
H13                float64
H14                float64
H15                  int64
H16                  int64
PROP                 int64
IX_TOT               int64
PERSONA_REF_ID       int64
P01                  int64
P02                  int64
P03                  int64
P05                  int64
P06                  int64
P07                  int64
P12                  int64
P08                  int64
P09                  int64
P10                  int64
CONDACT              int64
ANO4                 int64
R