# Predicting on CENSO samples.

In [264]:
# from IPython.core.display import display, HTML
# display(HTML("<style>.container { width:100% !important; }</style>"))

In [265]:
## Modulos
import pandas as pd
import numpy as np

## Cargar info empleo:

In [266]:
import matplotlib.pyplot as plt
from datetime import datetime

empleo = pd.read_csv('https://raw.githubusercontent.com/matuteiglesias/empleoARG/main/datos/45.2_ECTDT.csv')
empleo = empleo[['45.2_IT_0_T_13', '45.2_ECTDT_0_T_33']] # ('45.2_ECTDT_0_T_33' es tasa de desocupacion en total aglomerados)
empleo['Q'] = pd.to_datetime(empleo['45.2_IT_0_T_13']) + pd.DateOffset(months=1, days = 14)
empleo = empleo.set_index('Q').drop(['45.2_IT_0_T_13'], axis = 1)
empleo = empleo.replace('s/d', np.nan).astype(float).round(4)
empleo['censo2010_ratio'] = (empleo/empleo.loc['2010-11-15'])

# **Tasa de desempleo en censo 2010**
## notar que la tasa en Aglos, segun el censo, no es igual al valor de la serie de tiempo.
# para oct 2010 el censo da (6.29 %) y la que tenemos en dato (7.5%)
desoc_C2010 = pd.read_csv('./../data/info/desoc_AGLOsi_C2010.csv')
tasa_C2010 = desoc_C2010.loc[desoc_C2010.AGLO_si == True]['Tasa desocupacion'].values[0]
tasa_C2010


0.0628729377307203

# Predicting

## Lista de trimestres con modelos ya calculados

In [267]:
import datetime as dt
## Trimestres con ingresos disponibles (depende de disponibilidad de microdatos EPH)
import glob

path = './../../encuestador-de-hogares/fitted_RF/clf4_' # use your path

allFiles = []

allFiles += glob.glob(path +'*')
allFiles = sorted(allFiles)
# allFiles[-5:]


In [268]:

allqs = [f[-14:-4] for f in allFiles]
print(sorted(allqs)[:10])
print(sorted(allqs)[-10:])

['-02-15_ARG', '-05-15_ARG', '2003-08-15', '2003-11-15', '2004-02-15', '2004-05-15', '2004-08-15', '2004-11-15', '2005-02-15', '2005-05-15']
['2020-11-15', '2021-02-15', '2021-05-15', '2021-08-15', '2021-11-15', '2022-02-15', '2022-05-15', '2022-08-15', '2022-11-15', '2023-02-15']


In [269]:
import os

if not os.path.exists('./../data/resultados'):
    os.makedirs('./../data/resultados')

## Parametros

### Anios a calcular

In [270]:
### IMPORTANTE ELEGIR ANIOS
startyr = 2019
endyr = 2023

## Elegir el dataset usado como X:
experiment_tag = 'ARG'
models_tag = 'ARG'
frac = '0.05'

# 1174037/(18645609 + 1174037)

In [271]:
### Funcion ajustar nivel de empleo


def ajustar_empleo(data, q, verbose = False):

        ratio = empleo.loc[pd.to_datetime(q)].censo2010_ratio
        n_desempleados_ = ratio*(CONDACT_cnts[1] + CONDACT_cnts[2])*tasa_C2010
        desemp_adic = round(n_desempleados_ - CONDACT_cnts.loc[2]) # Desempleados adicionales
        
        print(str(q)[:10])

        if desemp_adic > 0:
            data.loc[
                data.query('CONDACT == 1').sample(desemp_adic).index,
                'CONDACT'
            ] = 2
        elif desemp_adic < 0:
            data.loc[
                data.query('CONDACT == 2').sample(- desemp_adic).index,
                'CONDACT'
            ] = 1

        if verbose:
            desempleo = data.CONDACT.value_counts().loc[2] / (data.CONDACT.value_counts().loc[1] + data.CONDACT.value_counts().loc[2])
            print('desempleo:' + str(desempleo))
        
        return data

In [272]:
import joblib
# import gc

def predict_save(X_data, x_cols, y_cols, model_filename, out_filename, tag, overwrite = False):

    # Si todavia no existe la training data de ese anio, o si la opcion overwrite esta activada:
    if (not os.path.exists(out_filename)) or (overwrite): 

        CLF = joblib.load(model_filename)
        
        y_out = CLF.predict(X_data[x_cols].values)

        ## Listo
        y_censo_fit = pd.DataFrame(y_out, index = X_data.index, columns=y_cols)
        
        # Xy_censo = pd.concat([X_data, y_censo_fit], axis = 1)

#             save
        y_censo_fit.to_csv(out_filename, index = True) #, index_label = 'ID')
        print('File saved at '+ out_filename)
        del X_data; del CLF

    # return y_censo_fit
#             gc.collect()

In [273]:
overwrite = True

In [274]:
import sys
sys.path.append('./../../../repos/encuestador-de-hogares/data/info')
from variables import *  # x_cols1, x_cols2, etc

In [275]:
models_path = './../../encuestador-de-hogares'
adapted_Censo_files_path = '/media/matias/Elements/suite/poblaciones/'

def run_predict_save(iter_dict):
    predict_save(**iter_dict)
    return pd.read_csv(iter_dict['out_filename'], index_col=['ID'])

for yr in [str(s) for s in range(startyr, endyr)]:
    print(yr)
    file_ = adapted_Censo_files_path + '/table_f'+str(frac)+'_'+yr+'_'+experiment_tag+'.csv'

    X_censo = pd.read_csv(file_, usecols = x_cols1 + 
    ['ID','AGLOMERADO', 'DPTO', 'HOGAR_REF_ID', 'PERSONA_REF_ID', 'RADIO_REF_ID', 'URP'], 
    index_col=['ID']).fillna(0)

    ## Tratamiento trimestral 
    qs = np.array(allqs)[[i for i, si in enumerate(allqs) if si.startswith(yr)]]
    print(qs)
    
    CONDACT_cnts = X_censo.CONDACT.value_counts()
        
    ### Cargar modelos de la parte no trimestral (anual).
    for q in sorted(qs):

        out_filename1 = '/media/matias/Elements/suite/resultados/RFC1_'+str(frac)+'_'+str(q)[:10]+'_'+experiment_tag+'.csv'
        out_filename2 = '/media/matias/Elements/suite/resultados/RFC2_'+str(frac)+'_'+str(q)[:10]+'_'+experiment_tag+'.csv'
        out_filename3 = '/media/matias/Elements/suite/resultados/RFC3_'+str(frac)+'_'+str(q)[:10]+'_'+experiment_tag+'.csv'
        out_filename4 = '/media/matias/Elements/suite/resultados/RFReg_'+str(frac)+'_'+str(q)[:10]+'_'+experiment_tag+'.csv'

        ### AJUSTAR NIVEL DE DESEMPLEO
        X_q = X_censo.copy()
        X_q['Q'] = q
        print('Nuevo trimestre.')

        X_q = ajustar_empleo(X_q, q)
        print('Poblacion: ', len(X_q)/float(frac))


        # Define the first iteration separately
        predict_save_iter_dict1 = {
            'X_data': X_q,
            'x_cols': x_cols1, 'y_cols': y_cols1,
            'out_filename': out_filename1,
            'model_filename': models_path + '/fitted_RF/clf1_'+yr+'_'+models_tag,
            'tag': 'clf1_'+yr+'_'+models_tag,
            'overwrite': overwrite
        }
        result1 = run_predict_save(predict_save_iter_dict1)
        print('Poblacion: ', len(result1)/float(frac))

        # Second iteration
        predict_save_iter_dict2 = {
            'X_data': pd.concat([X_q, result1], axis=1),
            'x_cols': x_cols2, 'y_cols': y_cols2,
            'out_filename': out_filename2,
            'model_filename': models_path + '/fitted_RF/clf2_'+yr+'_'+models_tag,
            'tag': 'clf2_'+yr+'_'+models_tag,
            'overwrite': overwrite
        }
        result2 = run_predict_save(predict_save_iter_dict2)
        print('Poblacion: ', len(result2)/float(frac))

        # Third iteration
        predict_save_iter_dict3 = {
            'X_data': pd.concat([X_q, result1, result2], axis=1),
            'x_cols': x_cols3, 'y_cols': y_cols3,
            'out_filename': out_filename3,
            'model_filename': models_path + '/fitted_RF/clf3_'+yr+'_'+models_tag,
            'tag': 'clf3_'+yr+'_'+models_tag,
            'overwrite': overwrite
        }
        result3 = run_predict_save(predict_save_iter_dict3)
        print('Poblacion: ', len(result3)/float(frac))

        # Fourth iteration
        predict_save_iter_dict4 = {
            'X_data': pd.concat([X_q, result1, result2, result3], axis=1),
            'x_cols': x_cols4, 'y_cols': columnas_pesos,
            'out_filename': out_filename4,
            'model_filename': models_path + '/fitted_RF/clf4_'+str(q)[:10]+'_'+models_tag,
            'tag': 'clf4_'+yr+'_'+models_tag,
            'overwrite': True,
        }
        result4 = run_predict_save(predict_save_iter_dict4)
        print('Poblacion: ', len(result4)/float(frac))

                                
    del X_censo; #del clf1; del clf2; del clf3


2019
['2019-02-15' '2019-05-15' '2019-08-15' '2019-11-15']
Nuevo trimestre.
2019-02-15
Poblacion:  44333140.0
File saved at /media/matias/Elements/suite/resultados/RFC1_0.05_2019-02-15_ARG.csv
Poblacion:  44333140.0
File saved at /media/matias/Elements/suite/resultados/RFC2_0.05_2019-02-15_ARG.csv
Poblacion:  44333140.0
File saved at /media/matias/Elements/suite/resultados/RFC3_0.05_2019-02-15_ARG.csv
Poblacion:  44333140.0
File saved at /media/matias/Elements/suite/resultados/RFReg_0.05_2019-02-15_ARG.csv
Poblacion:  44333140.0
Nuevo trimestre.
2019-05-15
Poblacion:  44333140.0
File saved at /media/matias/Elements/suite/resultados/RFC1_0.05_2019-05-15_ARG.csv
Poblacion:  44333140.0
File saved at /media/matias/Elements/suite/resultados/RFC2_0.05_2019-05-15_ARG.csv
Poblacion:  44333140.0
File saved at /media/matias/Elements/suite/resultados/RFC3_0.05_2019-05-15_ARG.csv
Poblacion:  44333140.0
File saved at /media/matias/Elements/suite/resultados/RFReg_0.05_2019-05-15_ARG.csv
Poblacion:  

KeyError: Timestamp('2022-05-15 00:00:00')

In [None]:
X_q.reset_index()['ID'].nunique()

2216657

In [None]:
X_q.index

Index([73807214819, 30423055419, 34416671619, 30203173819, 91440759019,
       84184536919, 78237969419, 79210603819, 57285309219, 26782205619,
       ...
       93311449219, 46821331819, 21243686719, 44173939819, 45149030219,
       65556038619, 51138346319, 25163700219, 73814969119, 42916481319],
      dtype='int64', name='ID', length=2216657)

In [None]:
result1.index

Index([6141843519, 7972274719, 7904497319, 6277325019, 4727717819, 1913777819,
       3575979319, 8164317919, 9899577019, 3599100819,
       ...
       3424829519, 4195493619, 2237885319, 8430669719, 5068540419, 4022512119,
       9657089119, 4054818819, 2001774219, 9351239519],
      dtype='int64', name='ID', length=1285695)

In [None]:
pd.concat([X_q, result1], axis=1)

InvalidIndexError: Reindexing only valid with uniquely valued Index objects

In [None]:
X_q.head().reset_index()

Unnamed: 0,ID,RADIO_REF_ID,V01,URP,DPTO,AGLOMERADO,HOGAR_REF_ID,H05,H06,H07,...,P05,P07,P08,P09,P10,CONDACT,IX_TOT,AGLO_rk,Reg_rk,Q
0,6141843519,1,1.0,1,2001,32,2,1,1.0,1,...,1,1,2,4,1,1,1,0.939,0.833,2019-02-15
1,7972274719,1,1.0,1,2001,32,3,1,3.0,1,...,1,1,2,7,2,1,2,0.939,0.833,2019-02-15
2,7904497319,1,1.0,1,2001,32,3,1,3.0,1,...,1,1,2,6,1,3,2,0.939,0.833,2019-02-15
3,6277325019,1,0.0,1,2001,32,40,0,0.0,0,...,1,1,1,4,2,1,0,0.939,0.833,2019-02-15
4,4727717819,1,0.0,1,2001,32,44,0,0.0,0,...,1,1,2,7,2,3,0,0.939,0.833,2019-02-15


In [None]:
# debugger
# import pandas as pd

# # List of filenames
# filenames = [
#     '/media/matias/Elements/suite/resultados/RFC1_0.01_2019-05-15_ARG.csv',
#     '/media/matias/Elements/suite/resultados/RFC2_0.01_2019-05-15_ARG.csv',
#     '/media/matias/Elements/suite/resultados/RFC3_0.01_2019-05-15_ARG.csv',
#     '/media/matias/Elements/suite/resultados/RFReg_0.01_2019-05-15_ARG.csv'
# ]

# # Loop through each filename and print the columns
# for filename in filenames:
#     df = pd.read_csv(filename, nrows=1) # Reading only the first row
#     print(f"Dtypes in {filename}: {df.dtypes}")

In [None]:
# # import sys
# # # These are the usual ipython objects, including this one you are creating
# # ipython_vars = ['In', 'Out', 'exit', 'quit', 'get_ipython', 'ipython_vars']


# for yr in [str(s) for s in range(startyr, endyr)]:
#     print(yr)
#     file_ = '/media/matias/Elements/suite/yr_samples/table_f'+str(frac)+'_'+yr+'_'+experiment_tag+'.csv'

#     X_censo = pd.read_csv(file_, usecols = ['DPTO','RADIO_REF_ID','PERSONA_REF_ID', 'HOGAR_REF_ID','IX_TOT', 'P02', 'P03', 'CONDACT', 'AGLOMERADO', 'URP', 'V01', 'H05', 'H06',
#            'H07', 'H08', 'H09', 'H10', 'H11', 'H12', 'H16', 'H15', 'PROP', 'H14', 'AGLO_rk', 'Reg_rk',
#            'H13', 'P07', 'P08', 'P09', 'P10', 'P05']).fillna(0)

#     ## Tratamiento trimestral 
#     qs = np.array(allqs)[[i for i, si in enumerate(allqs) if si.startswith(yr)]]
#     print(qs)
    
#     CONDACT_cnts = X_censo.CONDACT.value_counts()
    
# #     print(sorted([(x, sys.getsizeof(globals().get(x))) for x in dir() if not x.startswith('_') and x not in sys.modules and x not in ipython_vars], key=lambda x: x[1], reverse=True)[:5])
    
#     ### Cargar modelos de la parte no trimestral (anual).
#     for q in sorted(qs):
        
#         ### AJUSTAR NIVEL DE DESEMPLEO
#         X_q = X_censo.copy()
#         X_q['Q'] = q
#         print('Nuevo trimestre.')

#         X_q = ajustar_empleo(X_q)

# # import sys
# # # These are the usual ipython objects, including this one you are creating
# # ipython_vars = ['In', 'Out', 'exit', 'quit', 'get_ipython', 'ipython_vars']


# for yr in [str(s) for s in range(startyr, endyr)]:
#     print(yr)
#     file_ = '/media/matias/Elements/suite/yr_samples/table_f'+str(frac)+'_'+yr+'_'+experiment_tag+'.csv'

#     X_censo = pd.read_csv(file_, usecols = ['DPTO','RADIO_REF_ID','PERSONA_REF_ID', 'HOGAR_REF_ID','IX_TOT', 'P02', 'P03', 'CONDACT', 'AGLOMERADO', 'URP', 'V01', 'H05', 'H06',
#            'H07', 'H08', 'H09', 'H10', 'H11', 'H12', 'H16', 'H15', 'PROP', 'H14', 'AGLO_rk', 'Reg_rk',
#            'H13', 'P07', 'P08', 'P09', 'P10', 'P05']).fillna(0)

#     ## Tratamiento trimestral 
#     qs = np.array(allqs)[[i for i, si in enumerate(allqs) if si.startswith(yr)]]
#     print(qs)
    
#     CONDACT_cnts = X_censo.CONDACT.value_counts()
    
# #     print(sorted([(x, sys.getsizeof(globals().get(x))) for x in dir() if not x.startswith('_') and x not in sys.modules and x not in ipython_vars], key=lambda x: x[1], reverse=True)[:5])
    
#     ### Cargar modelos de la parte no trimestral (anual).
#     for q in sorted(qs):
        
#         ### AJUSTAR NIVEL DE DESEMPLEO
#         X_q = X_censo.copy()
#         X_q['Q'] = q
#         print('Nuevo trimestre.')

#         X_q = ajustar_empleo(X_q)

#         #################################    #################################    #################################
        
#         print('C1')
#         ## CLASIF 1
#         X_data = X_q;
#         y_cols1 = ['CAT_OCUP', 'CAT_INAC', 'CH07']
#         x_cols1 = ['IX_TOT', 'P02', 'P03', 'AGLO_rk', 'Reg_rk', 'V01', 'H05', 'H06',
#        'H07', 'H08', 'H09', 'H10', 'H11', 'H12', 'H16', 'H15', 'PROP', 'H14',
#        'H13', 'P07', 'P08', 'P09', 'P10', 'P05', 'CONDACT']
#         out_filename1 = '/media/miglesia/Elements/suite/yr_samples/RFC1_'+str(frac)+'_'+str(q)[:10]+'_'+experiment_tag+'.csv'

#         predict_save(X_data,
#                      x_cols = x_cols1,
#                      y_cols = y_cols1,
#                      out_filename = out_filename1,
#                      model_filename = models_path + '/fitted_RF/clf1_'+yr+'_'+models_tag,
#                      tag = 'clf1_'+yr+'_'+models_tag,
#                     overwrite = overwrite)
        
#         del X_q; del X_data
# #         print(sorted([(x, sys.getsizeof(globals().get(x))) for x in dir() if not x.startswith('_') and x not in sys.modules and x not in ipython_vars], key=lambda x: x[1], reverse=True)[:5])

#         #################################    #################################    #################################

#         print('C2')
#         ## CLASIF 2
#         X_data = pd.read_csv(out_filename1)
#         y_cols2 = ['INGRESO', 'INGRESO_NLB', 'INGRESO_JUB', 'INGRESO_SBS']
#         x_cols2 = x_cols1 + y_cols1
#         out_filename2 = '/media/miglesia/Elements/suite/yr_samples/RFC2_'+str(frac)+'_'+str(q)[:10]+'_'+experiment_tag+'.csv'

#         predict_save(X_data,
#                      x_cols = x_cols2,
#                      y_cols = y_cols2,
#                      out_filename = out_filename2,
#                      model_filename = models_path + '/fitted_RF/clf2_'+yr+'_'+models_tag,
#                      tag = 'clf2_'+yr+'_'+models_tag,
#                     overwrite = overwrite)

#         del X_data
# #         print(sorted([(x, sys.getsizeof(globals().get(x))) for x in dir() if not x.startswith('_') and x not in sys.modules and x not in ipython_vars], key=lambda x: x[1], reverse=True)[:5])

#         #################################    #################################    #################################

#         print('C3')

#         ## CLASIF 3
#         X_data = pd.read_csv(out_filename2)
#         y_cols3 = ['PP07G1','PP07G_59', 'PP07I', 'PP07J', 'PP07K']
#         x_cols3 = x_cols2 + y_cols2
#         out_filename3 = '/media/miglesia/Elements/suite/yr_samples/RFC3_'+str(frac)+'_'+str(q)[:10]+'_'+experiment_tag+'.csv'

#         predict_save(X_data,
#                      x_cols = x_cols3,
#                      y_cols = y_cols3,
#                      out_filename = out_filename3,
#                      model_filename = models_path + '/fitted_RF/clf3_'+yr+'_'+models_tag,
#                      tag = 'clf3_'+yr+'_'+models_tag,
#                     overwrite = overwrite)
#         del X_data
# #         print(sorted([(x, sys.getsizeof(globals().get(x))) for x in dir() if not x.startswith('_') and x not in sys.modules and x not in ipython_vars], key=lambda x: x[1], reverse=True)[:5])

#         #################################    #################################    #################################

# #         print('reg')
# #         # REGRESION
# #     
#         # Columnas de ingresos. Necesitan una regresion...
#         columnas_pesos = [u'P21', u'P47T', u'PP08D1', u'TOT_P12', u'T_VI', u'V12_M', u'V2_M', u'V3_M', u'V5_M']

#         x_cols4 = x_cols3 + y_cols3
#         y_cols4 = columnas_pesos

#         X_data = pd.read_csv(out_filename3)

#         predict_save(X_data,
#                     x_cols = x_cols4,
#                     y_cols = columnas_pesos,
#                     out_filename = '/media/miglesia/Elements/suite/yr_samples/RFReg_'+str(frac)+'_'+str(q)[:10]+'_'+experiment_tag+'.csv',
#                     model_filename = models_path + '/fitted_RF/clf4_'+str(q)[:10]+'_'+models_tag,
#                     tag = 'clf4_'+yr+'_'+models_tag,
#                     overwrite = overwrite)
                                
#     del X_censo; #del clf1; del clf2; del clf3


2005
['2005-02-15' '2005-05-15' '2005-08-15' '2005-11-15']
Nuevo trimestre.
2005-02-15
C1
File saved at /media/miglesia/Elements/suite/yr_samples/RFC1_0.01_2005-02-15_ARG.csv
C2
File saved at /media/miglesia/Elements/suite/yr_samples/RFC2_0.01_2005-02-15_ARG.csv
C3
File saved at /media/miglesia/Elements/suite/yr_samples/RFC3_0.01_2005-02-15_ARG.csv
reg
Nuevo trimestre.
2005-05-15
C1
File saved at /media/miglesia/Elements/suite/yr_samples/RFC1_0.01_2005-05-15_ARG.csv
C2
File saved at /media/miglesia/Elements/suite/yr_samples/RFC2_0.01_2005-05-15_ARG.csv
C3
File saved at /media/miglesia/Elements/suite/yr_samples/RFC3_0.01_2005-05-15_ARG.csv
reg
Nuevo trimestre.
2005-08-15
C1
File saved at /media/miglesia/Elements/suite/yr_samples/RFC1_0.01_2005-08-15_ARG.csv
C2
File saved at /media/miglesia/Elements/suite/yr_samples/RFC2_0.01_2005-08-15_ARG.csv
C3
File saved at /media/miglesia/Elements/suite/yr_samples/RFC3_0.01_2005-08-15_ARG.csv
reg
Nuevo trimestre.
2005-11-15
C1
File saved at /media/m