# Predicting on CENSO samples.

In [1]:
# from IPython.core.display import display, HTML
# display(HTML("<style>.container { width:100% !important; }</style>"))

In [2]:
## Modulos
import pandas as pd
import numpy as np

## Cargar info empleo:

In [5]:
import matplotlib.pyplot as plt
from datetime import datetime

empleo = pd.read_csv('https://raw.githubusercontent.com/matuteiglesias/empleoARG/main/datos/45.2_ECTDT.csv')
empleo = empleo[['45.2_IT_0_T_13', '45.2_ECTDT_0_T_33']] # ('45.2_ECTDT_0_T_33' es tasa de desocupacion en total aglomerados)
empleo['Q'] = pd.to_datetime(empleo['45.2_IT_0_T_13']) + pd.DateOffset(months=1, days = 14)
empleo = empleo.set_index('Q').drop(['45.2_IT_0_T_13'], axis = 1)
empleo = empleo.replace('s/d', np.nan).astype(float).round(4)
empleo['censo2010_ratio'] = (empleo/empleo.loc['2010-11-15'])

# **Tasa de desempleo en censo 2010**
## notar que la tasa en Aglos, segun el censo, no es igual al valor de la serie de tiempo.
# para oct 2010 el censo da (6.29 %) y la que tenemos en dato (7.5%)
desoc_C2010 = pd.read_csv('./../data/info/desoc_AGLOsi_C2010.csv')
tasa_C2010 = desoc_C2010.loc[desoc_C2010.AGLO_si == True]['Tasa desocupacion'].values[0]
tasa_C2010


0.0628729377307203

# Predicting

## Lista de trimestres con modelos ya calculados

In [13]:
import datetime as dt
## Trimestres con ingresos disponibles (depende de disponibilidad de microdatos EPH)
import glob

path = './../../encuestador-de-hogares/fitted_RF/clf4_' # use your path

allFiles = []

allFiles += glob.glob(path +'*')
allFiles = sorted(allFiles)
# allFiles[-5:]


In [14]:

allqs = [f[-14:-4] for f in allFiles]
print(sorted(allqs)[:10])
print(sorted(allqs)[-10:])

['2003-08-15', '2003-11-15', '2004-02-15', '2004-05-15', '2004-08-15', '2004-11-15', '2005-02-15', '2005-05-15', '2005-08-15', '2005-11-15']
['2020-02-15', '2020-05-15', '2020-08-15', '2020-11-15', '2021-02-15', '2021-05-15', '2021-08-15', '2021-11-15', '2022-02-15', '2022-05-15']


In [15]:
import os

if not os.path.exists('./../data/yr_samples'):
    os.makedirs('./../data/yr_samples')

## Parametros

### Anios a calcular

In [8]:
### IMPORTANTE ELEGIR ANIOS
startyr = 2015
endyr = 2016

## Elegir el dataset usado como X:
experiment_tag = 'ARG'
models_tag = 'ARG'
frac = '0.01'

# 1174037/(18645609 + 1174037)

0.059236022681737104

In [10]:
### Funcion ajustar nivel de empleo


def ajustar_empleo(data, q, verbose = False):

        ratio = empleo.loc[pd.to_datetime(q)].censo2010_ratio
        n_desempleados_ = ratio*(CONDACT_cnts[1] + CONDACT_cnts[2])*tasa_C2010
        desemp_adic = round(n_desempleados_ - CONDACT_cnts.loc[2]) # Desempleados adicionales
        
        print(str(q)[:10])

        if desemp_adic > 0:
            data.loc[
                data.query('CONDACT == 1').sample(desemp_adic).index,
                'CONDACT'
            ] = 2
        elif desemp_adic < 0:
            data.loc[
                data.query('CONDACT == 2').sample(- desemp_adic).index,
                'CONDACT'
            ] = 1

        if verbose:
            desempleo = data.CONDACT.value_counts().loc[2] / (data.CONDACT.value_counts().loc[1] + data.CONDACT.value_counts().loc[2])
            print('desempleo:' + str(desempleo))
        
        return data

In [12]:
import joblib
# import gc

def predict_save(X_data, x_cols, y_cols, model_filename, out_filename, tag, overwrite = False):

        # Si todavia no existe la training data de ese anio, o si la opcion overwrite esta activada:
        if (not os.path.exists(out_filename)) or (overwrite): 

            CLF = joblib.load(model_filename)
            
            y_out = CLF.predict(X_data[x_cols].values)

            ## Listo
            y_censo_fit = pd.DataFrame(y_out, index = X_data.index, columns=y_cols)
            
            Xy_censo = pd.concat([X_data, y_censo_fit], axis = 1)

#             save
            Xy_censo.to_csv(out_filename, index = False)
            print('File saved at '+ out_filename)
            del X_data; del Xy_censo; del CLF
#             gc.collect()

In [13]:
overwrite = True

In [1]:
import sys
sys.path.append('./../../../Repos/encuestador-de-hogares/data/info')
from variables import *

In [None]:
models_path = './../../encuestador-de-hogares'
adapted_Censo_files_path = '/media/matias/Elements/suite/yr_samples'

for yr in [str(s) for s in range(startyr, endyr)]:
    print(yr)
    file_ = adapted_Censo_files_path + '/table_f'+str(frac)+'_'+yr+'_'+experiment_tag+'.csv'

    X_censo = pd.read_csv(file_, usecols = x_cols1 + 
    ['AGLOMERADO', 'DPTO', 'HOGAR_REF_ID', 'PERSONA_REF_ID', 'RADIO_REF_ID', 'URP']).fillna(0)

    ## Tratamiento trimestral 
    qs = np.array(allqs)[[i for i, si in enumerate(allqs) if si.startswith(yr)]]
    print(qs)
    
    CONDACT_cnts = X_censo.CONDACT.value_counts()
        
    ### Cargar modelos de la parte no trimestral (anual).
    for q in sorted(qs):

        out_filename1 = '/media/miglesia/Elements/suite/yr_samples/RFC1_'+str(frac)+'_'+str(q)[:10]+'_'+experiment_tag+'.csv'
        out_filename2 = '/media/miglesia/Elements/suite/yr_samples/RFC2_'+str(frac)+'_'+str(q)[:10]+'_'+experiment_tag+'.csv'
        out_filename3 = '/media/miglesia/Elements/suite/yr_samples/RFC3_'+str(frac)+'_'+str(q)[:10]+'_'+experiment_tag+'.csv'
        out_filename3 = '/media/miglesia/Elements/suite/yr_samples/RFReg_'+str(frac)+'_'+str(q)[:10]+'_'+experiment_tag+'.csv'

        ### AJUSTAR NIVEL DE DESEMPLEO
        X_q = X_censo.copy()
        X_q['Q'] = q
        print('Nuevo trimestre.')

        X_q = ajustar_empleo(X_q, q)

        predict_save_iter_list = [{'X_data': X_q,
                                'x_cols': x_cols1, 'y_cols': y_cols1,
                                'out_filename': out_filename1,
                                'model_filename': models_path + '/fitted_RF/clf1_'+yr+'_'+models_tag,
                                'tag': 'clf1_'+yr+'_'+models_tag,'overwrite': overwrite}

                                {'X_data': pd.read_csv(out_filename1),
                                'x_cols': x_cols2, 'y_cols': y_cols2,
                                'out_filename': out_filename2,
                                'model_filename': models_path + '/fitted_RF/clf2_'+yr+'_'+models_tag,
                                'tag': 'clf2_'+yr+'_'+models_tag,'overwrite': overwrite}
                                
                                {'X_data': pd.read_csv(out_filename2),
                                'x_cols': x_cols3, 'y_cols': y_cols3,
                                'out_filename': out_filename4,
                                'model_filename': models_path + '/fitted_RF/clf3_'+yr+'_'+models_tag,
                                'tag': 'clf3_'+yr+'_'+models_tag,'overwrite': overwrite}

                                {'X_data': pd.read_csv(out_filename3),
                                'x_cols': x_cols4, 'y_cols': columnas_pesos,
                                'out_filename': out_filename4,
                                'model_filename': models_path + '/fitted_RF/clf4_'+str(q)[:10]+'_'+models_tag,
                                'tag': 'clf4_'+yr+'_'+models_tag, 'overwrite': overwrite}
                                    ]
            
        # 
        for predict_save_iter_dict in predict_save_iter_list:
            predict_save(**predict_save_iter_dict)
            del predict_save_iter_dict['X_data']


                                
    del X_censo; #del clf1; del clf2; del clf3


In [17]:
# # import sys
# # # These are the usual ipython objects, including this one you are creating
# # ipython_vars = ['In', 'Out', 'exit', 'quit', 'get_ipython', 'ipython_vars']


# for yr in [str(s) for s in range(startyr, endyr)]:
#     print(yr)
#     file_ = '/media/matias/Elements/suite/yr_samples/table_f'+str(frac)+'_'+yr+'_'+experiment_tag+'.csv'

#     X_censo = pd.read_csv(file_, usecols = ['DPTO','RADIO_REF_ID','PERSONA_REF_ID', 'HOGAR_REF_ID','IX_TOT', 'P02', 'P03', 'CONDACT', 'AGLOMERADO', 'URP', 'V01', 'H05', 'H06',
#            'H07', 'H08', 'H09', 'H10', 'H11', 'H12', 'H16', 'H15', 'PROP', 'H14', 'AGLO_rk', 'Reg_rk',
#            'H13', 'P07', 'P08', 'P09', 'P10', 'P05']).fillna(0)

#     ## Tratamiento trimestral 
#     qs = np.array(allqs)[[i for i, si in enumerate(allqs) if si.startswith(yr)]]
#     print(qs)
    
#     CONDACT_cnts = X_censo.CONDACT.value_counts()
    
# #     print(sorted([(x, sys.getsizeof(globals().get(x))) for x in dir() if not x.startswith('_') and x not in sys.modules and x not in ipython_vars], key=lambda x: x[1], reverse=True)[:5])
    
#     ### Cargar modelos de la parte no trimestral (anual).
#     for q in sorted(qs):
        
#         ### AJUSTAR NIVEL DE DESEMPLEO
#         X_q = X_censo.copy()
#         X_q['Q'] = q
#         print('Nuevo trimestre.')

#         X_q = ajustar_empleo(X_q)

# # import sys
# # # These are the usual ipython objects, including this one you are creating
# # ipython_vars = ['In', 'Out', 'exit', 'quit', 'get_ipython', 'ipython_vars']


# for yr in [str(s) for s in range(startyr, endyr)]:
#     print(yr)
#     file_ = '/media/matias/Elements/suite/yr_samples/table_f'+str(frac)+'_'+yr+'_'+experiment_tag+'.csv'

#     X_censo = pd.read_csv(file_, usecols = ['DPTO','RADIO_REF_ID','PERSONA_REF_ID', 'HOGAR_REF_ID','IX_TOT', 'P02', 'P03', 'CONDACT', 'AGLOMERADO', 'URP', 'V01', 'H05', 'H06',
#            'H07', 'H08', 'H09', 'H10', 'H11', 'H12', 'H16', 'H15', 'PROP', 'H14', 'AGLO_rk', 'Reg_rk',
#            'H13', 'P07', 'P08', 'P09', 'P10', 'P05']).fillna(0)

#     ## Tratamiento trimestral 
#     qs = np.array(allqs)[[i for i, si in enumerate(allqs) if si.startswith(yr)]]
#     print(qs)
    
#     CONDACT_cnts = X_censo.CONDACT.value_counts()
    
# #     print(sorted([(x, sys.getsizeof(globals().get(x))) for x in dir() if not x.startswith('_') and x not in sys.modules and x not in ipython_vars], key=lambda x: x[1], reverse=True)[:5])
    
#     ### Cargar modelos de la parte no trimestral (anual).
#     for q in sorted(qs):
        
#         ### AJUSTAR NIVEL DE DESEMPLEO
#         X_q = X_censo.copy()
#         X_q['Q'] = q
#         print('Nuevo trimestre.')

#         X_q = ajustar_empleo(X_q)

#         #################################    #################################    #################################
        
#         print('C1')
#         ## CLASIF 1
#         X_data = X_q;
#         y_cols1 = ['CAT_OCUP', 'CAT_INAC', 'CH07']
#         x_cols1 = ['IX_TOT', 'P02', 'P03', 'AGLO_rk', 'Reg_rk', 'V01', 'H05', 'H06',
#        'H07', 'H08', 'H09', 'H10', 'H11', 'H12', 'H16', 'H15', 'PROP', 'H14',
#        'H13', 'P07', 'P08', 'P09', 'P10', 'P05', 'CONDACT']
#         out_filename1 = '/media/miglesia/Elements/suite/yr_samples/RFC1_'+str(frac)+'_'+str(q)[:10]+'_'+experiment_tag+'.csv'

#         predict_save(X_data,
#                      x_cols = x_cols1,
#                      y_cols = y_cols1,
#                      out_filename = out_filename1,
#                      model_filename = models_path + '/fitted_RF/clf1_'+yr+'_'+models_tag,
#                      tag = 'clf1_'+yr+'_'+models_tag,
#                     overwrite = overwrite)
        
#         del X_q; del X_data
# #         print(sorted([(x, sys.getsizeof(globals().get(x))) for x in dir() if not x.startswith('_') and x not in sys.modules and x not in ipython_vars], key=lambda x: x[1], reverse=True)[:5])

#         #################################    #################################    #################################

#         print('C2')
#         ## CLASIF 2
#         X_data = pd.read_csv(out_filename1)
#         y_cols2 = ['INGRESO', 'INGRESO_NLB', 'INGRESO_JUB', 'INGRESO_SBS']
#         x_cols2 = x_cols1 + y_cols1
#         out_filename2 = '/media/miglesia/Elements/suite/yr_samples/RFC2_'+str(frac)+'_'+str(q)[:10]+'_'+experiment_tag+'.csv'

#         predict_save(X_data,
#                      x_cols = x_cols2,
#                      y_cols = y_cols2,
#                      out_filename = out_filename2,
#                      model_filename = models_path + '/fitted_RF/clf2_'+yr+'_'+models_tag,
#                      tag = 'clf2_'+yr+'_'+models_tag,
#                     overwrite = overwrite)

#         del X_data
# #         print(sorted([(x, sys.getsizeof(globals().get(x))) for x in dir() if not x.startswith('_') and x not in sys.modules and x not in ipython_vars], key=lambda x: x[1], reverse=True)[:5])

#         #################################    #################################    #################################

#         print('C3')

#         ## CLASIF 3
#         X_data = pd.read_csv(out_filename2)
#         y_cols3 = ['PP07G1','PP07G_59', 'PP07I', 'PP07J', 'PP07K']
#         x_cols3 = x_cols2 + y_cols2
#         out_filename3 = '/media/miglesia/Elements/suite/yr_samples/RFC3_'+str(frac)+'_'+str(q)[:10]+'_'+experiment_tag+'.csv'

#         predict_save(X_data,
#                      x_cols = x_cols3,
#                      y_cols = y_cols3,
#                      out_filename = out_filename3,
#                      model_filename = models_path + '/fitted_RF/clf3_'+yr+'_'+models_tag,
#                      tag = 'clf3_'+yr+'_'+models_tag,
#                     overwrite = overwrite)
#         del X_data
# #         print(sorted([(x, sys.getsizeof(globals().get(x))) for x in dir() if not x.startswith('_') and x not in sys.modules and x not in ipython_vars], key=lambda x: x[1], reverse=True)[:5])

#         #################################    #################################    #################################

# #         print('reg')
# #         # REGRESION
# #     
#         # Columnas de ingresos. Necesitan una regresion...
#         columnas_pesos = [u'P21', u'P47T', u'PP08D1', u'TOT_P12', u'T_VI', u'V12_M', u'V2_M', u'V3_M', u'V5_M']

#         x_cols4 = x_cols3 + y_cols3
#         y_cols4 = columnas_pesos

#         X_data = pd.read_csv(out_filename3)

#         predict_save(X_data,
#                     x_cols = x_cols4,
#                     y_cols = columnas_pesos,
#                     out_filename = '/media/miglesia/Elements/suite/yr_samples/RFReg_'+str(frac)+'_'+str(q)[:10]+'_'+experiment_tag+'.csv',
#                     model_filename = models_path + '/fitted_RF/clf4_'+str(q)[:10]+'_'+models_tag,
#                     tag = 'clf4_'+yr+'_'+models_tag,
#                     overwrite = overwrite)
                                
#     del X_censo; #del clf1; del clf2; del clf3


2005
['2005-02-15' '2005-05-15' '2005-08-15' '2005-11-15']
Nuevo trimestre.
2005-02-15
C1
File saved at /media/miglesia/Elements/suite/yr_samples/RFC1_0.01_2005-02-15_ARG.csv
C2
File saved at /media/miglesia/Elements/suite/yr_samples/RFC2_0.01_2005-02-15_ARG.csv
C3
File saved at /media/miglesia/Elements/suite/yr_samples/RFC3_0.01_2005-02-15_ARG.csv
reg
Nuevo trimestre.
2005-05-15
C1
File saved at /media/miglesia/Elements/suite/yr_samples/RFC1_0.01_2005-05-15_ARG.csv
C2
File saved at /media/miglesia/Elements/suite/yr_samples/RFC2_0.01_2005-05-15_ARG.csv
C3
File saved at /media/miglesia/Elements/suite/yr_samples/RFC3_0.01_2005-05-15_ARG.csv
reg
Nuevo trimestre.
2005-08-15
C1
File saved at /media/miglesia/Elements/suite/yr_samples/RFC1_0.01_2005-08-15_ARG.csv
C2
File saved at /media/miglesia/Elements/suite/yr_samples/RFC2_0.01_2005-08-15_ARG.csv
C3
File saved at /media/miglesia/Elements/suite/yr_samples/RFC3_0.01_2005-08-15_ARG.csv
reg
Nuevo trimestre.
2005-11-15
C1
File saved at /media/m

In [None]:
# from sklearn.model_selection import train_test_split
# from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
# from IPython.core.display import display, HTML
