# Predicting on CENSO samples.

In [1]:
startyr = 2003
endyr = 2021

In [2]:
# Load modules
import warnings
warnings.filterwarnings('ignore')

import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from IPython.core.display import display, HTML

import pickle
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

import os

In [3]:
# Column names
y_cols = ['CAT_OCUP', 'P47T', 'PP10E', 'PP10D', 'PP07K', 'PP07I', 'V3_M', 'PP07G4', 'CH16', 'T_VI', 
          'V12_M', 'TOT_P12', 'PP07G3', 'V5_M', 'PP07H', 'V2_M', 'PP10C', 
          'PP08D1', 'PP07J', 'CAT_INAC', 'CH07', 'CH08', 'P21', 'PP07G1', 'PP07G_59', 'PP07G2']


x_cols = ['IX_TOT', 'P02', 'P03', 'AGLO_rk', 'Reg_rk', 'V01', 'H05', 'H06',
       'H07', 'H08', 'H09', 'H10', 'H11', 'H12', 'H16', 'H15', 'PROP', 'H14',
       'H13', 'P07', 'P08', 'P09', 'P10', 'P05', 'CONDACT']

x_cols1 = x_cols
predecir1 = ['CAT_OCUP', 'CAT_INAC', 'CH07']

x_cols2 = x_cols1 + predecir1
predecir2 = ['INGRESO', 'INGRESO_NLB', 'INGRESO_JUB', 'INGRESO_SBS']

x_cols3 = x_cols2 + predecir2
# La seccion PP07G pregunta si el trabajo es en blanco y que beneficios tiene. Puede ayudar a la regresion para ingresos.
# predecir3 = ['PP07G1', 'PP07G2', 'PP07G3', 'PP07G4', 'PP07G_59', 'PP07H', 'PP07I', 'PP07J', 'PP07K']
predecir3 = ['PP07G1','PP07G_59', 'PP07I', 'PP07J', 'PP07K']

# Columnas de ingresos. Necesitan una regresion...
columnas_pesos = [u'P21', u'P47T', u'PP08D1', u'TOT_P12', u'T_VI', u'V12_M', u'V2_M', u'V3_M', u'V5_M']
# P21: INGRESO DE LA OCUPACIÓN PRINCIPAL 
# P47T: TOTAL (laboral y no laboral)
# PP08D1: Sueldos, jornales, etc.
# TOT_P12: TOTAL DE INGRESO  POR  OTRAS OCUPACIONES (Secundaria, previa...)
# T_VI: TOTAL DE INGRESOS NO LABORALES
# V12_M:  CUOTAS DE ALIMENTOS O AYUDA EN DINERO  DE PERSONAS QUE NO VIVEN EN EL HOGAR 
# V2_M: ingreso por JUBILACION O PENSION
# V3_M: INDEMNIZACION POR DESPIDO 
# V5_M: SUBSIDIO  O AYUDA SOCIAL DEL GOBIERNO, IGLESIAS, ETC.

x_cols4 = x_cols3 + predecir3
# Columnas de ingresos. Necesitan una regresion...
predecir4 = columnas_pesos
y_cols4 = predecir4

### Load info

In [4]:
radio_ref = pd.read_csv('./../data/info/radio_ref.csv')
# radio_ref[['PROV','NOMPROV','DPTO', 'NOMDPTO']].drop_duplicates().to_csv('./../data/DPTO_PROV.csv', index = False)
dpto_region = pd.read_csv('./../data/info/DPTO_PROV_Region.csv')
radio_ref = radio_ref.merge(dpto_region)

AGLO_rk = pd.read_csv('./../data/info/AGLO_rk')
Reg_rk = pd.read_csv('./../data/info/Reg_rk')

In [5]:
frac = 0.02

# Predicting

## Classification 1
Columnas ['CAT_OCUP', 'CAT_INAC', 'CH07']

In [11]:
for yr in [str(s) for s in range(startyr, endyr)]:
    print(yr)
    file_ = './../data/yr_samples/sample_censo_table_f'+str(frac)+'_'+yr+'_ARG.csv'

    X_censo = pd.read_csv(file_, usecols = ['DPTO','RADIO_REF_ID','PERSONA_REF_ID', 'HOGAR_REF_ID','IX_TOT', 'P02', 'P03', 'CONDACT', 'AGLOMERADO', 'V01', 'H05', 'H06',
           'H07', 'H08', 'H09', 'H10', 'H11', 'H12', 'H16', 'H15', 'PROP', 'H14',
           'H13', 'P07', 'P08', 'P09', 'P10', 'P05']).fillna(0)
    
    DPTO_Region = radio_ref[['DPTO', 'Region']].drop_duplicates()
    X_censo = X_censo.merge(DPTO_Region)
#     print(X_censo.merge(radio_ref[['RADIO_REF_ID', 'PROV']], how = 'left')['PROV'].nunique())
    X_censo = X_censo.merge(AGLO_rk[['AGLOMERADO', 'AGLO_rk']]).merge(Reg_rk[['Region', 'Reg_rk']])
   

    ## Cargar Modelo
    filename = './../fitted_RF/clf1_'+str(frac)+'_'+yr+'_ARG.sav'
    clf1 = pickle.load(open(filename, 'rb'))
    
    y_out1 = clf1.predict(X_censo[x_cols1].values)
    y_censo_fit1 = pd.DataFrame(y_out1, index = X_censo.index, columns=predecir1)
    Xy1_censo = pd.concat([X_censo, y_censo_fit1], axis = 1)

    # save 
    Xy1_censo.to_csv('./../data/yr_samples/RFC1_'+str(frac)+'_'+yr+'_ARG.csv', index = False)


2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020


## Classification 2
Columna INGRESO

In [12]:
for yr in [str(s) for s in range(startyr, endyr)]:
    print(yr)
    Xy1_censo = pd.read_csv('./../data/yr_samples/RFC1_'+str(frac)+'_'+yr+'_ARG.csv')
    
    ## Cargar Modelo
    filename = './../fitted_RF/clf2_'+str(frac)+'_'+yr+'_ARG.sav'
    clf2 = pickle.load(open(filename, 'rb'))
       
    y_out2 = clf2.predict(Xy1_censo[x_cols2].values)
    y_censo_fit2 = pd.DataFrame(y_out2, index = Xy1_censo.index, columns=predecir2)

    Xy2_censo = pd.concat([Xy1_censo, y_censo_fit2], axis = 1)

#     # # save
    Xy2_censo.to_csv('./../data/yr_samples/RFC2_'+str(frac)+'_'+yr+'_ARG.csv', index = False)

2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020


## Classification 3
Columnas ['PP07G1', 'PP07G2', 'PP07G3', 'PP07G4', 'PP07G_59', 'PP07H', 'PP07I', 'PP07J', 'PP07K']

In [13]:
for yr in [str(s) for s in range(startyr, endyr)]:
    print(yr)
    Xy2_censo = pd.read_csv('./../data/yr_samples/RFC2_'+str(frac)+'_'+yr+'_ARG.csv')
    
    ## Cargar Modelo
    filename = './../fitted_RF/clf3_'+str(frac)+'_'+yr+'_ARG.sav'
    clf3 = pickle.load(open(filename, 'rb'))
       
    y_out3 = clf3.predict(Xy2_censo[x_cols3].values)
    y_censo_fit3 = pd.DataFrame(y_out3, index = Xy2_censo.index, columns=predecir3)

    Xy3_censo = pd.concat([Xy2_censo, y_censo_fit3], axis = 1)

    # # save
    Xy3_censo.to_csv('./../data/yr_samples/RFC3_'+str(frac)+'_'+yr+'_ARG.csv', index = False)

2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020


## Regresion

In [14]:
for yr in [str(s) for s in range(startyr, endyr)]:
    print(yr)
    Xy3_censo = pd.read_csv('./../data/yr_samples/RFC3_'+str(frac)+'_'+yr+'_ARG.csv')
        
    qs = pd.read_csv('./../data/training/EPHARG_train_'+yr[2:]+'.csv', usecols=['Q'])
    for q in qs.Q.unique():

        ## Cargar Modelo
        filename = './../fitted_RF/clf4_'+str(frac)+'_'+q+'_ARG.sav'
        clf4 = pickle.load(open(filename, 'rb'))

        y_out4 = clf4.predict(Xy3_censo[x_cols4].values)
        y_censo_fit4 = pd.DataFrame(y_out4, index = Xy3_censo.index, columns=predecir4)


        Xy4_censo = pd.concat([Xy3_censo, y_censo_fit4], axis = 1)
        Xy4_censo.to_csv('./../data/yr_samples/RFReg_'+str(frac)+'ARG'+str(q)+'.csv', index = False)


2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
