# Training ML algos on EPH. Predicting on CENSO.

In [1]:
startyr = 2020
endyr = 2021

In [2]:
# Load modules
import warnings
warnings.filterwarnings('ignore')

import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from IPython.core.display import display, HTML

import pickle
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

import os

In [3]:
# Column names
y_cols = ['CAT_OCUP', 'P47T', 'PP10E', 'PP10D', 'PP07K', 'PP07I', 'V3_M', 'PP07G4', 'CH16', 'T_VI', 
          'V12_M', 'TOT_P12', 'PP07G3', 'V5_M', 'PP07H', 'V2_M', 'PP10C', 
          'PP08D1', 'PP07J', 'CAT_INAC', 'CH07', 'CH08', 'P21', 'PP07G1', 'PP07G_59', 'PP07G2']


x_cols = ['IX_TOT', 'P02', 'P03', 'AGLO_rk', 'Reg_rk', 'V01', 'H05', 'H06',
       'H07', 'H08', 'H09', 'H10', 'H11', 'H12', 'H16', 'H15', 'PROP', 'H14',
       'H13', 'P07', 'P08', 'P09', 'P10', 'P05', 'CONDACT']

x_cols1 = x_cols
predecir1 = ['CAT_OCUP', 'CAT_INAC', 'CH07']

x_cols2 = x_cols1 + predecir1
predecir2 = ['INGRESO', 'INGRESO_NLB', 'INGRESO_JUB', 'INGRESO_SBS']

x_cols3 = x_cols2 + predecir2
# La seccion PP07G pregunta si el trabajo es en blanco y que beneficios tiene. Puede ayudar a la regresion para ingresos.
# predecir3 = ['PP07G1', 'PP07G2', 'PP07G3', 'PP07G4', 'PP07G_59', 'PP07H', 'PP07I', 'PP07J', 'PP07K']
predecir3 = ['PP07G1','PP07G_59', 'PP07I', 'PP07J', 'PP07K']

# Columnas de ingresos. Necesitan una regresion...
columnas_pesos = [u'P21', u'P47T', u'PP08D1', u'TOT_P12', u'T_VI', u'V12_M', u'V2_M', u'V3_M', u'V5_M']
# P21: INGRESO DE LA OCUPACIÃ“N PRINCIPAL 
# P47T: TOTAL (laboral y no laboral)
# PP08D1: Sueldos, jornales, etc.
# TOT_P12: TOTAL DE INGRESO  POR  OTRAS OCUPACIONES (Secundaria, previa...)
# T_VI: TOTAL DE INGRESOS NO LABORALES
# V12_M:  CUOTAS DE ALIMENTOS O AYUDA EN DINERO  DE PERSONAS QUE NO VIVEN EN EL HOGAR 
# V2_M: ingreso por JUBILACION O PENSION
# V3_M: INDEMNIZACION POR DESPIDO 
# V5_M: SUBSIDIO  O AYUDA SOCIAL DEL GOBIERNO, IGLESIAS, ETC.

x_cols4 = x_cols3 + predecir3
# Columnas de ingresos. Necesitan una regresion...
predecir4 = columnas_pesos
y_cols4 = predecir4

### Load info

In [4]:
radio_ref = pd.read_csv('./../data/info/radio_ref.csv')
# radio_ref[['PROV','NOMPROV','DPTO', 'NOMDPTO']].drop_duplicates().to_csv('./../data/DPTO_PROV.csv', index = False)
dpto_region = pd.read_csv('./../data/info/DPTO_PROV_Region.csv')
radio_ref = radio_ref.merge(dpto_region)

AGLO_rk = pd.read_csv('./../data/info/AGLO_rk')
Reg_rk = pd.read_csv('./../data/info/Reg_rk')

In [5]:
frac = 0.02

# Training

## Classification 1
Columnas ['CAT_OCUP', 'CAT_INAC', 'CH07']

In [6]:
# Load Census

for yr in [str(s) for s in range(startyr, endyr)]:
    # Load EPH
    train = pd.read_csv('./../data/training/EPHARG_train_'+yr[2:]+'.csv')
    train = train.loc[train.P47T >= -0.001].fillna(0)#.sample(400000)
    train = train.sort_values('CODUSU') ## Los hogares se repiten en cada cuatrimestre. Esto hace que haya hogares solo en test set. 
    print(train.shape)

    # # NO SE DEBE DEFLACTAR PORQUE YA ESTA DEFLACTADO DE LA NOTEBOOK 2

    ### STEP 1
    x_cols1 = x_cols

    y_cols1 = predecir1

    X = train[x_cols1]
    y = train[y_cols1]#.loc[X.index]

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1)
    # Rewrite Census data as 'test' set.

    X = X_train; y = y_train

    forest = RandomForestClassifier(n_estimators=70, max_depth = 35, n_jobs = -1)
    clf1 = forest.fit(X.values, y.values)

    # save the model to disk
    if not os.path.exists('./../fitted_RF/'):
        os.makedirs('./../fitted_RF/')
    filename = './../fitted_RF/clf1_'+str(frac)+'_'+yr+'_ARG.sav'
    pickle.dump(forest, open(filename, 'wb'))

(169218, 53)


In [7]:
# X_train.groupby('AGLO_rk').size()/X_test.groupby('AGLO_rk').size()

Cuando predecimos, usamos entradas de la base de datos del censo como X.



## Classification 2
Columna INGRESO

In [8]:
# Load Census

for yr in [str(s) for s in range(startyr, endyr)]:
    # Load EPH
    print(yr)

    train = pd.read_csv('./../data/training/EPHARG_train_'+yr[2:]+'.csv')
#     train = train.merge(AGLO_rk[['AGLOMERADO', 'AGLO_rk']]).merge(Reg_rk[['Region', 'Reg_rk']])
    train = train.loc[train.P47T >= -0.001].fillna(0)#.sample(400000)
    train = train.sort_values('CODUSU') ## Los hogares se repiten en cada cuatrimestre. Esto hace que haya hogares solo en test set. 
    train['INGRESO'] = (train.P47T > 100).astype(int)
    train['INGRESO_NLB'] = (train.T_VI > 100).astype(int)
    train['INGRESO_JUB'] = (train.V2_M > 100).astype(int)
    train['INGRESO_SBS'] = (train.V5_M > 100).astype(int)

    ### STEP 2
    y_cols2 = predecir2

    X = train[x_cols2]
    y = train[y_cols2]#.loc[X.index]

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1)
    X = X_train; y = y_train

    forest = RandomForestClassifier(n_estimators=50, max_depth = 35, n_jobs = -1)
    clf2 = forest.fit(X.values, y.values)

    # save the model to disk
    filename = './../fitted_RF/clf2_'+str(frac)+'_'+yr+'_ARG.sav'
    pickle.dump(forest, open(filename, 'wb'))

2020


## Classification 3
Columnas ['PP07G1', 'PP07G2', 'PP07G3', 'PP07G4', 'PP07G_59', 'PP07H', 'PP07I', 'PP07J', 'PP07K']

In [9]:
# Load Census

for yr in [str(s) for s in range(startyr, endyr)]:
    # Load EPH
    print(yr)

    train = pd.read_csv('./../data/training/EPHARG_train_'+yr[2:]+'.csv')
#     train = train.merge(AGLO_rk[['AGLOMERADO', 'AGLO_rk']]).merge(Reg_rk[['Region', 'Reg_rk']])
    train = train.loc[train.P47T >= -0.001].fillna(0)#.sample(400000)
    train = train.sort_values('CODUSU') ## Los hogares se repiten en cada cuatrimestre. Esto hace que haya hogares solo en test set. 
    train['INGRESO'] = (train.P47T > 100).astype(int)
    train['INGRESO_NLB'] = (train.T_VI > 100).astype(int)
    train['INGRESO_JUB'] = (train.V2_M > 100).astype(int)
    train['INGRESO_SBS'] = (train.V5_M > 100).astype(int)
    
    ### STEP 2
    y_cols3 = predecir3

    X = train[x_cols3]
    y = train[y_cols3]#.loc[X.index]

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1)
    X = X_train; y = y_train

    forest = RandomForestClassifier(n_estimators=50, max_depth = 35, n_jobs = -1)
    clf3 = forest.fit(X.values, y.values)

    # save the model to disk
    filename = './../fitted_RF/clf3_'+str(frac)+'_'+yr+'_ARG.sav'
    pickle.dump(forest, open(filename, 'wb'))

2020


## Regresion
Combinamos la info de los varios trimestres, deflactada.

In [10]:
# startyr = 2003
# endyr = 2004

In [11]:
for yr in [str(s) for s in range(startyr, endyr)]:
# for yr in [str(s) for s in range(2017, endyr)]:
    print(yr)
    train = pd.read_csv('./../data/training/EPHARG_train_'+yr[2:]+'.csv')
#     train = train.merge(AGLO_rk[['AGLOMERADO', 'AGLO_rk']]).merge(Reg_rk[['Region', 'Reg_rk']])

    train = train.loc[train.P47T >= -0.001].fillna(0)
    train = train.sort_values('CODUSU') ## Los hogares se repiten en cada cuatrimestre. Esto hace que haya hogares solo en test set. 
    train['INGRESO'] = (train.P47T > 100).astype(int)
    train['INGRESO_NLB'] = (train.T_VI > 100).astype(int)
    train['INGRESO_JUB'] = (train.V2_M > 100).astype(int)
    train['INGRESO_SBS'] = (train.V5_M > 100).astype(int)

    train[columnas_pesos] = np.log10(train[columnas_pesos].clip(-.9) + 1)


    for q in train.Q.unique():
        print(q)
        ### STEP 3 (Regression)
        train_q = train.loc[train.Q == q]

        X = train_q[x_cols4]#.sample(frac = 1) #PBA_train_reg
        y = train_q[y_cols4].loc[X.index].fillna(0) #PBA_train_reg

        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1)
        X = X_train; y = y_train

        forest = RandomForestRegressor(n_estimators=1, max_depth = 45, n_jobs = -1)
        clf4 = forest.fit(X.values, y.values)
        
        # save the model to disk
        filename = './../fitted_RF/clf4_'+str(frac)+'_'+q+'_ARG.sav'
        pickle.dump(forest, open(filename, 'wb'))

2020
2020-06-30
2020-03-31
