# Training ML algos on EPH. Predicting on CENSO.

In [1]:
startyr = 2010
endyr = 2011

In [2]:
# Load modules
import warnings
warnings.filterwarnings('ignore')

import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from IPython.core.display import display, HTML

# import pickle
import joblib

from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

import os
import json

In [3]:
import sys

# These are the usual ipython objects, including this one you are creating
ipython_vars = ['In', 'Out', 'exit', 'quit', 'get_ipython', 'ipython_vars']
print(sorted([(x, sys.getsizeof(globals().get(x))) for x in dir() if not x.startswith('_') and x not in sys.modules and x not in ipython_vars], key=lambda x: x[1], reverse=True)[:5])


[('HTML', 1064), ('RandomForestClassifier', 1064), ('RandomForestRegressor', 1064), ('display', 136), ('train_test_split', 136)]


In [4]:
# Column names
y_cols = ['CAT_OCUP', 'P47T', 'PP10E', 'PP10D', 'PP07K', 'PP07I', 'V3_M', 'PP07G4', 'CH16', 'T_VI', 
          'V12_M', 'TOT_P12', 'PP07G3', 'V5_M', 'PP07H', 'V2_M', 'PP10C', 
          'PP08D1', 'PP07J', 'CAT_INAC', 'CH07', 'CH08', 'P21', 'PP07G1', 'PP07G_59', 'PP07G2']

x_cols = ['IX_TOT', 'P02', 'P03', 'AGLO_rk', 'Reg_rk', 'V01', 'H05', 'H06',
       'H07', 'H08', 'H09', 'H10', 'H11', 'H12', 'H16', 'H15', 'PROP', 'H14',
       'H13', 'P07', 'P08', 'P09', 'P10', 'P05', 'CONDACT']

x_cols1 = x_cols
predecir1 = ['CAT_OCUP', 'CAT_INAC', 'CH07']

x_cols2 = x_cols1 + predecir1
predecir2 = ['INGRESO', 'INGRESO_NLB', 'INGRESO_JUB', 'INGRESO_SBS']

x_cols3 = x_cols2 + predecir2
# La seccion PP07G pregunta si el trabajo es en blanco y que beneficios tiene. Puede ayudar a la regresion para ingresos.
# predecir3 = ['PP07G1', 'PP07G2', 'PP07G3', 'PP07G4', 'PP07G_59', 'PP07H', 'PP07I', 'PP07J', 'PP07K']
predecir3 = ['PP07G1','PP07G_59', 'PP07I', 'PP07J', 'PP07K']

# Columnas de ingresos. Necesitan una regresion...
columnas_pesos = [u'P21', u'P47T', u'PP08D1', u'TOT_P12', u'T_VI', u'V12_M', u'V2_M', u'V3_M', u'V5_M']
# P21: INGRESO DE LA OCUPACIÓN PRINCIPAL 
# P47T: TOTAL (laboral y no laboral)
# PP08D1: Sueldos, jornales, etc.
# TOT_P12: TOTAL DE INGRESO  POR  OTRAS OCUPACIONES (Secundaria, previa...)
# T_VI: TOTAL DE INGRESOS NO LABORALES
# V12_M:  CUOTAS DE ALIMENTOS O AYUDA EN DINERO  DE PERSONAS QUE NO VIVEN EN EL HOGAR 
# V2_M: ingreso por JUBILACION O PENSION
# V3_M: INDEMNIZACION POR DESPIDO 
# V5_M: SUBSIDIO  O AYUDA SOCIAL DEL GOBIERNO, IGLESIAS, ETC.

x_cols4 = x_cols3 + predecir3
# Columnas de ingresos. Necesitan una regresion...
predecir4 = columnas_pesos
y_cols4 = predecir4

In [5]:
def prepend_index_level(index, key, name=None):
    names = index.names
    if index.nlevels==1:
        # Sequence of tuples
        index = ((item,) for item in index)

    tuples_gen = ((key,)+item for item in index)
    return pd.MultiIndex.from_tuples(tuples_gen, names=[name]+names)

### Load info

In [6]:
radio_ref = pd.read_csv('./../data/info/radio_ref.csv')
# radio_ref[['PROV','NOMPROV','DPTO', 'NOMDPTO']].drop_duplicates().to_csv('./../data/DPTO_PROV.csv', index = False)
dpto_region = pd.read_csv('./../data/info/DPTO_PROV_Region.csv')
radio_ref = radio_ref.merge(dpto_region)

AGLO_rk = pd.read_csv('./../data/info/AGLO_rk')
Reg_rk = pd.read_csv('./../data/info/Reg_rk')

## Set up Google sheet connection

In [7]:
from oauth2client.service_account import ServiceAccountCredentials
import gspread
from numpy import array, nan

scope = ['https://spreadsheets.google.com/feeds',
         'https://www.googleapis.com/auth/drive']

credentials = ServiceAccountCredentials.from_json_keyfile_name(
         './../../../Desktop/newgsheets-349817-e11b8c6a66ec.json', scope) # Your json file here

gc = gspread.authorize(credentials)
    
sheet_id = '1qlSesmbEnR0NHi_pe8NDJEG80v-fbumJBviy8yb9Ojc'

sh = gc.open_by_key(sheet_id)#.get_worksheet('Sheet1')

# # Get a list of all worksheets
# worksheet_list = sh.worksheets()


## Funcion para subir confusion matrices a Gsheets

In [13]:
from sklearn.metrics import confusion_matrix  

def compute_upload_conf_matrix(y_true, y_pred, col):
    conf_vals = confusion_matrix(y_[col], y_pred[col])
    
    confusion = pd.DataFrame(conf_vals, columns = sorted(y_[col].unique()), index = sorted(y_pred[col].unique()))
    confusion = 100*confusion.div(confusion.sum(1), 0).round(3)
    
    confusion.columns.name = "Prediccion"
    confusion.index.name = "Verdadero"
    
    display(confusion)
    
    ## UPDATE G SHEET
    df = confusion.reset_index()
    rows, cols = df.shape

    worksheet = sh.worksheet(col)
    worksheet.update([df.columns.values.tolist()] + [6*['']] + df.values.tolist())
    worksheet.update('A1', 'Verdadero')
    worksheet.update('A2', 'Prediccion')
    worksheet.format('A', {'textFormat': {'bold': True}})
    worksheet.format('1', {'textFormat': {'bold': True}})

### Funcion entrenar modelo y guardarlo

In [None]:
def fit_save_clf(data, x_cols, y_cols, filename):

    data['split'] = data.index.values % 6

    test  = data.loc[data.split == 0].reset_index(drop = True)
    train = data.loc[data.split != 0]
    del data
    
    X, y = train[x_cols], train[y_cols]
    X_, y_ = test[x_cols], test[y_cols]
    del train; del test

    forest = RandomForestClassifier(n_estimators=100, max_depth = 20, n_jobs = -1)
    clf = forest.fit(X.values, y.values)

    # save the model to disk
    if not os.path.exists('./../fitted_RF/'):
        os.makedirs('./../fitted_RF/')
    
    joblib.dump(forest, filename, compress=3)
    
    del X; del y; # liberar memoria eliminando los dataframes mas pesados
    
    return clf, X_, y_  #devuelve modelo e info para testeo

# Training

## Classification 1
Columnas ['CAT_OCUP', 'CAT_INAC', 'CH07']

In [9]:
# Load Census
### STEP 1
x_cols1 = x_cols
y_cols1 = predecir1


### Para cada anio. 
- Calcular Factores en Cross Validation y guardar.

In [None]:
def compute_factors_wCV(data, x_cols, y_cols):
    data['split'] = data.index.values % 6

    test  = data.loc[data.split == 0].reset_index(drop = True)
    train = data.loc[data.split != 0]
    
    ## En cada particion de Cross Validation
    factors_df_parts = []

    for i in range(1, 6)[:2]:
        print(i)
    #     Separar train y test
        cv_part = train.loc[train.split != i]
        cv_test = train.loc[train.split == i].reset_index(drop = True)

        # Ajustar modelo en train
        X, y = cv_part[x_cols1], cv_part[y_cols1]
        del cv_part

        forest = RandomForestClassifier(n_estimators=100, max_depth = 20, n_jobs = -1)
        clf = forest.fit(X.values, y.values)

        X_, y_ = cv_test[x_cols1], cv_test[y_cols1]
        del cv_test

        # # For each of the columns, get the proba so that the correct number of observations are above.
        proba_values = clf.predict_proba(X_)
    #     y_pred = y_.copy()

        for j, y_col in enumerate(y_cols1):
            y_probas = pd.DataFrame(proba_values[j], columns = sorted(y_[y_col].unique()))

            counts = y_[y_col].value_counts().sort_index()

            ## Get the highest
            y_probas_rk = y_probas.rank(method = 'first', ascending = False)

            factors = ((y_probas_rk > counts)*y_probas).max()
            factors_df_part = pd.DataFrame(factors.reset_index()); 
            factors_df_part.columns = ['valor', 'factor']
            factors_df_part['variable'] = y_col
            factors_df_part['part'] = i
    #         y_pred[y_col] = (y_probas/factors).idxmax(1)#.value_counts().sort_index()

            factors_df_parts += [factors_df_part]

    factors_df = pd.concat(factors_df_parts)
    
    factors_mean = factors_df.groupby(['variable', 'valor'])['factor'].mean()
    
    return factors_mean

In [10]:
for yr in [str(s) for s in range(startyr, endyr)]:
    # Load EPH
    print(yr)
    data = pd.read_csv('./../data/training/EPHARG_train_'+yr[2:]+'.csv')
    print(data.shape)

    # Computar factores
    factors_mean = compute_factors_wCV(data, x_cols = x_cols1, y_cols = y_cols1)

    ## Guardar factores en archivo json
    tag = 'clf1_'+yr+'_ARG'
    factors_dict = dict()
    factors_dict[tag] = factors_mean.unstack().to_json()

    with open('./../data/training/factors/'+tag+'.json', 'w') as file: 
        json.dump(factors_dict, file)


2010
(361058, 56)
1
2


### Para cada anio. 
- Ajustar modelo y guardar

In [66]:
# Entrenar modelo en train
for yr in [str(s) for s in range(startyr, endyr)]:
    # Load EPH
    print(yr)
    data = pd.read_csv('./../data/training/EPHARG_train_'+yr[2:]+'.csv')

    filename = './../fitted_RF/clf1_'+yr+'_ARG'
    clf, X_, y_ = fit_save_clf(data, x_cols = x_cols1, y_cols = y_cols1, filename = filename)

    if yr == '2010':
        
        y_pred = y_.copy()
        proba_values = clf.predict_proba(X_)

        ## Leer factores de archivo json
        with open('./../data/training/factors/'+filename.split('/')[-1]+'.json', 'r') as file:
            info = json.load(file)
        factors_mean = pd.DataFrame(json.loads(info[filename])).stack()
        factors_mean.index.names = ['variable', 'valor']
        factors_mean.index = factors_mean.index.set_levels(factors_mean.index.levels[1].astype(int), level='valor')

        for j, y_col in enumerate(y_cols1):
            y_probas = pd.DataFrame(proba_values[j], columns = sorted(y_[y_col].unique()))
            y_pred[y_col] = (y_probas/factors_mean.loc[y_col]).idxmax(1)#.value_counts().sort_index()

        for col in y_.columns:
            print(col)
            compute_upload_conf_matrix(y_true = y_, y_pred = y_pred, col = col)

2010
CAT_OCUP


Prediccion,0,1,2,3,4
Verdadero,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,99.9,0.0,0.0,0.1,0.0
1,0.1,46.9,14.5,38.5,0.0
2,0.2,2.6,63.0,33.9,0.3
3,0.4,1.6,7.6,90.1,0.3
4,1.4,1.4,7.7,30.2,59.5


CAT_INAC


Prediccion,0,1,2,3,4,5,6,7
Verdadero,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0,99.9,0.0,0.0,0.1,0.0,0.0,0.0,0.0
1,0.0,93.4,0.1,0.5,5.2,0.0,0.2,0.6
2,0.0,6.6,69.2,0.0,24.2,0.0,0.0,0.0
3,0.0,0.0,0.0,99.9,0.0,0.0,0.0,0.0
4,0.0,6.6,0.1,0.0,89.1,0.0,0.3,3.9
5,0.0,0.0,0.0,0.0,0.0,100.0,0.0,0.0
6,0.0,12.1,0.0,0.0,7.1,0.0,72.9,7.9
7,0.0,7.3,0.2,0.0,14.6,0.0,0.7,77.2


CH07


Prediccion,1,2,3,4,5,9
Verdadero,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1,68.5,19.7,1.0,0.4,10.4,0.0
2,9.4,83.9,1.7,1.8,3.2,0.0
3,5.8,21.6,62.9,3.2,6.5,0.0
4,0.8,15.3,5.2,77.9,0.9,0.0
5,2.0,2.7,1.2,0.6,93.5,0.0
9,0.0,0.0,0.0,0.0,0.0,100.0


In [None]:
# y_pred[y_col] = 
(y_probas/factors_mean.loc[y_col]).idxmax(1)

In [63]:
# print(sorted([(x, sys.getsizeof(globals().get(x))) for x in dir() if not x.startswith('_') and x not in sys.modules and x not in ipython_vars], key=lambda x: x[1], reverse=True)[:5])

## Classification 2
Columna INGRESO

In [58]:
y_cols2 = predecir2

### Para cada anio. 
- Calcular Factores en Cross Validation y guardar.

In [59]:
for yr in [str(s) for s in range(startyr, endyr)]:
    # Load EPH
    print(yr)
    data = pd.read_csv('./../data/training/EPHARG_train_'+yr[2:]+'.csv')
    print(data.shape)

    data['split'] = data.index.values % 6

    test  = data.loc[data.split == 0].reset_index(drop = True)
    train = data.loc[data.split != 0]
    
    ## En cada particion de Cross Validation
    factors_df_parts = []

    for i in range(1, 6)[:2]:
        print(i)
    #     Separar train y test
        cv_part = train.loc[train.split != i]
        cv_test = train.loc[train.split == i].reset_index(drop = True)

        # Ajustar modelo en train
        X, y = cv_part[x_cols2], cv_part[y_cols2]
        del cv_part

        forest = RandomForestClassifier(n_estimators=100, max_depth = 20, n_jobs = -1)
        clf = forest.fit(X.values, y.values)

        X_, y_ = cv_test[x_cols2], cv_test[y_cols2]
        del cv_test

        # # For each of the columns, get the proba so that the correct number of observations are above.
        proba_values = clf.predict_proba(X_)
    #     y_pred = y_.copy()

        for j, y_col in enumerate(y_cols2):
            y_probas = pd.DataFrame(proba_values[j], columns = sorted(y_[y_col].unique()))

            counts = y_[y_col].value_counts().sort_index()

            ## Get the highest
            y_probas_rk = y_probas.rank(method = 'first', ascending = False)

            factors = ((y_probas_rk > counts)*y_probas).max()
            factors_df_part = pd.DataFrame(factors.reset_index()); 
            factors_df_part.columns = ['valor', 'factor']
            factors_df_part['variable'] = y_col
            factors_df_part['part'] = i
    #         y_pred[y_col] = (y_probas/factors).idxmax(1)#.value_counts().sort_index()

            factors_df_parts += [factors_df_part]

    factors_df = pd.concat(factors_df_parts)
    
    factors_mean = factors_df.groupby(['variable', 'valor'])['factor'].mean()

    ## Guardar factores en archivo json
    filename = './../fitted_RF/clf2_'+yr+'_ARG'
    factors_dict = dict()
    factors_dict[filename] = factors_mean.unstack().to_json()

    with open('./../data/training/factors/'+filename.split('/')[-1]+'.json', 'w') as file: 
        json.dump(factors_dict, file)


2010
(361058, 56)
1
2


### Para cada anio. 
- Ajustar modelo y guardar

In [60]:
# Entrenar modelo en train
for yr in [str(s) for s in range(startyr, endyr)]:
#     # Load EPH
    print(yr)
    data = pd.read_csv('./../data/training/EPHARG_train_'+yr[2:]+'.csv')

    filename = './../fitted_RF/clf2_'+yr+'_ARG'
    clf, X_, y_ = fit_save_clf(data, x_cols = x_cols2, y_cols = y_cols2, filename = filename)
    
    if yr == '2010':
        
        y_pred = y_.copy()
        proba_values = clf.predict_proba(X_)

        ## Leer factores de archivo json
        with open('./../data/training/factors/'+filename.split('/')[-1]+'.json', 'r') as file:
            info = json.load(file)
        factors_mean = pd.DataFrame(json.loads(info[filename])).stack()
        factors_mean.index.names = ['variable', 'valor']
        factors_mean.index = factors_mean.index.set_levels(factors_mean.index.levels[1].astype(int), level='valor')

        for j, y_col in enumerate(y_cols2):
            y_probas = pd.DataFrame(proba_values[j], columns = sorted(y_[y_col].unique()))
            y_pred[y_col] = (y_probas/factors_mean.loc[y_col]).idxmax(1)#.value_counts().sort_index()

        for col in y_.columns:
            print(col)
            compute_upload_conf_matrix(y_true = y_, y_pred = y_pred, col = col)

2010


Prediccion,0,1
Verdadero,Unnamed: 1_level_1,Unnamed: 2_level_1
0,97.2,2.8
1,2.3,97.7


Prediccion,0,1
Verdadero,Unnamed: 1_level_1,Unnamed: 2_level_1
0,96.8,3.2
1,12.9,87.1


Prediccion,0,1
Verdadero,Unnamed: 1_level_1,Unnamed: 2_level_1
0,99.5,0.5
1,3.4,96.6


Prediccion,0,1
Verdadero,Unnamed: 1_level_1,Unnamed: 2_level_1
0,98.5,1.5
1,31.0,69.0


In [61]:
xx

NameError: name 'xx' is not defined

In [None]:
for yr in [str(s) for s in range(startyr, endyr)]:
    # Load EPH
    print(yr)
    data = pd.read_csv('./../data/training/EPHARG_train_'+yr[2:]+'.csv')
    print(data.shape)

data['split'] = data.index.values % 6

test  = data.loc[data.split == 0].reset_index(drop = True)
train = data.loc[data.split != 0]

In [None]:
## En cada particion de Cross Validation
factors_df_parts = []

for i in range(1, 6):
    print(i)
#     Separar train y test
    cv_part = train.loc[train.split != i]
    cv_test = train.loc[train.split == i].reset_index(drop = True)
    
    # Ajustar modelo en train
    X, y = cv_part[x_cols2], cv_part[y_cols2]
    
    forest = RandomForestClassifier(n_estimators=100, max_depth = 20, n_jobs = -1)
    clf2 = forest.fit(X.values, y.values)
    
    X_, y_ = cv_test[x_cols2], cv_test[y_cols2]
    # clf1.predict_proba(X_)

    # # For each of the columns, get the proba so that the correct number of observations are above.
    proba_values = clf2.predict_proba(X_)
#     y_pred = y_.copy()

    for j, y_col in enumerate(y_cols2):
        y_probas = pd.DataFrame(proba_values[j], columns = sorted(y_[y_col].unique()))

        counts = y_[y_col].value_counts().sort_index()

        ## Get the highest
        y_probas_rk = y_probas.rank(method = 'first', ascending = False)

        factors = ((y_probas_rk > counts)*y_probas).max()
        factors_df_part = pd.DataFrame(factors.reset_index()); 
        factors_df_part.columns = ['valor', 'factor']
        factors_df_part['variable'] = y_col
        factors_df_part['part'] = i
#         y_pred[y_col] = (y_probas/factors).idxmax(1)#.value_counts().sort_index()

        factors_df_parts += [factors_df_part]
    
factors_df = pd.concat(factors_df_parts)

In [None]:
factors_mean = factors_df.groupby(['variable', 'valor'])['factor'].mean()


In [None]:
# Entrenar modelo en train
for yr in [str(s) for s in range(startyr, endyr)]:
#     # Load EPH
#     print(yr)
#     data = pd.read_csv('./../data/training/EPHARG_train_'+yr[2:]+'.csv')
#     print(data.shape)
#     # # NO SE DEBE DEFLACTAR PORQUE YA ESTA DEFLACTADO DE LA NOTEBOOK 2

    X, y = train[x_cols2], train[y_cols2]
    X_, y_ = test[x_cols2], test[y_cols2]
    
#     X, X_test, y, y_test = train_test_split(X, y, test_size=0.1) # less memory used

    forest = RandomForestClassifier(n_estimators=100, max_depth = 20, n_jobs = -1)
    clf2 = forest.fit(X.values, y.values)

    # save the model to disk
    if not os.path.exists('./../fitted_RF/'):
        os.makedirs('./../fitted_RF/')
    filename = './../fitted_RF/clf2_'+yr+'_ARG'
    joblib.dump(forest, filename, compress=3)
    
    del train; del X; del y; # liberar memoria eliminando los dataframes mas pesados
    
    
    y_pred = y_.copy()

    for j, y_col in enumerate(y_cols2):
        y_probas = pd.DataFrame(clf2.predict_proba(X_)[j], columns = sorted(y_[y_col].unique()))
        y_pred[y_col] = (y_probas/factors_mean.loc[y_col]).idxmax(1)#.value_counts().sort_index()

In [None]:

y_pred = y_.copy()
proba_values = clf2.predict_proba(X_)

for j, y_col in enumerate(y_cols2):
    y_probas = pd.DataFrame(proba_values[j], columns = sorted(y_[y_col].unique()))
    y_pred[y_col] = (y_probas/factors_mean.loc[y_col]).idxmax(1)#.value_counts().sort_index()
    

In [None]:
from sklearn.metrics import confusion_matrix  

for col in y_.columns:
    print(col)
    conf_vals = confusion_matrix(y_[col], y_pred[col])
    
    confusion = pd.DataFrame(conf_vals, columns = sorted(y_[col].unique()), index = sorted(y_pred[col].unique()))
    confusion = 100*confusion.div(confusion.sum(1), 0).round(3)#.sum(1)
    # confusion = 100*confusion.div(confusion.sum(), 1).round(3)#.sum(0)
    
#     ## Opcion 1: Multiindex
#     confusion.columns = prepend_index_level(confusion.columns, key=col, name="Prediccion")
#     confusion.index = prepend_index_level(confusion.index, key=col, name="Verdadero")
    
    ## Opcion 2: 
    confusion.columns.name = "Prediccion"
    confusion.index.name = "Verdadero"
    
    display(confusion)
    
    ## UPDATE G SHEET
    df = confusion.reset_index()
    rows, cols = df.shape

    try:
        sh.del_worksheet(sh.worksheet(col))
    except:
        pass
    worksheet = sh.add_worksheet(title=col, rows=3+rows, cols= 2+cols)

    worksheet.update([df.columns.values.tolist()] + [6*['']] + df.values.tolist())
    worksheet.update('A1', 'Verdadero')
    worksheet.update('A2', 'Prediccion')
    worksheet.format('A', {'textFormat': {'bold': True}})
    worksheet.format('1', {'textFormat': {'bold': True}})

## Classification 3
Columnas ['PP07G1', 'PP07G2', 'PP07G3', 'PP07G4', 'PP07G_59', 'PP07H', 'PP07I', 'PP07J', 'PP07K']

In [None]:
y_cols3 = predecir3

In [None]:
for yr in [str(s) for s in range(startyr, endyr)]:
    # Load EPH
    print(yr)
    data = pd.read_csv('./../data/training/EPHARG_train_'+yr[2:]+'.csv')
    print(data.shape)

data['split'] = data.index.values % 6

test  = data.loc[data.split == 0].reset_index(drop = True)
train = data.loc[data.split != 0]

In [None]:
## En cada particion de Cross Validation
factors_df_parts = []

for i in range(1, 6):
    print(i)
#     Separar train y test
    cv_part = train.loc[train.split != i]
    cv_test = train.loc[train.split == i].reset_index(drop = True)
    
    # Ajustar modelo en train
    X, y = cv_part[x_cols3], cv_part[y_cols3]
    
    forest = RandomForestClassifier(n_estimators=100, max_depth = 20, n_jobs = -1)
    clf3 = forest.fit(X.values, y.values)
    
    X_, y_ = cv_test[x_cols3], cv_test[y_cols3]
    # clf1.predict_proba(X_)

    # # For each of the columns, get the proba so that the correct number of observations are above.
    proba_values = clf3.predict_proba(X_)
#     y_pred = y_.copy()

    for j, y_col in enumerate(y_cols3):
        y_probas = pd.DataFrame(proba_values[j], columns = sorted(y_[y_col].unique()))

        counts = y_[y_col].value_counts().sort_index()

        ## Get the highest
        y_probas_rk = y_probas.rank(method = 'first', ascending = False)

        factors = ((y_probas_rk > counts)*y_probas).max()
        factors_df_part = pd.DataFrame(factors.reset_index()); 
        factors_df_part.columns = ['valor', 'factor']
        factors_df_part['variable'] = y_col
        factors_df_part['part'] = i
#         y_pred[y_col] = (y_probas/factors).idxmax(1)#.value_counts().sort_index()

        factors_df_parts += [factors_df_part]
    
factors_df = pd.concat(factors_df_parts)

In [None]:
factors_mean = factors_df.groupby(['variable', 'valor'])['factor'].mean()


In [None]:
# Entrenar modelo en train
for yr in [str(s) for s in range(startyr, endyr)]:
#     # Load EPH
#     print(yr)
#     data = pd.read_csv('./../data/training/EPHARG_train_'+yr[2:]+'.csv')
#     print(data.shape)
#     # # NO SE DEBE DEFLACTAR PORQUE YA ESTA DEFLACTADO DE LA NOTEBOOK 2

    X, y = train[x_cols3], train[y_cols3]
    X_, y_ = test[x_cols3], test[y_cols3]
    
#     X, X_test, y, y_test = train_test_split(X, y, test_size=0.1) # less memory used

    forest = RandomForestClassifier(n_estimators=100, max_depth = 20, n_jobs = -1)
    clf3 = forest.fit(X.values, y.values)

    # save the model to disk
    if not os.path.exists('./../fitted_RF/'):
        os.makedirs('./../fitted_RF/')
    filename = './../fitted_RF/clf3_'+yr+'_ARG'
    joblib.dump(forest, filename, compress=3)
    
    del train; del X; del y; # liberar memoria eliminando los dataframes mas pesados
    
    
    y_pred = y_.copy()

    for j, y_col in enumerate(y_cols3):
        y_probas = pd.DataFrame(clf3.predict_proba(X_)[j], columns = sorted(y_[y_col].unique()))
        y_pred[y_col] = (y_probas/factors_mean.loc[y_col]).idxmax(1)#.value_counts().sort_index()

In [None]:

y_pred = y_.copy()
proba_values = clf3.predict_proba(X_)

for j, y_col in enumerate(y_cols3):
    y_probas = pd.DataFrame(proba_values[j], columns = sorted(y_[y_col].unique()))
    y_pred[y_col] = (y_probas/factors_mean.loc[y_col]).idxmax(1)#.value_counts().sort_index()
    

In [None]:
from sklearn.metrics import confusion_matrix  

for col in y_.columns:
    print(col)
    conf_vals = confusion_matrix(y_[col], y_pred[col])
    
    confusion = pd.DataFrame(conf_vals, columns = sorted(y_[col].unique()), index = sorted(y_pred[col].unique()))
    confusion = 100*confusion.div(confusion.sum(1), 0).round(3)#.sum(1)
    # confusion = 100*confusion.div(confusion.sum(), 1).round(3)#.sum(0)
    
#     ## Opcion 1: Multiindex
#     confusion.columns = prepend_index_level(confusion.columns, key=col, name="Prediccion")
#     confusion.index = prepend_index_level(confusion.index, key=col, name="Verdadero")
    
    ## Opcion 2: 
    confusion.columns.name = "Prediccion"
    confusion.index.name = "Verdadero"
    
    display(confusion)
    
    ## UPDATE G SHEET
    df = confusion.reset_index()
    rows, cols = df.shape

    try:
        sh.del_worksheet(sh.worksheet(col))
    except:
        pass
    worksheet = sh.add_worksheet(title=col, rows=3+rows, cols= 2+cols)

    worksheet.update([df.columns.values.tolist()] + [6*['']] + df.values.tolist())
    worksheet.update('A1', 'Verdadero')
    worksheet.update('A2', 'Prediccion')
    worksheet.format('A', {'textFormat': {'bold': True}})
    worksheet.format('1', {'textFormat': {'bold': True}})

In [None]:
xx

In [None]:
# # Load Census

# for yr in [str(s) for s in range(startyr, endyr)]:
#     # Load EPH
#     print(yr)

#     train = pd.read_csv('./../data/training/EPHARG_train_'+yr[2:]+'.csv')
# #     train = train.merge(AGLO_rk[['AGLOMERADO', 'AGLO_rk']]).merge(Reg_rk[['Region', 'Reg_rk']])
# #     train = train.loc[train.P47T >= -0.001].fillna(0)#.sample(400000)
# #     train = train.sort_values('CODUSU') ## Los hogares se repiten en cada cuatrimestre. Esto hace que haya hogares solo en test set. 
# #     train['INGRESO'] = (train.P47T > 100).astype(int)
# #     train['INGRESO_NLB'] = (train.T_VI > 100).astype(int)
# #     train['INGRESO_JUB'] = (train.V2_M > 100).astype(int)
# #     train['INGRESO_SBS'] = (train.V5_M > 100).astype(int)
    
#     ### STEP 2
#     y_cols3 = predecir3

#     X = train[x_cols3]
#     y = train[y_cols3]#.loc[X.index]
#     X, X_test, y, y_test = train_test_split(X, y, test_size=0.2) # less memory used

    
#     forest = RandomForestClassifier(n_estimators=100, max_depth = 20, n_jobs = -1)
#     clf3 = forest.fit(X.values, y.values)

#     # save the model to disk
#     filename = './../fitted_RF/clf3_'+yr+'_ARG'
# #     pickle.dump(forest, open(filename+'.sav', 'wb'))
#     joblib.dump(forest, filename, compress=3)

# #     print(sorted([(x, sys.getsizeof(globals().get(x))) for x in dir() if not x.startswith('_') and x not in sys.modules and x not in ipython_vars], key=lambda x: x[1], reverse=True)[:5])
# #     del clf3 # liberar memoria eliminando el modelo (pesado)
# #     del train; del X; del y # liberar memoria eliminando los dataframes mas pesados

In [None]:

# y_pred = pd.DataFrame(clf3.predict(X_test), columns = y_test.columns)

# for col in y_test.columns:
#     print(col)
#     conf_vals = confusion_matrix(y_test[col], y_pred[col])
    
#     confusion = pd.DataFrame(conf_vals, columns = sorted(y_test[col].unique()), index = sorted(y_pred[col].unique()))
#     confusion = 100*confusion.div(confusion.sum(1), 0).round(3)#.sum(1)
#     # confusion = 100*confusion.div(confusion.sum(), 1).round(3)#.sum(0)
    
#     #     ## Opcion 1: Multiindex
# #     confusion.columns = prepend_index_level(confusion.columns, key=col, name="Prediccion")
# #     confusion.index = prepend_index_level(confusion.index, key=col, name="Verdadero")
    
#     ## Opcion 2: 
#     confusion.columns.name = "Prediccion"
#     confusion.index.name = "Verdadero"
    
#     display(confusion)
    
#     ## UPDATE G SHEET
#     df = confusion.reset_index()
#     rows, cols = df.shape

#     try:
#         sh.del_worksheet(sh.worksheet(col))
#     except:
#         pass
#     worksheet = sh.add_worksheet(title=col, rows=3+rows, cols= 2+cols)

#     worksheet.update([df.columns.values.tolist()] + [6*['']] + df.values.tolist())
#     worksheet.update('A1', 'Verdadero')
#     worksheet.update('A2', 'Prediccion')
#     worksheet.format('A', {'textFormat': {'bold': True}})
#     worksheet.format('1', {'textFormat': {'bold': True}})

In [None]:
xx

## Regresion
Combinamos la info de los varios trimestres, deflactada.

In [None]:
# startyr = 2003
# endyr = 2004

In [None]:
for yr in [str(s) for s in range(startyr, endyr)]:
# for yr in [str(s) for s in range(2017, endyr)]:
    print(yr)
    train = pd.read_csv('./../data/training/EPHARG_train_'+yr[2:]+'.csv')
#     train = train.merge(AGLO_rk[['AGLOMERADO', 'AGLO_rk']]).merge(Reg_rk[['Region', 'Reg_rk']])

#     train = train.loc[train.P47T >= -0.001].fillna(0)
#     train = train.sort_values('CODUSU') ## Los hogares se repiten en cada cuatrimestre. Esto hace que haya hogares solo en test set. 
#     train['INGRESO'] = (train.P47T > 100).astype(int)
#     train['INGRESO_NLB'] = (train.T_VI > 100).astype(int)
#     train['INGRESO_JUB'] = (train.V2_M > 100).astype(int)
#     train['INGRESO_SBS'] = (train.V5_M > 100).astype(int)

    train[columnas_pesos] = np.log10(train[columnas_pesos].clip(-.9) + 1)


    for q in train.Q.unique():
        print(q)
        ### STEP 3 (Regression)
        train_q = train.loc[train.Q == q]

        X = train_q[x_cols4]#.sample(frac = 1) #PBA_train_reg
        y = train_q[y_cols4].loc[X.index].fillna(0) #PBA_train_reg

        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1)
        X = X_train; y = y_train

        forest = RandomForestRegressor(n_estimators=1, max_depth = 40, n_jobs = -1)
        clf4 = forest.fit(X.values, y.values)
        
        # save the model to disk
        filename = './../fitted_RF/clf4_'+q+'_ARG'
#     pickle.dump(forest, open(filename+'.sav', 'wb'))
        joblib.dump(forest, filename, compress=3)
    
        del clf4;
        del train_q;
    print(sorted([(x, sys.getsizeof(globals().get(x))) for x in dir() if not x.startswith('_') and x not in sys.modules and x not in ipython_vars], key=lambda x: x[1], reverse=True)[:5])
    del train; del X # liberar memoria eliminando los dataframes mas pesados