<a href="https://colab.research.google.com/github/mlaricobar/solution-datathon-interbank-2020/blob/desarrollo/%5B03%5D%20Feature%20Engineering.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Feature Engineering

In [1]:
from google.colab import drive, files

import pandas as pd

from datetime import datetime
from dateutil import relativedelta

In [2]:
drive.mount('/gdrive')

Mounted at /gdrive


In [3]:
%cd /gdrive/'My Drive'/Competencias/'[02] Interbank-2020'
!ls -lh '[00] Data'

/gdrive/My Drive/Competencias/[02] Interbank-2020
total 5.1G
-rw------- 1 root root 979M Dec 12 01:33 censo_test.csv
-rw------- 1 root root 748M Dec 12 01:34 censo_train.csv
-rw------- 1 root root  998 Dec 12 01:34 productos.csv
-rw------- 1 root root 1.7G Dec 12 01:34 rcc_test.csv
-rw------- 1 root root 1.4G Dec 12 01:35 rcc_train.csv
-rw------- 1 root root 3.3M Dec 12 01:35 sample_submission.csv
-rw------- 1 root root  30M Dec 12 01:35 se_test.csv
-rw------- 1 root root  28M Dec 12 01:35 se_train.csv
-rw------- 1 root root 122M Dec 12 01:35 sunat_test.csv
-rw------- 1 root root 106M Dec 12 01:35 sunat_train.csv
-rw------- 1 root root 3.0M Dec 12 01:35 y_train.csv


In [4]:
def optimizar_df(df):
  for col_ in df.select_dtypes(include=[int]).columns:
    df[col_] = pd.to_numeric(df[col_], downcast="unsigned")

  for col_ in df.select_dtypes(include=[float]).columns:
    df[col_] = pd.to_numeric(df[col_], downcast="float")
  
  return df

## Dataset Base

In [5]:
df_base_train = optimizar_df(pd.read_csv("[00] Data/y_train.csv"))
df_base_test = optimizar_df(pd.read_csv("[00] Data/sample_submission.csv"))

## Dataset Socio-Demográfico

In [6]:
df_se_train = optimizar_df(pd.read_csv("[00] Data/se_train.csv"))
df_se_test = optimizar_df(pd.read_csv("[00] Data/se_test.csv"))

In [7]:
df_se_train.head()

Unnamed: 0,key_value,edad,sexo,est_cvl,sit_lab,cod_ocu,ctd_hijos,flg_sin_email,ctd_veh,cod_ubi,lgr_vot,prv,dto,rgn,tip_lvledu
0,0,-0.983651,1,4,2,22,0.0,1,0.0,-0.070167,17,47,687,4,0
1,1,0.000953,1,4,1,22,0.0,1,1.0,-0.738566,10,186,849,6,0
2,2,0.531124,0,4,1,22,0.0,1,0.0,-0.297248,15,84,1405,1,7
3,3,-1.135129,0,4,1,22,0.0,1,0.0,-1.203435,4,9,977,6,7
4,4,-1.438084,0,4,1,22,0.0,1,0.0,0.043424,18,111,391,3,0


In [8]:
print(df_se_train["edad"].min(), df_se_train["edad"].max())
print(df_se_train["cod_ubi"].min(), df_se_train["cod_ubi"].max())
print(df_se_test["edad"].min(), df_se_test["edad"].max())
print(df_se_test["cod_ubi"].min(), df_se_test["cod_ubi"].max())

-2.4226877689361572 5.302666187286377
-1.549017310142517 9.250945091247559
-2.1197328567504883 4.242323398590088
-1.549017310142517 9.250945091247559


In [14]:
df_se_train.to_csv("[01] Intermediate/ds_se.csv", index=False)
df_se_test.to_csv("[01] Intermediate/ds_se_test.csv", index=False)

## Dataset RCC

In [9]:
df_rcc_train = optimizar_df(pd.read_csv("[00] Data/rcc_train.csv"))
df_rcc_test = optimizar_df(pd.read_csv("[00] Data/rcc_test.csv"))

In [16]:
df_rcc_train["saldo"] = df_rcc_train["saldo"] + 1
df_rcc_test["saldo"] = df_rcc_test["saldo"] + 1

In [None]:
#round(df_rcc_train[["saldo"]].describe(percentiles=[.25, .5, .75, .9, .95, .99]), 2)
#round(df_rcc_test[["saldo"]].describe(percentiles=[.25, .5, .75, .9, .95, .99]), 2)

In [17]:
def diff_months(codmes_ini, codmes_fin):
  r = relativedelta.relativedelta(datetime.strptime(str(int(codmes_fin)), '%Y%m'), datetime.strptime(str(int(codmes_ini)), '%Y%m'))
  return r.years*12 + r.months

In [18]:
def hist_feat_generator(cat_col, numeric_value, agg_func, df_base, df_base_test):

  df_rcc_crosstab = (pd.crosstab(index=[df_rcc_train.key_value, df_rcc_train.codmes], 
                                columns=df_rcc_train[cat_col], 
                                values=df_rcc_train[numeric_value],
                                aggfunc=agg_func)
                      .reset_index(drop=False)
                      .reset_index(drop=True))
  df_rcc_crosstab.columns = ["{0}_{1}_FOR_{2}_{3}".format(numeric_value.upper(), agg_func.upper(), cat_col, c) if c not in ["key_value", "codmes"] else c for c in df_rcc_crosstab.columns]

  feat_cols = [c for c in df_rcc_crosstab.columns if "{0}_".format(cat_col) in c]
  df_rcc_crosstab["MEAN_OF_{0}_{1}_FOR_ALL_{2}".format(numeric_value.upper(), agg_func.upper(), cat_col)] = df_rcc_crosstab[feat_cols].mean(axis=1)
  df_rcc_crosstab["SUM_OF_{0}_{1}_FOR_ALL_{2}".format(numeric_value.upper(), agg_func.upper(), cat_col)] = df_rcc_crosstab[feat_cols].sum(axis=1)
  df_rcc_crosstab["COUNT_OF_{0}_{1}_FOR_ALL_{2}".format(numeric_value.upper(), agg_func.upper(), cat_col)] = df_rcc_crosstab[feat_cols].count(axis=1)

  df_rcc_test_crosstab = (pd.crosstab(index=[df_rcc_test.key_value, df_rcc_test.codmes], 
                                columns=df_rcc_test[cat_col], 
                                values=df_rcc_test[numeric_value],
                                aggfunc=agg_func)
                      .reset_index(drop=False)
                      .reset_index(drop=True))
  df_rcc_test_crosstab.columns = ["{0}_{1}_FOR_{2}_{3}".format(numeric_value.upper(), agg_func.upper(), cat_col, c) if c not in ["key_value", "codmes"] else c for c in df_rcc_test_crosstab.columns]

  feat_cols = [c for c in df_rcc_test_crosstab.columns if "{0}_".format(cat_col) in c]
  df_rcc_test_crosstab["MEAN_OF_{0}_{1}_FOR_ALL_{2}".format(numeric_value.upper(), agg_func.upper(), cat_col)] = df_rcc_test_crosstab[feat_cols].mean(axis=1)
  df_rcc_test_crosstab["SUM_OF_{0}_{1}_FOR_ALL_{2}".format(numeric_value.upper(), agg_func.upper(), cat_col)] = df_rcc_test_crosstab[feat_cols].sum(axis=1)
  df_rcc_test_crosstab["COUNT_OF_{0}_{1}_FOR_ALL_{2}".format(numeric_value.upper(), agg_func.upper(), cat_col)] = df_rcc_test_crosstab[feat_cols].count(axis=1)

  agg_col_dict = {"codmes": ["last", "first"]}
  agg_col_dict.update({c: ["mean", "sum", "count"] for c in df_rcc_crosstab.columns if c not in ["key_value", "codmes"]})

  for month_level in ["U03M", "U06M", "U12M"]:
    print("\t\t\tFeatures at month level: {0}".format(month_level))
    df_rcc_agg = (df_rcc_crosstab.loc[df_rcc_crosstab["codmes"].isin(month_dict[month_level])]
                                .groupby(["key_value"], as_index=False)
                                .agg(agg_col_dict)
                    )
    df_rcc_agg.columns = [month_level + "_" + "_".join(c).upper() if c[-1] != "" else c[0] for c in df_rcc_agg.columns]

    df_rcc_agg["{0}_RECENCY_FOR_{1}".format(month_level, cat_col)] = df_rcc_agg.apply(lambda row: diff_months(row["{0}_CODMES_LAST".format(month_level)], 201803), axis=1)
    df_rcc_agg["{0}_LONG_RECENCY_FOR_{1}".format(month_level, cat_col)] = df_rcc_agg.apply(lambda row: diff_months(row["{0}_CODMES_FIRST".format(month_level)], 201803), axis=1)
    df_rcc_agg["{0}_DIFF_MONTHS_FIRST_LAST_FOR_{1}".format(month_level, cat_col)] = df_rcc_agg.apply(lambda row: diff_months(row["{0}_CODMES_FIRST".format(month_level)], row["{0}_CODMES_LAST".format(month_level)]), axis=1)
    del df_rcc_agg["{0}_CODMES_FIRST".format(month_level, cat_col)]
    del df_rcc_agg["{0}_CODMES_LAST".format(month_level, cat_col)]

    df_rcc_test_agg = (df_rcc_test_crosstab.loc[df_rcc_test_crosstab["codmes"].isin(month_dict_test[month_level])]
                                .groupby(["key_value"], as_index=False)
                                .agg(agg_col_dict)
                       )
    df_rcc_test_agg.columns = [month_level + "_" + "_".join(c).upper() if c[-1] != "" else c[0] for c in df_rcc_test_agg.columns]
    
    df_rcc_test_agg["{0}_RECENCY_FOR_{1}".format(month_level, cat_col)] = df_rcc_test_agg.apply(lambda row: diff_months(row["{0}_CODMES_LAST".format(month_level)], 201903), axis=1)
    df_rcc_test_agg["{0}_LONG_RECENCY_FOR_{1}".format(month_level, cat_col)] = df_rcc_test_agg.apply(lambda row: diff_months(row["{0}_CODMES_FIRST".format(month_level)], 201903), axis=1)
    df_rcc_test_agg["{0}_DIFF_MONTHS_FIRST_LAST_FOR_{1}".format(month_level, cat_col)] = df_rcc_test_agg.apply(lambda row: diff_months(row["{0}_CODMES_FIRST".format(month_level)], row["{0}_CODMES_LAST".format(month_level)]), axis=1)
    del df_rcc_test_agg["{0}_CODMES_FIRST".format(month_level, cat_col)]
    del df_rcc_test_agg["{0}_CODMES_LAST".format(month_level, cat_col)]

    df_base = df_base.merge(df_rcc_agg, how="left", on="key_value")
    df_base_test = df_base_test.merge(df_rcc_test_agg, how="left", on="key_value")

  return df_base, df_base_test

In [19]:
month_dict = {"U12M": [201802, 201801, 201712, 201711, 201710, 201709, 201708, 201707, 201706, 201705, 201704, 201703],
              "U06M": [201802, 201801, 201712, 201711, 201710, 201709],
              "U03M": [201802, 201801, 201712],
              "U01M": [201802]
              }

month_dict_test = {"U12M": [201902, 201901, 201812, 201811, 201810, 201809, 201808, 201807, 201806, 201805, 201804, 201803],
              "U06M": [201902, 201901, 201812, 201811, 201810, 201809],
              "U03M": [201902, 201901, 201812],
              "U01M": [201902]
              }

#### **Variable Producto**

In [20]:
top_freq_products = [1, 2, 3, 8, 255, 4, 0, 6]
top_propensity_products = [12, 17, 18, 25, 20]
low_propensity_products = [32, 33, 35, 36, 37, 38, 40, 41]
list_product = top_freq_products + top_propensity_products + low_propensity_products
print(list_product)

[1, 2, 3, 8, 255, 4, 0, 6, 12, 17, 18, 25, 20, 32, 33, 35, 36, 37, 38, 40, 41]


In [21]:
df_rcc_test["PRODUCTO"] = df_rcc_test["PRODUCTO"].fillna(255)
df_rcc_test["PRODUCTO"] = df_rcc_test["PRODUCTO"].astype(int)

In [22]:
cat_col = "PRODUCTO"
df_rcc_train.loc[~df_rcc_train[cat_col].isin(list_product), cat_col] = "OTROS"
df_rcc_train.loc[df_rcc_train[cat_col].isin(top_propensity_products), cat_col] = "TOP_PROPENSITY"
df_rcc_train.loc[df_rcc_train[cat_col].isin(low_propensity_products), cat_col] = "LOW_PROPENSITY"

df_rcc_test.loc[~df_rcc_test[cat_col].isin(list_product), cat_col] = "OTROS"
df_rcc_test.loc[df_rcc_test[cat_col].isin(top_propensity_products), cat_col] = "TOP_PROPENSITY"
df_rcc_test.loc[df_rcc_test[cat_col].isin(low_propensity_products), cat_col] = "LOW_PROPENSITY"

#### **Variable Institucion Financiera**

In [23]:
top_freq_instits = [34, 1, 10, 28, 11, 37, 0, 2]
top_propensity_instits = [9, 26, 89, 105, 99, 39, 97, 90, 108, 106, 107, 95, 49, 94, 100, 77, 91, 96, 6, 69, 104, 85, 70, 88, 93, 101, 75, 80, 73, 76, 52, 46, 68, 71, 92, 63, 60, 55, 65] # [74, 5]
low_propensity_instits = [12, 86, 47, 14, 78, 84] #[67, 66, 87, 98, 72, 31, 81]
list_instit = top_freq_instits + top_propensity_instits + low_propensity_instits
print(list_instit)

[34, 1, 10, 28, 11, 37, 0, 2, 9, 26, 89, 105, 99, 39, 97, 90, 108, 106, 107, 95, 49, 94, 100, 77, 91, 96, 6, 69, 104, 85, 70, 88, 93, 101, 75, 80, 73, 76, 52, 46, 68, 71, 92, 63, 60, 55, 65, 12, 86, 47, 14, 78, 84]


In [24]:
df_rcc_test["cod_instit_financiera"] = df_rcc_test["cod_instit_financiera"].fillna(255)
df_rcc_test["cod_instit_financiera"] = df_rcc_test["cod_instit_financiera"].astype(int)

In [25]:
df_rcc_train["cod_instit_financiera"] = df_rcc_train["cod_instit_financiera"]
df_rcc_train.loc[df_rcc_train["cod_instit_financiera"].isin(top_propensity_instits), "cod_instit_financiera"] = "TOP_PROPENSITY"
df_rcc_train.loc[df_rcc_train["cod_instit_financiera"].isin(low_propensity_instits), "cod_instit_financiera"] = "LOW_PROPENSITY"
df_rcc_train.loc[~df_rcc_train["cod_instit_financiera"].isin(list_instit), "cod_instit_financiera"] = "OTROS"

In [26]:
df_rcc_test["cod_instit_financiera"] = df_rcc_test["cod_instit_financiera"]
df_rcc_test.loc[df_rcc_test["cod_instit_financiera"].isin(top_propensity_instits), "cod_instit_financiera"] = "TOP_PROPENSITY"
df_rcc_test.loc[df_rcc_test["cod_instit_financiera"].isin(low_propensity_instits), "cod_instit_financiera"] = "LOW_PROPENSITY"
df_rcc_test.loc[~df_rcc_test["cod_instit_financiera"].isin(list_instit), "cod_instit_financiera"] = "OTROS"

#### **Variable Condición como categórica**

In [27]:
df_rcc_train["condicion_cat"] = df_rcc_train["condicion"].apply(lambda c: 0 if c == 0 else "00_03" if c < 90 else  "03_06" if c < 180 else "06_12" if c < 365 else "12-32" if c < 960 else ">32")
df_rcc_test["condicion_cat"] = df_rcc_test["condicion"].apply(lambda c: 0 if c == 0 else "00_03" if c < 90 else  "03_06" if c < 180 else "06_12" if c < 365 else "12-32" if c < 960 else ">32")

In [28]:
df_rcc_train["condicion_cat"].value_counts(normalize=True, dropna=False)

0        0.966859
00_03    0.022446
>32      0.005332
12-32    0.002580
03_06    0.001442
06_12    0.001340
Name: condicion_cat, dtype: float64

In [29]:
df_rcc_test["condicion_cat"].value_counts(normalize=True, dropna=False)

0        0.967486
00_03    0.019706
>32      0.006578
12-32    0.002869
06_12    0.001725
03_06    0.001636
Name: condicion_cat, dtype: float64

#### **Generando los features**

In [31]:
df_base = df_base_train[["key_value"]].copy()
df_base_test = df_base_test[["key_value"]].copy()

In [32]:
for cat_col in ["PRODUCTO", "cod_instit_financiera", "condicion_cat"]:
  print("Cat Feature: {0}".format(cat_col))
  for numeric_value in ["saldo"]:
    print("\tNumeric Feature: {0}".format(numeric_value))
    for agg_func in ["mean"]:
      print("\t\tAgg func: {0}".format(agg_func))
      df_base, df_base_test = hist_feat_generator(cat_col, numeric_value, agg_func, df_base, df_base_test)

Cat Feature: PRODUCTO
	Numeric Feature: saldo
		Agg func: mean
			Features at month level: U03M
			Features at month level: U06M
			Features at month level: U12M
Cat Feature: cod_instit_financiera
	Numeric Feature: saldo
		Agg func: mean
			Features at month level: U03M
			Features at month level: U06M
			Features at month level: U12M
Cat Feature: condicion_cat
	Numeric Feature: saldo
		Agg func: mean
			Features at month level: U03M
			Features at month level: U06M
			Features at month level: U12M


In [33]:
df_base.shape, df_base_test.shape

((358487, 343), (396609, 343))

In [34]:
df_base.shape, df_base_test.shape

((358487, 343), (396609, 343))

#### **Exportando el dataset de RCC**

In [35]:
df_base.to_csv("[01] Intermediate/ds_rcc.csv", index=False)
df_base_test.to_csv("[01] Intermediate/ds_rcc_test.csv", index=False)

## SUNAT

In [6]:
df_sunat_train = optimizar_df(pd.read_csv("[00] Data/sunat_train.csv")).drop_duplicates()
df_sunat_test = optimizar_df(pd.read_csv("[00] Data/sunat_test.csv")).drop_duplicates()

In [10]:
df_sunat_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 292479 entries, 0 to 1462035
Data columns (total 18 columns):
 #   Column               Non-Null Count   Dtype  
---  ------               --------------   -----  
 0   key_value            292479 non-null  uint32 
 1   tipcontribuyente     292479 non-null  uint8  
 2   tippersona           292479 non-null  uint8  
 3   ciiu                 292479 non-null  uint16 
 4   ubigeo               292479 non-null  uint16 
 5   condiciondomicilio   292479 non-null  uint8  
 6   estadocontribuyente  292479 non-null  uint8  
 7   fecalta              272027 non-null  float32
 8   fecbaja              46802 non-null   float32
 9   codvia               292479 non-null  uint8  
 10  codzona              292479 non-null  uint8  
 11  contabilidad         292479 non-null  uint8  
 12  facturacion          292479 non-null  uint8  
 13  domiciliado          292479 non-null  uint8  
 14  comercioexterior     292479 non-null  uint8  
 15  cargorele       

In [7]:
df_sunat_train.head()

Unnamed: 0,key_value,tipcontribuyente,tippersona,ciiu,ubigeo,condiciondomicilio,estadocontribuyente,fecalta,fecbaja,codvia,codzona,contabilidad,facturacion,domiciliado,comercioexterior,cargorele,codentidadtributo,estadotributo
0,190597,9,0,178,442,0,0,-0.785832,,0,0,6,11,0,1,635,8,2
1,162813,0,0,253,449,0,0,0.971584,,20,0,6,11,0,1,635,8,2
2,190620,9,0,173,473,0,2,0.025283,-1.561407,11,0,6,11,0,1,635,8,2
3,43886,0,0,285,154,0,0,0.566027,,11,3,6,11,0,1,635,8,2
4,255211,0,0,285,456,0,0,0.836399,,11,3,6,11,0,1,635,8,2


In [None]:
from scipy import stats

In [20]:
df_sunat_train_agg = df_sunat_train.groupby("key_value", as_index=False).agg({c: stats.mode for c in df_sunat_train.columns if c not in ["key_value", "fecalta", "fecbaja"]})

In [21]:
df_sunat_test_agg = df_sunat_test.groupby("key_value", as_index=False).agg({c: stats.mode for c in df_sunat_test.columns if c not in ["key_value", "fecalta", "fecbaja"]})

In [36]:
for col in [c for c in df_sunat_train.columns if c not in ["key_value", "fecalta", "fecbaja"]]:
  df_sunat_train_agg[col] = df_sunat_train_agg[col].apply(lambda t: t[0][0])
  df_sunat_test_agg[col] = df_sunat_test_agg[col].apply(lambda t: t[0][0])

In [37]:
df_sunat_train_fecha = df_sunat_train.groupby("key_value", as_index=False).agg({"fecalta": ["min", "max", "mean"], "fecbaja": ["min", "max", "mean"]})
df_sunat_test_fecha = df_sunat_test.groupby("key_value", as_index=False).agg({"fecalta": ["min", "max", "mean"], "fecbaja": ["min", "max", "mean"]})

In [38]:
df_sunat_train_fecha.columns = ["SUNAT" + "_".join(c).upper() if c[-1] != "" else c[0] for c in df_sunat_train_fecha.columns]
df_sunat_test_fecha.columns = ["SUNAT" + "_".join(c).upper() if c[-1] != "" else c[0] for c in df_sunat_test_fecha.columns]

In [40]:
df_sunat_train_agg = df_sunat_train_agg.merge(df_sunat_train_fecha, how="left", on="key_value")
df_sunat_test_agg = df_sunat_test_agg.merge(df_sunat_test_fecha, how="left", on="key_value")

In [43]:
df_sunat_train_agg.to_csv("[01] Intermediate/ds_sunat.csv", index=False)
df_sunat_test_agg.to_csv("[01] Intermediate/ds_sunat_test.csv", index=False)

In [42]:
df_sunat_test_agg.head()

Unnamed: 0,key_value,tipcontribuyente,tippersona,ciiu,ubigeo,condiciondomicilio,estadocontribuyente,codvia,codzona,contabilidad,facturacion,domiciliado,comercioexterior,cargorele,codentidadtributo,estadotributo,SUNATFECALTA_MIN,SUNATFECALTA_MAX,SUNATFECALTA_MEAN,SUNATFECBAJA_MIN,SUNATFECBAJA_MAX,SUNATFECBAJA_MEAN
0,0,0,0,255,473,0,0,3,3,6,11,0,2,635,8,2,0.566027,0.566027,0.566027,,,
1,2,0,0,244,473,0,0,0,3,6,11,0,1,635,8,2,1.377142,1.377142,1.377142,,,
2,4,0,0,285,1109,0,0,3,3,6,11,0,2,635,8,2,1.377142,1.377142,1.377142,,,
3,5,9,0,177,674,0,3,3,3,2,11,0,2,635,8,2,1.10677,1.10677,1.10677,1.422525,1.422525,1.422525
4,7,0,0,285,500,0,0,3,10,6,11,0,2,635,8,2,1.7827,1.7827,1.7827,,,


In [8]:
df_sunat_train.groupby("key_value", as_index=False).agg({"tipcontribuyente": "count"}).sort_values(by="tipcontribuyente", ascending=False)

Unnamed: 0,key_value,tipcontribuyente
172663,262090,96
80409,121911,63
57921,87603,51
149929,227523,47
157189,238557,45
...,...,...
83372,126457,1
83373,126458,1
83376,126461,1
83377,126463,1


In [9]:
df_sunat_test.groupby("key_value", as_index=False).agg({"tipcontribuyente": "count"}).sort_values(by="tipcontribuyente", ascending=False)


Unnamed: 0,key_value,tipcontribuyente
16321,24767,655
143253,220827,93
227474,352446,65
10113,15340,62
142130,219118,49
...,...,...
90676,139485,1
90677,139487,1
90678,139489,1
90679,139490,1


## Censo

In [45]:
df_censo_train = optimizar_df(pd.read_csv("[00] Data/censo_train.csv")).drop_duplicates()
df_censo_test = optimizar_df(pd.read_csv("[00] Data/censo_test.csv")).drop_duplicates()

In [46]:
df_censo_train.head()

Unnamed: 0,key_value,lon_x,lat_y,zon_sis,zon_der,zon_inu,zon_com,score,pob,pob_a,pob_b,pob_c,pob_d,pob_e,hog,a,b,c,d,e,pred,ing_a_18,gas_ali_a_18,gas_ves_a_18,gas_alq_a_18,gas_mue_a_18,gas_sal_a_18,gas_trans_a_18,gas_esp_a_18,gas_otrs_a_18,ing_b_18,gas_ali_b_18,gas_ves_b_18,gas_alq_b_18,gas_mue_b_18,gas_sal_b_18,gas_trans_b_18,gas_esp_b_18,gas_otrs_b_18,ing_c_18,...,estud_9,estud_10,profe_1,profe_2,profe_3,profe_4,profe_5,profe_6,profe_7,profe_8,profe_9,profe_10,acteco1,acteco2,acteco3,acteco4,acteco5,acteco6,acteco7,acteco8,acteco9,acteco10,acteco11,acteco12,acteco13,acteco14,acteco15,acteco16,acteco17,acteco18,acteco19,acteco20,acteco21,ocup_1,ocup_2,ocup_3,ocup_4,ocup_5,ocup_6,flag_censo
0,106695,1.827118,-0.711359,-0.931515,-0.242328,-0.32701,-0.280674,-0.714361,-0.544996,-0.312923,-0.425825,-0.565415,-0.089094,-0.165019,-0.537031,-0.302006,-0.418318,-0.547005,-0.110158,-0.187616,4,-0.29862,-0.29862,-0.29862,-0.29862,-0.29862,-0.29862,-0.29862,-0.29862,-0.29862,-0.452057,-0.452057,-0.452057,-0.452057,-0.452057,-0.452057,-0.452057,-0.452057,-0.452057,-0.535443,...,-0.199376,-0.308,-0.340184,-0.217179,-0.355359,-0.206882,-0.225646,-0.151014,-0.339847,-0.419854,-0.733954,-0.205903,-0.237047,-0.368741,-0.555645,-0.273762,-0.338135,0.041469,-0.511477,-0.457707,0.639525,-0.202316,-0.173108,-0.331307,-0.224216,-0.155999,-0.232125,-0.378461,-0.270605,-0.543045,-0.583661,-0.644919,-0.11823,-0.63972,-0.329153,-0.248796,-0.623164,-0.605414,-0.644919,0
1,271120,-0.480514,0.198531,0.054361,-0.242328,-0.32701,-0.280674,-0.104283,-0.06042,-0.312923,-0.485131,0.448644,-0.059923,-0.308719,-0.078552,-0.302006,-0.457537,0.46281,-0.162021,-0.31726,3,-0.29862,-0.29862,-0.29862,-0.29862,-0.29862,-0.29862,-0.29862,-0.29862,-0.29862,-0.4121,-0.4121,-0.4121,-0.4121,-0.4121,-0.4121,-0.4121,-0.4121,-0.4121,0.334792,...,-0.008364,-0.308,0.017784,-0.139198,-0.500427,0.000321,-0.076505,0.239948,0.184462,0.143793,-0.041876,-0.205903,0.215731,-0.368741,-0.130826,-0.273762,-0.338135,-0.127766,-0.076081,0.214074,0.059442,0.017362,-0.402767,-0.331307,-0.171162,-0.658023,-0.00767,0.049331,-0.270605,-0.162191,-0.215123,-0.199299,-0.11823,-0.234777,-0.113894,-0.135995,0.032167,0.062284,-0.199299,0
2,63080,-0.192205,-0.21657,1.040236,-0.242328,-0.32701,-0.280674,-0.124272,0.60751,-0.312923,1.092421,-0.23026,0.961048,0.733106,0.572192,-0.302006,0.993563,-0.2203,0.927114,0.849535,2,-0.29862,-0.29862,-0.29862,-0.29862,-0.29862,-0.29862,-0.29862,-0.29862,-0.29862,1.106274,1.106274,1.106274,1.106274,1.106274,1.106274,1.106274,1.106274,1.106274,0.004703,...,-0.180274,0.137688,-0.340184,-0.035223,0.176559,0.069388,0.44549,-0.151014,0.097077,0.566528,-0.041876,-0.01101,-0.237047,0.360282,0.378958,-0.273762,-0.338135,-0.466236,0.1779,0.438002,0.494504,0.237039,-0.173108,-0.331307,-0.224216,0.513367,-0.232125,0.370175,-0.270605,1.742079,0.521952,0.24632,-0.11823,0.035186,0.224369,0.117805,-0.131666,1.063831,0.24632,0
3,107681,-0.803171,0.74864,1.040236,-0.242328,-0.32701,-0.280674,-0.540244,-0.658501,-0.131975,-0.437686,-0.59979,-0.439141,-0.308719,-0.625768,-0.136362,-0.418318,-0.576705,-0.421339,-0.31726,4,-0.128909,-0.128909,-0.128909,-0.128909,-0.128909,-0.128909,-0.128909,-0.128909,-0.128909,-0.4121,-0.4121,-0.4121,-0.4121,-0.4121,-0.4121,-0.4121,-0.4121,-0.4121,-0.565451,...,-0.485894,-0.382281,0.017784,-0.451123,-0.500427,-0.621288,-0.598499,-0.151014,-0.514617,-0.84259,-0.657057,-0.205903,0.064805,-0.368741,-0.470681,-0.273762,-0.338135,-0.804707,-0.54776,-0.793597,-0.665663,-0.421994,-0.402767,-0.331307,-0.542542,-0.658023,-0.45658,-0.699305,-0.48739,-0.543045,-0.215123,-0.644919,-0.11823,-0.369758,-0.698168,-0.530797,-0.70508,-0.605414,-0.644919,0
4,107336,1.983624,-1.649749,-0.931515,-0.242328,-0.32701,-0.280674,-0.104283,-0.470782,0.669365,0.048627,-0.694321,-0.643335,-0.452419,-0.403924,0.609035,0.130747,-0.665807,-0.628794,-0.446903,2,0.804502,0.804502,0.804502,0.804502,0.804502,0.804502,0.804502,0.804502,0.804502,-0.012528,-0.012528,-0.012528,-0.012528,-0.012528,-0.012528,-0.012528,-0.012528,-0.012528,-0.655475,...,0.010738,0.509094,0.017784,0.042758,-0.258646,-0.345017,-0.710355,-0.151014,-0.776772,-0.84259,-0.733954,-0.205903,-0.086121,0.360282,-0.555645,-0.273762,-0.338135,-0.297001,-0.692892,-0.793597,-0.520642,-0.421994,0.056551,-0.331307,-0.011999,-0.658023,-0.232125,-0.271513,-0.05382,-0.162191,-0.583661,-0.199299,-0.11823,-0.099795,-0.667416,-0.276996,-0.786996,-0.605414,-0.199299,0


In [52]:
df_censo_train.shape, df_censo_train["key_value"].nunique()

((153139, 256), 153139)

In [53]:
df_censo_test.shape, df_censo_test["key_value"].nunique()

((200499, 256), 200499)

## Todos los Datasets

In [44]:
!ls -lh '[01] Intermediate'

total 2.0G
-rw------- 1 root root 521M Dec 20 17:16 ds_rcc.csv
-rw------- 1 root root 185M Dec 18 16:51 ds_rcc_instit.csv
-rw------- 1 root root 204M Dec 18 16:51 ds_rcc_instit_test.csv
-rw------- 1 root root 576M Dec 20 17:18 ds_rcc_test.csv
-rw------- 1 root root  22M Dec 20 13:20 ds_se.csv
-rw------- 1 root root  24M Dec 20 13:20 ds_se_test.csv
-rw------- 1 root root  18M Dec 31 19:34 ds_sunat.csv
-rw------- 1 root root  20M Dec 31 19:34 ds_sunat_test.csv
-rw------- 1 root root 192M Dec 31 13:48 rcc_product_level.csv
-rw------- 1 root root 212M Dec 31 17:27 rcc_product_level_test.csv


---