<a href="https://colab.research.google.com/github/mlaricobar/solution-datathon-interbank-2020/blob/desarrollo/%5B03%5D%20Feature%20Engineering.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Feature Engineering

In [1]:
from google.colab import drive, files

import pandas as pd

from datetime import datetime
from dateutil import relativedelta

In [2]:
drive.mount('/gdrive')

Mounted at /gdrive


In [3]:
%cd /gdrive/'My Drive'/Competencias/'[02] Interbank-2020'
!ls -lh '[00] Data'

/gdrive/My Drive/Competencias/[02] Interbank-2020
total 5.1G
-rw------- 1 root root 979M Dec 12 01:33 censo_test.csv
-rw------- 1 root root 748M Dec 12 01:34 censo_train.csv
-rw------- 1 root root  998 Dec 12 01:34 productos.csv
-rw------- 1 root root 1.7G Dec 12 01:34 rcc_test.csv
-rw------- 1 root root 1.4G Dec 12 01:35 rcc_train.csv
-rw------- 1 root root 3.3M Dec 12 01:35 sample_submission.csv
-rw------- 1 root root  30M Dec 12 01:35 se_test.csv
-rw------- 1 root root  28M Dec 12 01:35 se_train.csv
-rw------- 1 root root 122M Dec 12 01:35 sunat_test.csv
-rw------- 1 root root 106M Dec 12 01:35 sunat_train.csv
-rw------- 1 root root 3.0M Dec 12 01:35 y_train.csv


In [4]:
def optimizar_df(df):
  for col_ in df.select_dtypes(include=[int]).columns:
    df[col_] = pd.to_numeric(df[col_], downcast="unsigned")

  for col_ in df.select_dtypes(include=[float]).columns:
    df[col_] = pd.to_numeric(df[col_], downcast="float")
  
  return df

## Dataset Base

In [5]:
df_base_train = optimizar_df(pd.read_csv("[00] Data/y_train.csv"))
df_base_test = optimizar_df(pd.read_csv("[00] Data/sample_submission.csv"))

## Dataset Socio-Demográfico

In [8]:
df_se_train = optimizar_df(pd.read_csv("[00] Data/se_train.csv"))
df_se_test = optimizar_df(pd.read_csv("[00] Data/se_test.csv"))

In [None]:
df_se_train.to_csv("[01] Intermediate/ds_se.csv", index=False)
df_se_test.to_csv("[01] Intermediate/ds_se_test.csv", index=False)

## Dataset RCC

In [6]:
df_rcc_train = optimizar_df(pd.read_csv("[00] Data/rcc_train.csv"))
df_rcc_test = optimizar_df(pd.read_csv("[00] Data/rcc_test.csv"))

In [7]:
df_rcc_train["saldo"] = df_rcc_train["saldo"] + 1
df_rcc_test["saldo"] = df_rcc_test["saldo"] + 1

In [11]:
#round(df_rcc_train[["saldo"]].describe(percentiles=[.25, .5, .75, .9, .95, .99]), 2)
#round(df_rcc_test[["saldo"]].describe(percentiles=[.25, .5, .75, .9, .95, .99]), 2)

In [12]:
def diff_months(codmes_ini, codmes_fin):
  r = relativedelta.relativedelta(datetime.strptime(str(int(codmes_fin)), '%Y%m'), datetime.strptime(str(int(codmes_ini)), '%Y%m'))
  return r.years*12 + r.months

In [13]:
df_rcc_train.head(3)

Unnamed: 0,codmes,key_value,condicion,tipo_credito,cod_instit_financiera,saldo,PRODUCTO,RIESGO_DIRECTO,COD_CLASIFICACION_DEUDOR
0,201711,4,0,12,33,0.927399,6,-1,0
1,201711,4,0,12,33,0.929392,7,-1,0
2,201711,4,0,12,61,0.947946,5,-1,0


In [1]:
df_rcc_train["COD_CLASIFICACION_DEUDOR"].unique()

NameError: ignored

### Historical Features

In [14]:
from scipy.stats import hmean, gmean, kurtosis, skew, entropy

#### Count, sum, min, max, mean

In [15]:
df_rcc_train_agg = (df_rcc_train.groupby(["key_value", "codmes"], as_index=False)
                                .agg({
                                        "saldo": ["count", "sum", "min", "max", "mean"],
                                        "condicion": ["sum", "min", "max", "mean"]
                                }))

In [16]:
df_rcc_test_agg = (df_rcc_test.groupby(["key_value", "codmes"], as_index=False)
                                .agg({
                                        "saldo": ["count", "sum", "min", "max", "mean"],
                                        "condicion": ["sum", "min", "max", "mean"]
                                }))

In [17]:
df_rcc_train_agg.columns = ["_".join(c) if c[-1] != "" else c[0] for c in df_rcc_train_agg.columns]
df_rcc_test_agg.columns = ["_".join(c) if c[-1] != "" else c[0] for c in df_rcc_test_agg.columns]

In [18]:
df_rcc_train_agg.head()

Unnamed: 0,key_value,codmes,saldo_count,saldo_sum,saldo_min,saldo_max,saldo_mean,condicion_sum,condicion_min,condicion_max,condicion_mean
0,0,201703,4,3.777921,0.927186,0.961819,0.94448,0,0,0,0.0
1,0,201704,5,4.705365,0.927186,0.961819,0.941073,0,0,0,0.0
2,0,201705,5,4.705208,0.927186,0.961819,0.941042,0,0,0,0.0
3,0,201706,5,4.705231,0.927186,0.961819,0.941046,0,0,0,0.0
4,0,201707,5,4.705533,0.927186,0.961819,0.941107,0,0,0,0.0


In [21]:
col_num = ['saldo_count', 'saldo_sum', 'saldo_min', 'saldo_max', 'saldo_mean', 'condicion_sum', 'condicion_min', 'condicion_max', 'condicion_mean']

In [22]:
import time

In [23]:
for col_ in col_num:
    print("Numeric Feature: {0}".format(col_))
    ti = time.time()
    df_rcc_train_agg["last{0}".format(col_)] = df_rcc_train_agg.groupby("key_value")[col_].shift(1)
    df_rcc_test_agg["last{0}".format(col_)] = df_rcc_test_agg.groupby("key_value")[col_].shift(1)
  
    if col_ == "codmes":
        df_rcc_train_agg["difflast{0}".format(col_)] = df_rcc_train_agg.apply(lambda row: diff_months(row["last{0}".format(col_)], row[col_]), axis=1)
        df_rcc_test_agg["difflast{0}".format(col_)] = df_rcc_test_agg.apply(lambda row: diff_months(row["last{0}".format(col_)], row[col_]), axis=1)
    else:
        df_rcc_train_agg["difflast{0}".format(col_)] = df_rcc_train_agg.apply(lambda row: row[col_] - row["last{0}".format(col_)], axis=1)
        df_rcc_test_agg["difflast{0}".format(col_)] = df_rcc_test_agg.apply(lambda row: row[col_] - row["last{0}".format(col_)], axis=1)
  
    df_rcc_train_agg["absdifflast{0}".format(col_)] = abs(df_rcc_train_agg["difflast{0}".format(col_)])
    df_rcc_test_agg["absdifflast{0}".format(col_)] = abs(df_rcc_test_agg["difflast{0}".format(col_)])

    df_rcc_train_agg.loc[pd.notnull(df_rcc_train_agg["difflast{0}".format(col_)]), "incrflag{0}".format(col_)] = df_rcc_train_agg.loc[pd.notnull(df_rcc_train_agg["difflast{0}".format(col_)]), "difflast{0}".format(col_)].apply(lambda c: 1 if c > 0 else 0)
    df_rcc_train_agg.loc[pd.notnull(df_rcc_train_agg["difflast{0}".format(col_)]), "decrflag{0}".format(col_)] = df_rcc_train_agg.loc[pd.notnull(df_rcc_train_agg["difflast{0}".format(col_)]), "difflast{0}".format(col_)].apply(lambda c: 1 if c < 0 else 0)
    df_rcc_train_agg.loc[pd.notnull(df_rcc_train_agg["difflast{0}".format(col_)]), "sameflag{0}".format(col_)] = df_rcc_train_agg.loc[pd.notnull(df_rcc_train_agg["difflast{0}".format(col_)]), "difflast{0}".format(col_)].apply(lambda c: 1 if c == 0 else 0)

    df_rcc_test_agg.loc[pd.notnull(df_rcc_test_agg["difflast{0}".format(col_)]), "incrflag{0}".format(col_)] = df_rcc_test_agg.loc[pd.notnull(df_rcc_test_agg["difflast{0}".format(col_)]), "difflast{0}".format(col_)].apply(lambda c: 1 if c > 0 else 0)
    df_rcc_test_agg.loc[pd.notnull(df_rcc_test_agg["difflast{0}".format(col_)]), "decrflag{0}".format(col_)] = df_rcc_test_agg.loc[pd.notnull(df_rcc_test_agg["difflast{0}".format(col_)]), "difflast{0}".format(col_)].apply(lambda c: 1 if c < 0 else 0)
    df_rcc_test_agg.loc[pd.notnull(df_rcc_test_agg["difflast{0}".format(col_)]), "sameflag{0}".format(col_)] = df_rcc_test_agg.loc[pd.notnull(df_rcc_test_agg["difflast{0}".format(col_)]), "difflast{0}".format(col_)].apply(lambda c: 1 if c == 0 else 0)

    tf = time.time()
    print("\tCalculo de variables en {0:.2f} segundos.".format(tf-ti))

Numeric Feature: saldo_count
	Calculo de variables en 105.61 segundos.
Numeric Feature: saldo_sum
	Calculo de variables en 102.37 segundos.
Numeric Feature: saldo_min
	Calculo de variables en 104.33 segundos.
Numeric Feature: saldo_max
	Calculo de variables en 104.74 segundos.
Numeric Feature: saldo_mean
	Calculo de variables en 104.79 segundos.
Numeric Feature: condicion_sum
	Calculo de variables en 108.59 segundos.
Numeric Feature: condicion_min
	Calculo de variables en 110.92 segundos.
Numeric Feature: condicion_max
	Calculo de variables en 110.67 segundos.
Numeric Feature: condicion_mean
	Calculo de variables en 115.29 segundos.


In [25]:
df_rcc_train_agg.head(5)

Unnamed: 0,key_value,codmes,saldo_count,saldo_sum,saldo_min,saldo_max,saldo_mean,condicion_sum,condicion_min,condicion_max,condicion_mean,lastsaldo_count,difflastsaldo_count,absdifflastsaldo_count,incrflagsaldo_count,decrflagsaldo_count,sameflagsaldo_count,lastsaldo_sum,difflastsaldo_sum,absdifflastsaldo_sum,incrflagsaldo_sum,decrflagsaldo_sum,sameflagsaldo_sum,lastsaldo_min,difflastsaldo_min,absdifflastsaldo_min,incrflagsaldo_min,decrflagsaldo_min,sameflagsaldo_min,lastsaldo_max,difflastsaldo_max,absdifflastsaldo_max,incrflagsaldo_max,decrflagsaldo_max,sameflagsaldo_max,lastsaldo_mean,difflastsaldo_mean,absdifflastsaldo_mean,incrflagsaldo_mean,decrflagsaldo_mean,sameflagsaldo_mean,lastcondicion_sum,difflastcondicion_sum,absdifflastcondicion_sum,incrflagcondicion_sum,decrflagcondicion_sum,sameflagcondicion_sum,lastcondicion_min,difflastcondicion_min,absdifflastcondicion_min,incrflagcondicion_min,decrflagcondicion_min,sameflagcondicion_min,lastcondicion_max,difflastcondicion_max,absdifflastcondicion_max,incrflagcondicion_max,decrflagcondicion_max,sameflagcondicion_max,lastcondicion_mean,difflastcondicion_mean,absdifflastcondicion_mean,incrflagcondicion_mean,decrflagcondicion_mean,sameflagcondicion_mean
0,0,201703,4,3.777921,0.927186,0.961819,0.94448,0,0,0,0.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
1,0,201704,5,4.705365,0.927186,0.961819,0.941073,0,0,0,0.0,4.0,1.0,1.0,1.0,0.0,0.0,3.777921,0.927444,0.927444,1.0,0.0,0.0,0.927186,0.0,0.0,0.0,0.0,1.0,0.961819,0.0,0.0,0.0,0.0,1.0,0.94448,-0.003407,0.003407,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
2,0,201705,5,4.705208,0.927186,0.961819,0.941042,0,0,0,0.0,5.0,0.0,0.0,0.0,0.0,1.0,4.705365,-0.000157,0.000157,0.0,1.0,0.0,0.927186,0.0,0.0,0.0,0.0,1.0,0.961819,0.0,0.0,0.0,0.0,1.0,0.941073,-3.1e-05,3.1e-05,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
3,0,201706,5,4.705231,0.927186,0.961819,0.941046,0,0,0,0.0,5.0,0.0,0.0,0.0,0.0,1.0,4.705208,2.2e-05,2.2e-05,1.0,0.0,0.0,0.927186,0.0,0.0,0.0,0.0,1.0,0.961819,0.0,0.0,0.0,0.0,1.0,0.941042,5e-06,5e-06,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
4,0,201707,5,4.705533,0.927186,0.961819,0.941107,0,0,0,0.0,5.0,0.0,0.0,0.0,0.0,1.0,4.705231,0.000302,0.000302,1.0,0.0,0.0,0.927186,0.0,0.0,0.0,0.0,1.0,0.961819,0.0,0.0,0.0,0.0,1.0,0.941046,6e-05,6e-05,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0


In [26]:
c_month = {"codmes": ["count", "first", "last"]}
c_dict = {c: ["sum", "mean", "std", "min", "max", "first", "last"] for c in col_num}
c_grow_dict = {c: ["sum", "mean", "std", "min", "max", "first", "last"] for c in df_rcc_train_agg.columns if (('incrflag' in c) or ('decrflag' in c))}
c_abs_diff_dict = {c: ["sum", "mean", "std", "min", "max", "first", "last"] for c in df_rcc_train_agg.columns if 'absdiff' in c}

c_dict.update(c_month)
c_dict.update(c_grow_dict)
c_dict.update(c_abs_diff_dict)

In [34]:
n_months = 12
prefix_ = "U{0}M_".format(n_months)

df_rcc_train_agg_all = df_rcc_train_agg.groupby("key_value", as_index=False).agg(c_dict)
df_rcc_train_agg_all.columns = [prefix_ + "_".join(c) if c[-1] != '' else c[0] for c in df_rcc_train_agg_all.columns]

df_rcc_test_agg_all = df_rcc_test_agg.groupby("key_value", as_index=False).agg(c_dict)
df_rcc_test_agg_all.columns = [prefix_ + "_".join(c) if c[-1] != '' else c[0] for c in df_rcc_test_agg_all.columns]

In [33]:
df_rcc_train_agg_all.head()

Unnamed: 0,key_value,U12M_saldo_count_sum,U12M_saldo_count_mean,U12M_saldo_count_std,U12M_saldo_count_min,U12M_saldo_count_max,U12M_saldo_count_first,U12M_saldo_count_last,U12M_saldo_sum_sum,U12M_saldo_sum_mean,U12M_saldo_sum_std,U12M_saldo_sum_min,U12M_saldo_sum_max,U12M_saldo_sum_first,U12M_saldo_sum_last,U12M_saldo_min_sum,U12M_saldo_min_mean,U12M_saldo_min_std,U12M_saldo_min_min,U12M_saldo_min_max,U12M_saldo_min_first,U12M_saldo_min_last,U12M_saldo_max_sum,U12M_saldo_max_mean,U12M_saldo_max_std,U12M_saldo_max_min,U12M_saldo_max_max,U12M_saldo_max_first,U12M_saldo_max_last,U12M_saldo_mean_sum,U12M_saldo_mean_mean,U12M_saldo_mean_std,U12M_saldo_mean_min,U12M_saldo_mean_max,U12M_saldo_mean_first,U12M_saldo_mean_last,U12M_condicion_sum_sum,U12M_condicion_sum_mean,U12M_condicion_sum_std,U12M_condicion_sum_min,...,U12M_absdifflastsaldo_max_std,U12M_absdifflastsaldo_max_min,U12M_absdifflastsaldo_max_max,U12M_absdifflastsaldo_max_first,U12M_absdifflastsaldo_max_last,U12M_absdifflastsaldo_mean_sum,U12M_absdifflastsaldo_mean_mean,U12M_absdifflastsaldo_mean_std,U12M_absdifflastsaldo_mean_min,U12M_absdifflastsaldo_mean_max,U12M_absdifflastsaldo_mean_first,U12M_absdifflastsaldo_mean_last,U12M_absdifflastcondicion_sum_sum,U12M_absdifflastcondicion_sum_mean,U12M_absdifflastcondicion_sum_std,U12M_absdifflastcondicion_sum_min,U12M_absdifflastcondicion_sum_max,U12M_absdifflastcondicion_sum_first,U12M_absdifflastcondicion_sum_last,U12M_absdifflastcondicion_min_sum,U12M_absdifflastcondicion_min_mean,U12M_absdifflastcondicion_min_std,U12M_absdifflastcondicion_min_min,U12M_absdifflastcondicion_min_max,U12M_absdifflastcondicion_min_first,U12M_absdifflastcondicion_min_last,U12M_absdifflastcondicion_max_sum,U12M_absdifflastcondicion_max_mean,U12M_absdifflastcondicion_max_std,U12M_absdifflastcondicion_max_min,U12M_absdifflastcondicion_max_max,U12M_absdifflastcondicion_max_first,U12M_absdifflastcondicion_max_last,U12M_absdifflastcondicion_mean_sum,U12M_absdifflastcondicion_mean_mean,U12M_absdifflastcondicion_mean_std,U12M_absdifflastcondicion_mean_min,U12M_absdifflastcondicion_mean_max,U12M_absdifflastcondicion_mean_first,U12M_absdifflastcondicion_mean_last
0,0,53,4.416667,1.1645,3,7,4,7,50.948174,4.245681,1.082779,2.984509,6.829334,3.777921,6.829334,11.232015,0.936001,0.015573,0.927141,0.961819,0.927186,0.927141,12.172789,1.014399,0.055023,0.961819,1.072672,0.961819,1.062438,11.576217,0.964685,0.024947,0.941042,0.998158,0.94448,0.975619,0.0,0.0,0.0,0,...,0.033085,0.0,0.110853,0.0,0.001568,0.113586,0.010326,0.012779,5e-06,0.039933,0.003407,0.019217,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1,8,2.666667,1.154701,2,4,2,4,7.459759,2.486586,1.076292,1.864763,3.729381,1.865614,3.729381,2.781715,0.927238,0.000104,0.927119,0.927309,0.927309,0.927119,2.814154,0.938051,0.000499,0.937476,0.938372,0.938305,0.938372,2.797534,0.932511,0.000257,0.932345,0.932807,0.932807,0.932345,0.0,0.0,0.0,0,...,4.8e-05,0.000829,0.000896,0.000829,0.000896,0.000462,0.000231,0.000275,3.6e-05,0.000426,0.000426,3.6e-05,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,2,6,3.0,0.0,3,3,3,3,5.636907,2.818454,0.04407,2.787292,2.849616,2.787292,2.849616,1.854238,0.927119,0.0,0.927119,0.927119,0.927119,0.927119,1.92823,0.964115,0.044038,0.932975,0.995254,0.932975,0.995254,1.878969,0.939484,0.01469,0.929097,0.949872,0.929097,0.949872,0.0,0.0,0.0,0,...,,0.062279,0.062279,0.062279,0.062279,0.020775,0.020775,,0.020775,0.020775,0.020775,0.020775,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,,0.0,0.0,0.0,0.0
3,3,7,3.5,0.707107,3,4,4,3,7.193806,3.596903,0.678201,3.117342,4.076464,4.076464,3.117342,1.85603,0.928015,0.00114,0.927209,0.928821,0.928821,0.927209,2.209337,1.104669,0.013579,1.095067,1.11427,1.11427,1.095067,2.05823,1.029115,0.014141,1.019116,1.039114,1.019116,1.039114,0.0,0.0,0.0,0,...,,0.019203,0.019203,0.019203,0.019203,0.019998,0.019998,,0.019998,0.019998,0.019998,0.019998,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,,0.0,0.0,0.0,0.0
4,4,58,7.25,3.918819,3,12,3,12,54.38044,6.797555,3.661081,2.823033,11.229775,2.826796,11.229775,7.430041,0.928755,0.001938,0.92722,0.931094,0.931094,0.927365,7.622668,0.952834,0.004231,0.947946,0.957766,0.947946,0.953645,7.510547,0.938818,0.002615,0.935815,0.942265,0.942265,0.935815,157.0,19.625,31.622494,0,...,0.003486,0.0,0.00982,0.0,0.00112,0.006451,0.000922,0.000724,0.000325,0.002358,0.000627,0.000325,82.0,11.714286,15.019036,0.0,33.0,0.0,28.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,80.0,11.428571,14.558584,0.0,31.0,0.0,28.0,6.833333,0.97619,1.247201,0.0,2.809091,0.0,1.924242


In [38]:
df_rcc_train_agg_all["U12M_codmes_last"]

0         201802
1         201802
2         201802
3         201802
4         201802
           ...  
358482    201802
358483    201802
358484    201802
358485    201802
358486    201802
Name: U12M_codmes_last, Length: 358487, dtype: uint64

In [36]:
df_rcc_test_agg_all.head()

Unnamed: 0,key_value,U12M_saldo_count_sum,U12M_saldo_count_mean,U12M_saldo_count_std,U12M_saldo_count_min,U12M_saldo_count_max,U12M_saldo_count_first,U12M_saldo_count_last,U12M_saldo_sum_sum,U12M_saldo_sum_mean,U12M_saldo_sum_std,U12M_saldo_sum_min,U12M_saldo_sum_max,U12M_saldo_sum_first,U12M_saldo_sum_last,U12M_saldo_min_sum,U12M_saldo_min_mean,U12M_saldo_min_std,U12M_saldo_min_min,U12M_saldo_min_max,U12M_saldo_min_first,U12M_saldo_min_last,U12M_saldo_max_sum,U12M_saldo_max_mean,U12M_saldo_max_std,U12M_saldo_max_min,U12M_saldo_max_max,U12M_saldo_max_first,U12M_saldo_max_last,U12M_saldo_mean_sum,U12M_saldo_mean_mean,U12M_saldo_mean_std,U12M_saldo_mean_min,U12M_saldo_mean_max,U12M_saldo_mean_first,U12M_saldo_mean_last,U12M_condicion_sum_sum,U12M_condicion_sum_mean,U12M_condicion_sum_std,U12M_condicion_sum_min,...,U12M_absdifflastsaldo_max_std,U12M_absdifflastsaldo_max_min,U12M_absdifflastsaldo_max_max,U12M_absdifflastsaldo_max_first,U12M_absdifflastsaldo_max_last,U12M_absdifflastsaldo_mean_sum,U12M_absdifflastsaldo_mean_mean,U12M_absdifflastsaldo_mean_std,U12M_absdifflastsaldo_mean_min,U12M_absdifflastsaldo_mean_max,U12M_absdifflastsaldo_mean_first,U12M_absdifflastsaldo_mean_last,U12M_absdifflastcondicion_sum_sum,U12M_absdifflastcondicion_sum_mean,U12M_absdifflastcondicion_sum_std,U12M_absdifflastcondicion_sum_min,U12M_absdifflastcondicion_sum_max,U12M_absdifflastcondicion_sum_first,U12M_absdifflastcondicion_sum_last,U12M_absdifflastcondicion_min_sum,U12M_absdifflastcondicion_min_mean,U12M_absdifflastcondicion_min_std,U12M_absdifflastcondicion_min_min,U12M_absdifflastcondicion_min_max,U12M_absdifflastcondicion_min_first,U12M_absdifflastcondicion_min_last,U12M_absdifflastcondicion_max_sum,U12M_absdifflastcondicion_max_mean,U12M_absdifflastcondicion_max_std,U12M_absdifflastcondicion_max_min,U12M_absdifflastcondicion_max_max,U12M_absdifflastcondicion_max_first,U12M_absdifflastcondicion_max_last,U12M_absdifflastcondicion_mean_sum,U12M_absdifflastcondicion_mean_mean,U12M_absdifflastcondicion_mean_std,U12M_absdifflastcondicion_mean_min,U12M_absdifflastcondicion_mean_max,U12M_absdifflastcondicion_mean_first,U12M_absdifflastcondicion_mean_last
0,0,208,17.333333,3.420083,11,22,11,15,201.109909,16.759159,3.238173,10.688168,21.142704,10.688168,14.645761,11.127208,0.927267,0.000259,0.927119,0.927959,0.927959,0.927119,12.737703,1.061475,0.0,1.061475,1.061475,1.061475,1.061475,11.611557,0.96763,0.005013,0.961032,0.976384,0.971652,0.976384,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.050091,0.004554,0.003691,2.2e-05,0.012165,0.001888,0.012165,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1,59,4.916667,1.240112,4,7,4,6,55.991013,4.665917,1.20872,3.7722,6.703029,3.775212,5.835804,11.130399,0.927533,0.000287,0.927186,0.927836,0.927836,0.927488,11.900627,0.991719,0.055229,0.962805,1.130898,0.962805,1.130898,11.372097,0.947675,0.009133,0.940894,0.972634,0.943803,0.972634,72.0,6.0,20.78461,0.0,...,0.054286,0.0,0.160099,0.0,0.160099,0.063059,0.005733,0.008886,0.0,0.028768,0.0,0.028768,144.0,13.090909,29.125434,0.0,72.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,48.0,4.363636,9.708478,0.0,24.0,0.0,0.0,28.8,2.618182,5.825087,0.0,14.4,0.0,0.0
2,2,90,7.5,1.445998,6,9,6,8,84.988724,7.082393,1.333476,5.71493,8.528098,5.715031,7.484953,11.127477,0.92729,0.000193,0.927108,0.927701,0.92722,0.927108,11.729946,0.977496,0.024818,0.942784,0.994291,0.994291,0.945023,11.342265,0.945189,0.008341,0.932663,0.952522,0.952505,0.935619,0.0,0.0,0.0,0.0,...,0.015477,0.0,0.051508,0.0,0.0,0.022917,0.002083,0.004017,7e-06,0.013313,7e-06,0.001025,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,3,180,15.0,1.758098,12,17,17,13,170.475616,14.206302,1.637311,11.417912,16.07991,16.07991,12.360068,11.125998,0.927167,0.000102,0.927108,0.927488,0.927153,0.927488,12.052428,1.004369,0.007017,0.997651,1.011087,0.997651,1.011087,11.36759,0.947299,0.00216,0.944915,0.951493,0.945877,0.950774,243.0,20.25,54.077596,0.0,...,0.004051,0.0,0.013437,0.0,0.0,0.019063,0.001733,0.001345,0.000308,0.003646,0.000308,0.003646,366.0,33.272727,63.277312,0.0,183.0,0.0,183.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,122.0,11.090909,20.564312,0.0,61.0,0.0,61.0,26.142857,2.376623,4.54996,0.0,13.071429,0.0,13.071429
4,4,30,4.285714,1.380131,2,6,2,6,27.926773,3.989539,1.288093,1.854238,5.594201,1.854238,5.594201,6.490348,0.927193,6e-05,0.927119,0.927276,0.927119,0.927276,6.57117,0.938739,0.005974,0.927119,0.944508,0.927119,0.944508,6.51384,0.930548,0.001736,0.927119,0.932367,0.927119,0.932367,22.0,3.142857,5.398412,0.0,...,0.005723,0.001848,0.016841,0.016841,0.006203,0.009374,0.001562,0.001652,0.000395,0.004745,0.004745,0.001907,44.0,7.333333,5.750362,0.0,12.0,0.0,10.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,22.0,3.666667,2.875181,0.0,6.0,0.0,5.0,9.333333,1.555556,1.344399,0.0,3.0,0.0,1.666667


In [37]:
df_rcc_train_agg_all.to_csv("[01] Intermediate/ds_rcc_feat_hist_train.csv", index=False)
df_rcc_test_agg_all.to_csv("[01] Intermediate/ds_rcc_feat_hist_test.csv", index=False)

In [12]:
def hist_feat_generator_per_cat(cat_col, numeric_value, agg_func, df_base, df_base_test):

  df_rcc_crosstab = (pd.crosstab(index=[df_rcc_train.key_value, df_rcc_train.codmes], 
                                columns=df_rcc_train[cat_col], 
                                values=df_rcc_train[numeric_value],
                                aggfunc=agg_func)
                      .reset_index(drop=False)
                      .reset_index(drop=True))
  df_rcc_crosstab.columns = ["{0}_{1}_FOR_{2}_{3}".format(numeric_value.upper(), agg_func.upper(), cat_col, c) if c not in ["key_value", "codmes"] else c for c in df_rcc_crosstab.columns]

  feat_cols = [c for c in df_rcc_crosstab.columns if "{0}_".format(cat_col) in c]
  df_rcc_crosstab["MEAN_OF_{0}_{1}_FOR_ALL_{2}".format(numeric_value.upper(), agg_func.upper(), cat_col)] = df_rcc_crosstab[feat_cols].mean(axis=1)
  df_rcc_crosstab["SUM_OF_{0}_{1}_FOR_ALL_{2}".format(numeric_value.upper(), agg_func.upper(), cat_col)] = df_rcc_crosstab[feat_cols].sum(axis=1)
  df_rcc_crosstab["COUNT_OF_{0}_{1}_FOR_ALL_{2}".format(numeric_value.upper(), agg_func.upper(), cat_col)] = df_rcc_crosstab[feat_cols].count(axis=1)

  df_rcc_test_crosstab = (pd.crosstab(index=[df_rcc_test.key_value, df_rcc_test.codmes], 
                                columns=df_rcc_test[cat_col], 
                                values=df_rcc_test[numeric_value],
                                aggfunc=agg_func)
                      .reset_index(drop=False)
                      .reset_index(drop=True))
  df_rcc_test_crosstab.columns = ["{0}_{1}_FOR_{2}_{3}".format(numeric_value.upper(), agg_func.upper(), cat_col, c) if c not in ["key_value", "codmes"] else c for c in df_rcc_test_crosstab.columns]

  feat_cols = [c for c in df_rcc_test_crosstab.columns if "{0}_".format(cat_col) in c]
  df_rcc_test_crosstab["MEAN_OF_{0}_{1}_FOR_ALL_{2}".format(numeric_value.upper(), agg_func.upper(), cat_col)] = df_rcc_test_crosstab[feat_cols].mean(axis=1)
  df_rcc_test_crosstab["SUM_OF_{0}_{1}_FOR_ALL_{2}".format(numeric_value.upper(), agg_func.upper(), cat_col)] = df_rcc_test_crosstab[feat_cols].sum(axis=1)
  df_rcc_test_crosstab["COUNT_OF_{0}_{1}_FOR_ALL_{2}".format(numeric_value.upper(), agg_func.upper(), cat_col)] = df_rcc_test_crosstab[feat_cols].count(axis=1)

  agg_col_dict = {"codmes": ["last", "first"]}
  agg_col_dict.update({c: ["mean", "sum", "count"] for c in df_rcc_crosstab.columns if c not in ["key_value", "codmes"]})

  for month_level in ["U03M", "U06M", "U12M"]:
    print("\t\t\tFeatures at month level: {0}".format(month_level))
    df_rcc_agg = (df_rcc_crosstab.loc[df_rcc_crosstab["codmes"].isin(month_dict[month_level])]
                                .groupby(["key_value"], as_index=False)
                                .agg(agg_col_dict)
                    )
    df_rcc_agg.columns = [month_level + "_" + "_".join(c).upper() if c[-1] != "" else c[0] for c in df_rcc_agg.columns]

    df_rcc_agg["{0}_RECENCY_FOR_{1}".format(month_level, cat_col)] = df_rcc_agg.apply(lambda row: diff_months(row["{0}_CODMES_LAST".format(month_level)], 201803), axis=1)
    df_rcc_agg["{0}_LONG_RECENCY_FOR_{1}".format(month_level, cat_col)] = df_rcc_agg.apply(lambda row: diff_months(row["{0}_CODMES_FIRST".format(month_level)], 201803), axis=1)
    df_rcc_agg["{0}_DIFF_MONTHS_FIRST_LAST_FOR_{1}".format(month_level, cat_col)] = df_rcc_agg.apply(lambda row: diff_months(row["{0}_CODMES_FIRST".format(month_level)], row["{0}_CODMES_LAST".format(month_level)]), axis=1)
    del df_rcc_agg["{0}_CODMES_FIRST".format(month_level, cat_col)]
    del df_rcc_agg["{0}_CODMES_LAST".format(month_level, cat_col)]

    df_rcc_test_agg = (df_rcc_test_crosstab.loc[df_rcc_test_crosstab["codmes"].isin(month_dict_test[month_level])]
                                .groupby(["key_value"], as_index=False)
                                .agg(agg_col_dict)
                       )
    df_rcc_test_agg.columns = [month_level + "_" + "_".join(c).upper() if c[-1] != "" else c[0] for c in df_rcc_test_agg.columns]
    
    df_rcc_test_agg["{0}_RECENCY_FOR_{1}".format(month_level, cat_col)] = df_rcc_test_agg.apply(lambda row: diff_months(row["{0}_CODMES_LAST".format(month_level)], 201903), axis=1)
    df_rcc_test_agg["{0}_LONG_RECENCY_FOR_{1}".format(month_level, cat_col)] = df_rcc_test_agg.apply(lambda row: diff_months(row["{0}_CODMES_FIRST".format(month_level)], 201903), axis=1)
    df_rcc_test_agg["{0}_DIFF_MONTHS_FIRST_LAST_FOR_{1}".format(month_level, cat_col)] = df_rcc_test_agg.apply(lambda row: diff_months(row["{0}_CODMES_FIRST".format(month_level)], row["{0}_CODMES_LAST".format(month_level)]), axis=1)
    del df_rcc_test_agg["{0}_CODMES_FIRST".format(month_level, cat_col)]
    del df_rcc_test_agg["{0}_CODMES_LAST".format(month_level, cat_col)]

    df_base = df_base.merge(df_rcc_agg, how="left", on="key_value")
    df_base_test = df_base_test.merge(df_rcc_test_agg, how="left", on="key_value")

  return df_base, df_base_test

In [None]:
month_dict = {"U12M": [201802, 201801, 201712, 201711, 201710, 201709, 201708, 201707, 201706, 201705, 201704, 201703],
              "U06M": [201802, 201801, 201712, 201711, 201710, 201709],
              "U03M": [201802, 201801, 201712],
              "U01M": [201802]
              }

month_dict_test = {"U12M": [201902, 201901, 201812, 201811, 201810, 201809, 201808, 201807, 201806, 201805, 201804, 201803],
              "U06M": [201902, 201901, 201812, 201811, 201810, 201809],
              "U03M": [201902, 201901, 201812],
              "U01M": [201902]
              }

#### **Variable Producto**

In [None]:
top_freq_products = [1, 2, 3, 8, 255, 4, 0, 6]
top_propensity_products = [12, 17, 18, 25, 20]
low_propensity_products = [32, 33, 35, 36, 37, 38, 40, 41]
list_product = top_freq_products + top_propensity_products + low_propensity_products
print(list_product)

[1, 2, 3, 8, 255, 4, 0, 6, 12, 17, 18, 25, 20, 32, 33, 35, 36, 37, 38, 40, 41]


In [None]:
df_rcc_test["PRODUCTO"] = df_rcc_test["PRODUCTO"].fillna(255)
df_rcc_test["PRODUCTO"] = df_rcc_test["PRODUCTO"].astype(int)

In [None]:
cat_col = "PRODUCTO"
df_rcc_train.loc[~df_rcc_train[cat_col].isin(list_product), cat_col] = "OTROS"
df_rcc_train.loc[df_rcc_train[cat_col].isin(top_propensity_products), cat_col] = "TOP_PROPENSITY"
df_rcc_train.loc[df_rcc_train[cat_col].isin(low_propensity_products), cat_col] = "LOW_PROPENSITY"

df_rcc_test.loc[~df_rcc_test[cat_col].isin(list_product), cat_col] = "OTROS"
df_rcc_test.loc[df_rcc_test[cat_col].isin(top_propensity_products), cat_col] = "TOP_PROPENSITY"
df_rcc_test.loc[df_rcc_test[cat_col].isin(low_propensity_products), cat_col] = "LOW_PROPENSITY"

#### **Variable Institucion Financiera**

In [None]:
top_freq_instits = [34, 1, 10, 28, 11, 37, 0, 2]
top_propensity_instits = [9, 26, 89, 105, 99, 39, 97, 90, 108, 106, 107, 95, 49, 94, 100, 77, 91, 96, 6, 69, 104, 85, 70, 88, 93, 101, 75, 80, 73, 76, 52, 46, 68, 71, 92, 63, 60, 55, 65] # [74, 5]
low_propensity_instits = [12, 86, 47, 14, 78, 84] #[67, 66, 87, 98, 72, 31, 81]
list_instit = top_freq_instits + top_propensity_instits + low_propensity_instits
print(list_instit)

[34, 1, 10, 28, 11, 37, 0, 2, 9, 26, 89, 105, 99, 39, 97, 90, 108, 106, 107, 95, 49, 94, 100, 77, 91, 96, 6, 69, 104, 85, 70, 88, 93, 101, 75, 80, 73, 76, 52, 46, 68, 71, 92, 63, 60, 55, 65, 12, 86, 47, 14, 78, 84]


In [None]:
df_rcc_test["cod_instit_financiera"] = df_rcc_test["cod_instit_financiera"].fillna(255)
df_rcc_test["cod_instit_financiera"] = df_rcc_test["cod_instit_financiera"].astype(int)

In [None]:
df_rcc_train["cod_instit_financiera"] = df_rcc_train["cod_instit_financiera"]
df_rcc_train.loc[df_rcc_train["cod_instit_financiera"].isin(top_propensity_instits), "cod_instit_financiera"] = "TOP_PROPENSITY"
df_rcc_train.loc[df_rcc_train["cod_instit_financiera"].isin(low_propensity_instits), "cod_instit_financiera"] = "LOW_PROPENSITY"
df_rcc_train.loc[~df_rcc_train["cod_instit_financiera"].isin(list_instit), "cod_instit_financiera"] = "OTROS"

In [None]:
df_rcc_test["cod_instit_financiera"] = df_rcc_test["cod_instit_financiera"]
df_rcc_test.loc[df_rcc_test["cod_instit_financiera"].isin(top_propensity_instits), "cod_instit_financiera"] = "TOP_PROPENSITY"
df_rcc_test.loc[df_rcc_test["cod_instit_financiera"].isin(low_propensity_instits), "cod_instit_financiera"] = "LOW_PROPENSITY"
df_rcc_test.loc[~df_rcc_test["cod_instit_financiera"].isin(list_instit), "cod_instit_financiera"] = "OTROS"

#### **Variable Condición como categórica**

In [None]:
df_rcc_train["condicion_cat"] = df_rcc_train["condicion"].apply(lambda c: 0 if c == 0 else "00_03" if c < 90 else  "03_06" if c < 180 else "06_12" if c < 365 else "12-32" if c < 960 else ">32")
df_rcc_test["condicion_cat"] = df_rcc_test["condicion"].apply(lambda c: 0 if c == 0 else "00_03" if c < 90 else  "03_06" if c < 180 else "06_12" if c < 365 else "12-32" if c < 960 else ">32")

In [None]:
df_rcc_train["condicion_cat"].value_counts(normalize=True, dropna=False)

0        0.966859
00_03    0.022446
>32      0.005332
12-32    0.002580
03_06    0.001442
06_12    0.001340
Name: condicion_cat, dtype: float64

In [None]:
df_rcc_test["condicion_cat"].value_counts(normalize=True, dropna=False)

0        0.967486
00_03    0.019706
>32      0.006578
12-32    0.002869
06_12    0.001725
03_06    0.001636
Name: condicion_cat, dtype: float64

#### **Generando los features**

In [None]:
df_base = df_base_train[["key_value"]].copy()
df_base_test = df_base_test[["key_value"]].copy()

In [None]:
for cat_col in ["PRODUCTO", "cod_instit_financiera", "condicion_cat"]:
  print("Cat Feature: {0}".format(cat_col))
  for numeric_value in ["saldo"]:
    print("\tNumeric Feature: {0}".format(numeric_value))
    for agg_func in ["mean"]:
      print("\t\tAgg func: {0}".format(agg_func))
      df_base, df_base_test = hist_feat_generator(cat_col, numeric_value, agg_func, df_base, df_base_test)

Cat Feature: PRODUCTO
	Numeric Feature: saldo
		Agg func: mean
			Features at month level: U03M
			Features at month level: U06M
			Features at month level: U12M
Cat Feature: cod_instit_financiera
	Numeric Feature: saldo
		Agg func: mean
			Features at month level: U03M
			Features at month level: U06M
			Features at month level: U12M
Cat Feature: condicion_cat
	Numeric Feature: saldo
		Agg func: mean
			Features at month level: U03M
			Features at month level: U06M
			Features at month level: U12M


In [None]:
df_base.shape, df_base_test.shape

((358487, 343), (396609, 343))

In [None]:
df_base.shape, df_base_test.shape

((358487, 343), (396609, 343))

#### **Exportando el dataset de RCC**

In [None]:
df_base.to_csv("[01] Intermediate/ds_rcc.csv", index=False)
df_base_test.to_csv("[01] Intermediate/ds_rcc_test.csv", index=False)

## SUNAT

In [None]:
df_sunat_train = optimizar_df(pd.read_csv("[00] Data/sunat_train.csv")).drop_duplicates()
df_sunat_test = optimizar_df(pd.read_csv("[00] Data/sunat_test.csv")).drop_duplicates()

In [None]:
df_sunat_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 292479 entries, 0 to 1462035
Data columns (total 18 columns):
 #   Column               Non-Null Count   Dtype  
---  ------               --------------   -----  
 0   key_value            292479 non-null  uint32 
 1   tipcontribuyente     292479 non-null  uint8  
 2   tippersona           292479 non-null  uint8  
 3   ciiu                 292479 non-null  uint16 
 4   ubigeo               292479 non-null  uint16 
 5   condiciondomicilio   292479 non-null  uint8  
 6   estadocontribuyente  292479 non-null  uint8  
 7   fecalta              272027 non-null  float32
 8   fecbaja              46802 non-null   float32
 9   codvia               292479 non-null  uint8  
 10  codzona              292479 non-null  uint8  
 11  contabilidad         292479 non-null  uint8  
 12  facturacion          292479 non-null  uint8  
 13  domiciliado          292479 non-null  uint8  
 14  comercioexterior     292479 non-null  uint8  
 15  cargorele       

In [None]:
df_sunat_train.head()

Unnamed: 0,key_value,tipcontribuyente,tippersona,ciiu,ubigeo,condiciondomicilio,estadocontribuyente,fecalta,fecbaja,codvia,codzona,contabilidad,facturacion,domiciliado,comercioexterior,cargorele,codentidadtributo,estadotributo
0,190597,9,0,178,442,0,0,-0.785832,,0,0,6,11,0,1,635,8,2
1,162813,0,0,253,449,0,0,0.971584,,20,0,6,11,0,1,635,8,2
2,190620,9,0,173,473,0,2,0.025283,-1.561407,11,0,6,11,0,1,635,8,2
3,43886,0,0,285,154,0,0,0.566027,,11,3,6,11,0,1,635,8,2
4,255211,0,0,285,456,0,0,0.836399,,11,3,6,11,0,1,635,8,2


In [None]:
from scipy import stats

In [None]:
df_sunat_train_agg = df_sunat_train.groupby("key_value", as_index=False).agg({c: stats.mode for c in df_sunat_train.columns if c not in ["key_value", "fecalta", "fecbaja"]})

In [None]:
df_sunat_test_agg = df_sunat_test.groupby("key_value", as_index=False).agg({c: stats.mode for c in df_sunat_test.columns if c not in ["key_value", "fecalta", "fecbaja"]})

In [None]:
for col in [c for c in df_sunat_train.columns if c not in ["key_value", "fecalta", "fecbaja"]]:
  df_sunat_train_agg[col] = df_sunat_train_agg[col].apply(lambda t: t[0][0])
  df_sunat_test_agg[col] = df_sunat_test_agg[col].apply(lambda t: t[0][0])

In [None]:
df_sunat_train_fecha = df_sunat_train.groupby("key_value", as_index=False).agg({"fecalta": ["min", "max", "mean"], "fecbaja": ["min", "max", "mean"]})
df_sunat_test_fecha = df_sunat_test.groupby("key_value", as_index=False).agg({"fecalta": ["min", "max", "mean"], "fecbaja": ["min", "max", "mean"]})

In [None]:
df_sunat_train_fecha.columns = ["SUNAT" + "_".join(c).upper() if c[-1] != "" else c[0] for c in df_sunat_train_fecha.columns]
df_sunat_test_fecha.columns = ["SUNAT" + "_".join(c).upper() if c[-1] != "" else c[0] for c in df_sunat_test_fecha.columns]

In [None]:
df_sunat_train_agg = df_sunat_train_agg.merge(df_sunat_train_fecha, how="left", on="key_value")
df_sunat_test_agg = df_sunat_test_agg.merge(df_sunat_test_fecha, how="left", on="key_value")

In [None]:
df_sunat_train_agg.to_csv("[01] Intermediate/ds_sunat.csv", index=False)
df_sunat_test_agg.to_csv("[01] Intermediate/ds_sunat_test.csv", index=False)

In [None]:
df_sunat_test_agg.head()

Unnamed: 0,key_value,tipcontribuyente,tippersona,ciiu,ubigeo,condiciondomicilio,estadocontribuyente,codvia,codzona,contabilidad,facturacion,domiciliado,comercioexterior,cargorele,codentidadtributo,estadotributo,SUNATFECALTA_MIN,SUNATFECALTA_MAX,SUNATFECALTA_MEAN,SUNATFECBAJA_MIN,SUNATFECBAJA_MAX,SUNATFECBAJA_MEAN
0,0,0,0,255,473,0,0,3,3,6,11,0,2,635,8,2,0.566027,0.566027,0.566027,,,
1,2,0,0,244,473,0,0,0,3,6,11,0,1,635,8,2,1.377142,1.377142,1.377142,,,
2,4,0,0,285,1109,0,0,3,3,6,11,0,2,635,8,2,1.377142,1.377142,1.377142,,,
3,5,9,0,177,674,0,3,3,3,2,11,0,2,635,8,2,1.10677,1.10677,1.10677,1.422525,1.422525,1.422525
4,7,0,0,285,500,0,0,3,10,6,11,0,2,635,8,2,1.7827,1.7827,1.7827,,,


In [None]:
df_sunat_train.groupby("key_value", as_index=False).agg({"tipcontribuyente": "count"}).sort_values(by="tipcontribuyente", ascending=False)

Unnamed: 0,key_value,tipcontribuyente
172663,262090,96
80409,121911,63
57921,87603,51
149929,227523,47
157189,238557,45
...,...,...
83372,126457,1
83373,126458,1
83376,126461,1
83377,126463,1


In [None]:
df_sunat_test.groupby("key_value", as_index=False).agg({"tipcontribuyente": "count"}).sort_values(by="tipcontribuyente", ascending=False)


Unnamed: 0,key_value,tipcontribuyente
16321,24767,655
143253,220827,93
227474,352446,65
10113,15340,62
142130,219118,49
...,...,...
90676,139485,1
90677,139487,1
90678,139489,1
90679,139490,1


## Censo

In [None]:
df_censo_train = optimizar_df(pd.read_csv("[00] Data/censo_train.csv")).drop_duplicates()
df_censo_test = optimizar_df(pd.read_csv("[00] Data/censo_test.csv")).drop_duplicates()

In [None]:
df_censo_train.head()

Unnamed: 0,key_value,lon_x,lat_y,zon_sis,zon_der,zon_inu,zon_com,score,pob,pob_a,pob_b,pob_c,pob_d,pob_e,hog,a,b,c,d,e,pred,ing_a_18,gas_ali_a_18,gas_ves_a_18,gas_alq_a_18,gas_mue_a_18,gas_sal_a_18,gas_trans_a_18,gas_esp_a_18,gas_otrs_a_18,ing_b_18,gas_ali_b_18,gas_ves_b_18,gas_alq_b_18,gas_mue_b_18,gas_sal_b_18,gas_trans_b_18,gas_esp_b_18,gas_otrs_b_18,ing_c_18,...,estud_9,estud_10,profe_1,profe_2,profe_3,profe_4,profe_5,profe_6,profe_7,profe_8,profe_9,profe_10,acteco1,acteco2,acteco3,acteco4,acteco5,acteco6,acteco7,acteco8,acteco9,acteco10,acteco11,acteco12,acteco13,acteco14,acteco15,acteco16,acteco17,acteco18,acteco19,acteco20,acteco21,ocup_1,ocup_2,ocup_3,ocup_4,ocup_5,ocup_6,flag_censo
0,106695,1.827118,-0.711359,-0.931515,-0.242328,-0.32701,-0.280674,-0.714361,-0.544996,-0.312923,-0.425825,-0.565415,-0.089094,-0.165019,-0.537031,-0.302006,-0.418318,-0.547005,-0.110158,-0.187616,4,-0.29862,-0.29862,-0.29862,-0.29862,-0.29862,-0.29862,-0.29862,-0.29862,-0.29862,-0.452057,-0.452057,-0.452057,-0.452057,-0.452057,-0.452057,-0.452057,-0.452057,-0.452057,-0.535443,...,-0.199376,-0.308,-0.340184,-0.217179,-0.355359,-0.206882,-0.225646,-0.151014,-0.339847,-0.419854,-0.733954,-0.205903,-0.237047,-0.368741,-0.555645,-0.273762,-0.338135,0.041469,-0.511477,-0.457707,0.639525,-0.202316,-0.173108,-0.331307,-0.224216,-0.155999,-0.232125,-0.378461,-0.270605,-0.543045,-0.583661,-0.644919,-0.11823,-0.63972,-0.329153,-0.248796,-0.623164,-0.605414,-0.644919,0
1,271120,-0.480514,0.198531,0.054361,-0.242328,-0.32701,-0.280674,-0.104283,-0.06042,-0.312923,-0.485131,0.448644,-0.059923,-0.308719,-0.078552,-0.302006,-0.457537,0.46281,-0.162021,-0.31726,3,-0.29862,-0.29862,-0.29862,-0.29862,-0.29862,-0.29862,-0.29862,-0.29862,-0.29862,-0.4121,-0.4121,-0.4121,-0.4121,-0.4121,-0.4121,-0.4121,-0.4121,-0.4121,0.334792,...,-0.008364,-0.308,0.017784,-0.139198,-0.500427,0.000321,-0.076505,0.239948,0.184462,0.143793,-0.041876,-0.205903,0.215731,-0.368741,-0.130826,-0.273762,-0.338135,-0.127766,-0.076081,0.214074,0.059442,0.017362,-0.402767,-0.331307,-0.171162,-0.658023,-0.00767,0.049331,-0.270605,-0.162191,-0.215123,-0.199299,-0.11823,-0.234777,-0.113894,-0.135995,0.032167,0.062284,-0.199299,0
2,63080,-0.192205,-0.21657,1.040236,-0.242328,-0.32701,-0.280674,-0.124272,0.60751,-0.312923,1.092421,-0.23026,0.961048,0.733106,0.572192,-0.302006,0.993563,-0.2203,0.927114,0.849535,2,-0.29862,-0.29862,-0.29862,-0.29862,-0.29862,-0.29862,-0.29862,-0.29862,-0.29862,1.106274,1.106274,1.106274,1.106274,1.106274,1.106274,1.106274,1.106274,1.106274,0.004703,...,-0.180274,0.137688,-0.340184,-0.035223,0.176559,0.069388,0.44549,-0.151014,0.097077,0.566528,-0.041876,-0.01101,-0.237047,0.360282,0.378958,-0.273762,-0.338135,-0.466236,0.1779,0.438002,0.494504,0.237039,-0.173108,-0.331307,-0.224216,0.513367,-0.232125,0.370175,-0.270605,1.742079,0.521952,0.24632,-0.11823,0.035186,0.224369,0.117805,-0.131666,1.063831,0.24632,0
3,107681,-0.803171,0.74864,1.040236,-0.242328,-0.32701,-0.280674,-0.540244,-0.658501,-0.131975,-0.437686,-0.59979,-0.439141,-0.308719,-0.625768,-0.136362,-0.418318,-0.576705,-0.421339,-0.31726,4,-0.128909,-0.128909,-0.128909,-0.128909,-0.128909,-0.128909,-0.128909,-0.128909,-0.128909,-0.4121,-0.4121,-0.4121,-0.4121,-0.4121,-0.4121,-0.4121,-0.4121,-0.4121,-0.565451,...,-0.485894,-0.382281,0.017784,-0.451123,-0.500427,-0.621288,-0.598499,-0.151014,-0.514617,-0.84259,-0.657057,-0.205903,0.064805,-0.368741,-0.470681,-0.273762,-0.338135,-0.804707,-0.54776,-0.793597,-0.665663,-0.421994,-0.402767,-0.331307,-0.542542,-0.658023,-0.45658,-0.699305,-0.48739,-0.543045,-0.215123,-0.644919,-0.11823,-0.369758,-0.698168,-0.530797,-0.70508,-0.605414,-0.644919,0
4,107336,1.983624,-1.649749,-0.931515,-0.242328,-0.32701,-0.280674,-0.104283,-0.470782,0.669365,0.048627,-0.694321,-0.643335,-0.452419,-0.403924,0.609035,0.130747,-0.665807,-0.628794,-0.446903,2,0.804502,0.804502,0.804502,0.804502,0.804502,0.804502,0.804502,0.804502,0.804502,-0.012528,-0.012528,-0.012528,-0.012528,-0.012528,-0.012528,-0.012528,-0.012528,-0.012528,-0.655475,...,0.010738,0.509094,0.017784,0.042758,-0.258646,-0.345017,-0.710355,-0.151014,-0.776772,-0.84259,-0.733954,-0.205903,-0.086121,0.360282,-0.555645,-0.273762,-0.338135,-0.297001,-0.692892,-0.793597,-0.520642,-0.421994,0.056551,-0.331307,-0.011999,-0.658023,-0.232125,-0.271513,-0.05382,-0.162191,-0.583661,-0.199299,-0.11823,-0.099795,-0.667416,-0.276996,-0.786996,-0.605414,-0.199299,0


In [None]:
df_censo_train.shape, df_censo_train["key_value"].nunique()

((153139, 256), 153139)

In [None]:
df_censo_test.shape, df_censo_test["key_value"].nunique()

((200499, 256), 200499)

## Todos los Datasets

In [None]:
!ls -lh '[01] Intermediate'

total 2.0G
-rw------- 1 root root 521M Dec 20 17:16 ds_rcc.csv
-rw------- 1 root root 185M Dec 18 16:51 ds_rcc_instit.csv
-rw------- 1 root root 204M Dec 18 16:51 ds_rcc_instit_test.csv
-rw------- 1 root root 576M Dec 20 17:18 ds_rcc_test.csv
-rw------- 1 root root  22M Dec 20 13:20 ds_se.csv
-rw------- 1 root root  24M Dec 20 13:20 ds_se_test.csv
-rw------- 1 root root  18M Dec 31 19:34 ds_sunat.csv
-rw------- 1 root root  20M Dec 31 19:34 ds_sunat_test.csv
-rw------- 1 root root 192M Dec 31 13:48 rcc_product_level.csv
-rw------- 1 root root 212M Dec 31 17:27 rcc_product_level_test.csv


---