<a href="https://colab.research.google.com/github/mlaricobar/solution-datathon-interbank-2020/blob/desarrollo/%5B03%5D%20Feature%20Engineering.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Feature Engineering

In [1]:
from google.colab import drive, files

import pandas as pd

from datetime import datetime
from dateutil import relativedelta

In [2]:
drive.mount('/gdrive')

Mounted at /gdrive


In [3]:
%cd /gdrive/'My Drive'/Competencias/'[02] Interbank-2020'
!ls -lh '[00] Data'

/gdrive/My Drive/Competencias/[02] Interbank-2020
total 5.1G
-rw------- 1 root root 979M Dec 12 01:33 censo_test.csv
-rw------- 1 root root 748M Dec 12 01:34 censo_train.csv
-rw------- 1 root root  998 Dec 12 01:34 productos.csv
-rw------- 1 root root 1.7G Dec 12 01:34 rcc_test.csv
-rw------- 1 root root 1.4G Dec 12 01:35 rcc_train.csv
-rw------- 1 root root 3.3M Dec 12 01:35 sample_submission.csv
-rw------- 1 root root  30M Dec 12 01:35 se_test.csv
-rw------- 1 root root  28M Dec 12 01:35 se_train.csv
-rw------- 1 root root 122M Dec 12 01:35 sunat_test.csv
-rw------- 1 root root 106M Dec 12 01:35 sunat_train.csv
-rw------- 1 root root 3.0M Dec 12 01:35 y_train.csv


## Dataset RCC

In [4]:
def optimizar_df(df):
  for col_ in df.select_dtypes(include=[int]).columns:
    df[col_] = pd.to_numeric(df[col_], downcast="unsigned")

  for col_ in df.select_dtypes(include=[float]).columns:
    df[col_] = pd.to_numeric(df[col_], downcast="float")
  
  return df

In [5]:
df_rcc_train = optimizar_df(pd.read_csv("[00] Data/rcc_train.csv"))
df_rcc_test = optimizar_df(pd.read_csv("[00] Data/rcc_test.csv"))

In [6]:
top_freq_products = [1, 2, 3, 8, 255, 4, 0, 6]
top_propensity_products = [12, 17, 18, 25, 20]
low_propensity_products = [32, 33, 35, 36, 37, 38, 40, 41]
list_product = top_freq_products + top_propensity_products + low_propensity_products
print(list_product)

[1, 2, 3, 8, 255, 4, 0, 6, 12, 17, 18, 25, 20, 32, 33, 35, 36, 37, 38, 40, 41]


In [7]:
df_rcc_test["PRODUCTO"] = df_rcc_test["PRODUCTO"].fillna(255)
df_rcc_test["PRODUCTO"] = df_rcc_test["PRODUCTO"].astype(int)

In [8]:
cat_col = "PRODUCTO"
df_rcc_train.loc[~df_rcc_train[cat_col].isin(list_product), cat_col] = "OTROS"
df_rcc_train.loc[df_rcc_train[cat_col].isin(top_propensity_products), cat_col] = "TOP_PROPENSITY"
df_rcc_train.loc[df_rcc_train[cat_col].isin(low_propensity_products), cat_col] = "LOW_PROPENSITY"

df_rcc_test.loc[~df_rcc_test[cat_col].isin(list_product), cat_col] = "OTROS"
df_rcc_test.loc[df_rcc_test[cat_col].isin(top_propensity_products), cat_col] = "TOP_PROPENSITY"
df_rcc_test.loc[df_rcc_test[cat_col].isin(low_propensity_products), cat_col] = "LOW_PROPENSITY"

In [9]:
agg_func = "mean"
numeric_value = "saldo"
df_rcc_crosstab = (pd.crosstab(index=[df_rcc_train.key_value, df_rcc_train.codmes], 
                               columns=df_rcc_train[cat_col], 
                               values=df_rcc_train[numeric_value],
                               aggfunc=agg_func)
                    .reset_index(drop=False)
                    .reset_index(drop=True))
df_rcc_crosstab.columns = ["{0}_{1}_FOR_{2}_{3}".format(numeric_value.upper(), agg_func.upper(), cat_col, c) if c not in ["key_value", "codmes"] else c for c in df_rcc_crosstab.columns]

feat_cols = [c for c in df_rcc_crosstab.columns if "{0}_".format(cat_col) in c]
df_rcc_crosstab["MEAN_OF_{0}_{1}_FOR_ALL_{2}".format(numeric_value.upper(), agg_func.upper(), cat_col)] = df_rcc_crosstab[feat_cols].mean(axis=1)
df_rcc_crosstab["SUM_OF_{0}_{1}_FOR_ALL_{2}".format(numeric_value.upper(), agg_func.upper(), cat_col)] = df_rcc_crosstab[feat_cols].sum(axis=1)
df_rcc_crosstab["COUNT_OF_{0}_{1}_FOR_ALL_{2}".format(numeric_value.upper(), agg_func.upper(), cat_col)] = df_rcc_crosstab[feat_cols].count(axis=1)

In [10]:
df_rcc_test_crosstab = (pd.crosstab(index=[df_rcc_test.key_value, df_rcc_test.codmes], 
                               columns=df_rcc_test[cat_col], 
                               values=df_rcc_test[numeric_value],
                               aggfunc=agg_func)
                    .reset_index(drop=False)
                    .reset_index(drop=True))
df_rcc_test_crosstab.columns = ["{0}_{1}_FOR_{2}_{3}".format(numeric_value.upper(), agg_func.upper(), cat_col, c) if c not in ["key_value", "codmes"] else c for c in df_rcc_test_crosstab.columns]

feat_cols = [c for c in df_rcc_test_crosstab.columns if "{0}_".format(cat_col) in c]
df_rcc_test_crosstab["MEAN_OF_{0}_{1}_FOR_ALL_{2}".format(numeric_value.upper(), agg_func.upper(), cat_col)] = df_rcc_test_crosstab[feat_cols].mean(axis=1)
df_rcc_test_crosstab["SUM_OF_{0}_{1}_FOR_ALL_{2}".format(numeric_value.upper(), agg_func.upper(), cat_col)] = df_rcc_test_crosstab[feat_cols].sum(axis=1)
df_rcc_test_crosstab["COUNT_OF_{0}_{1}_FOR_ALL_{2}".format(numeric_value.upper(), agg_func.upper(), cat_col)] = df_rcc_test_crosstab[feat_cols].count(axis=1)

In [11]:
#df_rcc_crosstab.head(10)
#df_rcc_test_crosstab.head(10)

In [23]:
month_dict = {"U12M": [201802, 201801, 201712, 201711, 201710, 201709, 201708, 201707, 201706, 201705, 201704, 201703],
              "U06M": [201802, 201801, 201712, 201711, 201710, 201709],
              "U03M": [201802, 201801, 201712],
              "U01M": [201802]
              }

month_dict_test = {"U12M": [201902, 201901, 201812, 201811, 201810, 201809, 201808, 201807, 201806, 201805, 201804, 201803],
              "U06M": [201902, 201901, 201812, 201811, 201810, 201809],
              "U03M": [201902, 201901, 201812],
              "U01M": [201902]
              }

In [24]:
def diff_months(codmes_ini, codmes_fin):
  r = relativedelta.relativedelta(datetime.strptime(str(int(codmes_fin)), '%Y%m'), datetime.strptime(str(int(codmes_ini)), '%Y%m'))
  return r.years*12 + r.months

In [25]:
agg_col_dict = {"codmes": ["last", "first"]}
agg_col_dict.update({c: ["mean", "sum", "count"] for c in df_rcc_crosstab.columns if c not in ["key_value", "codmes"]})

In [26]:
month_level = "U03M"
df_rcc_agg = (df_rcc_crosstab.loc[df_rcc_crosstab["codmes"].isin(month_dict[month_level])]
                             .groupby(["key_value"], as_index=False)
                             .agg(agg_col_dict)
                )
df_rcc_agg.columns = [month_level + "_" + "_".join(c).upper() if c[-1] != "" else c[0] for c in df_rcc_agg.columns]

df_rcc_test_agg = (df_rcc_test_crosstab.loc[df_rcc_test_crosstab["codmes"].isin(month_dict_test[month_level])]
                             .groupby(["key_value"], as_index=False)
                             .agg(agg_col_dict)
                )
df_rcc_test_agg.columns = [month_level + "_" + "_".join(c).upper() if c[-1] != "" else c[0] for c in df_rcc_test_agg.columns]

In [27]:
df_rcc_agg["{0}_RECENCY".format(month_level)] = df_rcc_agg.apply(lambda row: diff_months(row["{0}_CODMES_LAST".format(month_level)], 201803), axis=1)
df_rcc_agg["{0}_LONG_RECENCY".format(month_level)] = df_rcc_agg.apply(lambda row: diff_months(row["{0}_CODMES_FIRST".format(month_level)], 201803), axis=1)
df_rcc_agg["{0}_DIFF_MONTHS_FIRST_LAST".format(month_level)] = df_rcc_agg.apply(lambda row: diff_months(row["{0}_CODMES_FIRST".format(month_level)], row["{0}_CODMES_LAST".format(month_level)]), axis=1)
del df_rcc_agg["{0}_CODMES_FIRST".format(month_level)]
del df_rcc_agg["{0}_CODMES_LAST".format(month_level)]

df_rcc_test_agg["{0}_RECENCY".format(month_level)] = df_rcc_test_agg.apply(lambda row: diff_months(row["{0}_CODMES_LAST".format(month_level)], 201903), axis=1)
df_rcc_test_agg["{0}_LONG_RECENCY".format(month_level)] = df_rcc_test_agg.apply(lambda row: diff_months(row["{0}_CODMES_FIRST".format(month_level)], 201903), axis=1)
df_rcc_test_agg["{0}_DIFF_MONTHS_FIRST_LAST".format(month_level)] = df_rcc_test_agg.apply(lambda row: diff_months(row["{0}_CODMES_FIRST".format(month_level)], row["{0}_CODMES_LAST".format(month_level)]), axis=1)
del df_rcc_test_agg["{0}_CODMES_FIRST".format(month_level)]
del df_rcc_test_agg["{0}_CODMES_LAST".format(month_level)]

In [28]:
df_rcc_agg.head()

Unnamed: 0,key_value,U03M_SALDO_MEAN_FOR_PRODUCTO_0_MEAN,U03M_SALDO_MEAN_FOR_PRODUCTO_0_SUM,U03M_SALDO_MEAN_FOR_PRODUCTO_0_COUNT,U03M_SALDO_MEAN_FOR_PRODUCTO_1_MEAN,U03M_SALDO_MEAN_FOR_PRODUCTO_1_SUM,U03M_SALDO_MEAN_FOR_PRODUCTO_1_COUNT,U03M_SALDO_MEAN_FOR_PRODUCTO_2_MEAN,U03M_SALDO_MEAN_FOR_PRODUCTO_2_SUM,U03M_SALDO_MEAN_FOR_PRODUCTO_2_COUNT,U03M_SALDO_MEAN_FOR_PRODUCTO_3_MEAN,U03M_SALDO_MEAN_FOR_PRODUCTO_3_SUM,U03M_SALDO_MEAN_FOR_PRODUCTO_3_COUNT,U03M_SALDO_MEAN_FOR_PRODUCTO_4_MEAN,U03M_SALDO_MEAN_FOR_PRODUCTO_4_SUM,U03M_SALDO_MEAN_FOR_PRODUCTO_4_COUNT,U03M_SALDO_MEAN_FOR_PRODUCTO_6_MEAN,U03M_SALDO_MEAN_FOR_PRODUCTO_6_SUM,U03M_SALDO_MEAN_FOR_PRODUCTO_6_COUNT,U03M_SALDO_MEAN_FOR_PRODUCTO_8_MEAN,U03M_SALDO_MEAN_FOR_PRODUCTO_8_SUM,U03M_SALDO_MEAN_FOR_PRODUCTO_8_COUNT,U03M_SALDO_MEAN_FOR_PRODUCTO_LOW_PROPENSITY_MEAN,U03M_SALDO_MEAN_FOR_PRODUCTO_LOW_PROPENSITY_SUM,U03M_SALDO_MEAN_FOR_PRODUCTO_LOW_PROPENSITY_COUNT,U03M_SALDO_MEAN_FOR_PRODUCTO_OTROS_MEAN,U03M_SALDO_MEAN_FOR_PRODUCTO_OTROS_SUM,U03M_SALDO_MEAN_FOR_PRODUCTO_OTROS_COUNT,U03M_SALDO_MEAN_FOR_PRODUCTO_TOP_PROPENSITY_MEAN,U03M_SALDO_MEAN_FOR_PRODUCTO_TOP_PROPENSITY_SUM,U03M_SALDO_MEAN_FOR_PRODUCTO_TOP_PROPENSITY_COUNT,U03M_MEAN_OF_SALDO_MEAN_FOR_ALL_PRODUCTO_MEAN,U03M_MEAN_OF_SALDO_MEAN_FOR_ALL_PRODUCTO_SUM,U03M_MEAN_OF_SALDO_MEAN_FOR_ALL_PRODUCTO_COUNT,U03M_SUM_OF_SALDO_MEAN_FOR_ALL_PRODUCTO_MEAN,U03M_SUM_OF_SALDO_MEAN_FOR_ALL_PRODUCTO_SUM,U03M_SUM_OF_SALDO_MEAN_FOR_ALL_PRODUCTO_COUNT,U03M_COUNT_OF_SALDO_MEAN_FOR_ALL_PRODUCTO_MEAN,U03M_COUNT_OF_SALDO_MEAN_FOR_ALL_PRODUCTO_SUM,U03M_COUNT_OF_SALDO_MEAN_FOR_ALL_PRODUCTO_COUNT,U03M_RECENCY,U03M_LONG_RECENCY,U03M_DIFF_MONTHS_FIRST_LAST
0,0,0.062248,0.186743,3,-0.072355,-0.072355,1,-0.038181,-0.114542,3,-0.038442,-0.115326,3,,0.0,0,,0.0,0,-0.07212,-0.07212,1,,0.0,0,,0.0,0,,0.0,0,-0.013735,-0.041204,3,-0.062533,-0.187599,3,3.666667,11,3,1,3,2
1,1,,0.0,0,-0.072717,-0.21815,3,,0.0,0,,0.0,0,-0.062238,-0.186714,3,,0.0,0,,0.0,0,,0.0,0,-0.072881,-0.072881,1,,0.0,0,-0.068062,-0.204186,3,-0.159249,-0.477746,3,2.333333,7,3,1,3,2
2,2,,0.0,0,-0.072758,-0.072758,1,,0.0,0,,0.0,0,-0.004746,-0.004746,1,,0.0,0,,0.0,0,,0.0,0,-0.072881,-0.072881,1,,0.0,0,-0.050128,-0.050128,1,-0.150384,-0.150384,1,3.0,3,1,1,1,0
3,3,,0.0,0,-0.072791,-0.072791,1,,0.0,0,,0.0,0,0.095067,0.095067,1,,0.0,0,,0.0,0,,0.0,0,0.095067,0.095067,1,,0.0,0,0.039114,0.039114,1,0.117343,0.117343,1,3.0,3,1,1,1,0
4,4,-0.056247,-0.168741,3,-0.072403,-0.21721,3,-0.058336,-0.175007,3,-0.067921,-0.203762,3,,0.0,0,-0.070507,-0.211522,3,-0.063319,-0.189956,3,,0.0,0,-0.061331,-0.183993,3,,0.0,0,-0.064295,-0.192884,3,-0.450063,-1.35019,3,7.0,21,3,1,3,2


In [29]:
df_rcc_test_agg.head()

Unnamed: 0,key_value,U03M_SALDO_MEAN_FOR_PRODUCTO_0_MEAN,U03M_SALDO_MEAN_FOR_PRODUCTO_0_SUM,U03M_SALDO_MEAN_FOR_PRODUCTO_0_COUNT,U03M_SALDO_MEAN_FOR_PRODUCTO_1_MEAN,U03M_SALDO_MEAN_FOR_PRODUCTO_1_SUM,U03M_SALDO_MEAN_FOR_PRODUCTO_1_COUNT,U03M_SALDO_MEAN_FOR_PRODUCTO_2_MEAN,U03M_SALDO_MEAN_FOR_PRODUCTO_2_SUM,U03M_SALDO_MEAN_FOR_PRODUCTO_2_COUNT,U03M_SALDO_MEAN_FOR_PRODUCTO_3_MEAN,U03M_SALDO_MEAN_FOR_PRODUCTO_3_SUM,U03M_SALDO_MEAN_FOR_PRODUCTO_3_COUNT,U03M_SALDO_MEAN_FOR_PRODUCTO_4_MEAN,U03M_SALDO_MEAN_FOR_PRODUCTO_4_SUM,U03M_SALDO_MEAN_FOR_PRODUCTO_4_COUNT,U03M_SALDO_MEAN_FOR_PRODUCTO_6_MEAN,U03M_SALDO_MEAN_FOR_PRODUCTO_6_SUM,U03M_SALDO_MEAN_FOR_PRODUCTO_6_COUNT,U03M_SALDO_MEAN_FOR_PRODUCTO_8_MEAN,U03M_SALDO_MEAN_FOR_PRODUCTO_8_SUM,U03M_SALDO_MEAN_FOR_PRODUCTO_8_COUNT,U03M_SALDO_MEAN_FOR_PRODUCTO_LOW_PROPENSITY_MEAN,U03M_SALDO_MEAN_FOR_PRODUCTO_LOW_PROPENSITY_SUM,U03M_SALDO_MEAN_FOR_PRODUCTO_LOW_PROPENSITY_COUNT,U03M_SALDO_MEAN_FOR_PRODUCTO_OTROS_MEAN,U03M_SALDO_MEAN_FOR_PRODUCTO_OTROS_SUM,U03M_SALDO_MEAN_FOR_PRODUCTO_OTROS_COUNT,U03M_SALDO_MEAN_FOR_PRODUCTO_TOP_PROPENSITY_MEAN,U03M_SALDO_MEAN_FOR_PRODUCTO_TOP_PROPENSITY_SUM,U03M_SALDO_MEAN_FOR_PRODUCTO_TOP_PROPENSITY_COUNT,U03M_MEAN_OF_SALDO_MEAN_FOR_ALL_PRODUCTO_MEAN,U03M_MEAN_OF_SALDO_MEAN_FOR_ALL_PRODUCTO_SUM,U03M_MEAN_OF_SALDO_MEAN_FOR_ALL_PRODUCTO_COUNT,U03M_SUM_OF_SALDO_MEAN_FOR_ALL_PRODUCTO_MEAN,U03M_SUM_OF_SALDO_MEAN_FOR_ALL_PRODUCTO_SUM,U03M_SUM_OF_SALDO_MEAN_FOR_ALL_PRODUCTO_COUNT,U03M_COUNT_OF_SALDO_MEAN_FOR_ALL_PRODUCTO_MEAN,U03M_COUNT_OF_SALDO_MEAN_FOR_ALL_PRODUCTO_SUM,U03M_COUNT_OF_SALDO_MEAN_FOR_ALL_PRODUCTO_COUNT,U03M_RECENCY,U03M_LONG_RECENCY,U03M_DIFF_MONTHS_FIRST_LAST
0,0,,0.0,0,-0.072818,-0.218455,3,-0.011041,-0.033123,3,-0.012827,-0.038481,3,,0.0,0,-0.072853,-0.145706,2,-0.068034,-0.204102,3,,0.0,0,-0.072833,-0.145665,2,,0.0,0,-0.048169,-0.144506,3,-0.261844,-0.785531,3,5.333333,16,3,1,3,2
1,1,,0.0,0,-0.072517,-0.217551,3,,0.0,0,,0.0,0,-0.064181,-0.192542,3,,0.0,0,,0.0,0,,0.0,0,-0.029526,-0.088578,3,,0.0,0,-0.055408,-0.166224,3,-0.166224,-0.498672,3,3.0,9,3,1,3,2
2,2,,0.0,0,-0.07267,-0.21801,3,-0.057589,-0.172768,3,-0.062227,-0.18668,3,,0.0,0,,0.0,0,-0.065416,-0.196249,3,,0.0,0,-0.072836,-0.218509,3,,0.0,0,-0.066148,-0.198443,3,-0.330739,-0.992216,3,5.0,15,3,1,3,2
3,3,-0.051192,-0.153576,3,-0.072344,-0.217031,3,-0.016682,-0.050046,3,-0.020091,-0.060274,3,-0.059594,-0.178781,3,-0.069279,-0.207838,3,-0.071466,-0.214399,3,,0.0,0,-0.066812,-0.200436,3,,0.0,0,-0.053433,-0.160298,3,-0.42746,-1.282381,3,8.0,24,3,1,3,2
4,4,-0.062281,-0.186843,3,-0.072672,-0.218016,3,,0.0,0,,0.0,0,,0.0,0,-0.072791,-0.218374,3,,0.0,0,,0.0,0,-0.070933,-0.212798,3,,0.0,0,-0.069669,-0.209008,3,-0.278677,-0.836031,3,4.0,12,3,1,3,2


In [30]:
df_rcc_agg.shape, df_rcc_test_agg.shape

((358487, 43), (396666, 43))

In [31]:
df_rcc_agg.describe()

Unnamed: 0,key_value,U03M_SALDO_MEAN_FOR_PRODUCTO_0_MEAN,U03M_SALDO_MEAN_FOR_PRODUCTO_0_SUM,U03M_SALDO_MEAN_FOR_PRODUCTO_0_COUNT,U03M_SALDO_MEAN_FOR_PRODUCTO_1_MEAN,U03M_SALDO_MEAN_FOR_PRODUCTO_1_SUM,U03M_SALDO_MEAN_FOR_PRODUCTO_1_COUNT,U03M_SALDO_MEAN_FOR_PRODUCTO_2_MEAN,U03M_SALDO_MEAN_FOR_PRODUCTO_2_SUM,U03M_SALDO_MEAN_FOR_PRODUCTO_2_COUNT,U03M_SALDO_MEAN_FOR_PRODUCTO_3_MEAN,U03M_SALDO_MEAN_FOR_PRODUCTO_3_SUM,U03M_SALDO_MEAN_FOR_PRODUCTO_3_COUNT,U03M_SALDO_MEAN_FOR_PRODUCTO_4_MEAN,U03M_SALDO_MEAN_FOR_PRODUCTO_4_SUM,U03M_SALDO_MEAN_FOR_PRODUCTO_4_COUNT,U03M_SALDO_MEAN_FOR_PRODUCTO_6_MEAN,U03M_SALDO_MEAN_FOR_PRODUCTO_6_SUM,U03M_SALDO_MEAN_FOR_PRODUCTO_6_COUNT,U03M_SALDO_MEAN_FOR_PRODUCTO_8_MEAN,U03M_SALDO_MEAN_FOR_PRODUCTO_8_SUM,U03M_SALDO_MEAN_FOR_PRODUCTO_8_COUNT,U03M_SALDO_MEAN_FOR_PRODUCTO_LOW_PROPENSITY_MEAN,U03M_SALDO_MEAN_FOR_PRODUCTO_LOW_PROPENSITY_SUM,U03M_SALDO_MEAN_FOR_PRODUCTO_LOW_PROPENSITY_COUNT,U03M_SALDO_MEAN_FOR_PRODUCTO_OTROS_MEAN,U03M_SALDO_MEAN_FOR_PRODUCTO_OTROS_SUM,U03M_SALDO_MEAN_FOR_PRODUCTO_OTROS_COUNT,U03M_SALDO_MEAN_FOR_PRODUCTO_TOP_PROPENSITY_MEAN,U03M_SALDO_MEAN_FOR_PRODUCTO_TOP_PROPENSITY_SUM,U03M_SALDO_MEAN_FOR_PRODUCTO_TOP_PROPENSITY_COUNT,U03M_MEAN_OF_SALDO_MEAN_FOR_ALL_PRODUCTO_MEAN,U03M_MEAN_OF_SALDO_MEAN_FOR_ALL_PRODUCTO_SUM,U03M_MEAN_OF_SALDO_MEAN_FOR_ALL_PRODUCTO_COUNT,U03M_SUM_OF_SALDO_MEAN_FOR_ALL_PRODUCTO_MEAN,U03M_SUM_OF_SALDO_MEAN_FOR_ALL_PRODUCTO_SUM,U03M_SUM_OF_SALDO_MEAN_FOR_ALL_PRODUCTO_COUNT,U03M_COUNT_OF_SALDO_MEAN_FOR_ALL_PRODUCTO_MEAN,U03M_COUNT_OF_SALDO_MEAN_FOR_ALL_PRODUCTO_SUM,U03M_COUNT_OF_SALDO_MEAN_FOR_ALL_PRODUCTO_COUNT,U03M_RECENCY,U03M_LONG_RECENCY,U03M_DIFF_MONTHS_FIRST_LAST
count,358487.0,173199.0,358487.0,358487.0,344464.0,358487.0,358487.0,178066.0,358487.0,358487.0,175838.0,358487.0,358487.0,168237.0,358487.0,358487.0,146373.0,358487.0,358487.0,148939.0,358487.0,358487.0,65.0,358487.0,358487.0,308142.0,358487.0,358487.0,15381.0,358487.0,358487.0,358487.0,358487.0,358487.0,358487.0,358487.0,358487.0,358487.0,358487.0,358487.0,358487.0,358487.0,358487.0
mean,179243.0,-0.005414,-0.003218,1.057238,-0.071768,-0.173114,2.409702,-0.002358,-8.2e-05,1.407005,-0.019099,-0.023168,1.379057,-0.024487,-0.032922,1.098561,-0.070448,-0.061592,0.880342,-0.05676,-0.063427,1.119695,1.883162,0.000909,0.000491,0.047931,0.131425,2.121837,-0.043102,-0.004769,0.111985,-0.022338,-0.053182,2.643873,-0.0908,-0.22977,2.643873,4.20573,11.585912,2.643873,1.0,2.659039,1.659039
std,103486.427313,0.22191,0.427376,1.273644,0.003944,0.06772,0.933587,0.098101,0.20402,1.461523,0.094457,0.196091,1.453483,0.131075,0.199055,1.322599,0.031835,0.103705,1.215199,0.030064,0.095197,1.391007,2.798165,0.121672,0.037492,1.117769,2.956732,1.159436,0.117394,0.072488,0.542394,0.261583,0.722897,0.745895,1.085736,3.14957,0.745895,1.518427,5.708658,0.745895,0.0,0.742809,0.742809
min,0.0,-0.072892,-0.218677,0.0,-0.072892,-0.218677,0.0,-0.072892,-0.218643,0.0,-0.072892,-0.218677,0.0,-0.072892,-0.218677,0.0,-0.072892,-0.218677,0.0,-0.072892,-0.218677,0.0,0.034624,0.0,0.0,-0.072892,-0.218677,0.0,-0.072892,-0.218677,0.0,-0.072255,-0.216764,1.0,-0.56999,-1.70997,1.0,1.0,1.0,1.0,1.0,1.0,0.0
25%,89621.5,-0.059131,-0.057642,0.0,-0.072683,-0.21768,2.0,-0.054977,-0.084309,0.0,-0.063468,-0.124349,0.0,-0.061885,-0.088525,0.0,-0.07275,-0.139451,0.0,-0.067725,-0.171467,0.0,0.511785,0.0,0.0,-0.069387,-0.164258,1.0,-0.066678,0.0,0.0,-0.062807,-0.178153,3.0,-0.251674,-0.736582,3.0,3.0,8.0,3.0,1.0,3.0,2.0
50%,179243.0,-0.048482,0.0,0.0,-0.072452,-0.215597,3.0,-0.035244,0.0,0.0,-0.049158,0.0,0.0,-0.051994,0.0,0.0,-0.072586,0.0,0.0,-0.063834,0.0,0.0,1.046837,0.0,0.0,-0.054036,-0.06605,3.0,-0.0602,0.0,0.0,-0.05187,-0.123461,3.0,-0.183702,-0.457714,3.0,4.0,12.0,3.0,1.0,3.0,2.0
75%,268864.5,-0.013233,0.0,3.0,-0.071909,-0.143993,3.0,0.007712,0.0,3.0,-0.01474,0.0,3.0,-0.024223,0.0,3.0,-0.071952,0.0,2.0,-0.057526,0.0,3.0,1.982579,0.0,0.0,0.010463,0.0,3.0,-0.047891,0.0,0.0,-0.026752,-0.055671,3.0,-0.096621,-0.165278,3.0,5.333333,16.0,3.0,1.0,3.0,2.0
max,358486.0,46.067295,138.201889,3.0,1.030873,3.092619,3.0,7.596439,22.789318,3.0,7.51065,22.53195,3.0,21.687918,65.063751,3.0,8.655417,25.966253,3.0,0.919344,2.128751,3.0,17.973169,53.919506,3.0,262.7854,788.356201,3.0,11.011289,33.033867,3.0,72.915085,218.74527,3.0,262.676117,788.02832,3.0,9.0,27.0,3.0,1.0,3.0,2.0


In [32]:
df_rcc_test_agg.describe()

Unnamed: 0,key_value,U03M_SALDO_MEAN_FOR_PRODUCTO_0_MEAN,U03M_SALDO_MEAN_FOR_PRODUCTO_0_SUM,U03M_SALDO_MEAN_FOR_PRODUCTO_0_COUNT,U03M_SALDO_MEAN_FOR_PRODUCTO_1_MEAN,U03M_SALDO_MEAN_FOR_PRODUCTO_1_SUM,U03M_SALDO_MEAN_FOR_PRODUCTO_1_COUNT,U03M_SALDO_MEAN_FOR_PRODUCTO_2_MEAN,U03M_SALDO_MEAN_FOR_PRODUCTO_2_SUM,U03M_SALDO_MEAN_FOR_PRODUCTO_2_COUNT,U03M_SALDO_MEAN_FOR_PRODUCTO_3_MEAN,U03M_SALDO_MEAN_FOR_PRODUCTO_3_SUM,U03M_SALDO_MEAN_FOR_PRODUCTO_3_COUNT,U03M_SALDO_MEAN_FOR_PRODUCTO_4_MEAN,U03M_SALDO_MEAN_FOR_PRODUCTO_4_SUM,U03M_SALDO_MEAN_FOR_PRODUCTO_4_COUNT,U03M_SALDO_MEAN_FOR_PRODUCTO_6_MEAN,U03M_SALDO_MEAN_FOR_PRODUCTO_6_SUM,U03M_SALDO_MEAN_FOR_PRODUCTO_6_COUNT,U03M_SALDO_MEAN_FOR_PRODUCTO_8_MEAN,U03M_SALDO_MEAN_FOR_PRODUCTO_8_SUM,U03M_SALDO_MEAN_FOR_PRODUCTO_8_COUNT,U03M_SALDO_MEAN_FOR_PRODUCTO_LOW_PROPENSITY_MEAN,U03M_SALDO_MEAN_FOR_PRODUCTO_LOW_PROPENSITY_SUM,U03M_SALDO_MEAN_FOR_PRODUCTO_LOW_PROPENSITY_COUNT,U03M_SALDO_MEAN_FOR_PRODUCTO_OTROS_MEAN,U03M_SALDO_MEAN_FOR_PRODUCTO_OTROS_SUM,U03M_SALDO_MEAN_FOR_PRODUCTO_OTROS_COUNT,U03M_SALDO_MEAN_FOR_PRODUCTO_TOP_PROPENSITY_MEAN,U03M_SALDO_MEAN_FOR_PRODUCTO_TOP_PROPENSITY_SUM,U03M_SALDO_MEAN_FOR_PRODUCTO_TOP_PROPENSITY_COUNT,U03M_MEAN_OF_SALDO_MEAN_FOR_ALL_PRODUCTO_MEAN,U03M_MEAN_OF_SALDO_MEAN_FOR_ALL_PRODUCTO_SUM,U03M_MEAN_OF_SALDO_MEAN_FOR_ALL_PRODUCTO_COUNT,U03M_SUM_OF_SALDO_MEAN_FOR_ALL_PRODUCTO_MEAN,U03M_SUM_OF_SALDO_MEAN_FOR_ALL_PRODUCTO_SUM,U03M_SUM_OF_SALDO_MEAN_FOR_ALL_PRODUCTO_COUNT,U03M_COUNT_OF_SALDO_MEAN_FOR_ALL_PRODUCTO_MEAN,U03M_COUNT_OF_SALDO_MEAN_FOR_ALL_PRODUCTO_SUM,U03M_COUNT_OF_SALDO_MEAN_FOR_ALL_PRODUCTO_COUNT,U03M_RECENCY,U03M_LONG_RECENCY,U03M_DIFF_MONTHS_FIRST_LAST
count,396666.0,185631.0,396666.0,396666.0,381278.0,396666.0,396666.0,188115.0,396666.0,396666.0,185587.0,396666.0,396666.0,192713.0,396666.0,396666.0,159519.0,396666.0,396666.0,162104.0,396666.0,396666.0,57.0,396666.0,396666.0,347664.0,396666.0,396666.0,16273.0,396666.0,396666.0,396666.0,396666.0,396666.0,396666.0,396666.0,396666.0,396666.0,396666.0,396666.0,396666.0,396666.0,396666.0
mean,198332.5,-0.002964,-0.000228,1.033151,-0.071752,-0.172878,2.405981,0.004623,0.009414,1.35209,-0.014229,-0.016046,1.327149,-0.027263,-0.037197,1.147353,-0.069438,-0.060926,0.884233,-0.059175,-0.065864,1.112233,4.193369,0.001372,0.000361,0.046015,0.125539,2.213593,-0.040158,-0.004735,0.116698,-0.020374,-0.04823,2.64907,-0.086805,-0.221374,2.64907,4.197491,11.592841,2.64907,1.0,2.664506,1.664506
std,114507.755277,0.195199,0.353896,1.273367,0.010439,0.073256,0.932751,0.10925,0.221913,1.462813,0.102024,0.205414,1.455466,0.101367,0.148045,1.336001,0.046786,0.120663,1.225148,0.021816,0.090391,1.395654,6.971385,0.191388,0.031714,1.163133,3.258498,1.124414,0.086292,0.053272,0.573122,0.332878,0.994224,0.742262,1.141316,3.416477,0.742262,1.535553,5.739905,0.742262,0.0,0.738994,0.738994
min,0.0,-0.072892,-0.218677,0.0,-0.072892,-0.218677,0.0,-0.072892,-0.218643,0.0,-0.072892,-0.218677,0.0,-0.072892,-0.216001,0.0,-0.072892,-0.218677,0.0,-0.072892,-0.218677,0.0,-0.049669,-0.099338,0.0,-0.072892,-0.218677,0.0,-0.072892,-0.218677,0.0,-0.072149,-0.216446,1.0,-0.570061,-1.710182,1.0,1.0,1.0,1.0,1.0,1.0,0.0
25%,99166.25,-0.058563,-0.055996,0.0,-0.072691,-0.217671,2.0,-0.054977,-0.068073,0.0,-0.063322,-0.114795,0.0,-0.062217,-0.112193,0.0,-0.072747,-0.142648,0.0,-0.067719,-0.175206,0.0,0.598945,0.0,0.0,-0.068977,-0.169174,1.0,-0.066247,0.0,0.0,-0.06197,-0.17544,3.0,-0.248777,-0.727375,3.0,3.0,8.0,3.0,1.0,3.0,2.0
50%,198332.5,-0.046078,0.0,0.0,-0.072458,-0.215517,3.0,-0.033142,0.0,0.0,-0.048346,0.0,0.0,-0.054622,0.0,0.0,-0.072556,0.0,0.0,-0.06402,0.0,0.0,2.168805,0.0,0.0,-0.050498,-0.065473,3.0,-0.06071,0.0,0.0,-0.050019,-0.118967,3.0,-0.17675,-0.437461,3.0,4.0,12.0,3.0,1.0,3.0,2.0
75%,297498.75,-0.008116,0.0,3.0,-0.07191,-0.14406,3.0,0.015566,0.0,3.0,-0.008949,0.0,3.0,-0.028103,0.0,3.0,-0.071806,0.0,2.0,-0.058431,0.0,3.0,4.507526,0.0,0.0,0.022,0.016553,3.0,-0.048325,0.0,0.0,-0.023576,-0.050498,3.0,-0.084273,-0.13901,3.0,5.333333,16.0,3.0,1.0,3.0,2.0
max,396665.0,29.346394,88.039177,3.0,6.01475,18.04425,3.0,3.163124,9.489372,3.0,3.193342,9.580026,3.0,11.108355,33.325062,3.0,14.844953,44.534859,3.0,0.825052,1.953715,3.0,45.279797,56.648254,3.0,391.515564,1174.546753,3.0,3.382555,6.976365,3.0,113.679047,341.03714,3.0,392.867004,1178.601074,3.0,9.0,27.0,3.0,1.0,3.0,2.0


In [33]:
df_rcc_agg.to_csv("[01] Intermediate/ds_rcc.csv", index=False)
df_rcc_test_agg.to_csv("[01] Intermediate/ds_rcc_test.csv", index=False)

## Dataset Socio-Demográfico

In [34]:
df_se_train = optimizar_df(pd.read_csv("[00] Data/se_train.csv"))
df_se_test = optimizar_df(pd.read_csv("[00] Data/se_test.csv"))
df_y_train = optimizar_df(pd.read_csv("[00] Data/y_train.csv"))

In [35]:
df_se_train = df_se_train.merge(df_y_train, how="left", on="key_value")

In [36]:
df_se_train

Unnamed: 0,key_value,edad,sexo,est_cvl,sit_lab,cod_ocu,ctd_hijos,flg_sin_email,ctd_veh,cod_ubi,lgr_vot,prv,dto,rgn,tip_lvledu,target
0,0,-0.983651,1,4,2,22,0.0,1,0.0,-0.070167,17,47,687,4,0,0
1,1,0.000953,1,4,1,22,0.0,1,1.0,-0.738566,10,186,849,6,0,0
2,2,0.531124,0,4,1,22,0.0,1,0.0,-0.297248,15,84,1405,1,7,0
3,3,-1.135129,0,4,1,22,0.0,1,0.0,-1.203435,4,9,977,6,7,0
4,4,-1.438084,0,4,1,22,0.0,1,0.0,0.043424,18,111,391,3,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
358423,358482,1.742944,0,4,3,22,0.0,1,0.0,0.047914,18,92,521,3,7,0
358424,358483,-0.302002,0,0,3,22,3.0,1,0.0,0.043367,18,111,768,3,7,0
358425,358484,-1.513823,1,4,3,22,0.0,1,0.0,-0.523795,13,127,949,5,7,0
358426,358485,0.076691,1,4,2,22,0.0,1,0.0,0.725428,26,111,1313,3,0,0


In [37]:
df_se_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 358428 entries, 0 to 358427
Data columns (total 16 columns):
 #   Column         Non-Null Count   Dtype  
---  ------         --------------   -----  
 0   key_value      358428 non-null  uint32 
 1   edad           354663 non-null  float32
 2   sexo           358428 non-null  uint8  
 3   est_cvl        358428 non-null  uint8  
 4   sit_lab        358428 non-null  uint8  
 5   cod_ocu        358428 non-null  uint8  
 6   ctd_hijos      358428 non-null  float32
 7   flg_sin_email  358428 non-null  uint8  
 8   ctd_veh        358428 non-null  float32
 9   cod_ubi        354255 non-null  float32
 10  lgr_vot        358428 non-null  uint8  
 11  prv            358428 non-null  uint8  
 12  dto            358428 non-null  uint16 
 13  rgn            358428 non-null  uint8  
 14  tip_lvledu     358428 non-null  uint8  
 15  target         358428 non-null  uint8  
dtypes: float32(4), uint16(1), uint32(1), uint8(10)
memory usage: 13.7 MB


In [39]:
df_se_train.shape, df_se_train["key_value"].nunique(), df_y_train.shape, df_y_train["key_value"].nunique(), df_se_test.shape, df_se_test["key_value"].nunique()

((358428, 16), 358428, (358487, 2), 358487, (396609, 15), 396609)

In [41]:
df_se_train.to_csv("[01] Intermediate/ds_se.csv", index=False)
df_se_test.to_csv("[01] Intermediate/ds_se_test.csv", index=False)

## Todos los Datasets

In [43]:
!ls -lh '[01] Intermediate'

total 216M
-rw------- 1 root root 81M Dec 17 14:57 ds_rcc.csv
-rw------- 1 root root 89M Dec 17 14:57 ds_rcc_test.csv
-rw------- 1 root root 23M Dec 17 15:01 ds_se.csv
-rw------- 1 root root 24M Dec 17 15:01 ds_se_test.csv


---