<a href="https://colab.research.google.com/github/mlaricobar/solution-datathon-interbank-2020/blob/desarrollo/%5B03%5D%20Feature%20Engineering.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Feature Engineering

In [1]:
from google.colab import drive, files

import pandas as pd

from datetime import datetime
from dateutil import relativedelta

In [2]:
drive.mount('/gdrive')

Drive already mounted at /gdrive; to attempt to forcibly remount, call drive.mount("/gdrive", force_remount=True).


In [3]:
%cd /gdrive/'My Drive'/Competencias/'[02] Interbank-2020'
!ls -lh '[00] Data'

/gdrive/My Drive/Competencias/[02] Interbank-2020
total 5.1G
-rw------- 1 root root 979M Dec 12 01:33 censo_test.csv
-rw------- 1 root root 748M Dec 12 01:34 censo_train.csv
-rw------- 1 root root  998 Dec 12 01:34 productos.csv
-rw------- 1 root root 1.7G Dec 12 01:34 rcc_test.csv
-rw------- 1 root root 1.4G Dec 12 01:35 rcc_train.csv
-rw------- 1 root root 3.3M Dec 12 01:35 sample_submission.csv
-rw------- 1 root root  30M Dec 12 01:35 se_test.csv
-rw------- 1 root root  28M Dec 12 01:35 se_train.csv
-rw------- 1 root root 122M Dec 12 01:35 sunat_test.csv
-rw------- 1 root root 106M Dec 12 01:35 sunat_train.csv
-rw------- 1 root root 3.0M Dec 12 01:35 y_train.csv


In [4]:
def optimizar_df(df):
  for col_ in df.select_dtypes(include=[int]).columns:
    df[col_] = pd.to_numeric(df[col_], downcast="unsigned")

  for col_ in df.select_dtypes(include=[float]).columns:
    df[col_] = pd.to_numeric(df[col_], downcast="float")
  
  return df

## Dataset Socio-Demográfico

In [5]:
df_se_train = optimizar_df(pd.read_csv("[00] Data/se_train.csv"))
df_se_test = optimizar_df(pd.read_csv("[00] Data/se_test.csv"))
df_y_train = optimizar_df(pd.read_csv("[00] Data/y_train.csv"))

In [6]:
df_se_train = df_se_train.merge(df_y_train, how="left", on="key_value")

In [7]:
df_se_train

Unnamed: 0,key_value,edad,sexo,est_cvl,sit_lab,cod_ocu,ctd_hijos,flg_sin_email,ctd_veh,cod_ubi,lgr_vot,prv,dto,rgn,tip_lvledu,target
0,0,-0.983651,1,4,2,22,0.0,1,0.0,-0.070167,17,47,687,4,0,0
1,1,0.000953,1,4,1,22,0.0,1,1.0,-0.738566,10,186,849,6,0,0
2,2,0.531124,0,4,1,22,0.0,1,0.0,-0.297248,15,84,1405,1,7,0
3,3,-1.135129,0,4,1,22,0.0,1,0.0,-1.203435,4,9,977,6,7,0
4,4,-1.438084,0,4,1,22,0.0,1,0.0,0.043424,18,111,391,3,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
358423,358482,1.742944,0,4,3,22,0.0,1,0.0,0.047914,18,92,521,3,7,0
358424,358483,-0.302002,0,0,3,22,3.0,1,0.0,0.043367,18,111,768,3,7,0
358425,358484,-1.513823,1,4,3,22,0.0,1,0.0,-0.523795,13,127,949,5,7,0
358426,358485,0.076691,1,4,2,22,0.0,1,0.0,0.725428,26,111,1313,3,0,0


In [8]:
df_se_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 358428 entries, 0 to 358427
Data columns (total 16 columns):
 #   Column         Non-Null Count   Dtype  
---  ------         --------------   -----  
 0   key_value      358428 non-null  uint32 
 1   edad           354663 non-null  float32
 2   sexo           358428 non-null  uint8  
 3   est_cvl        358428 non-null  uint8  
 4   sit_lab        358428 non-null  uint8  
 5   cod_ocu        358428 non-null  uint8  
 6   ctd_hijos      358428 non-null  float32
 7   flg_sin_email  358428 non-null  uint8  
 8   ctd_veh        358428 non-null  float32
 9   cod_ubi        354255 non-null  float32
 10  lgr_vot        358428 non-null  uint8  
 11  prv            358428 non-null  uint8  
 12  dto            358428 non-null  uint16 
 13  rgn            358428 non-null  uint8  
 14  tip_lvledu     358428 non-null  uint8  
 15  target         358428 non-null  uint8  
dtypes: float32(4), uint16(1), uint32(1), uint8(10)
memory usage: 13.7 MB


In [9]:
df_se_train.shape, df_se_train["key_value"].nunique(), df_y_train.shape, df_y_train["key_value"].nunique(), df_se_test.shape, df_se_test["key_value"].nunique()

((358428, 16), 358428, (358487, 2), 358487, (396609, 15), 396609)

In [41]:
df_se_train.to_csv("[01] Intermediate/ds_se.csv", index=False)
df_se_test.to_csv("[01] Intermediate/ds_se_test.csv", index=False)

## Dataset RCC

In [10]:
df_rcc_train = optimizar_df(pd.read_csv("[00] Data/rcc_train.csv"))
df_rcc_test = optimizar_df(pd.read_csv("[00] Data/rcc_test.csv"))

In [11]:
def diff_months(codmes_ini, codmes_fin):
  r = relativedelta.relativedelta(datetime.strptime(str(int(codmes_fin)), '%Y%m'), datetime.strptime(str(int(codmes_ini)), '%Y%m'))
  return r.years*12 + r.months

In [12]:
def hist_feat_generator(cat_col, numeric_value, agg_func, df_base, df_base_test):

  df_rcc_crosstab = (pd.crosstab(index=[df_rcc_train.key_value, df_rcc_train.codmes], 
                                columns=df_rcc_train[cat_col], 
                                values=df_rcc_train[numeric_value],
                                aggfunc=agg_func)
                      .reset_index(drop=False)
                      .reset_index(drop=True))
  df_rcc_crosstab.columns = ["{0}_{1}_FOR_{2}_{3}".format(numeric_value.upper(), agg_func.upper(), cat_col, c) if c not in ["key_value", "codmes"] else c for c in df_rcc_crosstab.columns]

  feat_cols = [c for c in df_rcc_crosstab.columns if "{0}_".format(cat_col) in c]
  df_rcc_crosstab["MEAN_OF_{0}_{1}_FOR_ALL_{2}".format(numeric_value.upper(), agg_func.upper(), cat_col)] = df_rcc_crosstab[feat_cols].mean(axis=1)
  df_rcc_crosstab["SUM_OF_{0}_{1}_FOR_ALL_{2}".format(numeric_value.upper(), agg_func.upper(), cat_col)] = df_rcc_crosstab[feat_cols].sum(axis=1)
  df_rcc_crosstab["COUNT_OF_{0}_{1}_FOR_ALL_{2}".format(numeric_value.upper(), agg_func.upper(), cat_col)] = df_rcc_crosstab[feat_cols].count(axis=1)

  df_rcc_test_crosstab = (pd.crosstab(index=[df_rcc_test.key_value, df_rcc_test.codmes], 
                                columns=df_rcc_test[cat_col], 
                                values=df_rcc_test[numeric_value],
                                aggfunc=agg_func)
                      .reset_index(drop=False)
                      .reset_index(drop=True))
  df_rcc_test_crosstab.columns = ["{0}_{1}_FOR_{2}_{3}".format(numeric_value.upper(), agg_func.upper(), cat_col, c) if c not in ["key_value", "codmes"] else c for c in df_rcc_test_crosstab.columns]

  feat_cols = [c for c in df_rcc_test_crosstab.columns if "{0}_".format(cat_col) in c]
  df_rcc_test_crosstab["MEAN_OF_{0}_{1}_FOR_ALL_{2}".format(numeric_value.upper(), agg_func.upper(), cat_col)] = df_rcc_test_crosstab[feat_cols].mean(axis=1)
  df_rcc_test_crosstab["SUM_OF_{0}_{1}_FOR_ALL_{2}".format(numeric_value.upper(), agg_func.upper(), cat_col)] = df_rcc_test_crosstab[feat_cols].sum(axis=1)
  df_rcc_test_crosstab["COUNT_OF_{0}_{1}_FOR_ALL_{2}".format(numeric_value.upper(), agg_func.upper(), cat_col)] = df_rcc_test_crosstab[feat_cols].count(axis=1)

  agg_col_dict = {"codmes": ["last", "first"]}
  agg_col_dict.update({c: ["mean", "sum", "count"] for c in df_rcc_crosstab.columns if c not in ["key_value", "codmes"]})

  #month_level = "U03M"
  for month_level in ["U03M", "U06M", "U12M"]:
    print("Features at month level: {0}".format(month_level))
    df_rcc_agg = (df_rcc_crosstab.loc[df_rcc_crosstab["codmes"].isin(month_dict[month_level])]
                                .groupby(["key_value"], as_index=False)
                                .agg(agg_col_dict)
                    )
    df_rcc_agg.columns = [month_level + "_" + "_".join(c).upper() if c[-1] != "" else c[0] for c in df_rcc_agg.columns]

    df_rcc_agg["{0}_RECENCY".format(month_level)] = df_rcc_agg.apply(lambda row: diff_months(row["{0}_CODMES_LAST".format(month_level)], 201803), axis=1)
    df_rcc_agg["{0}_LONG_RECENCY".format(month_level)] = df_rcc_agg.apply(lambda row: diff_months(row["{0}_CODMES_FIRST".format(month_level)], 201803), axis=1)
    df_rcc_agg["{0}_DIFF_MONTHS_FIRST_LAST".format(month_level)] = df_rcc_agg.apply(lambda row: diff_months(row["{0}_CODMES_FIRST".format(month_level)], row["{0}_CODMES_LAST".format(month_level)]), axis=1)
    del df_rcc_agg["{0}_CODMES_FIRST".format(month_level)]
    del df_rcc_agg["{0}_CODMES_LAST".format(month_level)]

    df_rcc_test_agg = (df_rcc_test_crosstab.loc[df_rcc_test_crosstab["codmes"].isin(month_dict_test[month_level])]
                                .groupby(["key_value"], as_index=False)
                                .agg(agg_col_dict)
                       )
    df_rcc_test_agg.columns = [month_level + "_" + "_".join(c).upper() if c[-1] != "" else c[0] for c in df_rcc_test_agg.columns]
    
    df_rcc_test_agg["{0}_RECENCY".format(month_level)] = df_rcc_test_agg.apply(lambda row: diff_months(row["{0}_CODMES_LAST".format(month_level)], 201903), axis=1)
    df_rcc_test_agg["{0}_LONG_RECENCY".format(month_level)] = df_rcc_test_agg.apply(lambda row: diff_months(row["{0}_CODMES_FIRST".format(month_level)], 201903), axis=1)
    df_rcc_test_agg["{0}_DIFF_MONTHS_FIRST_LAST".format(month_level)] = df_rcc_test_agg.apply(lambda row: diff_months(row["{0}_CODMES_FIRST".format(month_level)], row["{0}_CODMES_LAST".format(month_level)]), axis=1)
    del df_rcc_test_agg["{0}_CODMES_FIRST".format(month_level)]
    del df_rcc_test_agg["{0}_CODMES_LAST".format(month_level)]

    df_base = df_base.merge(df_rcc_agg, how="left", on="key_value")
    df_base_test = df_base_test.merge(df_rcc_test_agg, how="left", on="key_value")

  return df_base, df_base_test



In [13]:
month_dict = {"U12M": [201802, 201801, 201712, 201711, 201710, 201709, 201708, 201707, 201706, 201705, 201704, 201703],
              "U06M": [201802, 201801, 201712, 201711, 201710, 201709],
              "U03M": [201802, 201801, 201712],
              "U01M": [201802]
              }

month_dict_test = {"U12M": [201902, 201901, 201812, 201811, 201810, 201809, 201808, 201807, 201806, 201805, 201804, 201803],
              "U06M": [201902, 201901, 201812, 201811, 201810, 201809],
              "U03M": [201902, 201901, 201812],
              "U01M": [201902]
              }

In [17]:
df_base = df_se_train[["key_value"]].copy()
df_base_test = df_se_test[["key_value"]].copy()

#### **Variable Producto**

In [14]:
top_freq_products = [1, 2, 3, 8, 255, 4, 0, 6]
top_propensity_products = [12, 17, 18, 25, 20]
low_propensity_products = [32, 33, 35, 36, 37, 38, 40, 41]
list_product = top_freq_products + top_propensity_products + low_propensity_products
print(list_product)

[1, 2, 3, 8, 255, 4, 0, 6, 12, 17, 18, 25, 20, 32, 33, 35, 36, 37, 38, 40, 41]


In [15]:
df_rcc_test["PRODUCTO"] = df_rcc_test["PRODUCTO"].fillna(255)
df_rcc_test["PRODUCTO"] = df_rcc_test["PRODUCTO"].astype(int)

In [16]:
cat_col = "PRODUCTO"
df_rcc_train.loc[~df_rcc_train[cat_col].isin(list_product), cat_col] = "OTROS"
df_rcc_train.loc[df_rcc_train[cat_col].isin(top_propensity_products), cat_col] = "TOP_PROPENSITY"
df_rcc_train.loc[df_rcc_train[cat_col].isin(low_propensity_products), cat_col] = "LOW_PROPENSITY"

df_rcc_test.loc[~df_rcc_test[cat_col].isin(list_product), cat_col] = "OTROS"
df_rcc_test.loc[df_rcc_test[cat_col].isin(top_propensity_products), cat_col] = "TOP_PROPENSITY"
df_rcc_test.loc[df_rcc_test[cat_col].isin(low_propensity_products), cat_col] = "LOW_PROPENSITY"

In [18]:
cat_col = "PRODUCTO"
agg_func = "mean"
numeric_value = "saldo"

In [19]:
df_base, df_base_test = hist_feat_generator(cat_col, numeric_value, agg_func, df_base, df_base_test)

Features at month level: U03M
Features at month level: U06M
Features at month level: U12M


In [20]:
df_base.shape, df_base_test.shape

((358428, 136), (396609, 136))

In [22]:
df_base_test.head()

Unnamed: 0,key_value,U03M_SALDO_MEAN_FOR_PRODUCTO_0_MEAN,U03M_SALDO_MEAN_FOR_PRODUCTO_0_SUM,U03M_SALDO_MEAN_FOR_PRODUCTO_0_COUNT,U03M_SALDO_MEAN_FOR_PRODUCTO_1_MEAN,U03M_SALDO_MEAN_FOR_PRODUCTO_1_SUM,U03M_SALDO_MEAN_FOR_PRODUCTO_1_COUNT,U03M_SALDO_MEAN_FOR_PRODUCTO_2_MEAN,U03M_SALDO_MEAN_FOR_PRODUCTO_2_SUM,U03M_SALDO_MEAN_FOR_PRODUCTO_2_COUNT,U03M_SALDO_MEAN_FOR_PRODUCTO_3_MEAN,U03M_SALDO_MEAN_FOR_PRODUCTO_3_SUM,U03M_SALDO_MEAN_FOR_PRODUCTO_3_COUNT,U03M_SALDO_MEAN_FOR_PRODUCTO_4_MEAN,U03M_SALDO_MEAN_FOR_PRODUCTO_4_SUM,U03M_SALDO_MEAN_FOR_PRODUCTO_4_COUNT,U03M_SALDO_MEAN_FOR_PRODUCTO_6_MEAN,U03M_SALDO_MEAN_FOR_PRODUCTO_6_SUM,U03M_SALDO_MEAN_FOR_PRODUCTO_6_COUNT,U03M_SALDO_MEAN_FOR_PRODUCTO_8_MEAN,U03M_SALDO_MEAN_FOR_PRODUCTO_8_SUM,U03M_SALDO_MEAN_FOR_PRODUCTO_8_COUNT,U03M_SALDO_MEAN_FOR_PRODUCTO_255_MEAN,U03M_SALDO_MEAN_FOR_PRODUCTO_255_SUM,U03M_SALDO_MEAN_FOR_PRODUCTO_255_COUNT,U03M_SALDO_MEAN_FOR_PRODUCTO_LOW_PROPENSITY_MEAN,U03M_SALDO_MEAN_FOR_PRODUCTO_LOW_PROPENSITY_SUM,U03M_SALDO_MEAN_FOR_PRODUCTO_LOW_PROPENSITY_COUNT,U03M_SALDO_MEAN_FOR_PRODUCTO_OTROS_MEAN,U03M_SALDO_MEAN_FOR_PRODUCTO_OTROS_SUM,U03M_SALDO_MEAN_FOR_PRODUCTO_OTROS_COUNT,U03M_SALDO_MEAN_FOR_PRODUCTO_TOP_PROPENSITY_MEAN,U03M_SALDO_MEAN_FOR_PRODUCTO_TOP_PROPENSITY_SUM,U03M_SALDO_MEAN_FOR_PRODUCTO_TOP_PROPENSITY_COUNT,U03M_MEAN_OF_SALDO_MEAN_FOR_ALL_PRODUCTO_MEAN,U03M_MEAN_OF_SALDO_MEAN_FOR_ALL_PRODUCTO_SUM,U03M_MEAN_OF_SALDO_MEAN_FOR_ALL_PRODUCTO_COUNT,U03M_SUM_OF_SALDO_MEAN_FOR_ALL_PRODUCTO_MEAN,U03M_SUM_OF_SALDO_MEAN_FOR_ALL_PRODUCTO_SUM,U03M_SUM_OF_SALDO_MEAN_FOR_ALL_PRODUCTO_COUNT,...,U12M_SALDO_MEAN_FOR_PRODUCTO_1_COUNT,U12M_SALDO_MEAN_FOR_PRODUCTO_2_MEAN,U12M_SALDO_MEAN_FOR_PRODUCTO_2_SUM,U12M_SALDO_MEAN_FOR_PRODUCTO_2_COUNT,U12M_SALDO_MEAN_FOR_PRODUCTO_3_MEAN,U12M_SALDO_MEAN_FOR_PRODUCTO_3_SUM,U12M_SALDO_MEAN_FOR_PRODUCTO_3_COUNT,U12M_SALDO_MEAN_FOR_PRODUCTO_4_MEAN,U12M_SALDO_MEAN_FOR_PRODUCTO_4_SUM,U12M_SALDO_MEAN_FOR_PRODUCTO_4_COUNT,U12M_SALDO_MEAN_FOR_PRODUCTO_6_MEAN,U12M_SALDO_MEAN_FOR_PRODUCTO_6_SUM,U12M_SALDO_MEAN_FOR_PRODUCTO_6_COUNT,U12M_SALDO_MEAN_FOR_PRODUCTO_8_MEAN,U12M_SALDO_MEAN_FOR_PRODUCTO_8_SUM,U12M_SALDO_MEAN_FOR_PRODUCTO_8_COUNT,U12M_SALDO_MEAN_FOR_PRODUCTO_255_MEAN,U12M_SALDO_MEAN_FOR_PRODUCTO_255_SUM,U12M_SALDO_MEAN_FOR_PRODUCTO_255_COUNT,U12M_SALDO_MEAN_FOR_PRODUCTO_LOW_PROPENSITY_MEAN,U12M_SALDO_MEAN_FOR_PRODUCTO_LOW_PROPENSITY_SUM,U12M_SALDO_MEAN_FOR_PRODUCTO_LOW_PROPENSITY_COUNT,U12M_SALDO_MEAN_FOR_PRODUCTO_OTROS_MEAN,U12M_SALDO_MEAN_FOR_PRODUCTO_OTROS_SUM,U12M_SALDO_MEAN_FOR_PRODUCTO_OTROS_COUNT,U12M_SALDO_MEAN_FOR_PRODUCTO_TOP_PROPENSITY_MEAN,U12M_SALDO_MEAN_FOR_PRODUCTO_TOP_PROPENSITY_SUM,U12M_SALDO_MEAN_FOR_PRODUCTO_TOP_PROPENSITY_COUNT,U12M_MEAN_OF_SALDO_MEAN_FOR_ALL_PRODUCTO_MEAN,U12M_MEAN_OF_SALDO_MEAN_FOR_ALL_PRODUCTO_SUM,U12M_MEAN_OF_SALDO_MEAN_FOR_ALL_PRODUCTO_COUNT,U12M_SUM_OF_SALDO_MEAN_FOR_ALL_PRODUCTO_MEAN,U12M_SUM_OF_SALDO_MEAN_FOR_ALL_PRODUCTO_SUM,U12M_SUM_OF_SALDO_MEAN_FOR_ALL_PRODUCTO_COUNT,U12M_COUNT_OF_SALDO_MEAN_FOR_ALL_PRODUCTO_MEAN,U12M_COUNT_OF_SALDO_MEAN_FOR_ALL_PRODUCTO_SUM,U12M_COUNT_OF_SALDO_MEAN_FOR_ALL_PRODUCTO_COUNT,U12M_RECENCY,U12M_LONG_RECENCY,U12M_DIFF_MONTHS_FIRST_LAST
0,0,,0.0,0,-0.072818,-0.218455,3,-0.011041,-0.033123,3,-0.012827,-0.038481,3,,0.0,0,-0.072853,-0.145706,2,-0.068034,-0.204102,3,-0.072833,-0.145665,2,,0.0,0,,0.0,0,,0.0,0,-0.048169,-0.144506,3,-0.261844,-0.785531,3,...,10,-0.01354,-0.162483,12,-0.016538,-0.198462,12,,0.0,0,-0.072836,-0.291345,4,-0.066761,-0.801128,12,-0.072833,-0.58266,8,,0.0,0,,0.0,0,,0.0,0,-0.04673,-0.560766,12,-0.230238,-2.762861,12,4.833333,58,12,1,12,11
1,1,,0.0,0,-0.072517,-0.217551,3,,0.0,0,,0.0,0,-0.064181,-0.192542,3,,0.0,0,,0.0,0,,0.0,0,,0.0,0,-0.029526,-0.088578,3,,0.0,0,-0.055408,-0.166224,3,-0.166224,-0.498672,3,...,12,,0.0,0,,0.0,0,-0.06387,-0.25548,4,-0.072153,-0.072153,1,,0.0,0,,0.0,0,,0.0,0,-0.043328,-0.519941,12,,0.0,0,-0.059573,-0.714872,12,-0.142755,-1.713055,12,2.416667,29,12,1,12,11
2,2,,0.0,0,-0.07267,-0.21801,3,-0.057589,-0.172768,3,-0.062227,-0.18668,3,,0.0,0,,0.0,0,-0.065416,-0.196249,3,-0.072836,-0.218509,3,,0.0,0,,0.0,0,,0.0,0,-0.066148,-0.198443,3,-0.330739,-0.992216,3,...,12,-0.044526,-0.534309,12,-0.048521,-0.582252,12,,0.0,0,,0.0,0,-0.066413,-0.796953,12,-0.072836,-0.218509,3,,0.0,0,,0.0,0,,0.0,0,-0.058448,-0.701376,12,-0.250329,-3.003948,12,4.25,51,12,1,12,11
3,3,-0.051192,-0.153576,3,-0.072344,-0.217031,3,-0.016682,-0.050046,3,-0.020091,-0.060274,3,-0.059594,-0.178781,3,-0.069279,-0.207838,3,-0.071466,-0.214399,3,,0.0,0,,0.0,0,-0.066812,-0.200436,3,,0.0,0,-0.053433,-0.160298,3,-0.42746,-1.282381,3,...,12,-0.028734,-0.34481,12,-0.03006,-0.360723,12,-0.063275,-0.759296,12,-0.069279,-0.207838,3,-0.071911,-0.862927,12,,0.0,0,,0.0,0,-0.065995,-0.791937,12,,0.0,0,-0.054689,-0.656265,12,-0.39618,-4.754155,12,7.25,87,12,1,12,11
4,4,-0.062281,-0.186843,3,-0.072672,-0.218016,3,,0.0,0,,0.0,0,,0.0,0,-0.072791,-0.218374,3,,0.0,0,,0.0,0,,0.0,0,-0.070933,-0.212798,3,,0.0,0,-0.069669,-0.209008,3,-0.278677,-0.836031,3,...,6,-0.072881,-0.072881,1,-0.072881,-0.072881,1,,0.0,0,-0.072769,-0.436614,6,,0.0,0,,0.0,0,,0.0,0,-0.070933,-0.425596,6,,0.0,0,-0.069685,-0.487795,7,-0.257917,-1.805418,7,3.714286,26,7,1,12,11


#### **Variable Institucion Financiera**

In [19]:
top_freq_instits = [34, 1, 10, 28, 11, 37, 0, 2]
top_propensity_instits = [9, 26, 89, 105, 99, 39, 97, 90, 108, 106, 107, 95, 49, 94, 100, 77, 91, 96, 6, 69, 104, 85, 70, 88, 93, 101, 75, 80, 73, 76, 52, 46, 68, 71, 92, 63, 60, 55, 65] # [74, 5]
low_propensity_instits = [12, 86, 47, 14, 78, 84] #[67, 66, 87, 98, 72, 31, 81]
list_instit = top_freq_instits + top_propensity_instits + low_propensity_instits
print(list_instit)

[34, 1, 10, 28, 11, 37, 0, 2, 9, 26, 89, 105, 99, 39, 97, 90, 108, 106, 107, 95, 49, 94, 100, 77, 91, 96, 6, 69, 104, 85, 70, 88, 93, 101, 75, 80, 73, 76, 52, 46, 68, 71, 92, 63, 60, 55, 65, 12, 86, 47, 14, 78, 84]


In [20]:
df_rcc_test["cod_instit_financiera"] = df_rcc_test["cod_instit_financiera"].fillna(255)
df_rcc_test["cod_instit_financiera"] = df_rcc_test["cod_instit_financiera"].astype(int)

In [21]:
df_rcc_train["cod_instit_financiera"] = df_rcc_train["cod_instit_financiera"]
df_rcc_train.loc[df_rcc_train["cod_instit_financiera"].isin(top_propensity_instits), "cod_instit_financiera"] = "TOP_PROPENSITY"
df_rcc_train.loc[df_rcc_train["cod_instit_financiera"].isin(low_propensity_instits), "cod_instit_financiera"] = "LOW_PROPENSITY"
df_rcc_train.loc[~df_rcc_train["cod_instit_financiera"].isin(list_instit), "cod_instit_financiera"] = "OTROS"

In [22]:
df_rcc_test["cod_instit_financiera"] = df_rcc_test["cod_instit_financiera"]
df_rcc_test.loc[df_rcc_test["cod_instit_financiera"].isin(top_propensity_instits), "cod_instit_financiera"] = "TOP_PROPENSITY"
df_rcc_test.loc[df_rcc_test["cod_instit_financiera"].isin(low_propensity_instits), "cod_instit_financiera"] = "LOW_PROPENSITY"
df_rcc_test.loc[~df_rcc_test["cod_instit_financiera"].isin(list_instit), "cod_instit_financiera"] = "OTROS"

In [25]:
cat_col = "cod_instit_financiera"
agg_func = "mean"
numeric_value = "saldo"

In [26]:
df_base, df_base_test = hist_feat_generator(cat_col, numeric_value, agg_func, df_base, df_base_test)

Features at month level: U03M
Features at month level: U06M
Features at month level: U12M


In [27]:
df_base.head()

Unnamed: 0,key_value,U03M_SALDO_MEAN_FOR_COD_INSTIT_FINANCIERA_0_MEAN,U03M_SALDO_MEAN_FOR_COD_INSTIT_FINANCIERA_0_SUM,U03M_SALDO_MEAN_FOR_COD_INSTIT_FINANCIERA_0_COUNT,U03M_SALDO_MEAN_FOR_COD_INSTIT_FINANCIERA_1_MEAN,U03M_SALDO_MEAN_FOR_COD_INSTIT_FINANCIERA_1_SUM,U03M_SALDO_MEAN_FOR_COD_INSTIT_FINANCIERA_1_COUNT,U03M_SALDO_MEAN_FOR_COD_INSTIT_FINANCIERA_2_MEAN,U03M_SALDO_MEAN_FOR_COD_INSTIT_FINANCIERA_2_SUM,U03M_SALDO_MEAN_FOR_COD_INSTIT_FINANCIERA_2_COUNT,U03M_SALDO_MEAN_FOR_COD_INSTIT_FINANCIERA_10_MEAN,U03M_SALDO_MEAN_FOR_COD_INSTIT_FINANCIERA_10_SUM,U03M_SALDO_MEAN_FOR_COD_INSTIT_FINANCIERA_10_COUNT,U03M_SALDO_MEAN_FOR_COD_INSTIT_FINANCIERA_11_MEAN,U03M_SALDO_MEAN_FOR_COD_INSTIT_FINANCIERA_11_SUM,U03M_SALDO_MEAN_FOR_COD_INSTIT_FINANCIERA_11_COUNT,U03M_SALDO_MEAN_FOR_COD_INSTIT_FINANCIERA_28_MEAN,U03M_SALDO_MEAN_FOR_COD_INSTIT_FINANCIERA_28_SUM,U03M_SALDO_MEAN_FOR_COD_INSTIT_FINANCIERA_28_COUNT,U03M_SALDO_MEAN_FOR_COD_INSTIT_FINANCIERA_34_MEAN,U03M_SALDO_MEAN_FOR_COD_INSTIT_FINANCIERA_34_SUM,U03M_SALDO_MEAN_FOR_COD_INSTIT_FINANCIERA_34_COUNT,U03M_SALDO_MEAN_FOR_COD_INSTIT_FINANCIERA_37_MEAN,U03M_SALDO_MEAN_FOR_COD_INSTIT_FINANCIERA_37_SUM,U03M_SALDO_MEAN_FOR_COD_INSTIT_FINANCIERA_37_COUNT,U03M_SALDO_MEAN_FOR_COD_INSTIT_FINANCIERA_OTROS_MEAN,U03M_SALDO_MEAN_FOR_COD_INSTIT_FINANCIERA_OTROS_SUM,U03M_SALDO_MEAN_FOR_COD_INSTIT_FINANCIERA_OTROS_COUNT,U03M_MEAN_OF_SALDO_MEAN_FOR_ALL_COD_INSTIT_FINANCIERA_MEAN,U03M_MEAN_OF_SALDO_MEAN_FOR_ALL_COD_INSTIT_FINANCIERA_SUM,U03M_MEAN_OF_SALDO_MEAN_FOR_ALL_COD_INSTIT_FINANCIERA_COUNT,U03M_SUM_OF_SALDO_MEAN_FOR_ALL_COD_INSTIT_FINANCIERA_MEAN,U03M_SUM_OF_SALDO_MEAN_FOR_ALL_COD_INSTIT_FINANCIERA_SUM,U03M_SUM_OF_SALDO_MEAN_FOR_ALL_COD_INSTIT_FINANCIERA_COUNT,U03M_COUNT_OF_SALDO_MEAN_FOR_ALL_COD_INSTIT_FINANCIERA_MEAN,U03M_COUNT_OF_SALDO_MEAN_FOR_ALL_COD_INSTIT_FINANCIERA_SUM,U03M_COUNT_OF_SALDO_MEAN_FOR_ALL_COD_INSTIT_FINANCIERA_COUNT,U03M_RECENCY,U03M_LONG_RECENCY,U03M_DIFF_MONTHS_FIRST_LAST,...,U06M_DIFF_MONTHS_FIRST_LAST,U12M_SALDO_MEAN_FOR_COD_INSTIT_FINANCIERA_0_MEAN,U12M_SALDO_MEAN_FOR_COD_INSTIT_FINANCIERA_0_SUM,U12M_SALDO_MEAN_FOR_COD_INSTIT_FINANCIERA_0_COUNT,U12M_SALDO_MEAN_FOR_COD_INSTIT_FINANCIERA_1_MEAN,U12M_SALDO_MEAN_FOR_COD_INSTIT_FINANCIERA_1_SUM,U12M_SALDO_MEAN_FOR_COD_INSTIT_FINANCIERA_1_COUNT,U12M_SALDO_MEAN_FOR_COD_INSTIT_FINANCIERA_2_MEAN,U12M_SALDO_MEAN_FOR_COD_INSTIT_FINANCIERA_2_SUM,U12M_SALDO_MEAN_FOR_COD_INSTIT_FINANCIERA_2_COUNT,U12M_SALDO_MEAN_FOR_COD_INSTIT_FINANCIERA_10_MEAN,U12M_SALDO_MEAN_FOR_COD_INSTIT_FINANCIERA_10_SUM,U12M_SALDO_MEAN_FOR_COD_INSTIT_FINANCIERA_10_COUNT,U12M_SALDO_MEAN_FOR_COD_INSTIT_FINANCIERA_11_MEAN,U12M_SALDO_MEAN_FOR_COD_INSTIT_FINANCIERA_11_SUM,U12M_SALDO_MEAN_FOR_COD_INSTIT_FINANCIERA_11_COUNT,U12M_SALDO_MEAN_FOR_COD_INSTIT_FINANCIERA_28_MEAN,U12M_SALDO_MEAN_FOR_COD_INSTIT_FINANCIERA_28_SUM,U12M_SALDO_MEAN_FOR_COD_INSTIT_FINANCIERA_28_COUNT,U12M_SALDO_MEAN_FOR_COD_INSTIT_FINANCIERA_34_MEAN,U12M_SALDO_MEAN_FOR_COD_INSTIT_FINANCIERA_34_SUM,U12M_SALDO_MEAN_FOR_COD_INSTIT_FINANCIERA_34_COUNT,U12M_SALDO_MEAN_FOR_COD_INSTIT_FINANCIERA_37_MEAN,U12M_SALDO_MEAN_FOR_COD_INSTIT_FINANCIERA_37_SUM,U12M_SALDO_MEAN_FOR_COD_INSTIT_FINANCIERA_37_COUNT,U12M_SALDO_MEAN_FOR_COD_INSTIT_FINANCIERA_OTROS_MEAN,U12M_SALDO_MEAN_FOR_COD_INSTIT_FINANCIERA_OTROS_SUM,U12M_SALDO_MEAN_FOR_COD_INSTIT_FINANCIERA_OTROS_COUNT,U12M_MEAN_OF_SALDO_MEAN_FOR_ALL_COD_INSTIT_FINANCIERA_MEAN,U12M_MEAN_OF_SALDO_MEAN_FOR_ALL_COD_INSTIT_FINANCIERA_SUM,U12M_MEAN_OF_SALDO_MEAN_FOR_ALL_COD_INSTIT_FINANCIERA_COUNT,U12M_SUM_OF_SALDO_MEAN_FOR_ALL_COD_INSTIT_FINANCIERA_MEAN,U12M_SUM_OF_SALDO_MEAN_FOR_ALL_COD_INSTIT_FINANCIERA_SUM,U12M_SUM_OF_SALDO_MEAN_FOR_ALL_COD_INSTIT_FINANCIERA_COUNT,U12M_COUNT_OF_SALDO_MEAN_FOR_ALL_COD_INSTIT_FINANCIERA_MEAN,U12M_COUNT_OF_SALDO_MEAN_FOR_ALL_COD_INSTIT_FINANCIERA_SUM,U12M_COUNT_OF_SALDO_MEAN_FOR_ALL_COD_INSTIT_FINANCIERA_COUNT,U12M_RECENCY,U12M_LONG_RECENCY,U12M_DIFF_MONTHS_FIRST_LAST
0,0,-0.00521,-0.00521,1,,0.0,0,,0.0,0,,0.0,0,-0.013753,-0.04126,3,,0.0,0,,0.0,0,,0.0,0,,0.0,0,-0.00928,-0.027841,3,-0.01549,-0.04647,3,1.333333,4,3,1,3,2,...,5,-0.00521,-0.00521,1,,0.0,0,,0.0,0,,0.0,0,-0.035954,-0.431452,12,,0.0,0,,0.0,0,,0.0,0,,0.0,0,-0.034836,-0.418032,12,-0.036389,-0.436662,12,1.083333,13,12,1,12,11
1,1,,0.0,0,,0.0,0,,0.0,0,,0.0,0,,0.0,0,-0.067254,-0.067254,1,,0.0,0,,0.0,0,-0.067622,-0.202866,3,-0.067489,-0.202466,3,-0.09004,-0.270121,3,1.333333,4,3,1,3,2,...,2,,0.0,0,,0.0,0,,0.0,0,,0.0,0,,0.0,0,-0.067254,-0.067254,1,,0.0,0,,0.0,0,-0.067622,-0.202866,3,-0.067489,-0.202466,3,-0.09004,-0.270121,3,1.333333,4,3,1,3,2
2,2,,0.0,0,,0.0,0,,0.0,0,,0.0,0,,0.0,0,-0.050128,-0.050128,1,,0.0,0,,0.0,0,,0.0,0,-0.050128,-0.050128,1,-0.050128,-0.050128,1,1.0,1,1,1,1,0,...,0,,0.0,0,,0.0,0,,0.0,0,,0.0,0,,0.0,0,-0.060516,-0.121031,2,,0.0,0,,0.0,0,,0.0,0,-0.060516,-0.121031,2,-0.060516,-0.121031,2,1.0,2,2,1,12,11
3,3,,0.0,0,,0.0,0,,0.0,0,,0.0,0,,0.0,0,,0.0,0,,0.0,0,,0.0,0,0.039114,0.039114,1,0.039114,0.039114,1,0.039114,0.039114,1,1.0,1,1,1,1,0,...,5,,0.0,0,,0.0,0,,0.0,0,,0.0,0,,0.0,0,,0.0,0,,0.0,0,,0.0,0,0.029115,0.05823,2,0.029115,0.05823,2,0.029115,0.05823,2,1.0,2,2,1,6,5
4,4,,0.0,0,,0.0,0,,0.0,0,,0.0,0,,0.0,0,,0.0,0,,0.0,0,,0.0,0,-0.063737,-0.19121,3,-0.063737,-0.19121,3,-0.063737,-0.19121,3,1.0,3,3,1,3,2,...,5,,0.0,0,,0.0,0,,0.0,0,,0.0,0,,0.0,0,,0.0,0,,0.0,0,,0.0,0,-0.061182,-0.489453,8,-0.061182,-0.489453,8,-0.061182,-0.489453,8,1.0,8,8,1,8,7


In [28]:
df_base_test.head()

Unnamed: 0,key_value,U03M_SALDO_MEAN_FOR_COD_INSTIT_FINANCIERA_0_MEAN,U03M_SALDO_MEAN_FOR_COD_INSTIT_FINANCIERA_0_SUM,U03M_SALDO_MEAN_FOR_COD_INSTIT_FINANCIERA_0_COUNT,U03M_SALDO_MEAN_FOR_COD_INSTIT_FINANCIERA_1_MEAN,U03M_SALDO_MEAN_FOR_COD_INSTIT_FINANCIERA_1_SUM,U03M_SALDO_MEAN_FOR_COD_INSTIT_FINANCIERA_1_COUNT,U03M_SALDO_MEAN_FOR_COD_INSTIT_FINANCIERA_2_MEAN,U03M_SALDO_MEAN_FOR_COD_INSTIT_FINANCIERA_2_SUM,U03M_SALDO_MEAN_FOR_COD_INSTIT_FINANCIERA_2_COUNT,U03M_SALDO_MEAN_FOR_COD_INSTIT_FINANCIERA_10_MEAN,U03M_SALDO_MEAN_FOR_COD_INSTIT_FINANCIERA_10_SUM,U03M_SALDO_MEAN_FOR_COD_INSTIT_FINANCIERA_10_COUNT,U03M_SALDO_MEAN_FOR_COD_INSTIT_FINANCIERA_11_MEAN,U03M_SALDO_MEAN_FOR_COD_INSTIT_FINANCIERA_11_SUM,U03M_SALDO_MEAN_FOR_COD_INSTIT_FINANCIERA_11_COUNT,U03M_SALDO_MEAN_FOR_COD_INSTIT_FINANCIERA_28_MEAN,U03M_SALDO_MEAN_FOR_COD_INSTIT_FINANCIERA_28_SUM,U03M_SALDO_MEAN_FOR_COD_INSTIT_FINANCIERA_28_COUNT,U03M_SALDO_MEAN_FOR_COD_INSTIT_FINANCIERA_34_MEAN,U03M_SALDO_MEAN_FOR_COD_INSTIT_FINANCIERA_34_SUM,U03M_SALDO_MEAN_FOR_COD_INSTIT_FINANCIERA_34_COUNT,U03M_SALDO_MEAN_FOR_COD_INSTIT_FINANCIERA_37_MEAN,U03M_SALDO_MEAN_FOR_COD_INSTIT_FINANCIERA_37_SUM,U03M_SALDO_MEAN_FOR_COD_INSTIT_FINANCIERA_37_COUNT,U03M_SALDO_MEAN_FOR_COD_INSTIT_FINANCIERA_OTROS_MEAN,U03M_SALDO_MEAN_FOR_COD_INSTIT_FINANCIERA_OTROS_SUM,U03M_SALDO_MEAN_FOR_COD_INSTIT_FINANCIERA_OTROS_COUNT,U03M_MEAN_OF_SALDO_MEAN_FOR_ALL_COD_INSTIT_FINANCIERA_MEAN,U03M_MEAN_OF_SALDO_MEAN_FOR_ALL_COD_INSTIT_FINANCIERA_SUM,U03M_MEAN_OF_SALDO_MEAN_FOR_ALL_COD_INSTIT_FINANCIERA_COUNT,U03M_SUM_OF_SALDO_MEAN_FOR_ALL_COD_INSTIT_FINANCIERA_MEAN,U03M_SUM_OF_SALDO_MEAN_FOR_ALL_COD_INSTIT_FINANCIERA_SUM,U03M_SUM_OF_SALDO_MEAN_FOR_ALL_COD_INSTIT_FINANCIERA_COUNT,U03M_COUNT_OF_SALDO_MEAN_FOR_ALL_COD_INSTIT_FINANCIERA_MEAN,U03M_COUNT_OF_SALDO_MEAN_FOR_ALL_COD_INSTIT_FINANCIERA_SUM,U03M_COUNT_OF_SALDO_MEAN_FOR_ALL_COD_INSTIT_FINANCIERA_COUNT,U03M_RECENCY,U03M_LONG_RECENCY,U03M_DIFF_MONTHS_FIRST_LAST,...,U06M_DIFF_MONTHS_FIRST_LAST,U12M_SALDO_MEAN_FOR_COD_INSTIT_FINANCIERA_0_MEAN,U12M_SALDO_MEAN_FOR_COD_INSTIT_FINANCIERA_0_SUM,U12M_SALDO_MEAN_FOR_COD_INSTIT_FINANCIERA_0_COUNT,U12M_SALDO_MEAN_FOR_COD_INSTIT_FINANCIERA_1_MEAN,U12M_SALDO_MEAN_FOR_COD_INSTIT_FINANCIERA_1_SUM,U12M_SALDO_MEAN_FOR_COD_INSTIT_FINANCIERA_1_COUNT,U12M_SALDO_MEAN_FOR_COD_INSTIT_FINANCIERA_2_MEAN,U12M_SALDO_MEAN_FOR_COD_INSTIT_FINANCIERA_2_SUM,U12M_SALDO_MEAN_FOR_COD_INSTIT_FINANCIERA_2_COUNT,U12M_SALDO_MEAN_FOR_COD_INSTIT_FINANCIERA_10_MEAN,U12M_SALDO_MEAN_FOR_COD_INSTIT_FINANCIERA_10_SUM,U12M_SALDO_MEAN_FOR_COD_INSTIT_FINANCIERA_10_COUNT,U12M_SALDO_MEAN_FOR_COD_INSTIT_FINANCIERA_11_MEAN,U12M_SALDO_MEAN_FOR_COD_INSTIT_FINANCIERA_11_SUM,U12M_SALDO_MEAN_FOR_COD_INSTIT_FINANCIERA_11_COUNT,U12M_SALDO_MEAN_FOR_COD_INSTIT_FINANCIERA_28_MEAN,U12M_SALDO_MEAN_FOR_COD_INSTIT_FINANCIERA_28_SUM,U12M_SALDO_MEAN_FOR_COD_INSTIT_FINANCIERA_28_COUNT,U12M_SALDO_MEAN_FOR_COD_INSTIT_FINANCIERA_34_MEAN,U12M_SALDO_MEAN_FOR_COD_INSTIT_FINANCIERA_34_SUM,U12M_SALDO_MEAN_FOR_COD_INSTIT_FINANCIERA_34_COUNT,U12M_SALDO_MEAN_FOR_COD_INSTIT_FINANCIERA_37_MEAN,U12M_SALDO_MEAN_FOR_COD_INSTIT_FINANCIERA_37_SUM,U12M_SALDO_MEAN_FOR_COD_INSTIT_FINANCIERA_37_COUNT,U12M_SALDO_MEAN_FOR_COD_INSTIT_FINANCIERA_OTROS_MEAN,U12M_SALDO_MEAN_FOR_COD_INSTIT_FINANCIERA_OTROS_SUM,U12M_SALDO_MEAN_FOR_COD_INSTIT_FINANCIERA_OTROS_COUNT,U12M_MEAN_OF_SALDO_MEAN_FOR_ALL_COD_INSTIT_FINANCIERA_MEAN,U12M_MEAN_OF_SALDO_MEAN_FOR_ALL_COD_INSTIT_FINANCIERA_SUM,U12M_MEAN_OF_SALDO_MEAN_FOR_ALL_COD_INSTIT_FINANCIERA_COUNT,U12M_SUM_OF_SALDO_MEAN_FOR_ALL_COD_INSTIT_FINANCIERA_MEAN,U12M_SUM_OF_SALDO_MEAN_FOR_ALL_COD_INSTIT_FINANCIERA_SUM,U12M_SUM_OF_SALDO_MEAN_FOR_ALL_COD_INSTIT_FINANCIERA_COUNT,U12M_COUNT_OF_SALDO_MEAN_FOR_ALL_COD_INSTIT_FINANCIERA_MEAN,U12M_COUNT_OF_SALDO_MEAN_FOR_ALL_COD_INSTIT_FINANCIERA_SUM,U12M_COUNT_OF_SALDO_MEAN_FOR_ALL_COD_INSTIT_FINANCIERA_COUNT,U12M_RECENCY,U12M_LONG_RECENCY,U12M_DIFF_MONTHS_FIRST_LAST
0,0,,0.0,0,,0.0,0,-0.029477,-0.08843,3,,0.0,0,-0.017087,-0.05126,3,,0.0,0,-0.070136,-0.210408,3,-0.057216,-0.171648,3,0.011184,0.033553,3,-0.032546,-0.097639,3,-0.162731,-0.488193,3,5.0,15,3,1,3,2,...,5,,0.0,0,,0.0,0,-0.029439,-0.176632,6,,0.0,0,-0.018019,-0.216226,12,,0.0,0,-0.068861,-0.826327,12,-0.057216,-0.686592,12,-0.008887,-0.10664,12,-0.037402,-0.448821,12,-0.167701,-2.012417,12,4.5,54,12,1,12,11
1,1,,0.0,0,,0.0,0,,0.0,0,,0.0,0,,0.0,0,,0.0,0,,0.0,0,,0.0,0,-0.043833,-0.131499,3,-0.043833,-0.131499,3,-0.043833,-0.131499,3,1.0,3,3,1,3,2,...,5,,0.0,0,,0.0,0,,0.0,0,,0.0,0,,0.0,0,,0.0,0,,0.0,0,,0.0,0,-0.052325,-0.627904,12,-0.052325,-0.627904,12,-0.052325,-0.627904,12,1.0,12,12,1,12,11
2,2,,0.0,0,-0.064024,-0.192072,3,,0.0,0,,0.0,0,,0.0,0,,0.0,0,-0.066907,-0.200721,3,,0.0,0,,0.0,0,-0.065465,-0.196396,3,-0.130931,-0.392793,3,2.0,6,3,1,3,2,...,5,-0.005709,-0.045668,8,-0.06456,-0.451919,7,,0.0,0,,0.0,0,,0.0,0,,0.0,0,-0.067529,-0.810349,12,,0.0,0,,0.0,0,-0.048764,-0.585162,12,-0.108995,-1.307936,12,2.25,27,12,1,12,11
3,3,,0.0,0,,0.0,0,,0.0,0,-0.049106,-0.147319,3,,0.0,0,,0.0,0,,0.0,0,-0.028543,-0.08563,3,-0.063639,-0.190917,3,-0.047096,-0.141289,3,-0.141289,-0.423867,3,3.0,9,3,1,3,2,...,5,,0.0,0,-0.067148,-0.470036,7,,0.0,0,-0.045615,-0.547379,12,,0.0,0,,0.0,0,,0.0,0,-0.038991,-0.467897,12,-0.062987,-0.75585,12,-0.051672,-0.620069,12,-0.186764,-2.241163,12,3.583333,43,12,1,12,11
4,4,,0.0,0,,0.0,0,,0.0,0,,0.0,0,,0.0,0,,0.0,0,,0.0,0,,0.0,0,-0.069124,-0.207373,3,-0.069124,-0.207373,3,-0.069124,-0.207373,3,1.0,3,3,1,3,2,...,5,,0.0,0,,0.0,0,,0.0,0,-0.072881,-0.072881,1,,0.0,0,,0.0,0,,0.0,0,,0.0,0,-0.06888,-0.413279,6,-0.069451,-0.48616,7,-0.069451,-0.48616,7,1.0,7,7,1,12,11


In [29]:
df_base.shape, df_base_test.shape

((358428, 118), (396609, 118))

In [30]:
df_base.to_csv("[01] Intermediate/ds_rcc_instit.csv", index=False)
df_base_test.to_csv("[01] Intermediate/ds_rcc_instit_test.csv", index=False)

## Todos los Datasets

In [43]:
!ls -lh '[01] Intermediate'

total 216M
-rw------- 1 root root 81M Dec 17 14:57 ds_rcc.csv
-rw------- 1 root root 89M Dec 17 14:57 ds_rcc_test.csv
-rw------- 1 root root 23M Dec 17 15:01 ds_se.csv
-rw------- 1 root root 24M Dec 17 15:01 ds_se_test.csv


---