In [1]:
import os
import re
import pandas as pd
import numpy as np

In [2]:
def change_dtype_ser(ser):
    
    if ser.dtype == int:
        return ser.astype(np.int32)
    
    if ser.dtype == float:
        return ser.astype(np.float32)
    
    if ser.dtype == np.object:
        return ser.astype("category")
    
    return ser
    

def change_dtype_df(df):
    """
    change types of columns to reduce memory size
    :param df: dataframe
    :return df: dataframe
    """
    df["fecha_dato"] = pd.to_datetime(df["fecha_dato"])
    df["fecha_alta"] = pd.to_datetime(df["fecha_alta"])
    
    memory = df.memory_usage().sum() / 10**6
    print("Memory usage before changing types %0.2f MB" % memory)

    for col in df.columns:
        df[col] = change_dtype_ser(df[col])

    memory = df.memory_usage().sum() / 10 ** 6
    print("Memory usage after changing types %0.2f MB" % memory)
    return df


def load_csv(filename):
    df = pd.read_csv(filename)
    df = change_dtype_df(df)
    return df

In [3]:
INP_DIR = "data/data_/"

In [4]:
df_train = load_csv(os.path.join(INP_DIR, "train_cleaned.csv"))
df_test = load_csv(os.path.join(INP_DIR, "test_cleaned.csv"))

  if (await self.run_code(code, result,  async_=asy)):


Memory usage before changing types 6946.48 MB
Memory usage after changing types 3302.66 MB
Memory usage before changing types 116.20 MB
Memory usage after changing types 49.28 MB


In [5]:
# drop this column becuase it is too imbalanced
df_train = df_train.drop(["ind_empleado"], axis=1)
df_test = df_test.drop(["ind_empleado"], axis=1)

In [6]:
# days from when the data is recorded 
df_train["fecha_alta"] = (df_train["fecha_alta"] - df_train["fecha_dato"]).dt.days
df_test["fecha_alta"] = (df_test["fecha_alta"] - df_test["fecha_dato"]).dt.days

In [7]:
df_train.isnull().sum().sum(), df_test.isnull().sum().sum()

(0, 0)

In [8]:
df_train.shape, df_test.shape

((13647309, 67), (929615, 19))

In [9]:
df_train.columns

Index(['fecha_dato', 'ncodpers', 'pais_residencia', 'sexo', 'age',
       'fecha_alta', 'ind_nuevo', 'antiguedad', 'indrel', 'indrel_1mes',
       'tiprel_1mes', 'indresi', 'indext', 'canal_entrada', 'indfall',
       'cod_prov', 'ind_actividad_cliente', 'renta', 'segmento',
       'ind_ahor_fin_ult1', 'ind_aval_fin_ult1', 'ind_cco_fin_ult1',
       'ind_cder_fin_ult1', 'ind_cno_fin_ult1', 'ind_ctju_fin_ult1',
       'ind_ctma_fin_ult1', 'ind_ctop_fin_ult1', 'ind_ctpp_fin_ult1',
       'ind_deco_fin_ult1', 'ind_deme_fin_ult1', 'ind_dela_fin_ult1',
       'ind_ecue_fin_ult1', 'ind_fond_fin_ult1', 'ind_hip_fin_ult1',
       'ind_plan_fin_ult1', 'ind_pres_fin_ult1', 'ind_reca_fin_ult1',
       'ind_tjcr_fin_ult1', 'ind_valo_fin_ult1', 'ind_viv_fin_ult1',
       'ind_nomina_ult1', 'ind_nom_pens_ult1', 'ind_recibo_ult1',
       'ind_ahor_fin_ult1_NEW_PUR', 'ind_aval_fin_ult1_NEW_PUR',
       'ind_cco_fin_ult1_NEW_PUR', 'ind_cder_fin_ult1_NEW_PUR',
       'ind_cno_fin_ult1_NEW_PUR', 'ind_ctj

In [10]:
df_test.columns

Index(['fecha_dato', 'ncodpers', 'pais_residencia', 'sexo', 'age',
       'fecha_alta', 'ind_nuevo', 'antiguedad', 'indrel', 'indrel_1mes',
       'tiprel_1mes', 'indresi', 'indext', 'canal_entrada', 'indfall',
       'cod_prov', 'ind_actividad_cliente', 'renta', 'segmento'],
      dtype='object')

In [11]:
PROD_COLS = [col for col in df_train.columns if re.match(r"^ind_.*_ult1$", col)]
print(PROD_COLS)

['ind_ahor_fin_ult1', 'ind_aval_fin_ult1', 'ind_cco_fin_ult1', 'ind_cder_fin_ult1', 'ind_cno_fin_ult1', 'ind_ctju_fin_ult1', 'ind_ctma_fin_ult1', 'ind_ctop_fin_ult1', 'ind_ctpp_fin_ult1', 'ind_deco_fin_ult1', 'ind_deme_fin_ult1', 'ind_dela_fin_ult1', 'ind_ecue_fin_ult1', 'ind_fond_fin_ult1', 'ind_hip_fin_ult1', 'ind_plan_fin_ult1', 'ind_pres_fin_ult1', 'ind_reca_fin_ult1', 'ind_tjcr_fin_ult1', 'ind_valo_fin_ult1', 'ind_viv_fin_ult1', 'ind_nomina_ult1', 'ind_nom_pens_ult1', 'ind_recibo_ult1']


In [12]:
NEW_PURCH_COLS = [col for col in df_train.columns if re.match(r"^ind_.*_ult1_NEW_PUR$", col)]
print(NEW_PURCH_COLS, "\n")

prod_popul = df_train[NEW_PURCH_COLS].sum(axis=0)
prod_popul = prod_popul.sort_values(ascending=False)/prod_popul.sum() * 100
print(prod_popul, "\n")

SIXTEEN_MOST_POP_PROD = prod_popul.sort_values(ascending=False).index[:16].to_list()
print(SIXTEEN_MOST_POP_PROD)

['ind_ahor_fin_ult1_NEW_PUR', 'ind_aval_fin_ult1_NEW_PUR', 'ind_cco_fin_ult1_NEW_PUR', 'ind_cder_fin_ult1_NEW_PUR', 'ind_cno_fin_ult1_NEW_PUR', 'ind_ctju_fin_ult1_NEW_PUR', 'ind_ctma_fin_ult1_NEW_PUR', 'ind_ctop_fin_ult1_NEW_PUR', 'ind_ctpp_fin_ult1_NEW_PUR', 'ind_deco_fin_ult1_NEW_PUR', 'ind_deme_fin_ult1_NEW_PUR', 'ind_dela_fin_ult1_NEW_PUR', 'ind_ecue_fin_ult1_NEW_PUR', 'ind_fond_fin_ult1_NEW_PUR', 'ind_hip_fin_ult1_NEW_PUR', 'ind_plan_fin_ult1_NEW_PUR', 'ind_pres_fin_ult1_NEW_PUR', 'ind_reca_fin_ult1_NEW_PUR', 'ind_tjcr_fin_ult1_NEW_PUR', 'ind_valo_fin_ult1_NEW_PUR', 'ind_viv_fin_ult1_NEW_PUR', 'ind_nomina_ult1_NEW_PUR', 'ind_nom_pens_ult1_NEW_PUR', 'ind_recibo_ult1_NEW_PUR'] 

ind_recibo_ult1_NEW_PUR      27.197280
ind_nom_pens_ult1_NEW_PUR    15.048197
ind_nomina_ult1_NEW_PUR      13.101134
ind_cco_fin_ult1_NEW_PUR     12.426018
ind_tjcr_fin_ult1_NEW_PUR    12.304237
ind_cno_fin_ult1_NEW_PUR      6.601516
ind_ecue_fin_ult1_NEW_PUR     4.682679
ind_dela_fin_ult1_NEW_PUR     2.2557

In [13]:
DEMOG_COLS = [col for col in df_train.columns 
              if col not in PROD_COLS + NEW_PURCH_COLS + ["fecha_dato", "ncodpers"]]
print(DEMOG_COLS)

['pais_residencia', 'sexo', 'age', 'fecha_alta', 'ind_nuevo', 'antiguedad', 'indrel', 'indrel_1mes', 'tiprel_1mes', 'indresi', 'indext', 'canal_entrada', 'indfall', 'cod_prov', 'ind_actividad_cliente', 'renta', 'segmento']


In [44]:
def _is_unique(x):
    return len(x) == x.nunique()


def extract_subset(df, row_filter, cols):
    df = df.copy()
    return df.loc[row_filter, cols]


def extract_y(df, timestamp, y_cols=SIXTEEN_MOST_POP_PROD):
    # only use row when customer buys exactly one product
    any_new_pur = df[y_cols].sum(axis=1) == 1
    row_filter = (df["fecha_dato"] == timestamp) & any_new_pur
    cols = ["ncodpers"] + y_cols
    
    df_out = extract_subset(df, row_filter, cols)
    assert _is_unique(df_out["ncodpers"]), "ncodpers must be unique"
    
    df_out.columns = [col.replace("_NEW_PUR", "") if col.endswith("_NEW_PUR") else col for col in df_out.columns]
    #df_out = df_out.set_index("ncodpers")
    return df_out


def extract_x_demog(df, timestamp, customer_ids, demog_cols=DEMOG_COLS):
    row_filter = (df["fecha_dato"] == timestamp) & df["ncodpers"].isin(customer_ids)
    cols = ["ncodpers"] + demog_cols
    
    df_out = extract_subset(df, row_filter, cols)
    assert _is_unique(df_out["ncodpers"]), "ncodpers must be unique"
    #df_out = df_out.set_index("ncodpers")
    return df_out


# This function can give dataframe having less rows than len(customer_ids)
# This is becuse number of row at lag_timestamp is not the same as at timestamp
def extract_y_prod_lag(df, timestamp_lag, customer_ids, suffix="_lag", prod_cols=PROD_COLS):
    row_filter = (df["fecha_dato"] == timestamp_lag) & df["ncodpers"].isin(customer_ids)
    cols = ["ncodpers"] + prod_cols
    
    df_out = extract_subset(df, row_filter, cols)
    assert _is_unique(df_out["ncodpers"]), "ncodpers must be unique"
    
    df_out.columns = ["ncodpers"] + [col + suffix for col in df_out.columns if col != "ncodpers"]
    
    #df_out = df_out.set_index("ncodpers")
    return df_out



def extract_X_y_train(df, timestamp, timestamp_lags):
    y_train = extract_y(df, timestamp)
    print("y_train.shape", y_train.shape)
    
    x_demog = extract_x_demog(df, timestamp, customer_ids=y_train["ncodpers"])
    print("x_demog.shape:", x_demog.shape)
    assert y.shape[0] == x_demog.shape[0], "x_demog must have the same number of rows as y_train"
    
    x_lags = []
    for t, lag in enumerate(timestamp_lags):
        assert pd.to_datetime(lag) < pd.to_datetime(timestamp), lag + " lag is not before timestamp " +  timestamp
        lag_label = "LAG%d" % (t + 1)
        print(lag, lag_label)
        
        x_lag = extract_y_prod_lag(df, lag, customer_ids=y_train["ncodpers"], suffix=lag_label)
        x_lags.append(x_lag)
    
    x_train = y_train[["ncodpers"]].merge(x_demog, how="left", on="ncodpers")
    print("Nulls after merging y and x_demog:", x_train.isnull().sum().sum())
    
    for t, x_lag in enumerate(x_lags):
        x_train = x_train.merge(x_lag, how="left", on="ncodpers")
        print("Nulls at %d:" %(t + 1), x_train.isnull().sum().sum())
    
    return x_train

In [40]:
pd.to_datetime("2015-04-28") - pd.to_datetime("2015-05-28") 

Timedelta('-30 days +00:00:00')

In [21]:
row_filter = (df_train["fecha_dato"] == "2015-05-28") & df_train["ncodpers"].isin(y.index)
row_filter.sum()

17300

In [15]:
y = extract_y(df_train, "2015-05-28")

In [16]:
y

Unnamed: 0_level_0,ind_recibo_ult1,ind_nom_pens_ult1,ind_nomina_ult1,ind_cco_fin_ult1,ind_tjcr_fin_ult1,ind_cno_fin_ult1,ind_ecue_fin_ult1,ind_dela_fin_ult1,ind_reca_fin_ult1,ind_ctma_fin_ult1,ind_valo_fin_ult1,ind_ctop_fin_ult1,ind_fond_fin_ult1,ind_deco_fin_ult1,ind_ctpp_fin_ult1,ind_plan_fin_ult1
ncodpers,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
15889,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0
15929,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
15949,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
15967,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0
16022,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1394751,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0
1394766,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1394772,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0
1394775,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [23]:
x_demog = extract_x_demog(df_train, "2015-05-28", customer_ids=y.index)
x_demog.head(10)

Unnamed: 0_level_0,pais_residencia,sexo,age,fecha_alta,ind_nuevo,antiguedad,indrel,indrel_1mes,tiprel_1mes,indresi,indext,canal_entrada,indfall,cod_prov,ind_actividad_cliente,renta,segmento
ncodpers,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
15889,ES,V,56.0,-7437,False,245.0,A,1,A,True,False,KAT,False,28,True,326124.90625,01 - TOP
15929,ES,H,70.0,-7437,False,246.0,A,1,A,True,False,KAT,False,28,True,289565.0625,02 - PARTICULARES
15949,ES,H,50.0,-7437,False,246.0,A,1,A,True,False,KAT,False,28,True,233576.125,02 - PARTICULARES
15967,ES,V,50.0,-7437,False,246.0,A,1,A,True,False,KAT,False,28,True,419440.46875,02 - PARTICULARES
16022,ES,H,50.0,-7420,False,246.0,A,1,A,True,False,KAT,False,28,True,139070.96875,02 - PARTICULARES
16026,ES,H,53.0,-7416,False,245.0,A,1,A,True,False,KAT,False,28,True,151911.546875,02 - PARTICULARES
16050,ES,H,49.0,-7402,False,245.0,A,1,A,True,False,KAT,False,28,True,180294.8125,02 - PARTICULARES
16196,ES,H,49.0,-7338,False,243.0,A,1,A,True,False,KAT,False,28,True,129717.929688,01 - TOP
16222,ES,V,51.0,-7386,False,244.0,A,1,A,True,False,KAT,False,28,True,251314.046875,01 - TOP
16268,ES,V,46.0,-7378,False,244.0,A,1,A,True,False,KAT,False,28,True,132599.046875,02 - PARTICULARES


In [24]:
x_demog.shape

(17300, 17)

In [30]:
x_demog.isnull().sum().sum()

0

In [42]:
x_prod_lag1 = extract_y_prod_lag(df_train, "2015-04-28", customer_ids=y.index)
x_prod_lag1.head(10)

Unnamed: 0,ncodpers,ind_ahor_fin_ult1_lag,ind_aval_fin_ult1_lag,ind_cco_fin_ult1_lag,ind_cder_fin_ult1_lag,ind_cno_fin_ult1_lag,ind_ctju_fin_ult1_lag,ind_ctma_fin_ult1_lag,ind_ctop_fin_ult1_lag,ind_ctpp_fin_ult1_lag,...,ind_hip_fin_ult1_lag,ind_plan_fin_ult1_lag,ind_pres_fin_ult1_lag,ind_reca_fin_ult1_lag,ind_tjcr_fin_ult1_lag,ind_valo_fin_ult1_lag,ind_viv_fin_ult1_lag,ind_nomina_ult1_lag,ind_nom_pens_ult1_lag,ind_recibo_ult1_lag
3,15889,0,0,1,0,0,0,0,0,1,...,0,0,0,0,0,1,0,0,0,0
577,15929,0,0,1,0,0,0,0,0,1,...,0,0,0,0,1,1,0,0,0,0
867,15949,0,0,1,0,0,0,0,1,1,...,0,1,0,1,1,0,0,0,0,0
1105,15967,0,0,0,0,1,0,0,0,0,...,0,0,0,1,1,0,0,1,1,1
1828,16022,0,0,1,0,0,0,0,0,0,...,0,0,0,1,1,0,0,0,0,1
1873,16026,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,1,1
2167,16050,0,0,1,0,1,0,0,1,0,...,0,0,0,0,0,1,0,1,1,1
3491,16196,0,0,0,0,0,0,0,1,1,...,0,0,0,0,1,0,0,0,0,1
3745,16222,0,0,1,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,1
4259,16268,0,0,1,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
