In [32]:
import os
import re
import pandas as pd
import numpy as np

In [14]:
def change_dtype_ser(ser):
    
    if ser.dtype == int:
        return ser.astype(np.int32)
    
    if ser.dtype == float:
        return ser.astype(np.float32)
    
    if ser.dtype == np.object:
        return ser.astype("category")
    
    return ser
    

def change_dtype_df(df):
    """
    change types of columns to reduce memory size
    :param df: dataframe
    :return df: dataframe
    """
    df["fecha_dato"] = pd.to_datetime(df["fecha_dato"])
    df["fecha_alta"] = pd.to_datetime(df["fecha_alta"])
    
    memory = df.memory_usage().sum() / 10**6
    print("Memory usage before changing types %0.2f MB" % memory)

    for col in df.columns:
        df[col] = change_dtype_ser(df[col])

    memory = df.memory_usage().sum() / 10 ** 6
    print("Memory usage after changing types %0.2f MB" % memory)
    return df


def load_csv(filename):
    df = pd.read_csv(filename)
    df = change_dtype_df(df)
    return df

In [124]:
def is_prod_purchased(ser):
    new_ser = (ser - ser.shift(1)).fillna(0).astype("int")
    return new_ser
    

def new_prod_purchase(df):
    df = df.copy()
    df = df.sort_values(by=["ncodpers", "fecha_dato"])
    
    regex = re.compile(r"^ind_.*_ult1$")
    prod_cols = [col for col in df.columns.to_list() if regex.match(col)]
    
    df_new = df[prod_cols].groupby(df["ncodpers"]).transform(is_prod_purchased)
    df_new.columns = [col + "_added" for col in df_new.columns]
    
    return pd.concat([df, df_new], axis=1)

In [65]:
df_train.sort_values?

In [3]:
INP_DIR = "data/data_/"

In [15]:
df_train = load_csv(os.path.join(INP_DIR, "train_cleaned.csv"))

  if (await self.run_code(code, result,  async_=asy)):


Memory usage before changing types 4326.20 MB
Memory usage after changing types 1992.52 MB


In [20]:
df_train.isnull().sum().sum()

0

In [None]:
df_tmp = new_prod_purchase(df_train)

In [108]:
df_tmp = df_train.copy()
df_tmp = df_tmp.sort_values(by="fecha_dato")

In [112]:
df_tmp["ind_ahor_fin_ult1"].groupby(df_tmp["ncodpers"]).transform(is_prod_purchased)

0           0
416967      0
416968      0
416969      0
416970      0
           ..
13026344    0
13026345    0
13026346    0
13026320    0
13647308    0
Name: ind_ahor_fin_ult1, Length: 13647309, dtype: int32

In [63]:
df_tmp.filter(regex=r"^ind_.*_added$|ncodpers").head(20)

Unnamed: 0,ncodpers,ind_ahor_fin_ult1_added,ind_aval_fin_ult1_added,ind_cco_fin_ult1_added,ind_cder_fin_ult1_added,ind_cno_fin_ult1_added,ind_ctju_fin_ult1_added,ind_ctma_fin_ult1_added,ind_ctop_fin_ult1_added,ind_ctpp_fin_ult1_added,...,ind_hip_fin_ult1_added,ind_plan_fin_ult1_added,ind_pres_fin_ult1_added,ind_reca_fin_ult1_added,ind_tjcr_fin_ult1_added,ind_valo_fin_ult1_added,ind_viv_fin_ult1_added,ind_nomina_ult1_added,ind_nom_pens_ult1_added,ind_recibo_ult1_added
0,1375586,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1050611,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,1050612,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,1050613,0,0,-1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,1050614,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,1050615,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6,1050616,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7,1050617,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8,1050619,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9,1050620,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [58]:
df_tmp.columns.to_list()

['fecha_dato',
 'ncodpers',
 'ind_empleado',
 'pais_residencia',
 'sexo',
 'age',
 'fecha_alta',
 'ind_nuevo',
 'antiguedad',
 'indrel',
 'indrel_1mes',
 'tiprel_1mes',
 'indresi',
 'indext',
 'canal_entrada',
 'indfall',
 'cod_prov',
 'ind_actividad_cliente',
 'renta',
 'segmento',
 'ind_ahor_fin_ult1',
 'ind_aval_fin_ult1',
 'ind_cco_fin_ult1',
 'ind_cder_fin_ult1',
 'ind_cno_fin_ult1',
 'ind_ctju_fin_ult1',
 'ind_ctma_fin_ult1',
 'ind_ctop_fin_ult1',
 'ind_ctpp_fin_ult1',
 'ind_deco_fin_ult1',
 'ind_deme_fin_ult1',
 'ind_dela_fin_ult1',
 'ind_ecue_fin_ult1',
 'ind_fond_fin_ult1',
 'ind_hip_fin_ult1',
 'ind_plan_fin_ult1',
 'ind_pres_fin_ult1',
 'ind_reca_fin_ult1',
 'ind_tjcr_fin_ult1',
 'ind_valo_fin_ult1',
 'ind_viv_fin_ult1',
 'ind_nomina_ult1',
 'ind_nom_pens_ult1',
 'ind_recibo_ult1',
 'ind_ahor_fin_ult1_added',
 'ind_aval_fin_ult1_added',
 'ind_cco_fin_ult1_added',
 'ind_cder_fin_ult1_added',
 'ind_cno_fin_ult1_added',
 'ind_ctju_fin_ult1_added',
 'ind_ctma_fin_ult1_added',
 '

In [64]:
df_tmp.loc[df_tmp["ncodpers"] == 1050614, ["fecha_dato", "ncodpers", "ind_cco_fin_ult1", "ind_cco_fin_ult1_added"]]

Unnamed: 0,fecha_dato,ncodpers,ind_cco_fin_ult1,ind_cco_fin_ult1_added
4,2015-01-28,1050614,1,1
1243387,2015-02-28,1050614,1,0
1847830,2015-03-28,1050614,1,0
2507209,2015-04-28,1050614,1,0
2519201,2015-05-28,1050614,1,0
3176930,2015-06-28,1050614,1,0
4371722,2015-07-28,1050614,1,1
5136565,2015-08-28,1050614,1,0
6053844,2015-09-28,1050614,1,0
6492089,2015-10-28,1050614,1,1


In [75]:
col = "ind_ahor_fin_ult1"
df_train[["ncodpers", "fecha_dato", col]].groupby("ncodpers").apply(lambda df: is_prod_purchased(df, col))

KeyboardInterrupt: 

In [74]:
df_train[["ncodpers", "fecha_dato", col]].groupby("ncodpers").mean()

Unnamed: 0_level_0,ind_ahor_fin_ult1
ncodpers,Unnamed: 1_level_1
15889,0.0
15890,0.0
15891,0.0
15892,0.0
15893,0.0
...,...
1553685,0.0
1553686,0.0
1553687,0.0
1553688,0.0


In [120]:
a = ["a"]*10 + ["b"]*10
b = list(range(10)) + list(range(10))
c = [0, 0, 0, 1, 1, 1, 2, 2, 2, 2] + [0, 0, 1, 1, 1, 2, 2, 3, 3, 3]
d = [0, 0, 0, 0, 1, 1, 1, 2, 2, 2] + [0, 0, 1, 1, 1, 1, 2, 2, 2, 2]
df_tmp = pd.DataFrame({"a": a, "b": b, "c": c, "d": d})
df_tmp = df_tmp.sort_values(by=["a", "b"])
df_tmp

Unnamed: 0,a,b,c,d
0,a,0,0,0
1,a,1,0,0
2,a,2,0,0
3,a,3,1,0
4,a,4,1,1
5,a,5,1,1
6,a,6,2,1
7,a,7,2,2
8,a,8,2,2
9,a,9,2,2


In [114]:
def is_prod_purchased(ser):
    new_ser = (ser - ser.shift(1)).fillna(0).astype("int")
    return new_ser
    

In [121]:
tmp = df_tmp[["c", "d"]].groupby(df_tmp["a"]).transform(is_prod_purchased)
tmp.columns = ["c1", "d1"]

In [122]:
pd.concat([df_tmp, tmp], axis=1)

Unnamed: 0,a,b,c,d,c1,d1
0,a,0,0,0,0,0
1,a,1,0,0,0,0
2,a,2,0,0,0,0
3,a,3,1,0,1,0
4,a,4,1,1,0,1
5,a,5,1,1,0,0
6,a,6,2,1,1,0
7,a,7,2,2,0,1
8,a,8,2,2,0,0
9,a,9,2,2,0,0


In [97]:
df_tmp

Unnamed: 0,a,b,c,d
0,a,0,0,0
1,a,1,0,0
2,a,2,0,0
3,a,3,1,0
4,a,4,1,1
5,a,5,1,1
6,a,6,2,1
7,a,7,2,2
8,a,8,2,2
9,a,9,2,2
