In [1]:
import os
import pandas as pd
import numpy as np


In [2]:
def change_dtype_ser(ser):
    
    if ser.dtype == int:
        return ser.astype(np.int32)
    
    if ser.dtype == float:
        return ser.astype(np.float32)
    
    if ser.dtype == np.object:
        return ser.astype("category")
    
    return ser
    

def change_dtype_df(df):
    """
    change types of columns to reduce memory size
    :param df: dataframe
    :return df: dataframe
    """
    df["fecha_dato"] = pd.to_datetime(df["fecha_dato"])
    df["fecha_alta"] = pd.to_datetime(df["fecha_alta"])
    
    memory = df.memory_usage().sum() / 10**6
    print("Memory usage before changing types %0.2f MB" % memory)

    for col in df.columns:
        df[col] = change_dtype_ser(df[col])

    memory = df.memory_usage().sum() / 10 ** 6
    print("Memory usage after changing types %0.2f MB" % memory)
    return df


def load_csv(filename):
    df = pd.read_csv(filename)
    df = change_dtype_df(df)
    return df

In [3]:
INP_DIR = "data/data_/"

In [5]:
df_train = load_csv(os.path.join(INP_DIR, "train_cleaned.csv"))
df_test = load_csv(os.path.join(INP_DIR, "test_cleaned.csv"))

  if (await self.run_code(code, result,  async_=asy)):


Memory usage before changing types 6946.48 MB
Memory usage after changing types 3302.66 MB
Memory usage before changing types 116.20 MB
Memory usage after changing types 49.28 MB


In [14]:
# drop this column becuase it is too imbalanced
df_train = df_train.drop(["ind_empleado"], axis=1)
df_test = df_test.drop(["ind_empleado"], axis=1)

In [15]:
df_train.shape, df_test.shape

((13647309, 67), (929615, 19))

In [16]:
df_train.columns

Index(['fecha_dato', 'ncodpers', 'pais_residencia', 'sexo', 'age',
       'fecha_alta', 'ind_nuevo', 'antiguedad', 'indrel', 'indrel_1mes',
       'tiprel_1mes', 'indresi', 'indext', 'canal_entrada', 'indfall',
       'cod_prov', 'ind_actividad_cliente', 'renta', 'segmento',
       'ind_ahor_fin_ult1', 'ind_aval_fin_ult1', 'ind_cco_fin_ult1',
       'ind_cder_fin_ult1', 'ind_cno_fin_ult1', 'ind_ctju_fin_ult1',
       'ind_ctma_fin_ult1', 'ind_ctop_fin_ult1', 'ind_ctpp_fin_ult1',
       'ind_deco_fin_ult1', 'ind_deme_fin_ult1', 'ind_dela_fin_ult1',
       'ind_ecue_fin_ult1', 'ind_fond_fin_ult1', 'ind_hip_fin_ult1',
       'ind_plan_fin_ult1', 'ind_pres_fin_ult1', 'ind_reca_fin_ult1',
       'ind_tjcr_fin_ult1', 'ind_valo_fin_ult1', 'ind_viv_fin_ult1',
       'ind_nomina_ult1', 'ind_nom_pens_ult1', 'ind_recibo_ult1',
       'ind_ahor_fin_ult1_NEW_PUR', 'ind_aval_fin_ult1_NEW_PUR',
       'ind_cco_fin_ult1_NEW_PUR', 'ind_cder_fin_ult1_NEW_PUR',
       'ind_cno_fin_ult1_NEW_PUR', 'ind_ctj

In [17]:
df_test.columns

Index(['fecha_dato', 'ncodpers', 'pais_residencia', 'sexo', 'age',
       'fecha_alta', 'ind_nuevo', 'antiguedad', 'indrel', 'indrel_1mes',
       'tiprel_1mes', 'indresi', 'indext', 'canal_entrada', 'indfall',
       'cod_prov', 'ind_actividad_cliente', 'renta', 'segmento'],
      dtype='object')

In [18]:
df_test["fecha_dato"].nunique()

1

In [11]:
df_test["ncodpers"].nunique()

929615

In [13]:
df_train["ncodpers"].nunique()

956645

In [24]:
df_train["indrel_1mes"].value_counts()

1    9114353
1    4526109
3       2496
3       1854
P        874
2        659
2        658
4        188
4        118
Name: indrel_1mes, dtype: int64

In [25]:
df_train.head()

Unnamed: 0,fecha_dato,ncodpers,pais_residencia,sexo,age,fecha_alta,ind_nuevo,antiguedad,indrel,indrel_1mes,...,ind_hip_fin_ult1_NEW_PUR,ind_plan_fin_ult1_NEW_PUR,ind_pres_fin_ult1_NEW_PUR,ind_reca_fin_ult1_NEW_PUR,ind_tjcr_fin_ult1_NEW_PUR,ind_valo_fin_ult1_NEW_PUR,ind_viv_fin_ult1_NEW_PUR,ind_nomina_ult1_NEW_PUR,ind_nom_pens_ult1_NEW_PUR,ind_recibo_ult1_NEW_PUR
0,2015-01-28,15889,ES,V,56.0,1995-01-16,False,245.0,A,1,...,0,0,0,0,0,0,0,0,0,0
1,2015-02-28,15889,ES,V,56.0,1995-01-16,False,245.0,A,1,...,0,0,0,0,0,0,0,0,0,0
2,2015-03-28,15889,ES,V,56.0,1995-01-16,False,245.0,A,1,...,0,0,0,0,0,0,0,0,0,0
3,2015-04-28,15889,ES,V,56.0,1995-01-16,False,245.0,A,1,...,0,0,0,0,0,0,0,0,0,0
4,2015-05-28,15889,ES,V,56.0,1995-01-16,False,245.0,A,1,...,0,0,0,0,1,0,0,0,0,0


In [32]:
(df_train["fecha_dato"] == "2015-03-28").sum()

629209

In [30]:
df_train["fecha_dato"].value_counts().sort_index()

2015-01-28    625457
2015-02-28    627394
2015-03-28    629209
2015-04-28    630367
2015-05-28    631957
2015-06-28    632110
2015-07-28    829817
2015-08-28    843201
2015-09-28    865440
2015-10-28    892251
2015-11-28    906109
2015-12-28    912021
2016-01-28    916269
2016-02-28    920904
2016-03-28    925076
2016-04-28    928274
2016-05-28    931453
Name: fecha_dato, dtype: int64