In [1]:
import os
import re
import pandas as pd
import numpy as np

In [2]:
def change_dtype_ser(ser):
    
    if ser.dtype == int:
        return ser.astype(np.int32)
    
    if ser.dtype == float:
        return ser.astype(np.float32)
    
    if ser.dtype == np.object:
        return ser.astype("category")
    
    return ser
    

def change_dtype_df(df):
    """
    change types of columns to reduce memory size
    :param df: dataframe
    :return df: dataframe
    """
    df["fecha_dato"] = pd.to_datetime(df["fecha_dato"])
    df["fecha_alta"] = pd.to_datetime(df["fecha_alta"])
    
    memory = df.memory_usage().sum() / 10**6
    print("Memory usage before changing types %0.2f MB" % memory)

    for col in df.columns:
        df[col] = change_dtype_ser(df[col])

    memory = df.memory_usage().sum() / 10 ** 6
    print("Memory usage after changing types %0.2f MB" % memory)
    return df


def load_csv(filename):
    df = pd.read_csv(filename)
    df = change_dtype_df(df)
    return df

In [3]:
INP_DIR = "data/data_"
OUT_DIR1 = "data/data1_"
OUT_DIR2 = "data/data2_"
OUT_DIR3 = "data/data3_"

In [4]:
df_train = load_csv(os.path.join(INP_DIR, "train_cleaned.csv"))
df_test = load_csv(os.path.join(INP_DIR, "test_cleaned.csv"))

  if (await self.run_code(code, result,  async_=asy)):


Memory usage before changing types 9566.76 MB
Memory usage after changing types 4612.80 MB
Memory usage before changing types 116.20 MB
Memory usage after changing types 49.28 MB


In [5]:
# drop this column becuase it is too imbalanced
df_train = df_train.drop(["ind_empleado"], axis=1)
df_test = df_test.drop(["ind_empleado"], axis=1)

In [5]:
# days from when the data is recorded 
df_train["fecha_alta"] = (df_train["fecha_alta"] - df_train["fecha_dato"]).dt.days
df_test["fecha_alta"] = (df_test["fecha_alta"] - df_test["fecha_dato"]).dt.days

In [6]:
df_train.isnull().sum().sum(), df_test.isnull().sum().sum()

(0, 0)

In [7]:
df_train.shape, df_test.shape

((13647309, 92), (929615, 20))

In [8]:
df_train.columns

Index(['fecha_dato', 'ncodpers', 'ind_empleado', 'pais_residencia', 'sexo',
       'age', 'fecha_alta', 'ind_nuevo', 'antiguedad', 'indrel', 'indrel_1mes',
       'tiprel_1mes', 'indresi', 'indext', 'canal_entrada', 'indfall',
       'cod_prov', 'ind_actividad_cliente', 'renta', 'segmento',
       'ind_ahor_fin_ult1', 'ind_aval_fin_ult1', 'ind_cco_fin_ult1',
       'ind_cder_fin_ult1', 'ind_cno_fin_ult1', 'ind_ctju_fin_ult1',
       'ind_ctma_fin_ult1', 'ind_ctop_fin_ult1', 'ind_ctpp_fin_ult1',
       'ind_deco_fin_ult1', 'ind_deme_fin_ult1', 'ind_dela_fin_ult1',
       'ind_ecue_fin_ult1', 'ind_fond_fin_ult1', 'ind_hip_fin_ult1',
       'ind_plan_fin_ult1', 'ind_pres_fin_ult1', 'ind_reca_fin_ult1',
       'ind_tjcr_fin_ult1', 'ind_valo_fin_ult1', 'ind_viv_fin_ult1',
       'ind_nomina_ult1', 'ind_nom_pens_ult1', 'ind_recibo_ult1',
       'ind_ahor_fin_ult1_NEW_PUR', 'ind_aval_fin_ult1_NEW_PUR',
       'ind_cco_fin_ult1_NEW_PUR', 'ind_cder_fin_ult1_NEW_PUR',
       'ind_cno_fin_ult1_NE

In [9]:
df_test.columns

Index(['fecha_dato', 'ncodpers', 'ind_empleado', 'pais_residencia', 'sexo',
       'age', 'fecha_alta', 'ind_nuevo', 'antiguedad', 'indrel', 'indrel_1mes',
       'tiprel_1mes', 'indresi', 'indext', 'canal_entrada', 'indfall',
       'cod_prov', 'ind_actividad_cliente', 'renta', 'segmento'],
      dtype='object')

In [10]:
PROD_COLS = [col for col in df_train.columns if re.match(r"^ind_.*_ult1$", col)]
print(PROD_COLS)

['ind_ahor_fin_ult1', 'ind_aval_fin_ult1', 'ind_cco_fin_ult1', 'ind_cder_fin_ult1', 'ind_cno_fin_ult1', 'ind_ctju_fin_ult1', 'ind_ctma_fin_ult1', 'ind_ctop_fin_ult1', 'ind_ctpp_fin_ult1', 'ind_deco_fin_ult1', 'ind_deme_fin_ult1', 'ind_dela_fin_ult1', 'ind_ecue_fin_ult1', 'ind_fond_fin_ult1', 'ind_hip_fin_ult1', 'ind_plan_fin_ult1', 'ind_pres_fin_ult1', 'ind_reca_fin_ult1', 'ind_tjcr_fin_ult1', 'ind_valo_fin_ult1', 'ind_viv_fin_ult1', 'ind_nomina_ult1', 'ind_nom_pens_ult1', 'ind_recibo_ult1']


In [11]:
NEW_PURCH_COLS = [col for col in df_train.columns if re.match(r"^ind_.*_ult1_NEW_PUR$", col)]
print(NEW_PURCH_COLS, "\n")

prod_popul = df_train[NEW_PURCH_COLS].sum(axis=0)
prod_popul = prod_popul.sort_values(ascending=False)/prod_popul.sum() * 100
print(prod_popul, "\n")

NEW_PURCH_COLS = prod_popul.sort_values(ascending=False).index.to_list()
print(NEW_PURCH_COLS)

['ind_ahor_fin_ult1_NEW_PUR', 'ind_aval_fin_ult1_NEW_PUR', 'ind_cco_fin_ult1_NEW_PUR', 'ind_cder_fin_ult1_NEW_PUR', 'ind_cno_fin_ult1_NEW_PUR', 'ind_ctju_fin_ult1_NEW_PUR', 'ind_ctma_fin_ult1_NEW_PUR', 'ind_ctop_fin_ult1_NEW_PUR', 'ind_ctpp_fin_ult1_NEW_PUR', 'ind_deco_fin_ult1_NEW_PUR', 'ind_deme_fin_ult1_NEW_PUR', 'ind_dela_fin_ult1_NEW_PUR', 'ind_ecue_fin_ult1_NEW_PUR', 'ind_fond_fin_ult1_NEW_PUR', 'ind_hip_fin_ult1_NEW_PUR', 'ind_plan_fin_ult1_NEW_PUR', 'ind_pres_fin_ult1_NEW_PUR', 'ind_reca_fin_ult1_NEW_PUR', 'ind_tjcr_fin_ult1_NEW_PUR', 'ind_valo_fin_ult1_NEW_PUR', 'ind_viv_fin_ult1_NEW_PUR', 'ind_nomina_ult1_NEW_PUR', 'ind_nom_pens_ult1_NEW_PUR', 'ind_recibo_ult1_NEW_PUR'] 

ind_recibo_ult1_NEW_PUR      27.197280
ind_nom_pens_ult1_NEW_PUR    15.048197
ind_nomina_ult1_NEW_PUR      13.101134
ind_cco_fin_ult1_NEW_PUR     12.426018
ind_tjcr_fin_ult1_NEW_PUR    12.304237
ind_cno_fin_ult1_NEW_PUR      6.601516
ind_ecue_fin_ult1_NEW_PUR     4.682679
ind_dela_fin_ult1_NEW_PUR     2.2557

In [12]:
PURCH_CANCEL_COLS = [col for col in df_train.columns if re.match(r"^ind_.*_ult1_PUR_OR_CANCEL$", col)]
print(PURCH_CANCEL_COLS)

['ind_ahor_fin_ult1_PUR_OR_CANCEL', 'ind_aval_fin_ult1_PUR_OR_CANCEL', 'ind_cco_fin_ult1_PUR_OR_CANCEL', 'ind_cder_fin_ult1_PUR_OR_CANCEL', 'ind_cno_fin_ult1_PUR_OR_CANCEL', 'ind_ctju_fin_ult1_PUR_OR_CANCEL', 'ind_ctma_fin_ult1_PUR_OR_CANCEL', 'ind_ctop_fin_ult1_PUR_OR_CANCEL', 'ind_ctpp_fin_ult1_PUR_OR_CANCEL', 'ind_deco_fin_ult1_PUR_OR_CANCEL', 'ind_deme_fin_ult1_PUR_OR_CANCEL', 'ind_dela_fin_ult1_PUR_OR_CANCEL', 'ind_ecue_fin_ult1_PUR_OR_CANCEL', 'ind_fond_fin_ult1_PUR_OR_CANCEL', 'ind_hip_fin_ult1_PUR_OR_CANCEL', 'ind_plan_fin_ult1_PUR_OR_CANCEL', 'ind_pres_fin_ult1_PUR_OR_CANCEL', 'ind_reca_fin_ult1_PUR_OR_CANCEL', 'ind_tjcr_fin_ult1_PUR_OR_CANCEL', 'ind_valo_fin_ult1_PUR_OR_CANCEL', 'ind_viv_fin_ult1_PUR_OR_CANCEL', 'ind_nomina_ult1_PUR_OR_CANCEL', 'ind_nom_pens_ult1_PUR_OR_CANCEL', 'ind_recibo_ult1_PUR_OR_CANCEL']


In [13]:
# total number of prod
df_train["TOTAL_PRODS"] = df_train[PROD_COLS].sum(axis=1)

In [14]:
DEMOG_COLS = [col for col in df_train.columns 
    if col not in PROD_COLS + NEW_PURCH_COLS + PURCH_CANCEL_COLS + ["fecha_dato", "ncodpers", "TOTAL_PRODS"]]
print(DEMOG_COLS)

['ind_empleado', 'pais_residencia', 'sexo', 'age', 'fecha_alta', 'ind_nuevo', 'antiguedad', 'indrel', 'indrel_1mes', 'tiprel_1mes', 'indresi', 'indext', 'canal_entrada', 'indfall', 'cod_prov', 'ind_actividad_cliente', 'renta', 'segmento']


In [15]:
LAG_COLS = ["fecha_alta", "ind_nuevo", "antiguedad", "indrel", "tiprel_1mes", 
            "ind_actividad_cliente", "renta", "segmento", "TOTAL_PRODS"]

In [16]:
def _is_unique(x):
    return len(x) == x.nunique()


def _onehot_reduce(df):
    df = df.copy()
    values = df.values
    
    new_values = values.copy()
    for i in range(values.shape[0]):
        row = values[i]
        if row.sum() > 1:
            new_row = np.zeros([row.shape[0]], dtype=np.int)
            
            where_1 = np.where(row == 1)[0]
            np.random.shuffle(where_1)
            idx = where_1[0]
            new_row[idx] = 1
            
            new_values[i] = new_row
    
    new_df = pd.DataFrame(data=new_values, index=df.index, columns=df.columns)
    return new_df


def extract_subset(df, row_filter, cols):
    df = df.copy()
    return df.loc[row_filter, cols]



def extract_y(df, timestamp, y_cols=NEW_PURCH_COLS):
    # only use row when customer buys at least one product
    any_new_pur = df[y_cols].sum(axis=1) > 0
    row_filter = (df["fecha_dato"] == timestamp) & any_new_pur
    cols = ["ncodpers"] + y_cols
    
    df_out = extract_subset(df, row_filter, cols)
    assert _is_unique(df_out["ncodpers"]), "ncodpers must be unique"
    
    df_out.columns = [col.replace("_NEW_PUR", "") if col.endswith("_NEW_PUR") else col for col in df_out.columns]
    
    id_col = df_out["ncodpers"]
    pur_cols = [col for col in df_out.columns if col != "ncodpers"]
    df_out = _onehot_reduce(df_out[pur_cols])
    df_out["ncodpers"] = id_col
    
    return df_out


def extract_x_demog(df, timestamp, customer_ids, demog_cols=DEMOG_COLS):
    row_filter = (df["fecha_dato"] == timestamp) & df["ncodpers"].isin(customer_ids)
    cols = ["ncodpers"] + demog_cols
    
    df_out = extract_subset(df, row_filter, cols)
    assert _is_unique(df_out["ncodpers"]), "ncodpers must be unique"

    return df_out


# This function can give dataframe having less rows than len(customer_ids)
# This is becuse number of row at lag_timestamp is not the same as at timestamp
def extract_lag_features(df, timestamp_lag, customer_ids, suffix="_LAG", 
                         sel_cols=PROD_COLS+PURCH_CANCEL_COLS+LAG_COLS):
    row_filter = (df["fecha_dato"] == timestamp_lag) & df["ncodpers"].isin(customer_ids)
    cols = ["ncodpers"] + sel_cols
    
    df_out = extract_subset(df, row_filter, cols)
    assert _is_unique(df_out["ncodpers"]), "ncodpers must be unique"
    
    df_out.columns = ["ncodpers"] + [col + suffix for col in df_out.columns if col != "ncodpers"]
    
    return df_out



def extract_X_y_train(df, timestamp, timestamp_lags):
    y_train = extract_y(df, timestamp)
    print("y_train.shape", y_train.shape)
    
    x_demog = extract_x_demog(df, timestamp, customer_ids=y_train["ncodpers"])
    print("x_demog.shape:", x_demog.shape)
    assert y_train.shape[0] == x_demog.shape[0], "x_demog must have the same number of rows as y_train"
    
    x_lags = []
    for t, lag in enumerate(timestamp_lags):
        assert pd.to_datetime(lag) < pd.to_datetime(timestamp), lag + " lag is not before timestamp " +  timestamp
        lag_label = "_LAG%d" % (t + 1)
        
        x_lag = extract_lag_features(df, lag, customer_ids=y_train["ncodpers"], suffix=lag_label)
        print(lag, lag_label, x_lag.shape)
        x_lags.append(x_lag)
    
    X_train = y_train[["ncodpers"]].merge(x_demog, how="left", on="ncodpers")
    print("Nulls after merging y and x_demog:", X_train.isnull().sum().sum())
    
    for t, x_lag in enumerate(x_lags):
        X_train = X_train.merge(x_lag, how="left", on="ncodpers")
        print("Nulls at %d:" %(t + 1), X_train.isnull().sum().sum())
    
    print("X_train.shape:", X_train.shape)
    print("y_train.shape:", y_train.shape)
    return X_train, y_train


def extract_X_test(train, test, timestamp, timestamp_lags):
    x_demog = extract_x_demog(test, timestamp, customer_ids=test["ncodpers"])
    print("x_demog.shape:", x_demog.shape)
    print("Nulls of x_demog:", x_demog.isnull().sum().sum())
    
    x_lags = []
    for t, lag in enumerate(timestamp_lags):
        assert pd.to_datetime(lag) < pd.to_datetime(timestamp), lag + " lag is not before timestamp " +  timestamp
        lag_label = "_LAG%d" % (t + 1)
        
        x_lag = extract_lag_features(train, lag, customer_ids=test["ncodpers"], suffix=lag_label)
        print(lag, lag_label, x_lag.shape)
        x_lags.append(x_lag)
        
    X_test = x_demog
    for t, x_lag in enumerate(x_lags):
        X_test = X_test.merge(x_lag, how="left", on="ncodpers")
        print("Nulls at %d:" %(t + 1), X_test.isnull().sum().sum())
    
    print("X_test.shape:", X_test.shape)
    return X_test

In [17]:
def extract_x_demog(df, timestamp, customer_ids, demog_cols=DEMOG_COLS):
    row_filter = (df["fecha_dato"] == timestamp) & df["ncodpers"].isin(customer_ids)
    cols = ["ncodpers"] + demog_cols
    
    df_out = extract_subset(df, row_filter, cols)
    assert _is_unique(df_out["ncodpers"]), "ncodpers must be unique"

    return df_out

In [18]:
df_train["fecha_dato"].unique()

array(['2015-01-28T00:00:00.000000000', '2015-02-28T00:00:00.000000000',
       '2015-03-28T00:00:00.000000000', '2015-04-28T00:00:00.000000000',
       '2015-05-28T00:00:00.000000000', '2015-06-28T00:00:00.000000000',
       '2015-07-28T00:00:00.000000000', '2015-08-28T00:00:00.000000000',
       '2015-09-28T00:00:00.000000000', '2015-10-28T00:00:00.000000000',
       '2015-11-28T00:00:00.000000000', '2015-12-28T00:00:00.000000000',
       '2016-01-28T00:00:00.000000000', '2016-02-28T00:00:00.000000000',
       '2016-03-28T00:00:00.000000000', '2016-04-28T00:00:00.000000000',
       '2016-05-28T00:00:00.000000000'], dtype='datetime64[ns]')

# Use `2016-05`  with 6-month lags to predict `2016-06`

## Extract `X_2016_04`, `y_2016_04`

In [23]:
timestamp = "2016-04-28"
timestamp_lags = ["2016-03-28", "2016-02-28", "2016-01-28", 
                  "2015-12-28", "2015-11-28", "2015-10-28"]

X_2016_04, y_2016_04 = extract_X_y_train(df_train, timestamp, timestamp_lags)

y_train.shape (26791, 25)
x_demog.shape: (26791, 18)
2016-03-28 _LAG1 (26736, 58)
2016-02-28 _LAG2 (25051, 58)
2016-01-28 _LAG3 (24255, 58)
2015-12-28 _LAG4 (23556, 58)
2015-11-28 _LAG5 (22805, 58)
2015-10-28 _LAG6 (22308, 58)
Nulls after merging y and x_demog: 0
Nulls at 1: 3135
Nulls at 2: 102315
Nulls at 3: 246867
Nulls at 4: 431262
Nulls at 5: 658464
Nulls at 6: 913995
X_train.shape: (26791, 360)
y_train.shape: (26791, 25)


In [26]:
X_2016_04.to_csv(os.path.join(OUT_DIR1, "X_2016_04.csv"), index=False)
y_2016_04.to_csv(os.path.join(OUT_DIR1, "y_2016_04.csv"), index=False)

## Extract `X_2016_05`, `y_2016_05`

In [27]:
timestamp = "2016-05-28"
timestamp_lags = ["2016-04-28", "2016-03-28", "2016-02-28", "2016-01-28",
                  "2015-12-28", "2015-11-28"]

X_2016_05, y_2016_05 = extract_X_y_train(df_train, timestamp, timestamp_lags)

y_train.shape (27916, 25)
x_demog.shape: (27916, 18)
2016-04-28 _LAG1 (27875, 58)
2016-03-28 _LAG2 (26372, 58)
2016-02-28 _LAG3 (25692, 58)
2016-01-28 _LAG4 (24936, 58)
2015-12-28 _LAG5 (23968, 58)
2015-11-28 _LAG6 (23553, 58)
Nulls after merging y and x_demog: 0
Nulls at 1: 2337
Nulls at 2: 90345
Nulls at 3: 217113
Nulls at 4: 386973
Nulls at 5: 612009
Nulls at 6: 860700
X_train.shape: (27916, 360)
y_train.shape: (27916, 25)


In [30]:
X_2016_05.to_csv(os.path.join(OUT_DIR1, "X_2016_05.csv"), index=False)
y_2016_05.to_csv(os.path.join(OUT_DIR1, "y_2016_05.csv"), index=False)

## Extract `X_2016_06`

In [31]:
timestamp = "2016-06-28"
timestamp_lags = ["2016-05-28", "2016-04-28", "2016-03-28", "2016-02-28", "2016-01-28",
                  "2015-12-28"]

X_2016_06 = extract_X_test(df_train, df_test, timestamp, timestamp_lags)

x_demog.shape: (929615, 18)
Nulls of x_demog: 0
2016-05-28 _LAG1 (929615, 58)
2016-04-28 _LAG2 (925252, 58)
2016-03-28 _LAG3 (920975, 58)
2016-02-28 _LAG4 (915679, 58)
2016-01-28 _LAG5 (909885, 58)
2015-12-28 _LAG6 (903429, 58)
Nulls at 1: 0
Nulls at 2: 248691
Nulls at 3: 741171
Nulls at 4: 1535523
Nulls at 5: 2660133
Nulls at 6: 4152735
X_test.shape: (929615, 360)


In [33]:
X_2016_06.to_csv(os.path.join(OUT_DIR1, "X_2016_06.csv"), index=False)

# Use `2016-05`  with 12-month lags to predict `2016-06`

## Extract `X_2016_04`, `y_2016_04`

In [34]:
timestamp = "2016-04-28"
timestamp_lags = ["2016-03-28", "2016-02-28", "2016-01-28", 
                  "2015-12-28", "2015-11-28", "2015-10-28",
                  "2015-09-28", "2015-08-28", "2015-07-28", 
                  "2015-06-28", "2015-05-28", "2015-04-28"]

X_2016_04, y_2016_04 = extract_X_y_train(df_train, timestamp, timestamp_lags)

y_train.shape (26791, 25)
x_demog.shape: (26791, 18)
2016-03-28 _LAG1 (26736, 58)
2016-02-28 _LAG2 (25051, 58)
2016-01-28 _LAG3 (24255, 58)
2015-12-28 _LAG4 (23556, 58)
2015-11-28 _LAG5 (22805, 58)
2015-10-28 _LAG6 (22308, 58)
2015-09-28 _LAG7 (21819, 58)
2015-08-28 _LAG8 (21415, 58)
2015-07-28 _LAG9 (21143, 58)
2015-06-28 _LAG10 (20223, 58)
2015-05-28 _LAG11 (20064, 58)
2015-04-28 _LAG12 (19885, 58)
Nulls after merging y and x_demog: 0
Nulls at 1: 3135
Nulls at 2: 102315
Nulls at 3: 246867
Nulls at 4: 431262
Nulls at 5: 658464
Nulls at 6: 913995
Nulls at 7: 1197399
Nulls at 8: 1503831
Nulls at 9: 1825767
Nulls at 10: 2200143
Nulls at 11: 2583582
Nulls at 12: 2977224
X_train.shape: (26791, 702)
y_train.shape: (26791, 25)


In [35]:
X_2016_04.to_csv(os.path.join(OUT_DIR2, "X_2016_04.csv"), index=False)
y_2016_04.to_csv(os.path.join(OUT_DIR2, "y_2016_04.csv"), index=False)

## Extract `X_2016_05`, `y_2016_05`

In [36]:
timestamp = "2016-05-28"
timestamp_lags = ["2016-04-28", "2016-03-28", "2016-02-28", "2016-01-28",
                  "2015-12-28", "2015-11-28", "2015-10-28", "2015-09-28",
                 "2015-08-28", "2015-07-28", "2015-06-28", "2015-05-28"]

X_2016_05, y_2016_05 = extract_X_y_train(df_train, timestamp, timestamp_lags)

y_train.shape (27916, 25)
x_demog.shape: (27916, 18)
2016-04-28 _LAG1 (27875, 58)
2016-03-28 _LAG2 (26372, 58)
2016-02-28 _LAG3 (25692, 58)
2016-01-28 _LAG4 (24936, 58)
2015-12-28 _LAG5 (23968, 58)
2015-11-28 _LAG6 (23553, 58)
2015-10-28 _LAG7 (23179, 58)
2015-09-28 _LAG8 (22725, 58)
2015-08-28 _LAG9 (22315, 58)
2015-07-28 _LAG10 (22053, 58)
2015-06-28 _LAG11 (21110, 58)
2015-05-28 _LAG12 (20963, 58)
Nulls after merging y and x_demog: 0
Nulls at 1: 2337
Nulls at 2: 90345
Nulls at 3: 217113
Nulls at 4: 386973
Nulls at 5: 612009
Nulls at 6: 860700
Nulls at 7: 1130709
Nulls at 8: 1426596
Nulls at 9: 1745853
Nulls at 10: 2080044
Nulls at 11: 2467986
Nulls at 12: 2864307
X_train.shape: (27916, 702)
y_train.shape: (27916, 25)


In [37]:
X_2016_05.to_csv(os.path.join(OUT_DIR2, "X_2016_05.csv"), index=False)
y_2016_05.to_csv(os.path.join(OUT_DIR2, "y_2016_05.csv"), index=False)

## Extract `X_2016_06`

In [38]:
timestamp = "2016-06-28"
timestamp_lags = ["2016-05-28", "2016-04-28", "2016-03-28", "2016-02-28", "2016-01-28",
                  "2015-12-28", "2015-11-28", "2015-10-28", "2015-09-28", "2015-08-28",
                 "2015-07-28", "2015-06-28"]

X_2016_06 = extract_X_test(df_train, df_test, timestamp, timestamp_lags)

x_demog.shape: (929615, 18)
Nulls of x_demog: 0
2016-05-28 _LAG1 (929615, 58)
2016-04-28 _LAG2 (925252, 58)
2016-03-28 _LAG3 (920975, 58)
2016-02-28 _LAG4 (915679, 58)
2016-01-28 _LAG5 (909885, 58)
2015-12-28 _LAG6 (903429, 58)
2015-11-28 _LAG7 (896458, 58)
2015-10-28 _LAG8 (881573, 58)
2015-09-28 _LAG9 (854574, 58)
2015-08-28 _LAG10 (832230, 58)
2015-07-28 _LAG11 (818424, 58)
2015-06-28 _LAG12 (622404, 58)
Nulls at 1: 0
Nulls at 2: 248691
Nulls at 3: 741171
Nulls at 4: 1535523
Nulls at 5: 2660133
Nulls at 6: 4152735
Nulls at 7: 6042684
Nulls at 8: 8781078
Nulls at 9: 13058415
Nulls at 10: 18609360
Nulls at 11: 24947247
Nulls at 12: 42458274
X_test.shape: (929615, 702)


In [39]:
X_2016_06.to_csv(os.path.join(OUT_DIR2, "X_2016_06.csv"), index=False)

# Use `2015-06`  with 4-month lags to predict `2016-06`

## Extract `X_2015_05`, `y_2015_05`

In [19]:
timestamp = "2015-05-28"
timestamp_lags = ["2015-04-28", "2015-03-28", "2015-02-28", "2015-01-28"]

X_2015_05, y_2015_05 = extract_X_y_train(df_train, timestamp, timestamp_lags)

y_train.shape (21422, 25)
x_demog.shape: (21422, 19)
2015-04-28 _LAG1 (21203, 58)
2015-03-28 _LAG2 (20484, 58)
2015-02-28 _LAG3 (20057, 58)
2015-01-28 _LAG4 (19526, 58)
Nulls after merging y and x_demog: 0
Nulls at 1: 12483
Nulls at 2: 65949
Nulls at 3: 143754
Nulls at 4: 251826
X_train.shape: (21422, 247)
y_train.shape: (21422, 25)


In [20]:
X_2015_05.to_csv(os.path.join(OUT_DIR3, "X_2015_05.csv"), index=False)
y_2015_05.to_csv(os.path.join(OUT_DIR3, "y_2015_05.csv"), index=False)

## Extract `X_2015_06`, `y_2015_06`

In [21]:
timestamp = "2015-06-28"
timestamp_lags = ["2015-05-28", "2015-04-28", "2015-03-28", "2015-02-28"]

X_2015_06, y_2015_06 = extract_X_y_train(df_train, timestamp, timestamp_lags)

y_train.shape (33519, 25)
x_demog.shape: (33519, 19)
2015-05-28 _LAG1 (33318, 58)
2015-04-28 _LAG2 (32453, 58)
2015-03-28 _LAG3 (32052, 58)
2015-02-28 _LAG4 (31503, 58)
Nulls after merging y and x_demog: 0
Nulls at 1: 11457
Nulls at 2: 72219
Nulls at 3: 155838
Nulls at 4: 270750
X_train.shape: (33519, 247)
y_train.shape: (33519, 25)


In [22]:
X_2015_06.to_csv(os.path.join(OUT_DIR3, "X_2015_06.csv"), index=False)
y_2015_06.to_csv(os.path.join(OUT_DIR3, "y_2015_06.csv"), index=False)

## Extract `X_2016_05`, `y_2016_05`

In [23]:
timestamp = "2016-05-28"
timestamp_lags = ["2016-04-28", "2016-03-28", "2016-02-28", "2016-01-28"]

X_2016_05, y_2016_05 = extract_X_y_train(df_train, timestamp, timestamp_lags)

y_train.shape (27916, 25)
x_demog.shape: (27916, 19)
2016-04-28 _LAG1 (27875, 58)
2016-03-28 _LAG2 (26372, 58)
2016-02-28 _LAG3 (25692, 58)
2016-01-28 _LAG4 (24936, 58)
Nulls after merging y and x_demog: 0
Nulls at 1: 2337
Nulls at 2: 90345
Nulls at 3: 217113
Nulls at 4: 386973
X_train.shape: (27916, 247)
y_train.shape: (27916, 25)


In [24]:
X_2016_05.to_csv(os.path.join(OUT_DIR3, "X_2016_05.csv"), index=False)
y_2016_05.to_csv(os.path.join(OUT_DIR3, "y_2016_05.csv"), index=False)

## Extract `X_2016_06`

In [25]:
timestamp = "2016-06-28"
timestamp_lags = ["2016-05-28", "2016-04-28", "2016-03-28", "2016-02-28"]

X_2016_06 = extract_X_test(df_train, df_test, timestamp, timestamp_lags)

x_demog.shape: (929615, 19)
Nulls of x_demog: 0
2016-05-28 _LAG1 (929615, 58)
2016-04-28 _LAG2 (925252, 58)
2016-03-28 _LAG3 (920975, 58)
2016-02-28 _LAG4 (915679, 58)
Nulls at 1: 0
Nulls at 2: 248691
Nulls at 3: 741171
Nulls at 4: 1535523
X_test.shape: (929615, 247)


In [None]:
X_2016_06.to_csv(os.path.join(OUT_DIR3, "X_2016_06.csv"), index=False)