In [67]:
import pandas as pd
import numpy as np

In [149]:
def create_lags(X: pd.DataFrame, lag=1):
    X_lag = X.groupby("id_qi").shift(lag)
    return X_lag.rename(columns={c: f"{c}_l{lag}" for c in X.columns})


def grouped_roll(df, w="10D", mp=1):
    df.set_index(df.index.get_level_values(0).unique(), inplace=True)
    df.index = pd.to_datetime(df.index)
    df.sort_index(inplace=True)
    return df.rolling(w, min_periods=mp).mean()

def create_rolling_mean(X: pd.DataFrame, w, mp=5):
    X_rol_m = X.groupby("id_qi").apply(grouped_roll, w=w, mp=mp).swaplevel().sort_index()
    return X_rol_m.rename(columns={c: f"{c}_w{w}" for c in X.columns})

data = {
    'date': ['2020-03-01', '2020-03-01', '2020-03-02', '2020-03-02', '2020-03-03', '2020-03-03', '2020-03-04', '2020-03-05', '2020-03-05'],
    'id_qi': [2, 3, 2, 4, 2, 3, 2, 2, 4],
    'f1': [0.5, 4.1, 0.6, 7.5, 0.55, 4.05, 0.51, 0.52, 7.51],
    'f2': [1.2, 0.3, 1.25, 10, 1.3, 0.25, 1.26, 1.27, 10.1]
}

X = pd.DataFrame(data)
X['date'] = pd.to_datetime(X['date'])
X.set_index(['date', 'id_qi'], inplace=True)

In [150]:
assert X.groupby('id_qi').shift().equals(pd.concat((X.xs(2, level='id_qi').shift(), X.xs(3, level='id_qi').shift(), X.xs(4, level='id_qi').shift()), keys=[2, 3, 4], names=['date', 'id_qi']).swaplevel().sort_index())

In [187]:
np.random.seed(0)
index = pd.Series(['2020-03-01', '2020-03-02', '2020-03-03', '2020-03-05', '2020-03-06'], name="date")
columns = pd.Series([2, 3, 4, 5], name="id_qi")
target = pd.DataFrame(np.random.randn(5, 4), index=index, columns=columns)
target.index = pd.to_datetime(target.index)
target.iloc[0, 1] = target.iloc[2, 2] = target.iloc[3, 0] = target.iloc[1, 1] = np.nan
display(target)

target = pd.DataFrame(target.unstack().swaplevel().sort_index())
target_groupby = target.groupby('id_qi')
target_l2 = target_groupby.shift(2)
target_l1 = target_groupby.shift(1)
target_f1 = target_groupby.shift(-1)

target = pd.concat((target_l2, target_l1, target, target_f1), keys=['onr_l2', 'onr_l1', 'onr', 'onr_f1'], axis=1)
target.columns = target.columns.get_level_values(0)
target

id_qi,2,3,4,5
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2020-03-01,1.764052,,0.978738,2.240893
2020-03-02,1.867558,,0.950088,-0.151357
2020-03-03,-0.103219,0.410599,,1.454274
2020-03-05,,0.121675,0.443863,0.333674
2020-03-06,1.494079,-0.205158,0.313068,-0.854096


Unnamed: 0_level_0,Unnamed: 1_level_0,onr_l2,onr_l1,onr,onr_f1
date,id_qi,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2020-03-01,2,,,1.764052,1.867558
2020-03-01,3,,,,
2020-03-01,4,,,0.978738,0.950088
2020-03-01,5,,,2.240893,-0.151357
2020-03-02,2,,1.764052,1.867558,-0.103219
2020-03-02,3,,,,0.410599
2020-03-02,4,,0.978738,0.950088,
2020-03-02,5,,2.240893,-0.151357,1.454274
2020-03-03,2,1.764052,1.867558,-0.103219,
2020-03-03,3,,,0.410599,0.121675


In [183]:
# 2 options to create lags of features
features = X.columns
# Option 1: create lags regardless of the target
X_option1 = pd.concat((X, create_lags(X[features], 1)), axis=1)
X_option1 = pd.concat((X_option1, create_rolling_mean(X_option1[features], w=10, mp=1)), axis=1)
# Option 2: create lags taking the target dates into account
X_option2 = X[X.index.get_level_values(0).isin(target.index.get_level_values(0).unique())]
X_option2 = pd.concat((X_option2, create_lags(X_option2[features], 1)), axis=1)
X_option2 = pd.concat((X_option2, create_rolling_mean(X_option2[features], w=10, mp=1)), axis=1)

In [184]:
X_option1

Unnamed: 0_level_0,Unnamed: 1_level_0,f1,f2,f1_l1,f2_l1,f1_w10,f2_w10
date,id_qi,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2020-03-01,2,0.5,1.2,,,0.5,1.2
2020-03-01,3,4.1,0.3,,,4.1,0.3
2020-03-02,2,0.6,1.25,0.5,1.2,0.55,1.225
2020-03-02,4,7.5,10.0,,,7.5,10.0
2020-03-03,2,0.55,1.3,0.6,1.25,0.55,1.25
2020-03-03,3,4.05,0.25,4.1,0.3,4.075,0.275
2020-03-04,2,0.51,1.26,0.55,1.3,0.54,1.2525
2020-03-05,2,0.52,1.27,0.51,1.26,0.536,1.256
2020-03-05,4,7.51,10.1,7.5,10.0,7.505,10.05


In [185]:
X.xs(4, level='id_qi')

Unnamed: 0_level_0,f1,f2
date,Unnamed: 1_level_1,Unnamed: 2_level_1
2020-03-02,7.5,10.0
2020-03-05,7.51,10.1


In [186]:
data_option1 = pd.concat((X_option1, target), axis=1)
data_option2 = pd.concat((X_option2, target), axis=1)

data_option1.dropna()
data_option2.dropna()

Unnamed: 0_level_0,Unnamed: 1_level_0,f1,f2,f1_l1,f2_l1,f1_w10,f2_w10,onr_l2,onr_l1,onr,onr_f1
date,id_qi,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
