In [90]:
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import TimeSeriesSplit
import pandas as pd

In [96]:
df = pd.DataFrame({
    'y': [1,2,3,4,5,6,7,8,9,10],
    'x1': [1,3,2,5,4,3,2,9,12,13],
    'date': pd.to_datetime(['2018-02-02', '2018-02-03', '2018-02-04', '2018-02-05', '2018-02-06', '2018-02-07', 
                           '2018-02-08', '2018-02-09', '2018-02-10', '2018-02-11'])
})

In [760]:
df['x2'] = df['x1']

In [761]:
df

Unnamed: 0,y,x1,date,x2
0,9999,9999,2018-02-02,9999
1,2,3,2018-02-03,3
2,3,2,2018-02-04,2
3,100,100,2018-02-05,100
4,5,4,2018-02-06,4
5,6,3,2018-02-07,3
6,7,2,2018-02-08,2
7,8,9,2018-02-09,9
8,9,12,2018-02-10,12
9,10,13,2018-02-11,13


In [133]:
a = [df.set_index('date')['y'].shift(2).rename(), df.set_index('date')['y'].shift(1)]

In [134]:
pd.concat(a, axis=1)

Unnamed: 0_level_0,a,y
date,Unnamed: 1_level_1,Unnamed: 2_level_1
2018-02-02,,
2018-02-03,,1.0
2018-02-04,1.0,2.0
2018-02-05,2.0,3.0
2018-02-06,3.0,4.0
2018-02-07,4.0,5.0
2018-02-08,5.0,6.0
2018-02-09,6.0,7.0
2018-02-10,7.0,8.0
2018-02-11,8.0,9.0


In [136]:
a[0].name

'a'

In [439]:
def add_lags(series, depth = 5, suffix = '_lag_'):
    
    lags = []
    if not isinstance(depth, (list)):
        depth = range(1, depth+1)
    
    for i in depth:
        lags.append(series.shift(i).rename(series.name + suffix + str(i)))
    
    return pd.concat(lags, axis=1)

In [706]:
def m_add_lags(df, depth = 5, suffix = '_lag_'):
    
    lags = []
    if not isinstance(depth, (list)):
        depth = range(1, depth+1)
    
    for i in depth:
        r = df.shift(i)
        col_names = [c + suffix + str(i) for c in r.columns]
        r.columns = col_names
        lags.append(r)
    
    return pd.concat(lags, axis=1)

In [708]:
def mov_avg(series, window=5, suffix = '_mm_'):
    return pd.DataFrame(series.rolling(window).mean().rename(series.name + suffix + str(window)))

In [501]:
max_k = 2
current = 3
to = CV.dataset.set_index('date').iloc[current-max_k:(current+1)]; to

Unnamed: 0_level_0,y,x1
date,Unnamed: 1_level_1,Unnamed: 2_level_1
2018-02-03,2,3
2018-02-04,3,2
2018-02-05,100,100


In [504]:
r = mov_avg(to['y'], 3);r 

Unnamed: 0_level_0,y_mm_3
date,Unnamed: 1_level_1
2018-02-03,
2018-02-04,
2018-02-05,35.0


In [511]:
add_lags(to['y'])

Unnamed: 0_level_0,y_lag_1,y_lag_2,y_lag_3,y_lag_4,y_lag_5
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2018-02-03,,,,,
2018-02-04,2.0,,,,
2018-02-05,3.0,2.0,,,


In [512]:
to

Unnamed: 0_level_0,y,x1
date,Unnamed: 1_level_1,Unnamed: 2_level_1
2018-02-03,2,3
2018-02-04,3,2
2018-02-05,100,100


In [510]:
r.dropna().index

DatetimeIndex(['2018-02-05'], dtype='datetime64[ns]', name='date', freq=None)

---------

In [762]:
class TimeCV(object):
    def __init__(self, dataset, model, response_variable, n_slices = 10, ignore_cols = []):
        self.model = model
        self.response_variable = response_variable
        self.dataset = dataset
        self.active_dataset = dataset.copy()
        self.n_slices = n_slices
        self.feature_extractors = []
        self.submodels = []
        self.current_observation = 6
        
        self.train_split_index = None
        self.test_split_index = None
    
    def add_feature_extractor(self, f, apply_at, max_lags):
        self.feature_extractors.append((f, apply_at, max_lags))
    
    def add_submodel(self, sec_model, response_variable, features, train_at_each_fold = True):
        model_id = len(self.submodels)
        self.submodels.append((model_id, sec_model, response_variable, features, train_at_each_fold))
    
    def extract_features(self, predict=False):
        for f, apply_at, max_lags in self.feature_extractors:
            
            # No modo de predição, as features que usam y não são calculadas
            if predict and self.response_variable in apply_at:
                continue
                
            result = f(self.dataset[apply_at])
            self.dataset = pd.concat([self.dataset, result], axis=1)
    
    def extract_recursive_features(self):
        for f, apply_at, max_k in self.feature_extractors:
            if not self.response_variable in apply_at:
                continue
            
            # Dataframe da obs. atual com menor tamanho necessário p/ extrair o prox valor da feature 
            d_current = self.active_dataset.iloc[self.current_observation-max_k:(self.current_observation+1)]
            
            # Aplica a feature e obtém apenas o último valor
            result = f(d_current[apply_at]).dropna()
            
            if result.shape[0] > 1:
                print(result)
                raise Exception('Feature obteve mais que uma linha')
            
            # Atualiza o valor novo da feature
            self.active_dataset.loc[result.index, result.columns] = result.values[0]
            
    
    def extract_submodel_predictions(self):
        for model_id, model, response_variable, features, train_at_each_fold in self.submodels:
            if train_at_each_fold:
                df_t = self.dataset.copy()[features + [response_variable]]

                df_t = df_t.dropna()
                not_na_indices = df_t.index
                
                # Ajusta o modelo
                model.fit(df_t[features].values, df_t[response_variable].values)
                
                # Cria coluna de predicao
                self.dataset[response_variable + '_pred_' + str(model_id)] = pd.np.nan
                
                # Atualiza coluna de predicao com valores
                self.dataset.loc[not_na_indices, response_variable + '_pred_' + str(model_id)] = \
                model.predict(df_t[features])
        
    def train(self):
        train_dataset = self.active_dataset[self.train_split_index]
        
        self.model.fit(train_dataset.drop(self.model_response_variable), 
                       train_dataset[self.model_response_variable])
        
    def predict(self):
        pass

In [798]:
model = LinearRegression()
CV = TimeCV(df.set_index('date'), model, 'y', 3)
CV.add_feature_extractor(add_lags, 'x1', 5)
CV.add_feature_extractor(mov_avg, 'y', 4)
CV.add_feature_extractor(m_bla, ['x1','x2'], 4)
#CV.add_feature_extractor(lambda x: add_lags(x,1), 'y', 1)
CV.add_feature_extractor(lambda x: m_add_lags(x, 1), ['y','x2'], 1)

In [797]:
def m_bla(df):
    return (df['x1']+df['x2']).rename('soma')

In [799]:
CV.extract_features(predict=False)

In [800]:
CV.dataset

Unnamed: 0_level_0,y,x1,x2,x1_lag_1,x1_lag_2,x1_lag_3,x1_lag_4,x1_lag_5,y_mm_5,soma,y_lag_1,x2_lag_1
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
2018-02-02,9999,9999,9999,,,,,,,19998,,
2018-02-03,2,3,3,9999.0,,,,,,6,9999.0,9999.0
2018-02-04,3,2,2,3.0,9999.0,,,,,4,2.0,3.0
2018-02-05,100,100,100,2.0,3.0,9999.0,,,,200,3.0,2.0
2018-02-06,5,4,4,100.0,2.0,3.0,9999.0,,2021.8,8,100.0,100.0
2018-02-07,6,3,3,4.0,100.0,2.0,3.0,9999.0,23.2,6,5.0,4.0
2018-02-08,7,2,2,3.0,4.0,100.0,2.0,3.0,24.2,4,6.0,3.0
2018-02-09,8,9,9,2.0,3.0,4.0,100.0,2.0,25.2,18,7.0,2.0
2018-02-10,9,12,12,9.0,2.0,3.0,4.0,100.0,7.0,24,8.0,9.0
2018-02-11,10,13,13,12.0,9.0,2.0,3.0,4.0,8.0,26,9.0,12.0


In [774]:
CV.current_observation

6

In [775]:
CV.active_dataset = CV.dataset

In [785]:
CV.active_dataset.loc['2018-02-08','y'] = 1000
CV.active_dataset.loc['2018-02-07','y'] = 1000
CV.active_dataset.loc['2018-02-07','x2'] = 321321321

In [786]:
CV.extract_recursive_features()

In [787]:
CV.active_dataset

Unnamed: 0_level_0,y,x1,x2,x1_lag_1,x1_lag_2,x1_lag_3,x1_lag_4,x1_lag_5,y_mm_5,y_lag_1,x2_lag_1
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
2018-02-02,9999,9999,9999,,,,,,,,
2018-02-03,2,3,3,9999.0,,,,,,9999.0,9999.0
2018-02-04,3,2,2,3.0,9999.0,,,,,2.0,3.0
2018-02-05,100,100,100,2.0,3.0,9999.0,,,,3.0,2.0
2018-02-06,5,4,4,100.0,2.0,3.0,9999.0,,2021.8,100.0,100.0
2018-02-07,1000,3,321321321,4.0,100.0,2.0,3.0,9999.0,23.2,5.0,4.0
2018-02-08,1000,2,2,3.0,4.0,100.0,2.0,3.0,421.6,1000.0,321321321.0
2018-02-09,8,9,9,2.0,3.0,4.0,100.0,2.0,25.2,7.0,2.0
2018-02-10,9,12,12,9.0,2.0,3.0,4.0,100.0,7.0,8.0,9.0
2018-02-11,10,13,13,12.0,9.0,2.0,3.0,4.0,8.0,9.0,12.0


In [633]:
(1000+6+5+100+3)/5

222.8

In [581]:
CV.active_dataset.loc[r]

ValueError: Cannot index with multidimensional key

In [394]:
CV.extract_features()

In [395]:
CV.add_submodel(model, 'y', ['y_lag_1', 'y_lag_2'])

In [396]:
CV.extract_submodel_predictions()

In [397]:
CV.dataset

Unnamed: 0,y,x1,date,y_lag_1,y_lag_2,y_lag_3,y_lag_4,y_lag_5,x1_lag_1,x1_lag_2,x1_lag_3,x1_lag_4,x1_lag_5,y_pred_0
0,1,1,2018-02-02,,,,,,,,,,,
1,2,3,2018-02-03,1.0,,,,,1.0,,,,,
2,3,2,2018-02-04,2.0,1.0,,,,3.0,1.0,,,,3.0
3,4,5,2018-02-05,3.0,2.0,1.0,,,2.0,3.0,1.0,,,4.0
4,5,4,2018-02-06,4.0,3.0,2.0,1.0,,5.0,2.0,3.0,1.0,,5.0
5,6,3,2018-02-07,5.0,4.0,3.0,2.0,1.0,4.0,5.0,2.0,3.0,1.0,6.0
6,7,2,2018-02-08,6.0,5.0,4.0,3.0,2.0,3.0,4.0,5.0,2.0,3.0,7.0
7,8,9,2018-02-09,7.0,6.0,5.0,4.0,3.0,2.0,3.0,4.0,5.0,2.0,8.0
8,9,12,2018-02-10,8.0,7.0,6.0,5.0,4.0,9.0,2.0,3.0,4.0,5.0,9.0
9,10,13,2018-02-11,9.0,8.0,7.0,6.0,5.0,12.0,9.0,2.0,3.0,4.0,10.0


In [335]:
r

0     3.0
1     4.0
2     5.0
3     6.0
4     7.0
5     8.0
6     9.0
7    10.0
Name: y_pred_0, dtype: float64

In [336]:
CV.dataset

Unnamed: 0,y,x1,date,y_lag_1,y_lag_2,y_lag_3,y_lag_4,y_lag_5,x1_lag_1,x1_lag_2,x1_lag_3,x1_lag_4,x1_lag_5
0,1,1,2018-02-02,,,,,,,,,,
1,2,3,2018-02-03,1.0,,,,,1.0,,,,
2,3,2,2018-02-04,2.0,1.0,,,,3.0,1.0,,,
3,4,5,2018-02-05,3.0,2.0,1.0,,,2.0,3.0,1.0,,
4,5,4,2018-02-06,4.0,3.0,2.0,1.0,,5.0,2.0,3.0,1.0,
5,6,3,2018-02-07,5.0,4.0,3.0,2.0,1.0,4.0,5.0,2.0,3.0,1.0
6,7,2,2018-02-08,6.0,5.0,4.0,3.0,2.0,3.0,4.0,5.0,2.0,3.0
7,8,9,2018-02-09,7.0,6.0,5.0,4.0,3.0,2.0,3.0,4.0,5.0,2.0
8,9,12,2018-02-10,8.0,7.0,6.0,5.0,4.0,9.0,2.0,3.0,4.0,5.0
9,10,13,2018-02-11,9.0,8.0,7.0,6.0,5.0,12.0,9.0,2.0,3.0,4.0


In [325]:
pd.concat([r.reset_index(), CV.dataset], axis=1)

Unnamed: 0,index,y_pred_0,y,x1,date,y_lag_1,y_lag_2,y_lag_3,y_lag_4,y_lag_5,x1_lag_1,x1_lag_2,x1_lag_3,x1_lag_4,x1_lag_5
0,0,,1,1,2018-02-02,,,,,,,,,,
1,1,,2,3,2018-02-03,1.0,,,,,1.0,,,,
2,0,3.0,3,2,2018-02-04,2.0,1.0,,,,3.0,1.0,,,
3,1,4.0,4,5,2018-02-05,3.0,2.0,1.0,,,2.0,3.0,1.0,,
4,2,5.0,5,4,2018-02-06,4.0,3.0,2.0,1.0,,5.0,2.0,3.0,1.0,
5,3,6.0,6,3,2018-02-07,5.0,4.0,3.0,2.0,1.0,4.0,5.0,2.0,3.0,1.0
6,4,7.0,7,2,2018-02-08,6.0,5.0,4.0,3.0,2.0,3.0,4.0,5.0,2.0,3.0
7,5,8.0,8,9,2018-02-09,7.0,6.0,5.0,4.0,3.0,2.0,3.0,4.0,5.0,2.0
8,6,9.0,9,12,2018-02-10,8.0,7.0,6.0,5.0,4.0,9.0,2.0,3.0,4.0,5.0
9,7,10.0,10,13,2018-02-11,9.0,8.0,7.0,6.0,5.0,12.0,9.0,2.0,3.0,4.0


In [266]:
CV.dataset

Unnamed: 0,y,x1,date,y_lag_1,y_lag_2,y_lag_3,y_lag_4,y_lag_5,x1_lag_1,x1_lag_2,x1_lag_3,x1_lag_4,x1_lag_5,y_pred_0
0,1,1,2018-02-02,,,,,,,,,,,3.0
1,2,3,2018-02-03,1.0,,,,,1.0,,,,,4.0
2,3,2,2018-02-04,2.0,1.0,,,,3.0,1.0,,,,5.0
3,4,5,2018-02-05,3.0,2.0,1.0,,,2.0,3.0,1.0,,,6.0
4,5,4,2018-02-06,4.0,3.0,2.0,1.0,,5.0,2.0,3.0,1.0,,7.0
5,6,3,2018-02-07,5.0,4.0,3.0,2.0,1.0,4.0,5.0,2.0,3.0,1.0,8.0
6,7,2,2018-02-08,6.0,5.0,4.0,3.0,2.0,3.0,4.0,5.0,2.0,3.0,9.0
7,8,9,2018-02-09,7.0,6.0,5.0,4.0,3.0,2.0,3.0,4.0,5.0,2.0,10.0
8,9,12,2018-02-10,8.0,7.0,6.0,5.0,4.0,9.0,2.0,3.0,4.0,5.0,
9,10,13,2018-02-11,9.0,8.0,7.0,6.0,5.0,12.0,9.0,2.0,3.0,4.0,


In [184]:
CV.feature_extractors

[(<function __main__.add_lags(series, depth=5, suffix='_lag_')>, 'y', False)]

In [163]:
CV.feature_extractors[0](df['y'])

Unnamed: 0,y_lag_1,y_lag_2,y_lag_3,y_lag_4,y_lag_5
0,,,,,
1,1.0,,,,
2,2.0,1.0,,,
3,3.0,2.0,1.0,,
4,4.0,3.0,2.0,1.0,
5,5.0,4.0,3.0,2.0,1.0
6,6.0,5.0,4.0,3.0,2.0
7,7.0,6.0,5.0,4.0,3.0
8,8.0,7.0,6.0,5.0,4.0
9,9.0,8.0,7.0,6.0,5.0


In [100]:
CV.dataset

Unnamed: 0,y,x1,date
0,1,1,2018-02-02
1,2,3,2018-02-03
2,3,2,2018-02-04
3,4,5,2018-02-05
4,5,4,2018-02-06
5,6,3,2018-02-07
6,7,2,2018-02-08
7,8,9,2018-02-09
8,9,12,2018-02-10
9,10,13,2018-02-11
