In [7]:
import numpy as np
import pandas as pd
from datetime import datetime, timedelta

In [63]:
def generate_embeddings(date_list, ticker_name='t1'):
    return pd.DataFrame({
        'date': date_list,
        'f1': np.random.randn(len(date_list)),
        'f2': np.random.randn(len(date_list)),
        'ticker': [ticker_name] * len(date_list),
    })

In [58]:
window = 7
min_periods = 1

In [64]:
np.random.seed(0)
start_date = datetime(2018, 1, 1)
end_date = datetime(2018, 1, 15)

date_list = []
true_date_list = []

current_date = start_date
while current_date <= end_date:
    true_date_list.append(current_date.strftime("%d/%m/%Y"))
    for _ in range(2):
        if np.random.random() < 0.5:
            date_list.append(current_date.strftime("%d/%m/%Y"))
    current_date += timedelta(days=1)

print(len(date_list), len(true_date_list))

10 15


In [60]:
date_list

['03/01/2018',
 '04/01/2018',
 '05/01/2018',
 '08/01/2018',
 '08/01/2018',
 '09/01/2018',
 '12/01/2018',
 '13/01/2018',
 '14/01/2018',
 '15/01/2018']

# Rolling mean embedding of one ticker

## First version: mean by day and then mean of the last 7 days

In [66]:
np.random.seed(0)

embeddings = generate_embeddings(date_list).drop('ticker', axis=1)
embeddings = embeddings.groupby('date').mean()
embeddings = embeddings.reindex(true_date_list)
display(embeddings)
embeddings.rolling(window, min_periods=min_periods).mean()

Unnamed: 0_level_0,f1,f2
date,Unnamed: 1_level_1,Unnamed: 2_level_1
01/01/2018,,
02/01/2018,,
03/01/2018,1.764052,0.144044
04/01/2018,0.400157,1.454274
05/01/2018,0.978738,0.761038
06/01/2018,,
07/01/2018,,
08/01/2018,2.054226,0.282769
09/01/2018,-0.977278,0.333674
10/01/2018,,


Unnamed: 0_level_0,f1,f2
date,Unnamed: 1_level_1,Unnamed: 2_level_1
01/01/2018,,
02/01/2018,,
03/01/2018,1.764052,0.144044
04/01/2018,1.082105,0.799159
05/01/2018,1.047649,0.786452
06/01/2018,1.047649,0.786452
07/01/2018,1.047649,0.786452
08/01/2018,1.299293,0.660531
09/01/2018,0.843979,0.59516
10/01/2018,0.613961,0.707939


## Second version: mean of the last 7 days

In [47]:
embeddings['f2'].dtype=='float64'

True

In [67]:
np.random.seed(0)

embeddings = generate_embeddings(date_list)
embeddings_2 = generate_embeddings(date_list, ticker_name='t2')
embeddings_3 = generate_embeddings(date_list, ticker_name='t3')
embeddings = embeddings_inter = pd.concat((embeddings, embeddings_2, embeddings_3))
display(embeddings)
# How to make sure that the rows are exactly in this order? use .sort_values(['ticker', 'date'])
# embeddings_inter.sort_values(['ticker', 'date'])
embeddings = embeddings.groupby(['ticker', 'date']).mean()
def rolling_mean_group(group: pd.DataFrame):
    return group.set_index('date').reindex(true_date_list).drop('ticker', axis=1).rolling(window=window, min_periods=min_periods).mean()
    # return group.set_index('date').reindex(true_date_list).reset_index().drop('ticker', axis=1).rolling(window=window, min_periods=min_periods).mean()
embeddings.reset_index(drop=False).groupby('ticker').apply(rolling_mean_group)

Unnamed: 0,date,f1,f2,ticker
0,03/01/2018,1.764052,0.144044,t1
1,04/01/2018,0.400157,1.454274,t1
2,05/01/2018,0.978738,0.761038,t1
3,08/01/2018,2.240893,0.121675,t1
4,08/01/2018,1.867558,0.443863,t1
5,09/01/2018,-0.977278,0.333674,t1
6,12/01/2018,0.950088,1.494079,t1
7,13/01/2018,-0.151357,-0.205158,t1
8,14/01/2018,-0.103219,0.313068,t1
9,15/01/2018,0.410599,-0.854096,t1


Unnamed: 0_level_0,Unnamed: 1_level_0,f1,f2
ticker,date,Unnamed: 2_level_1,Unnamed: 3_level_1
t1,01/01/2018,,
t1,02/01/2018,,
t1,03/01/2018,1.764052,0.144044
t1,04/01/2018,1.082105,0.799159
t1,05/01/2018,1.047649,0.786452
t1,06/01/2018,1.047649,0.786452
t1,07/01/2018,1.047649,0.786452
t1,08/01/2018,1.299293,0.660531
t1,09/01/2018,0.843979,0.59516
t1,10/01/2018,0.613961,0.707939


In [81]:
embeddings_inter.sample(frac=1).sort_values(['ticker', 'date']).drop(['ticker', 'date'], axis=1).reset_index(drop=True) - embeddings_inter.drop(['ticker', 'date'], axis=1).reset_index(drop=True)

Unnamed: 0,f1,f2
0,0.0,0.0
1,0.0,0.0
2,0.0,0.0
3,0.0,0.0
4,0.0,0.0
5,0.0,0.0
6,0.0,0.0
7,0.0,0.0
8,0.0,0.0
9,0.0,0.0


In [83]:
embeddings_inter.reset_index(drop=True)

Unnamed: 0,date,f1,f2,ticker
0,03/01/2018,1.764052,0.144044,t1
1,04/01/2018,0.400157,1.454274,t1
2,05/01/2018,0.978738,0.761038,t1
3,08/01/2018,2.240893,0.121675,t1
4,08/01/2018,1.867558,0.443863,t1
5,09/01/2018,-0.977278,0.333674,t1
6,12/01/2018,0.950088,1.494079,t1
7,13/01/2018,-0.151357,-0.205158,t1
8,14/01/2018,-0.103219,0.313068,t1
9,15/01/2018,0.410599,-0.854096,t1


In [56]:
embeddings_inter.sample(frac=1).sort_values(['ticker', 'date'])

Unnamed: 0,date,f1,f2,ticker
0,03/01/2018,1.764052,0.144044,t1
1,04/01/2018,0.400157,1.454274,t1
2,05/01/2018,0.978738,0.761038,t1
3,08/01/2018,2.240893,0.121675,t1
4,08/01/2018,1.867558,0.443863,t1
5,09/01/2018,-0.977278,0.333674,t1
6,12/01/2018,0.950088,1.494079,t1
7,13/01/2018,-0.151357,-0.205158,t1
8,14/01/2018,-0.103219,0.313068,t1
9,15/01/2018,0.410599,-0.854096,t1


In [31]:
true_date_list

['01/01/2018',
 '02/01/2018',
 '03/01/2018',
 '04/01/2018',
 '05/01/2018',
 '06/01/2018',
 '07/01/2018',
 '08/01/2018',
 '09/01/2018',
 '10/01/2018',
 '11/01/2018',
 '12/01/2018',
 '13/01/2018',
 '14/01/2018',
 '15/01/2018']