In [2]:
import numpy as np
import pandas as pd
from datetime import datetime, timedelta

In [3]:
def generate_embeddings(date_list, ticker_name='t1'):
    return pd.DataFrame({
        'date': date_list,
        'f1': np.random.randn(len(date_list)),
        'f2': np.random.randn(len(date_list)),
        'ticker': [ticker_name] * len(date_list),
    })

def generate_date_list(start_date=datetime(2018, 1, 1), end_date=datetime(2018, 1, 15), max_duplicates=2):
    date_list = []
    current_date = start_date
    while current_date <= end_date:
        for _ in range(max_duplicates):
            if np.random.random() < 0.5:
                date_list.append(current_date.strftime("%d/%m/%Y"))
        current_date += timedelta(days=1)
    print(f"Length of the date list: {len(date_list)}")
    return date_list

def create_true_date_list(start_date=datetime(2018, 1, 1), end_date=datetime(2018, 1, 15)):
    true_date_list = []
    current_date = start_date
    while current_date <= end_date:
        true_date_list.append(current_date.strftime("%d/%m/%Y"))
        current_date += timedelta(days=1)
    print(f"Length of the true date list: {len(true_date_list)}")
    return true_date_list

In [4]:
window = 7
min_periods = 1
true_date_list = create_true_date_list()

Length of the true date list: 15


# Rolling mean embedding of one ticker

## First version: mean by day and then mean of the last 7 days

In [5]:
np.random.seed(0)
date_list = generate_date_list()
embeddings = generate_embeddings(date_list).drop('ticker', axis=1)
embeddings = embeddings.groupby('date').mean()
embeddings = embeddings.reindex(true_date_list)
display(embeddings)
embeddings.rolling(window, min_periods=min_periods).mean()

Length of the date list: 10


Unnamed: 0_level_0,f1,f2
date,Unnamed: 1_level_1,Unnamed: 2_level_1
01/01/2018,,
02/01/2018,,
03/01/2018,0.864436,-0.887786
04/01/2018,-0.742165,-1.980796
05/01/2018,2.269755,-0.347912
06/01/2018,,
07/01/2018,,
08/01/2018,-0.704304,0.69332
09/01/2018,-0.187184,1.20238
10/01/2018,,


Unnamed: 0_level_0,f1,f2
date,Unnamed: 1_level_1,Unnamed: 2_level_1
01/01/2018,,
02/01/2018,,
03/01/2018,0.864436,-0.887786
04/01/2018,0.061136,-1.434291
05/01/2018,0.797342,-1.072165
06/01/2018,0.797342,-1.072165
07/01/2018,0.797342,-1.072165
08/01/2018,0.421931,-0.630794
09/01/2018,0.300108,-0.264159
10/01/2018,0.159026,-0.108252


## Second version: mean of the last 7 days

In [27]:
np.random.seed(0)
date_list = generate_date_list()
date_list_2 = generate_date_list()
date_list_3 = generate_date_list()
embeddings = generate_embeddings(date_list)
embeddings_2 = generate_embeddings(date_list_2, ticker_name='t2')
embeddings_3 = generate_embeddings(date_list_3, ticker_name='t3')
embeddings = embeddings_inter = pd.concat((embeddings, embeddings_2, embeddings_3))
# display(embeddings)
embeddings = embeddings.groupby(['ticker', 'date']).mean()
def rolling_mean_group(group: pd.DataFrame):
    return group.set_index('date').reindex(true_date_list).drop('ticker', axis=1).rolling(window=window, min_periods=min_periods).mean()
    # return group.set_index('date').reindex(true_date_list).reset_index().drop('ticker', axis=1).rolling(window=window, min_periods=min_periods).mean()
embeddings.reset_index(drop=False).groupby('ticker').apply(rolling_mean_group).swaplevel().sort_index()

Length of the date list: 10
Length of the date list: 17
Length of the date list: 18


Unnamed: 0_level_0,Unnamed: 1_level_0,f1,f2
date,ticker,Unnamed: 2_level_1,Unnamed: 3_level_1
01/01/2018,t1,,
01/01/2018,t2,0.208275,1.480515
01/01/2018,t3,0.707055,1.663812
02/01/2018,t1,,
02/01/2018,t2,0.592457,1.674037
02/01/2018,t3,0.689675,0.375495
03/01/2018,t1,1.139401,0.465662
03/01/2018,t2,0.51376,1.418039
03/01/2018,t3,0.399374,0.217181
04/01/2018,t1,-0.047713,-0.535291


In [81]:
embeddings_inter.sample(frac=1).sort_values(['ticker', 'date']).drop(['ticker', 'date'], axis=1).reset_index(drop=True) - embeddings_inter.drop(['ticker', 'date'], axis=1).reset_index(drop=True)

Unnamed: 0,f1,f2
0,0.0,0.0
1,0.0,0.0
2,0.0,0.0
3,0.0,0.0
4,0.0,0.0
5,0.0,0.0
6,0.0,0.0
7,0.0,0.0
8,0.0,0.0
9,0.0,0.0


In [83]:
embeddings_inter.reset_index(drop=True)

Unnamed: 0,date,f1,f2,ticker
0,03/01/2018,1.764052,0.144044,t1
1,04/01/2018,0.400157,1.454274,t1
2,05/01/2018,0.978738,0.761038,t1
3,08/01/2018,2.240893,0.121675,t1
4,08/01/2018,1.867558,0.443863,t1
5,09/01/2018,-0.977278,0.333674,t1
6,12/01/2018,0.950088,1.494079,t1
7,13/01/2018,-0.151357,-0.205158,t1
8,14/01/2018,-0.103219,0.313068,t1
9,15/01/2018,0.410599,-0.854096,t1


In [56]:
embeddings_inter.sample(frac=1).sort_values(['ticker', 'date'])

Unnamed: 0,date,f1,f2,ticker
0,03/01/2018,1.764052,0.144044,t1
1,04/01/2018,0.400157,1.454274,t1
2,05/01/2018,0.978738,0.761038,t1
3,08/01/2018,2.240893,0.121675,t1
4,08/01/2018,1.867558,0.443863,t1
5,09/01/2018,-0.977278,0.333674,t1
6,12/01/2018,0.950088,1.494079,t1
7,13/01/2018,-0.151357,-0.205158,t1
8,14/01/2018,-0.103219,0.313068,t1
9,15/01/2018,0.410599,-0.854096,t1


In [12]:
pd.DataFrame({
    'a': [1, 2, 3, 6],
    'b': [3, 4, 6, 8],
}).sample(frac=1, random_state=42)

Unnamed: 0,a,b
1,2,4
3,6,8
0,1,3
2,3,6
