# Vocabulary

+ `embeddings1d`: embeddings of the tweets for one day. Some tickers have multiple tweets, some tickers have no tweets.
+ `embeddings1d_da`: daily average of the embeddings per ticker. Each ticker present that day is represented by a unique embedding.
+ `embeddings_da`: stack of `embeddings1d_da` for each day with swapped index. It has a MultiIndex with levels (clean_id_qis, date). Each ticker is represented by a single embedding for each day.
+ `embeddings`: final matrix of embeddings whose index is a MultiIndex with levels (date, clean_id_qis). Each ticker is represented by a single embedding for each day. If a ticker is not mentioned during a day, then it does not have any embedding for that day.

In [43]:
import numpy as np
import pandas as pd
from datetime import datetime
from typing import List
import os

In [90]:
def get_embeddings1d_da(embeddings1d: pd.DataFrame, weights_col='log_followers'):
    _e = embeddings1d.copy(); del embeddings1d
    unique_dates = _e['date'].unique()
    assert len(unique_dates) == 1, 'The dataframe must represent one day only!'
    if weights_col is None or len(weights_col) == 0:
        weights_col = 'weights'
        _e[weights_col] = 1.0
    date = unique_dates[0]
    _e.drop('date', axis=1, inplace=True)
    embeddings1d_da = _e.groupby('clean_id_qis').apply(lambda x: (x.drop(weights_col, axis=1).multiply(x[weights_col], 0)).sum() / x[weights_col].sum())
    embeddings1d_da['date'] = date
    return embeddings1d_da.reset_index(drop=False)

def get_embeddings_da(embeddings1d_das: List[pd.DataFrame]):
    return pd.concat(embeddings1d_das, axis=0).set_index(['clean_id_qis', 'date']).sort_index(level=['clean_id_qis', 'date'])

def get_embeddings1d_das(pq_files: List[str], weights_col='log_followers'):
    embeddings1d_das = []
    for pq_file in pq_files:
        embeddings1d = pd.read_parquet(pq_file)
        embeddings1d = embeddings1d[['date', 'clean_id_qis', 'log_followers']]
        # could do tests to see if there is a good alignment when body is identical
        _embeddings = load_embeddings(pq_file)
        embeddings1d = pd.concat([embeddings1d, _embeddings], axis=1)
        # print(embeddings1d)
        if weights_col is None:
            embeddings1d.drop('log_followers', axis=1, inplace=True)
        embeddings1d_da = get_embeddings1d_da(embeddings1d, weights_col=weights_col)
        embeddings1d_das.append(embeddings1d_da)
    return embeddings1d_das

def load_embeddings(pq_file: str):
    return pd.read_parquet(f"features{pq_file.lstrip('file')}")


def get_embeddings(embeddings_da: pd.DataFrame, window='7D', min_periods=1):
    return (embeddings_da
    .sort_index(level=['clean_id_qis', 'date'])
    .reset_index(level='clean_id_qis', drop=False)
    .groupby('clean_id_qis')
    .rolling(window, min_periods=min_periods)
    .mean()
    .swaplevel(i='clean_id_qis', j='date')
    .sort_index(level='date')
)

In [62]:
def generate_emdeddings1d(date, features=['f1', 'f2'], clean_id_qis=['t1', 't2', 't3'], size=7, seed=0):
    np.random.seed(seed)
    df = pd.DataFrame({
        'date': date,
        'clean_id_qis': np.random.choice(clean_id_qis, size=size),
    })
    for f in features:
        df[f] = np.abs(np.round(np.random.randn(size), 2))
    return df

def generate_embeddings1d_das(dates: List[datetime], features=['f1', 'f2', 'log_followers'], clean_id_qis=['t1', 't2', 't3'], weights_col='log_followers', seed=0):
    np.random.seed(seed)
    seeds = np.random.randint(0, int(1e5), size=len(dates))
    sizes = np.random.randint(1, 7, size=len(dates))
    embeddings1d_das = []
    embeddings1ds = []
    for i, date in enumerate(dates):
        ciq = np.unique(np.random.choice(clean_id_qis, size=len(clean_id_qis)+1, replace=True))
        embeddings1d = generate_emdeddings1d(date, features, ciq, sizes[i], seeds[i])
        embeddings1d_da = get_embeddings1d_da(embeddings1d, weights_col)
        embeddings1d_das.append(embeddings1d_da)
        embeddings1ds.append(embeddings1d)
    return embeddings1d_das, embeddings1ds

# From `embeddings1d` to `embeddings1d_da`

In [60]:
date = datetime(2018, 1, 1)
embeddings1d = generate_emdeddings1d(date, features=['followers', 'f1', 'f2'])
embeddings1d['log_followers'] = np.round(1 + np.log(1 + embeddings1d['followers'].fillna(0)), 2)
embeddings1d.drop('followers', axis=1, inplace=True)
print(f"This is how embeddings1d should look like before passing to get_embeddings1d_da:")
display(embeddings1d)
print(f"This is how embeddings1d should look like before passing to get_embeddings1d_da(, weights_col=None):\nMake sure you delete any undesirable column!!!")
display(embeddings1d.drop('log_followers', axis=1))
embeddings1d_da_unweighted = get_embeddings1d_da(embeddings1d.drop('log_followers', axis=1), None)
embeddings1d_da = get_embeddings1d_da(embeddings1d, 'log_followers')

This is how embeddings1d should look like before passing to get_embeddings1d_da:


Unnamed: 0,date,clean_id_qis,f1,f2,log_followers
0,2018-01-01,t1,0.36,2.01,1.25
1,2018-01-01,t2,1.22,0.23,1.12
2,2018-01-01,t1,1.34,0.6,1.16
3,2018-01-01,t2,0.43,1.63,2.02
4,2018-01-01,t2,0.12,1.59,1.09
5,2018-01-01,t3,1.41,0.23,1.86
6,2018-01-01,t1,0.12,0.06,1.76


This is how embeddings1d should look like before passing to get_embeddings1d_da(, weights_col=None):
Make sure you delete any undesirable column!!!


Unnamed: 0,date,clean_id_qis,f1,f2
0,2018-01-01,t1,0.36,2.01
1,2018-01-01,t2,1.22,0.23
2,2018-01-01,t1,1.34,0.6
3,2018-01-01,t2,0.43,1.63
4,2018-01-01,t2,0.12,1.59
5,2018-01-01,t3,1.41,0.23
6,2018-01-01,t1,0.12,0.06


In [31]:
print(f"Output of get_embeddings1d_da:")
embeddings1d_da

Output of get_embeddings1d_da:


Unnamed: 0,clean_id_qis,f1,f2,date
0,t1,0.531319,0.794748,2018-01-01
1,t2,0.559291,1.249007,2018-01-01
2,t3,1.41,0.23,2018-01-01


In [32]:
print(f"Output of get_embeddings1d_da_unweighted:")
embeddings1d_da_unweighted

Output of get_embeddings1d_da_unweighted:


Unnamed: 0,clean_id_qis,f1,f2,date
0,t1,0.606667,0.89,2018-01-01
1,t2,0.59,1.15,2018-01-01
2,t3,1.41,0.23,2018-01-01


# From `embeddings1d_da` to `embeddings_da`

## Code

In [33]:
dates = [datetime(2018, 1, 1), datetime(2018, 1, 2), datetime(2018, 1, 5), datetime(2018, 1, 7), datetime(2018, 1, 8), datetime(2018, 1, 13)]
embeddings1d_das, embeddings1ds = generate_embeddings1d_das(dates)

In [34]:
# Notice that there is no other column than the relevant features (no meta-features), the date and the clean_id_qis
print(f"This is how embeddings1d_das should look like before passing to get_embeddings_da:")
embeddings1d_das

This is how embeddings1d_das should look like before passing to get_embeddings_da:


[  clean_id_qis    f1    f2       date
 0           t2  1.41  0.86 2018-01-01
 1           t3  1.69  0.82 2018-01-01,
   clean_id_qis        f1        f2       date
 0           t3  0.753251  0.618587 2018-01-02,
   clean_id_qis        f1        f2       date
 0           t1  1.221971  0.671971 2018-01-05
 1           t3  0.786667  0.218288 2018-01-05,
   clean_id_qis    f1    f2       date
 0           t1  0.69  0.89 2018-01-07
 1           t2  0.28  0.18 2018-01-07
 2           t3  0.01  0.27 2018-01-07,
   clean_id_qis        f1        f2       date
 0           t1  0.250000  1.020000 2018-01-08
 1           t2  0.199405  0.464762 2018-01-08
 2           t3  0.224340  0.854864 2018-01-08,
   clean_id_qis    f1    f2       date
 0           t1  0.51  1.79 2018-01-13]

In [35]:
# Each DataFrame has no index
embeddings1d_das[0]

Unnamed: 0,clean_id_qis,f1,f2,date
0,t2,1.41,0.86,2018-01-01
1,t3,1.69,0.82,2018-01-01


In [36]:
embeddings_da = get_embeddings_da(embeddings1d_das)
embeddings_da

Unnamed: 0_level_0,Unnamed: 1_level_0,f1,f2
clean_id_qis,date,Unnamed: 2_level_1,Unnamed: 3_level_1
t1,2018-01-05,1.221971,0.671971
t1,2018-01-07,0.69,0.89
t1,2018-01-08,0.25,1.02
t1,2018-01-13,0.51,1.79
t2,2018-01-01,1.41,0.86
t2,2018-01-07,0.28,0.18
t2,2018-01-08,0.199405,0.464762
t3,2018-01-01,1.69,0.82
t3,2018-01-02,0.753251,0.618587
t3,2018-01-05,0.786667,0.218288


## A few tests

In [37]:
# Do not include log_followers since weights_col is None
embeddings1d_das, embeddings1ds = generate_embeddings1d_das(dates, features=['f1', 'f2'], weights_col=None)

In [38]:
pd.concat(embeddings1ds)

Unnamed: 0,date,clean_id_qis,f1,f2
0,2018-01-01,t2,1.41,0.86
1,2018-01-01,t3,1.69,0.82
0,2018-01-02,t3,0.69,0.56
1,2018-01-02,t3,0.74,0.7
2,2018-01-02,t3,1.69,1.5
3,2018-01-02,t3,0.52,0.29
0,2018-01-05,t3,1.56,1.18
1,2018-01-05,t1,0.66,1.61
2,2018-01-05,t3,1.8,0.2
3,2018-01-05,t3,1.51,0.12


In [13]:
embeddings_da = pd.concat(embeddings1ds).groupby(['clean_id_qis', 'date']).mean()
embeddings_da

Unnamed: 0_level_0,Unnamed: 1_level_0,f1,f2
clean_id_qis,date,Unnamed: 2_level_1,Unnamed: 3_level_1
t1,2018-01-05,0.66,1.61
t1,2018-01-08,0.25,1.02
t1,2018-01-13,0.51,1.79
t2,2018-01-01,1.41,0.86
t2,2018-01-05,0.19,0.955
t2,2018-01-07,0.145,0.395
t2,2018-01-08,0.195,0.46
t3,2018-01-01,1.69,0.82
t3,2018-01-02,0.91,0.7625
t3,2018-01-05,1.623333,0.5


In [14]:
(get_embeddings_da(embeddings1d_das) - embeddings_da).abs().max().max()

1.1102230246251565e-16

# From `embeddings_da` to `embeddings`

In [91]:
dates = [datetime(2018, 1, 1), datetime(2018, 1, 2), datetime(2018, 1, 5), datetime(2018, 1, 7), datetime(2018, 1, 8), datetime(2018, 1, 13)]
embeddings1d_das, embeddings1ds = generate_embeddings1d_das(dates)
embeddings_da = get_embeddings_da(embeddings1d_das)
print(f"This is how embeddings_da should look like before passing to get_embeddings:")
display(embeddings_da)
embeddings = get_embeddings(embeddings_da)

This is how embeddings_da should look like before passing to get_embeddings:


Unnamed: 0_level_0,Unnamed: 1_level_0,f1,f2
clean_id_qis,date,Unnamed: 2_level_1,Unnamed: 3_level_1
t1,2018-01-05,1.221971,0.671971
t1,2018-01-07,0.69,0.89
t1,2018-01-08,0.25,1.02
t1,2018-01-13,0.51,1.79
t2,2018-01-01,1.41,0.86
t2,2018-01-07,0.28,0.18
t2,2018-01-08,0.199405,0.464762
t3,2018-01-01,1.69,0.82
t3,2018-01-02,0.753251,0.618587
t3,2018-01-05,0.786667,0.218288


In [92]:
embeddings

Unnamed: 0_level_0,Unnamed: 1_level_0,f1,f2
date,clean_id_qis,Unnamed: 2_level_1,Unnamed: 3_level_1
2018-01-01,t2,1.41,0.86
2018-01-01,t3,1.69,0.82
2018-01-02,t3,1.221625,0.719293
2018-01-05,t1,1.221971,0.671971
2018-01-05,t3,1.076639,0.552292
2018-01-07,t1,0.955986,0.780986
2018-01-07,t2,0.845,0.52
2018-01-07,t3,0.809979,0.481719
2018-01-08,t1,0.720657,0.860657
2018-01-08,t2,0.239702,0.322381


In [99]:
dates = [datetime(2018, 1, 1), datetime(2018, 1, 2), datetime(2018, 1, 5), datetime(2018, 1, 7), datetime(2018, 1, 8), datetime(2018, 1, 13)]
embeddings1d_das_unw, embeddings1ds_unw = generate_embeddings1d_das(dates, features=['f1', 'f2', 'log_followers'], weights_col=None)
embeddings_da_unw = get_embeddings_da([df.drop('log_followers', axis=1) for df in embeddings1d_das_unw])
# embeddings_da_unw = get_embeddings_da(embeddings1d_das_unw)
print(f"This is how embeddings_da should look like before passing to get_embeddings:")
display(embeddings_da_unw)
embeddings_unw = get_embeddings(embeddings_da_unw)

This is how embeddings_da should look like before passing to get_embeddings:


Unnamed: 0_level_0,Unnamed: 1_level_0,f1,f2,log_followers
clean_id_qis,date,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
t1,2018-01-05,1.1775,0.6925,0.6975
t1,2018-01-07,0.69,0.89,1.43
t1,2018-01-08,0.25,1.02,0.3
t1,2018-01-13,0.51,1.79,0.33
t2,2018-01-01,1.41,0.86,0.16
t2,2018-01-07,0.28,0.18,0.51
t2,2018-01-08,0.195,0.46,0.42
t3,2018-01-01,1.69,0.82,2.51
t3,2018-01-02,0.91,0.7625,0.7075
t3,2018-01-05,0.95,0.375,1.665


In [100]:
embeddings_unw

Unnamed: 0_level_0,Unnamed: 1_level_0,f1,f2,log_followers
date,clean_id_qis,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2018-01-01,t2,1.41,0.86,0.16
2018-01-01,t3,1.69,0.82,2.51
2018-01-02,t3,1.3,0.79125,1.60875
2018-01-05,t1,1.1775,0.6925,0.6975
2018-01-05,t3,1.183333,0.6525,1.6275
2018-01-07,t1,0.93375,0.79125,1.06375
2018-01-07,t2,0.845,0.52,0.335
2018-01-07,t3,0.89,0.556875,1.273125
2018-01-08,t1,0.705833,0.8675,0.809167
2018-01-08,t2,0.2375,0.32,0.465


# Manual verification

In [17]:
def weighted_average(s, w=None):
    if w is None:
        return np.mean(s)
    return (s * w).sum() / w.sum()

In [18]:
s1 = np.array([0.36, 1.34, 0.12])
w = np.array([1.25, 1.16, 1.76])
s2 = np.array([2.01, 0.6, 0.06])
s3 = np.array([1.22, 0.43, .12])
s4 = np.array([.23, 1.63, 1.59])
w2 = np.array([1.12, 2.02, 1.09])
weighted_average(s1, w), weighted_average(s2, w), weighted_average(s3, w2), weighted_average(s4, w2)

(0.5313189448441247, 0.794748201438849, 0.559290780141844, 1.2490070921985816)

In [19]:
weighted_average(s1), weighted_average(s2), weighted_average(s3), weighted_average(s4)

(0.6066666666666668, 0.89, 0.59, 1.1500000000000001)

In [20]:
s1 = np.array([0.2, 0.86, 1.61, 2.04])
s2 = np.array([1.41, 0.6, .21, .55])
w = np.array([.71, 0.51, .87, .7])
weighted_average(s1, w), weighted_average(s2, w)

(1.2219713261648746, 0.6719713261648745)

In [21]:
t1 = np.array([1.22197, 0.69, 0.25, 0.51])
np.mean(t1[1:])

0.48333333333333334

In [22]:
t1 = np.array([1.41, 0.28, 0.199405])
np.mean(t1[1:])

0.2397025

In [23]:
t1 = np.array([1.69, 0.753251, 0.7866, 0.01, 0.22434])
np.mean(t1[1:])

0.44354775

# Full pipeline

## Generation of fake data

In [39]:
dates = [datetime(2018, 1, 1), datetime(2018, 1, 2), datetime(2018, 1, 5), datetime(2018, 1, 7), datetime(2018, 1, 8), datetime(2018, 1, 13)]
embeddings1d_das, embeddings1ds = generate_embeddings1d_das(dates)

In [40]:
embeddings1ds[4]

Unnamed: 0,date,clean_id_qis,f1,f2,log_followers
0,2018-01-08,t3,0.28,0.96,1.82
1,2018-01-08,t1,0.25,1.02,0.3
2,2018-01-08,t2,0.01,0.26,0.41
3,2018-01-08,t3,0.19,0.79,2.95
4,2018-01-08,t2,0.38,0.66,0.43


In [41]:
for i, embeddings1d in enumerate(embeddings1ds):
    cols_features = [col for col in embeddings1d.columns if col.startswith('f')]
    _embeddings = embeddings1d[cols_features]
    _embeddings.to_parquet(f'features_{i}.pq')
    embeddings1d.drop(cols_features, axis=1).to_parquet(f'file_{i}.pq')

## Actual pipeline

In [83]:
files = [f for f in os.listdir('.') if f.startswith('file') and f.endswith('.pq')]
embeddings1d_das = get_embeddings1d_das(pq_files=files, weights_col='log_followers')
embeddings_da = get_embeddings_da(embeddings1d_das)
embeddings = get_embeddings(embeddings_da)

In [84]:
embeddings

Unnamed: 0_level_0,Unnamed: 1_level_0,f1,f2
date,clean_id_qis,Unnamed: 2_level_1,Unnamed: 3_level_1
2018-01-01,t2,1.41,0.86
2018-01-01,t3,1.69,0.82
2018-01-02,t3,1.221625,0.719293
2018-01-05,t1,1.221971,0.671971
2018-01-05,t3,1.076639,0.552292
2018-01-07,t1,0.955986,0.780986
2018-01-07,t2,0.845,0.52
2018-01-07,t3,0.809979,0.481719
2018-01-08,t1,0.720657,0.860657
2018-01-08,t2,0.239702,0.322381


In [89]:
files = [f for f in os.listdir('.') if f.startswith('file') and f.endswith('.pq')]
embeddings1d_das_unw = get_embeddings1d_das(pq_files=files, weights_col=None)
embeddings_da_unw = get_embeddings_da(embeddings1d_das_unw)
embeddings_unw = get_embeddings(embeddings_da_unw)

dropping
dropping
dropping
dropping
dropping
dropping


In [88]:
embeddings_unw

Unnamed: 0_level_0,Unnamed: 1_level_0,f1,f2
date,clean_id_qis,Unnamed: 2_level_1,Unnamed: 3_level_1
2018-01-01,t2,1.41,0.86
2018-01-01,t3,1.69,0.82
2018-01-02,t3,1.3,0.79125
2018-01-05,t1,1.1775,0.6925
2018-01-05,t3,1.183333,0.6525
2018-01-07,t1,0.93375,0.79125
2018-01-07,t2,0.845,0.52
2018-01-07,t3,0.89,0.556875
2018-01-08,t1,0.705833,0.8675
2018-01-08,t2,0.2375,0.32
