# Vocabulary

+ `embeddings1d`: embeddings of the tweets for one day. Some tickers have multiple tweets, some tickers have no tweets. The index is nothing special. The columns are clean_id_qis, date, features.
+ `cctnd_embeddings1d` or `c_e1d`: concatenation of embeddings for one day. The index is nothing special. The columns are clean_id_qis, date, features.
+ `cctnd_embeddings1d_tck` or `c_e1d_t`: concatenation of embeddings for one day for a fixed ticker. The index is nothing special. The columns are clean_id_qis, date, features.
+ `embeddings`: final matrix of embeddings whose index is a MultiIndex with levels (date, clean_id_qis). Each ticker is represented by a single embedding for each day. If a ticker is not mentioned during a day, then it does not have any embedding for that day.

In [1]:
import numpy as np
import pandas as pd
from datetime import datetime
from typing import List
import os

In [111]:
def generate_emdeddings1d(date, features=['f1', 'f2'], clean_id_qis=['t1', 't2', 't3'], size=7, seed=0):
    np.random.seed(seed)
    df = pd.DataFrame({
        'date': date,
        'clean_id_qis': np.random.choice(clean_id_qis, size=size),
    })
    for f in features:
        df[f] = np.abs(np.round(np.random.randn(size), 2))
    return df

def generate_cctnd_embeddings1d(dates: List[datetime], features=['f1', 'f2', 'f4'], clean_id_qis=['t1', 't2', 't3'], seed=0):
    np.random.seed(seed)
    seeds = np.random.randint(0, int(1e5), size=len(dates))
    sizes = np.random.randint(1, 7, size=len(dates))
    embeddings1ds = []
    for i, date in enumerate(dates):
        ciq = np.unique(np.random.choice(clean_id_qis, size=len(clean_id_qis)+1, replace=True))
        embeddings1d = generate_emdeddings1d(date, features, ciq, sizes[i], seeds[i])
        embeddings1ds.append(embeddings1d)
    return pd.concat(embeddings1ds, axis=0)

In [94]:
def restore_all_dates(df, dates):
    all_dates_df = pd.DataFrame({'date': dates})
    merged_df = all_dates_df.merge(df, on='date', how='left')
    return merged_df

def get_embeddings_noda(c_e1d: pd.DataFrame, all_tickers=False):
    """noda means no daily average
    """
    print(f"No index. Expected columns are: {c_e1d.columns.values}")
    # assert c_e1d.shape[1] == 384 + 2
    assert c_e1d.shape[1] == 3 + 2
    unique_id_qis = c_e1d["clean_id_qis"].unique()
    c_e1d_t_s = []
    for clean_id_qi in unique_id_qis:
        c_e1d_t = c_e1d[c_e1d["clean_id_qis"]==clean_id_qi].drop("clean_id_qis", axis=1)
        if all_tickers:
            c_e1d_t = restore_all_dates(c_e1d_t, c_e1d["date"].unique())
        c_e1d_t.set_index("date", inplace=True)
        c_e1d_t.sort_index(inplace=True)
        c_e1d_t = c_e1d_t.rolling(window="7D", min_periods=1).mean()
        c_e1d_t = c_e1d_t[~c_e1d_t.index.duplicated(keep='last')]
        c_e1d_t_s.append(c_e1d_t)
    return (pd.concat(c_e1d_t_s, axis=0, keys=unique_id_qis, names=["clean_id_qis"])
        .dropna(how='all')
        .swaplevel(i='clean_id_qis', j='date')
        .sort_index())

In [95]:
dates = [datetime(2018, 1, 1), datetime(2018, 1, 2), datetime(2018, 1, 5), datetime(2018, 1, 7), datetime(2018, 1, 8), datetime(2018, 1, 13)]
c_e1d = generate_cctnd_embeddings1d(dates)
c_e1d

Unnamed: 0,date,clean_id_qis,f1,f2,f4
0,2018-01-01,t2,1.41,0.86,0.16
1,2018-01-01,t3,1.69,0.82,2.51
0,2018-01-02,t3,0.69,0.56,0.61
1,2018-01-02,t3,0.74,0.7,1.04
2,2018-01-02,t3,1.69,1.5,0.28
3,2018-01-02,t3,0.52,0.29,0.9
0,2018-01-05,t1,0.2,1.41,0.71
1,2018-01-05,t1,0.86,0.6,0.51
2,2018-01-05,t3,0.58,0.02,2.4
3,2018-01-05,t1,1.61,0.21,0.87


In [102]:
for clean_id_qi in c_e1d['clean_id_qis'].unique():
    display(c_e1d[c_e1d['clean_id_qis']==clean_id_qi])

Unnamed: 0,date,clean_id_qis,f1,f2,f4
0,2018-01-01,t2,1.41,0.86,0.16
2,2018-01-07,t2,0.28,0.18,0.51
2,2018-01-08,t2,0.01,0.26,0.41
4,2018-01-08,t2,0.38,0.66,0.43


Unnamed: 0,date,clean_id_qis,f1,f2,f4
1,2018-01-01,t3,1.69,0.82,2.51
0,2018-01-02,t3,0.69,0.56,0.61
1,2018-01-02,t3,0.74,0.7,1.04
2,2018-01-02,t3,1.69,1.5,0.28
3,2018-01-02,t3,0.52,0.29,0.9
2,2018-01-05,t3,0.58,0.02,2.4
4,2018-01-05,t3,1.32,0.73,0.93
1,2018-01-07,t3,0.01,0.27,0.21
0,2018-01-08,t3,0.28,0.96,1.82
3,2018-01-08,t3,0.19,0.79,2.95


Unnamed: 0,date,clean_id_qis,f1,f2,f4
0,2018-01-05,t1,0.2,1.41,0.71
1,2018-01-05,t1,0.86,0.6,0.51
3,2018-01-05,t1,1.61,0.21,0.87
5,2018-01-05,t1,2.04,0.55,0.7
0,2018-01-07,t1,0.69,0.89,1.43
1,2018-01-08,t1,0.25,1.02,0.3
0,2018-01-13,t1,0.51,1.79,0.33


In [103]:
embeddings = get_embeddings_noda(c_e1d, all_tickers=False)
embeddings

No index. Expected columns are: ['date' 'clean_id_qis' 'f1' 'f2' 'f4']


Unnamed: 0_level_0,Unnamed: 1_level_0,f1,f2,f4
date,clean_id_qis,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2018-01-01,t2,1.41,0.86,0.16
2018-01-01,t3,1.69,0.82,2.51
2018-01-02,t3,1.066,0.774,1.068
2018-01-05,t1,1.1775,0.6925,0.6975
2018-01-05,t3,1.032857,0.66,1.238571
2018-01-07,t1,1.08,0.732,0.844
2018-01-07,t2,0.845,0.52,0.335
2018-01-07,t3,0.905,0.61125,1.11
2018-01-08,t1,0.941667,0.78,0.753333
2018-01-08,t2,0.223333,0.366667,0.45


In [110]:
embeddings = get_embeddings_noda(c_e1d, all_tickers=True)
embeddings

No index. Expected columns are: ['date' 'clean_id_qis' 'f1' 'f2' 'f4']


Unnamed: 0_level_0,Unnamed: 1_level_0,f1,f2,f4
date,clean_id_qis,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2018-01-01,t2,1.41,0.86,0.16
2018-01-01,t3,1.69,0.82,2.51
2018-01-02,t2,1.41,0.86,0.16
2018-01-02,t3,1.066,0.774,1.068
2018-01-05,t1,1.1775,0.6925,0.6975
2018-01-05,t2,1.41,0.86,0.16
2018-01-05,t3,1.032857,0.66,1.238571
2018-01-07,t1,1.08,0.732,0.844
2018-01-07,t2,0.845,0.52,0.335
2018-01-07,t3,0.905,0.61125,1.11
