# Vocabulary

+ `embeddings1d`: embeddings of the tweets for one day. Some tickers have multiple tweets, some tickers have no tweets. The index is nothing special. The columns are clean_id_qis, date, features and optionally, there is weights_col.
+ `embeddings1d_da`: daily average of the embeddings per ticker. Each ticker present that day is represented by a unique embedding. The index is nothing special. The columns are clean_id_qis, date, features. There is no weights_col.
+ `embeddings_da`: stack of `embeddings1d_da` for each day with swapped index. It has a MultiIndex with levels (clean_id_qis, date). Each ticker is represented by a single embedding for each day.
+ `embeddings`: final matrix of embeddings whose index is a MultiIndex with levels (date, clean_id_qis). Each ticker is represented by a single embedding for each day. If a ticker is not mentioned during a day, then it does not have any embedding for that day.



In [3]:
import numpy as np
import pandas as pd
from datetime import datetime
from typing import List
import os

In [4]:
OCC_PTCK = 'occ_ptck'

In [5]:
def get_embeddings1d_da(embeddings1d: pd.DataFrame, n_features: int, weights_col=''):
    """
    Input: (index=nothing; columns=clean_id_qis, date, features, [opt] weights_col)
    Output: (index=nothing; columns=clean_id_qis, date, features)
    """
    add_col = 2 if (weights_col is None) or (len(weights_col) == 0) else 3
    assert embeddings1d.shape[1] == n_features + add_col
    unique_dates = embeddings1d['date'].unique()
    assert len(unique_dates) == 1, 'The dataframe must represent one day only!'
    date = unique_dates[0]
    embeddings1d.drop('date', axis=1, inplace=True)
    if (weights_col is None) or (len(weights_col) == 0):
        embeddings1d_da = embeddings1d.groupby('clean_id_qis').mean()
    else:
        embeddings1d_da = (embeddings1d
            .groupby('clean_id_qis')
            .apply(lambda x: (x.drop(weights_col, axis=1).multiply(x[weights_col], 0)).sum() / x[weights_col].sum(), include_groups=False))
    embeddings1d_da['date'] = date
    embeddings1d_da.reset_index(drop=False, inplace=True)
    assert embeddings1d.shape[1] == n_features + 2
    return embeddings1d_da

def get_embeddings_da(embeddings1d_das: List[pd.DataFrame]):
    """
    Input: List of embeddings1d_da and each of them has (index=nothing; columns=clean_id_qis, date, features)
    Return: (MultiIndex=clean_id_qis, date; columns=features)
    """
    return pd.concat(embeddings1d_das, axis=0).set_index(['clean_id_qis', 'date'])

def get_embeddings(embeddings_da: pd.DataFrame, window='7D', min_periods=1):
    """
    Input: (MultiIndex=clean_id_qis, date; columns=features)
    Output: (MultiIndex=date, clean_id_qis; columns=features)
    """
    return (embeddings_da
    .sort_index(level=['clean_id_qis', 'date'])
    .reset_index(level='clean_id_qis', drop=False)
    .groupby('clean_id_qis')
    .rolling(window, min_periods=min_periods)
    .mean()
    .swaplevel(i='clean_id_qis', j='date')
    .sort_index(level='date')
)

def get_embeddings1d_da_noda(embeddings1d: pd.DataFrame, n_features: int):
    """
    Input: (index=nothing; columns=clean_id_qis, date, features, [opt] weights_col)
    Output: (index=nothing; columns=clean_id_qis, date, features, occ_ptck)
    """
    assert embeddings1d.shape[1] == n_features + 2
    unique_dates = embeddings1d['date'].unique()
    assert len(unique_dates) == 1, 'The dataframe must represent one day only!'
    date = unique_dates[0]
    embeddings1d.drop('date', axis=1, inplace=True)
    embeddings1d_by_id_qi = embeddings1d.groupby('clean_id_qis')
    embeddings1d_da = embeddings1d_by_id_qi.mean()
    embeddings1d_da[OCC_PTCK] = embeddings1d_by_id_qi.size()
    embeddings1d_da.reset_index(drop=False, inplace=True)
    embeddings1d_da['date'] = date
    assert embeddings1d_da.shape[1] == n_features + 3
    return embeddings1d_da

def get_embeddings_da_noda(embeddings1d_das: List[pd.DataFrame]):
    """
    Input: List of embeddings1d_da s.t. (index=nothing; columns=clean_id_qis, date, features, occ_ptck)
    Return: (MultiIndex=clean_id_qis, date; columns=features, occ_ptck)
    """
    return pd.concat(embeddings1d_das, axis=0).set_index(['clean_id_qis', 'date'])

def get_embeddings_noda(embeddings_da: pd.DataFrame, window='7D', min_periods=1):
    """
    Input: (MultiIndex=clean_id_qis, date; columns=features)
    Output: (MultiIndex=date, clean_id_qis; columns=features)
    """
    return (embeddings_da
    .sort_index(level=['clean_id_qis', 'date'])
    .reset_index(level='clean_id_qis', drop=False)
    .groupby('clean_id_qis')
    .rolling(window, min_periods=min_periods)
    .mean()
    .swaplevel(i='clean_id_qis', j='date')
    .sort_index(level='date')
)

# Tests

In [27]:
def generate_emdeddings1d(date, features=['f1', 'f2'], clean_id_qis=['t1', 't2', 't3'], size=7, seed=0):
    np.random.seed(seed)
    df = pd.DataFrame({
        'date': date,
        'clean_id_qis': np.random.choice(clean_id_qis, size=size),
    })
    for f in features:
        df[f] = np.abs(np.round(np.random.randn(size), 2))
    return df

def generate_embeddings1d_das(dates: List[datetime], features=['f1', 'f2'], clean_id_qis=['t1', 't2', 't3'], seed=0):
    np.random.seed(seed)
    seeds = np.random.randint(0, int(1e5), size=len(dates))
    sizes = np.random.randint(1, 7, size=len(dates))
    embeddings1d_das = []
    embeddings1ds = []
    for i, date in enumerate(dates):
        ciq = np.unique(np.random.choice(clean_id_qis, size=len(clean_id_qis)+1, replace=True))
        embeddings1d = generate_emdeddings1d(date, features, ciq, sizes[i], seeds[i])
        print(embeddings1d)
        embeddings1d_da = get_embeddings1d_da_noda(embeddings1d, len(features))
        embeddings1d_das.append(embeddings1d_da)
        embeddings1ds.append(embeddings1d)
    return embeddings1d_das, embeddings1ds

In [30]:
np.mean([0.59, 0.33, 0.18, 0.01, 0.28])

0.27799999999999997

In [28]:
dates = [datetime(2018, 1, 1), datetime(2018, 1, 2), datetime(2018, 1, 5), datetime(2018, 1, 7), datetime(2018, 1, 8), datetime(2018, 1, 13), datetime(2018, 1, 16), datetime(2018, 1, 22)]
embeddings1d_das, embeddings1ds = generate_embeddings1d_das(dates)
embeddings_da = get_embeddings_da(embeddings1d_das)
embeddings_da

        date clean_id_qis    f1    f2
0 2018-01-01           t2  1.50  3.34
1 2018-01-01           t3  0.59  0.89
2 2018-01-01           t1  0.59  0.54
3 2018-01-01           t2  1.44  0.30
        date clean_id_qis    f1    f2
0 2018-01-02           t3  0.39  0.56
1 2018-01-02           t3  1.51  0.57
2 2018-01-02           t3  0.83  0.76
3 2018-01-02           t3  0.57  0.35
4 2018-01-02           t1  0.33  0.80
5 2018-01-02           t3  1.58  0.23
        date clean_id_qis    f1    f2
0 2018-01-05           t3  0.88  0.66
1 2018-01-05           t1  0.18  1.80
2 2018-01-05           t3  1.56  1.51
        date clean_id_qis    f1    f2
0 2018-01-07           t2  0.69  0.18
1 2018-01-07           t1  0.01  1.43
2 2018-01-07           t1  0.28  0.21
3 2018-01-07           t2  0.89  0.51
4 2018-01-07           t2  0.27  0.77
        date clean_id_qis    f1    f2
0 2018-01-08           t3  0.68  0.45
        date clean_id_qis    f1    f2
0 2018-01-13           t1  0.51  1.79
        date

Unnamed: 0_level_0,Unnamed: 1_level_0,f1,f2,occ_ptck
clean_id_qis,date,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
t1,2018-01-01,0.59,0.54,1
t2,2018-01-01,1.47,1.82,2
t3,2018-01-01,0.59,0.89,1
t1,2018-01-02,0.33,0.8,1
t3,2018-01-02,0.976,0.494,5
t1,2018-01-05,0.18,1.8,1
t3,2018-01-05,1.22,1.085,2
t1,2018-01-07,0.145,0.82,2
t2,2018-01-07,0.616667,0.486667,3
t3,2018-01-08,0.68,0.45,1


In [25]:
def func(grouped_df: pd.DataFrame):
    print(grouped_df.index)
    print(grouped_df.columns)

def compute_rolling_weighted_mean(df: pd.DataFrame, weights_col: str=OCC_PTCK, n_features: int=384, w: int='7D', mp: int=1, min_rolling_occ=1):
    """
    Input: (index=Datetime; columns=features, weights_col)
    Output: (index=Datetime; columns=features)
    """
    assert df.shape[1] == n_features + 1
    assert df.shape[0] == df.index.nunique()
    df.index = pd.to_datetime(df.index)
    df.sort_index(inplace=True)
    weights = df[weights_col]
    df.drop(weights_col, axis=1, inplace=True)
    rolling_weighted_sum = (df
        .mul(weights, axis=0)
        .rolling(window=w, min_periods=mp)
        .sum())
    weights_rolling_sum = (weights
        .rolling(window=w, min_periods=mp)
        .sum())
    print(weights_rolling_sum)
    mask_occ = weights_rolling_sum >= min_rolling_occ
    weights_rolling_sum = weights_rolling_sum[mask_occ]
    rolling_weighted_sum = rolling_weighted_sum[mask_occ]
    rolling_weighted_mean = rolling_weighted_sum.div(weights_rolling_sum, axis=0)
    return rolling_weighted_mean

In [26]:
(embeddings_da
    .sort_index(level=['clean_id_qis', 'date'])
    .reset_index(level='clean_id_qis', drop=False)
    .groupby("clean_id_qis")
    .apply(compute_rolling_weighted_mean, weights_col=OCC_PTCK, n_features=2, min_rolling_occ=0, include_groups=False)
    .swaplevel(i='clean_id_qis', j='date')
    .sort_index(level='date')
)

date
2018-01-01    1.0
2018-01-02    2.0
2018-01-05    3.0
2018-01-07    5.0
2018-01-13    3.0
2018-01-22    1.0
Name: occ_ptck, dtype: float64
date
2018-01-01    2.0
2018-01-07    5.0
2018-01-16    2.0
2018-01-22    3.0
Name: occ_ptck, dtype: float64
date
2018-01-01    1.0
2018-01-02    6.0
2018-01-05    8.0
2018-01-08    8.0
2018-01-16    3.0
2018-01-22    4.0
Name: occ_ptck, dtype: float64


Unnamed: 0_level_0,Unnamed: 1_level_0,f1,f2
date,clean_id_qis,Unnamed: 2_level_1,Unnamed: 3_level_1
2018-01-01,t1,0.59,0.54
2018-01-01,t2,1.47,1.82
2018-01-01,t3,0.59,0.89
2018-01-02,t1,0.46,0.67
2018-01-02,t3,0.911667,0.56
2018-01-05,t1,0.366667,1.046667
2018-01-05,t3,0.98875,0.69125
2018-01-07,t1,0.278,0.956
2018-01-07,t2,0.958,1.02
2018-01-08,t3,1.0,0.63625
