In [2]:
import numpy as np
import pandas as pd
from datetime import datetime
from typing import List
import os

In [3]:
def generate_emdeddings1d(date, features=['f1', 'f2'], clean_id_qis=['t1', 't2', 't3'], size=7, seed=0):
    np.random.seed(seed)
    df = pd.DataFrame({
        'date': date,
        'clean_id_qis': np.random.choice(clean_id_qis, size=size),
    })
    for f in features:
        df[f] = np.abs(np.round(np.random.randn(size), 2))
    return df

def generate_cctnd_embeddings1d(dates: List[datetime], features=['f1', 'f2', 'f4'], clean_id_qis=['t1', 't2', 't3'], seed=0):
    np.random.seed(seed)
    seeds = np.random.randint(0, int(1e5), size=len(dates))
    sizes = np.random.randint(1, 7, size=len(dates))
    embeddings1ds = []
    for i, date in enumerate(dates):
        ciq = np.unique(np.random.choice(clean_id_qis, size=len(clean_id_qis)+1, replace=True))
        embeddings1d = generate_emdeddings1d(date, features, ciq, sizes[i], seeds[i])
        embeddings1ds.append(embeddings1d)
    return pd.concat(embeddings1ds, axis=0)

def generate_example(seed=0):
    dates = [datetime(2018, 1, 1), datetime(2018, 1, 2), datetime(2018, 1, 5), datetime(2018, 1, 7), datetime(2018, 1, 8), datetime(2018, 1, 13)]
    return (generate_cctnd_embeddings1d(dates, features=['f1', 'f2', 'weights'], seed=seed)
        .drop('clean_id_qis', axis=1)
        .drop_duplicates(subset='date')
        .set_index('date'))

def compute_rolling_weighted_mean(df: pd.DataFrame, weights_col: str, n_features: int, w: int, mp: int):
    """
    Input: (index=Datetime; columns=features, weights_col)
    Output: (index=Datetime; columns=features)
    """
    assert df.shape[1] == n_features + 1
    assert df.shape[0] == df.index.nunique()
    df.index = pd.to_datetime(df.index)
    df.sort_index(inplace=True)
    weights = df[weights_col]
    df.drop(weights_col, axis=1, inplace=True)
    rolling_weighted_sum = (df
        .mul(weights, axis=0)
        .rolling(window=w, min_periods=mp)
        .sum())
    weights_rolling_sum = (weights
        .rolling(window=w, min_periods=mp)
        .sum())
    rolling_weighted_mean = rolling_weighted_sum.div(weights_rolling_sum, axis=0)
    return rolling_weighted_mean

In [4]:
df_example = generate_example()
print("Initial dataframe")
display(df_example)
weights = df_example["weights"]
df_example.drop("weights", axis=1, inplace=True)

Initial dataframe


Unnamed: 0_level_0,f1,f2,weights
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2018-01-01,1.41,0.86,0.16
2018-01-02,0.69,0.56,0.61
2018-01-05,0.2,1.41,0.71
2018-01-07,0.69,0.89,1.43
2018-01-08,0.28,0.96,1.82
2018-01-13,0.51,1.79,0.33


In [5]:
window = '7D'
min_periods = 1
rolling_weighted_sum = df_example.mul(weights, axis=0).rolling(window=window, min_periods=min_periods).sum()
rolling_sum = weights.rolling(window=window, min_periods=min_periods).sum()
rolling_weighted_mean = rolling_weighted_sum.div(rolling_sum, axis=0)
rolling_weighted_mean.round(2)

Unnamed: 0_level_0,f1,f2
date,Unnamed: 1_level_1,Unnamed: 2_level_1
2018-01-01,1.41,0.86
2018-01-02,0.84,0.62
2018-01-05,0.53,1.0
2018-01-07,0.61,0.95
2018-01-08,0.45,0.95
2018-01-13,0.46,1.01


In [6]:
df_example = generate_example()
res = compute_rolling_weighted_mean(df_example, weights_col='weights', n_features=2, w=window, mp=min_periods)
display(res.round(2))
assert res.equals(rolling_weighted_mean)

Unnamed: 0_level_0,f1,f2
date,Unnamed: 1_level_1,Unnamed: 2_level_1
2018-01-01,1.41,0.86
2018-01-02,0.84,0.62
2018-01-05,0.53,1.0
2018-01-07,0.61,0.95
2018-01-08,0.45,0.95
2018-01-13,0.46,1.01


In [7]:
def weighted_mean(a, w):
    return (a * w).sum() / w.sum()

In [8]:
a = np.array([.2, .69])
w = np.array([.71, 1.43])
weighted_mean(a, w)

0.527429906542056

In [9]:
5/3

1.6666666666666667