In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [None]:
data = pd.read_hdf('03_non_padded_with_batch_info.h5')

In [None]:
weights = pd.read_csv('data/asset_details.csv')

In [None]:
weights

In [None]:
data.head()

In [None]:
batchSize = max(np.unique(data.batchSize))
batch = data.loc[data.batchSize == batchSize]

In [None]:
batch

In [None]:
for i in np.unique(data.Asset_ID):
    coin = data.loc[data.Asset_ID == i]
    fig = plt.figure()
    plt.scatter(np.linspace(0, len(coin)-1, len(coin)), coin.Close)
    plt.title(i)


In [None]:
def R(batch, t):
    t16 = batch.loc[batch.timestamp == (t+16*60)].Close.to_numpy()
    t1  = batch.loc[batch.timestamp == (t+1*60)].Close.to_numpy()
    return np.log(t16/t1)

print(batch.loc[batch.Asset_ID == 0])
R(batch.loc[batch.Asset_ID == 0], 1615202340)

In [None]:
def M(data, t, weights):
    assets = np.unique(weights.Asset_ID.to_numpy())
    #print(assets)
    nom = 0
    denom = 0
    Rs=[]
    for asset in assets:
        weight = weights.loc[weights.Asset_ID == asset].Weight.to_numpy()
        #print(asset, weight)
        batch = data.loc[data.Asset_ID == asset]
        r = R(batch, t)
        Rs.append(r)
        nom = nom + weight*r
        denom = denom + weight
    return nom/denom#, Rs

In [None]:
M(batch, 1615202340, weights)

In [None]:
batch['M'] = 0
batch['R'] = 0
for timestamp in np.unique(batch.timestamp.to_numpy()):
    timeStampBatch = batch.loc[batch.timestamp == timestamp]
    indexes = timeStampBatch.index.to_numpy()
    #print(timeStampBatch, indexes)
    batch.at[indexes, 'M'] = M(batch, timestamp, weights)[0]
    for asset in np.unique(batch.Asset_ID.to_numpy()):
        index = timeStampBatch.loc[batch.Asset_ID == asset].index.to_numpy()
        #print(asset, index)
        batch.at[index,'R'] = R(batch, timestamp)[0]
        #break
    #print(batch.loc[batch.timestamp == timestamp])
    #break


In [None]:
batch.to_hdf('04_biggestBatch_with_M_and_R.h5', key = 'df', mode = 'w')

In [None]:
def recreateTarget(data, details):
    data['Time'] = pd.to_datetime(data['timestamp'], unit='s')
    price_column = 'Close'
    ids = list(details.Asset_ID)
    chunks = []
    for id in ids:    
        asset = data[data.Asset_ID == id].copy()
        asset.sort_values(by='Time', inplace=True)
        asset.set_index(keys='Time', inplace=True)
        asset['p1'] = asset[price_column].shift(freq='-1T')
        asset['p16'] = asset[price_column].shift(freq='-16T')
        asset['r'] = np.log(asset.p16/asset.p1)
        asset.drop(['p1', 'p16'], axis=1, inplace=True)
        asset.reset_index(inplace=True)
        chunks.append(asset)

    data = pd.concat(chunks)
    data.sort_values(by='Time', inplace=True)

    data['w'] = data['Asset_ID'].map(details.set_index(keys='Asset_ID')['Weight'])
    weight_sum = details.Weight.sum()

    data['weighted_asset_r'] = data.w * data.r
    time_group = data.groupby('Time')

    m = time_group['weighted_asset_r'].sum() / time_group['w'].sum()
    #m = time_group['weighted_asset_r'].sum() / weight_sum

    data.set_index(keys=['Time'], inplace=True)
    data['m'] = m
    data.reset_index(inplace=True)

    data['m2'] = data.m ** 2
    data['mr'] = data.r * data.m

    chunks = []
    for id in ids:
        # type: pd.DataFrame
        asset = data[data.Asset_ID == id].copy()
        asset.sort_values(by='Time', inplace=True)
        asset.set_index(keys='Time', inplace=True)
        asset['mr_rolling'] = asset['mr'].rolling(window='3750T', min_periods=3750).mean()
        asset['m2_rolling'] = asset['m2'].rolling(window='3750T', min_periods=3750).mean()
        asset.reset_index(inplace=True)
        chunks.append(asset)
        debug = 1

    data = pd.concat(chunks)
    data.sort_values(by='Time', inplace=True)
    data['beta'] = data['mr_rolling'] / data['m2_rolling']

    data['Target_recreated'] = data['r'] - data['beta'] * data['m']

    data['Target_diff'] = np.abs(data['Target'] - data['Target_recreated'])

    print(f'Average absolute error {data.Target_diff.mean():8.6f}')
    print(f'Max absolute error {data.Target_diff.max():8.6f}')
    print(f'Standard deviation {data.Target_diff.std():8.6f}')
    
    return data

In [None]:
newBatch = data.loc[data.batchSize == batchSize]
modBatch = recreateTarget(newBatch, weights)

In [None]:
for i in np.unique(data.Asset_ID):
    coin = modBatch.loc[modBatch.Asset_ID == i]
    fig = plt.figure()
    #plt.scatter(np.linspace(0, len(coin)-1, len(coin)), coin.Target_recreated)
    plt.scatter(np.linspace(0, len(coin)-1, len(coin)), coin.Target_diff)
    plt.title(i)
