In [389]:
import pandas as pd
from scipy.stats import norm
import numpy as np
import plotly.graph_objs as go
import plotly.express as px
import pystable

# Get Data

Coinbase data (treated)

In [390]:
coin_df = pd.read_csv('risk_pipeline/outputs/data/ETH-BTC_2022-06-01-00-00_2022-10-31-00-00_600secs_treated.csv', index_col=0)

In [391]:
coin_df.head()

Unnamed: 0,time,close
0,2022-06-01 00:00:00,0.06111
1,2022-06-01 00:10:00,0.061075
2,2022-06-01 00:20:00,0.060915
3,2022-06-01 00:30:00,0.060985
4,2022-06-01 00:40:00,0.06102


In [392]:
# invert close to match uni v3
coin_df.close = 1/coin_df.close

Uniswap data

In [393]:
uni_df = pd.read_csv('../csv/WBTC-WETH-10.0mTWAP-14881677-to-15864319.csv', index_col=0)

In [394]:
uni_df.head()

Unnamed: 0,timestamp,twap
0,2022-06-01 00:01:54,16.335589
2,2022-06-01 00:11:54,16.345683
4,2022-06-01 00:21:54,16.351692
10,2022-06-01 00:31:54,16.361773
11,2022-06-01 00:41:54,16.3666


In [395]:
uni_df.reset_index(inplace=True)

# Sanity checks

In [396]:
uni_df[uni_df.twap.isna()]

Unnamed: 0,index,timestamp,twap


In [397]:
coin_df[coin_df.close.isna()]

Unnamed: 0,time,close


In [398]:
print('shape:', uni_df.shape)
print('first timestamp:', uni_df.timestamp.min())
print('last timestamp:', uni_df.timestamp.max())

shape: (21883, 3)
first timestamp: 2022-06-01 00:01:54
last timestamp: 2022-10-30 23:01:54


In [399]:
print('shape:', coin_df.shape)
print('first timestamp:', coin_df.time.min())
print('last timestamp:', coin_df.time.max())

shape: (21889, 2)
first timestamp: 2022-06-01 00:00:00
last timestamp: 2022-10-31 00:00:00


# Combine data

In [400]:
df = pd.concat([coin_df.time, coin_df.close, uni_df.twap], axis=1)

In [401]:
df

Unnamed: 0,time,close,twap
0,2022-06-01 00:00:00,16.363934,16.335589
1,2022-06-01 00:10:00,16.373312,16.345683
2,2022-06-01 00:20:00,16.416318,16.351692
3,2022-06-01 00:30:00,16.397475,16.361773
4,2022-06-01 00:40:00,16.388069,16.366600
...,...,...,...
21884,2022-10-30 23:20:00,12.971010,
21885,2022-10-30 23:30:00,12.965124,
21886,2022-10-30 23:40:00,12.977743,
21887,2022-10-30 23:50:00,12.965964,


In [402]:
# Change column names
df.columns = ['time', 'coin', 'uni']

In [403]:
print('NAs in coinbase column =', df[df.coin.isna()].count()[0])
print('NAs in uniswap column =', df[df.uni.isna()].count()[0])


NAs in coinbase column = 0
NAs in uniswap column = 6


In [404]:
df.dropna(inplace=True)

In [405]:
df

Unnamed: 0,time,coin,uni
0,2022-06-01 00:00:00,16.363934,16.335589
1,2022-06-01 00:10:00,16.373312,16.345683
2,2022-06-01 00:20:00,16.416318,16.351692
3,2022-06-01 00:30:00,16.397475,16.361773
4,2022-06-01 00:40:00,16.388069,16.366600
...,...,...,...
21878,2022-10-30 22:20:00,13.005592,12.995307
21879,2022-10-30 22:30:00,12.997141,12.995306
21880,2022-10-30 22:40:00,12.990387,12.995304
21881,2022-10-30 22:50:00,12.971851,12.995304


# Analysis

In [406]:
def line_chart(df, title, xcol, ycol, x_title, y_title):
    fig = px.line(df, x=xcol, y=ycol)
    fig.update_layout(title=title)
    fig.update_layout(xaxis_title=x_title, yaxis_title=y_title)
    return fig

In [407]:
# Superimposed plot
plt = line_chart(df, 'Prices', 'time', ['coin', 'uni'], 'Time', 'Prices')
plt.show()

In [408]:
# Pystable funcs

def gaussian():
    return pystable.create(alpha=2.0, beta=0.0, mu=0.0,
                           sigma=1.0, parameterization=1)

def get_qs(dst):
    l = [i/100 for i in range(1, 100, 1)]
    return pystable.q(dst, l, len(l))

def pdf(dst, vals):
    return pystable.pdf(dst, vals, len(vals))

def log_returns(p):
    return [np.log(p[i]/p[i-1]) for i in range(1, len(p))]


In [409]:
# Data funcs
def get_qs_data(series):
    l = [i for i in range(1, 100, 1)]
    return [np.percentile(series, i) for i in l]

In [410]:
log_coin = log_returns(df['coin'])
log_uni = log_returns(df['uni'])

In [411]:
dst_coin = gaussian()
pystable.fit(dst_coin, log_coin, len(log_coin))

dst_uni = gaussian()
pystable.fit(dst_uni, log_uni, len(log_uni))

0

In [412]:
dst_coin.contents.sigma

0.0007611643203630056

In [413]:
m, s = norm.fit(log_coin)
print(m, s)

-1.0574426902473633e-05 0.0017892309673637043


In [414]:
coin_q = get_qs(dst_coin)
uni_q = get_qs(dst_uni)

In [415]:
dist_df = pd.DataFrame(
    {'Percentage': [i/100 for i in range(1, 100, 1)],
     'coin': coin_q, 'uni': uni_q}
)

In [416]:
plt = line_chart(dist_df, 'CDFs', 'Percentage', ['coin', 'uni'], 'Percentage', 'Log returns')
plt.show()

In [417]:
df['uni_log_returns'] = [0] + log_uni
df['coin_log_returns'] = [0] + log_coin


In [418]:
fig = px.histogram(df, x=['uni_log_returns'])
fig.show()

In [419]:
fig = px.histogram(df, x=['coin_log_returns'])
fig.show()

Uni distribution has way too many zeros. Maybe pystable is not able to fit the distribution. Check qq plot

In [420]:
uni_q_data = get_qs_data(log_uni)
fig = line_chart(pd.DataFrame({'data': uni_q_data, 'dist': uni_q}), 'Uni QQ plot', 'dist', 'data', 'Dist', 'Data')
fig.show()

In [421]:
coin_q_data = get_qs_data(log_coin)
fig = line_chart(pd.DataFrame({'data': coin_q_data, 'dist': coin_q}), 'Coin QQ plot', 'dist', 'data', 'Dist', 'Data')
fig.show()

# Try adding noise

In [422]:
noise_df = df.copy(deep=True)
noise_df = noise_df.iloc[1:,:]  # Remove first row since log returns are NA

In [423]:
noise_df

Unnamed: 0,time,coin,uni,uni_log_returns,coin_log_returns
1,2022-06-01 00:10:00,16.373312,16.345683,6.177501e-04,0.000573
2,2022-06-01 00:20:00,16.416318,16.351692,3.675261e-04,0.002623
3,2022-06-01 00:30:00,16.397475,16.361773,6.163437e-04,-0.001148
4,2022-06-01 00:40:00,16.388069,16.366600,2.949345e-04,-0.000574
5,2022-06-01 00:50:00,16.362595,16.366600,0.000000e+00,-0.001556
...,...,...,...,...,...
21878,2022-10-30 22:20:00,13.005592,12.995307,0.000000e+00,-0.001692
21879,2022-10-30 22:30:00,12.997141,12.995306,-1.136248e-07,-0.000650
21880,2022-10-30 22:40:00,12.990387,12.995304,-1.136248e-07,-0.000520
21881,2022-10-30 22:50:00,12.971851,12.995304,0.000000e+00,-0.001428


In [424]:
# Found out the percentage difference between uniswap and coinbase prices
noise_df['uni_coin_diff'] = noise_df.uni/noise_df.coin - 1
noise_df

Unnamed: 0,time,coin,uni,uni_log_returns,coin_log_returns,uni_coin_diff
1,2022-06-01 00:10:00,16.373312,16.345683,6.177501e-04,0.000573,-0.001687
2,2022-06-01 00:20:00,16.416318,16.351692,3.675261e-04,0.002623,-0.003937
3,2022-06-01 00:30:00,16.397475,16.361773,6.163437e-04,-0.001148,-0.002177
4,2022-06-01 00:40:00,16.388069,16.366600,2.949345e-04,-0.000574,-0.001310
5,2022-06-01 00:50:00,16.362595,16.366600,0.000000e+00,-0.001556,0.000245
...,...,...,...,...,...,...
21878,2022-10-30 22:20:00,13.005592,12.995307,0.000000e+00,-0.001692,-0.000791
21879,2022-10-30 22:30:00,12.997141,12.995306,-1.136248e-07,-0.000650,-0.000141
21880,2022-10-30 22:40:00,12.990387,12.995304,-1.136248e-07,-0.000520,0.000379
21881,2022-10-30 22:50:00,12.971851,12.995304,0.000000e+00,-0.001428,0.001808


In [425]:
# Average percentage price difference is expected to be ~0 since basically the prices are very similar
noise_df.uni_coin_diff.mean()

1.8854669752077555e-05

In [439]:
# Plot distribution of price difference. Expected to be gaussian
px.histogram(noise_df, x='uni_coin_diff', nbins=500, histnorm='probability density')

In [427]:
mean, std_dev = norm.fit(noise_df.uni_coin_diff)
print('Mean =', mean)
print('Std Dev =', std_dev)

Mean = 1.8854669752077555e-05
Std Dev = 0.0025371914816321155


In [437]:
# plot the histogram of the data
histogram = go.Histogram(x=noise_df.uni_coin_diff, nbinsx=500, histnorm='probability density', opacity=0.5)
x = np.linspace(noise_df.uni_coin_diff.min(), noise_df.uni_coin_diff.max(), 100)
pdf = norm.pdf(x, loc=mean, scale=std_dev)
line = go.Scatter(x=x, y=pdf, mode='lines')
fig = go.Figure(data=[histogram, line])

# update the layout
fig.update_layout(title='Histogram with PDF', xaxis_title='Value', yaxis_title='Probability density')

# show the plot
fig.show()


In [429]:
def gauss_data_point(mean, stddev, thresh):
    lower_bound = mean - thresh*stddev
    upper_bound = mean + thresh*stddev
    point = np.random.normal(loc=mean, scale=std_dev, size=1)
    while (point < lower_bound) | (point > upper_bound):
        point = np.random.normal(loc=mean, scale=std_dev, size=1)
    return point[0]

### Add noise to all uni data points

In [430]:
noise_df['uni_noisy'] = noise_df.uni.apply(
    lambda x: (gauss_data_point(mean, std_dev, 2) + 1) * x
)

In [441]:
noise_df

Unnamed: 0,time,coin,uni,uni_log_returns,coin_log_returns,uni_coin_diff,uni_noisy
1,2022-06-01 00:10:00,16.373312,16.345683,6.177501e-04,0.000573,-0.001687,16.400136
2,2022-06-01 00:20:00,16.416318,16.351692,3.675261e-04,0.002623,-0.003937,16.392622
3,2022-06-01 00:30:00,16.397475,16.361773,6.163437e-04,-0.001148,-0.002177,16.406841
4,2022-06-01 00:40:00,16.388069,16.366600,2.949345e-04,-0.000574,-0.001310,16.379511
5,2022-06-01 00:50:00,16.362595,16.366600,0.000000e+00,-0.001556,0.000245,16.390693
...,...,...,...,...,...,...,...
21878,2022-10-30 22:20:00,13.005592,12.995307,0.000000e+00,-0.001692,-0.000791,13.029960
21879,2022-10-30 22:30:00,12.997141,12.995306,-1.136248e-07,-0.000650,-0.000141,13.002056
21880,2022-10-30 22:40:00,12.990387,12.995304,-1.136248e-07,-0.000520,0.000379,13.026808
21881,2022-10-30 22:50:00,12.971851,12.995304,0.000000e+00,-0.001428,0.001808,12.997517


In [433]:
log_uni_noisy = log_returns(np.array(noise_df.uni_noisy))

In [440]:
fig = px.histogram(log_uni_noisy)
fig.show()

### Add noise to only those uni data points that are repeating

In [446]:
noise_df['shift_uni'] = noise_df.uni.shift(1)
noise_df['uni_noisy_v2'] = np.where(
    noise_df.uni == noise_df.shift_uni,
    noise_df.uni.apply(lambda x: (gauss_data_point(mean, std_dev, 2) + 1) * x),
    noise_df.uni
)

In [448]:
noise_df.head()

Unnamed: 0,time,coin,uni,uni_log_returns,coin_log_returns,uni_coin_diff,uni_noisy,shift_uni,uni_noisy_v2
1,2022-06-01 00:10:00,16.373312,16.345683,0.000618,0.000573,-0.001687,16.400136,,16.345683
2,2022-06-01 00:20:00,16.416318,16.351692,0.000368,0.002623,-0.003937,16.392622,16.345683,16.351692
3,2022-06-01 00:30:00,16.397475,16.361773,0.000616,-0.001148,-0.002177,16.406841,16.351692,16.361773
4,2022-06-01 00:40:00,16.388069,16.3666,0.000295,-0.000574,-0.00131,16.379511,16.361773,16.3666
5,2022-06-01 00:50:00,16.362595,16.3666,0.0,-0.001556,0.000245,16.390693,16.3666,16.399331


In [449]:
log_uni_noisy_v2 = log_returns(np.array(noise_df.uni_noisy_v2))

In [450]:
fig = px.histogram(log_uni_noisy_v2)
fig.show()