In [None]:
import os
import math
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.pylab as pylab
import seaborn as sns
import glob
import statsmodels.api as sm
from statsmodels.stats.diagnostic import het_white
from statsmodels.compat import lzip
from scipy.linalg import toeplitz
from patsy import dmatrices
from math import sqrt
sns.set_style('white')

In [None]:
filename_2019 = glob.glob('../data/2019/*.csv')
filename_2019.sort()
filename_2019

In [None]:
X = 'lm'
Y = 'vm'

In [None]:
df = pd.read_csv(f'../data/2018/{X}.csv', index_col=0, encoding='gbk')
df.columns = ['code', 'time', 'open', 'high', 'low', f'{X}_close', f'{X}_volume', 'turnover', 'open interest']
df.rename_axis("type", axis='index', inplace=True)
df['time'] = pd.to_datetime(df['time'])


#df.set_index('time', inplace=True)
df.reset_index(inplace=True)
df

In [None]:
df[[f'{X}_close']].plot(grid=False, figsize=(12,4))
sns.despine()

In [None]:
df_other = pd.read_csv(f'../data/2018/{Y}.csv', index_col=0, encoding='gbk')
df_other.columns = ['code', 'time', 'open', 'high', 'low', f'{Y}_close', f'{Y}_volume', 'turnover', 'open interest']
df_other.rename_axis("type", axis='index', inplace=True)
df_other['time'] = pd.to_datetime(df_other['time'])

#df_other.set_index('time', inplace=True)
df_other.reset_index(inplace=True)

df_other

In [None]:
df_other[[f'{Y}_close']].plot(grid=False, figsize=(12,4))
sns.despine()

In [None]:
# df = df[[f'{X}_close', f'{X}_volume']]

irrelevant_col = ['code', 'open', 'high', 'low', 'turnover', 'open interest']
df.drop(columns=irrelevant_col, inplace = True)
    
df[f'{Y}_close'] = df_other.loc[:, f'{Y}_close'].copy()
df[f'{Y}_volume'] = df_other.loc[:, f'{Y}_volume'].copy()

In [None]:
df['date'] = df['time'].dt.date

df['date_after'] = df['date'].shift(-1)
df["daily_close"] = (df["date_after"] != df["date"])
df_day = df.loc[df["daily_close"] ].copy()
df_day[[f'{X}_close',f'{Y}_close']].plot(grid=False, figsize=(12,4))
sns.despine()
#df_day.set_index('date', inplace=True)
df_day.reset_index(inplace=True)

In [None]:
df_day[f'{X}_log_ret'] = np.log(df_day[f'{X}_close']/df_day[f'{X}_close'].shift(1))
df_day[f'{Y}_log_ret'] = np.log(df_day[f'{Y}_close']/df_day[f'{Y}_close'].shift(1))

In [None]:
df_day

In [None]:
def calculate_hull_moving_average(df, period, close):

    df['window_t_ewm'] = df[close].ewm(span=period).mean()
    df['window_half_ewm'] = df[close].ewm(span=period/2).mean()
    df['diff']=df['window_half_ewm'] * 2 - df['window_t_ewm']
    
    df[f'hma{period}_{close}']=df['diff'].ewm(span=int(math.sqrt(period))).mean()
    
    irrelevant_col = ['window_t_ewm', 'window_half_ewm', 'diff']
    df.drop(columns=irrelevant_col, inplace = True)
    

In [None]:
calculate_hull_moving_average(df_day, 7, f'{X}_close')
calculate_hull_moving_average(df_day, 7, f'{Y}_close')
df_day[[f'hma7_{X}_close',f'hma7_{Y}_close']].plot(grid=False, figsize=(12,4))
sns.despine()

In [None]:
fig, axs = plt.subplots(2, 3, figsize=(18, 15))
df_day[f'{X}_close'].plot(ax=axs[0, 0], title=f'Plot of prices for {X}', grid=True)
df_day[f'{X}_log_ret'].plot(ax=axs[0, 1], title=f'Plot of log-returns for {X}', grid=True)
df_day[f'{X}_log_ret'].plot(kind='hist', bins=100, ax=axs[0, 2], title=f'Distribution of log-returns for {X}', grid=True)
axs[0, 2].axvline(df_day[f'{X}_log_ret'].median(), color='red', linestyle='--')

df_day[f'{Y}_close'].plot( ax=axs[1, 0], title=f'Plot of prices for {Y}', grid=True)
df_day[f'{Y}_log_ret'].plot(ax=axs[1, 1], grid=True, title=f'Plot of log-returns for {Y}',)
df_day[f'{Y}_log_ret'].plot(kind='hist', bins=100, ax=axs[1, 2], grid=True, title=f'Distribution of log-returns for {Y}',);
axs[1, 2].axvline(df_day[f'{Y}_log_ret'].median(), color='red', linestyle='--');

In [None]:
long_leg = f'{X}_log_ret'
short_leg = f'{Y}_log_ret'
df_spreads = pd.DataFrame()

for rolling_period in range(1, 51):
    df_spreads[rolling_period] = df_day[long_leg].rolling(rolling_period).sum() - df_day[short_leg].rolling(rolling_period).sum()

In [None]:
df_spread_stats = df_spreads.describe([x/10 for x in range(1,10)] + [0.005, 0.995])

In [None]:
df_spreads[[1, 10, 20, 30, 40, 50]].plot(figsize=(12, 6), grid=True, 
                                         ylabel='Diff. of $M$ period log-returns RTH - XRT',
                                         title='Log-return difference between RTH and XRT for different values of rolling period $M$');