In [124]:
import numpy as np
import pandas as pd
import itertools
from datetime import datetime
import statsmodels.tsa.stattools as smts
import statsmodels.api as sm
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')

In [None]:
df = pd.read_feather('okx_20221h.feather')

In [166]:
pivot_df = pd.pivot_table(df,values='close',index='open_time',columns='symbol')
# pivot_df.drop(['LUNA-USDC.OK', 'LUNA-BTC.OK', 'LUNA-UST.OK', 'LUNA-USDT.OK'],axis=1,inplace=True)
pivot_df = pivot_df.loc[:,~pivot_df.columns.str.endswith(('-BTC.OK','-ETH.OK','-USDC.OK','-USDK.OK','-UST.OK','-DAI.OK','-OKB.OK'))]
pivot_df.ffill(inplace=True)
pivot_df.head()

symbol,1INCH-USDT.OK,AAC-USDT.OK,AAVE-USDT.OK,ABT-USDT.OK,ACA-USDT.OK,ACT-USDT.OK,ADA-USDT.OK,AE-USDT.OK,AERGO-USDT.OK,AGLD-USDT.OK,...,YGG-USDT.OK,YOU-USDT.OK,YOYO-USDT.OK,ZBC-USDT.OK,ZEC-USDT.OK,ZEN-USDT.OK,ZIL-USDT.OK,ZKS-USDT.OK,ZRX-USDT.OK,ZYRO-USDT.OK
open_time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2022-01-01,2.5344,0.001475,266.53,0.1466,,0.00709,1.37919,0.141,0.2431,1.7022,...,5.5807,0.003715,0.01852,,150.35,63.94,0.07663,0.3786,0.8753,0.01063
2022-01-02,2.5531,0.001494,265.41,0.1582,,0.00695,1.37697,0.1461,0.2776,1.8319,...,5.6038,0.003722,0.01859,,152.96,65.05,0.07704,0.3724,0.8882,0.01074
2022-01-03,2.5625,0.001447,267.66,0.1516,,0.00709,1.31932,0.1517,0.251,1.7179,...,5.3267,0.003723,0.01786,,150.62,63.42,0.07438,0.3646,0.9404,0.01053
2022-01-04,2.4783,0.001406,252.0,0.147,,0.00701,1.30888,0.1538,0.2584,1.6257,...,5.1844,0.003634,0.01754,,147.32,64.4,0.07111,0.3587,0.8894,0.01057
2022-01-05,2.3211,0.001306,230.07,0.1387,,0.00668,1.2301,0.1458,0.236,1.5681,...,4.7126,0.003519,0.01629,,136.88,60.39,0.06676,0.3302,0.8253,0.00926


## Distance Method

In [169]:
def _distance_score(p1, p2):
        diff = p1 - p2
        return (diff * diff).sum()

def _compute_stat(p):
        return np.mean(p), np.std(p)

def _distance_transform(training_pair):
    training_P1, training_P2 = training_pair
    mean1, std1 = _compute_stat(training_P1)
    mean2, std2 = _compute_stat(training_P2)
    p1 = (training_P1 - mean1) / std1
    p2 = (training_P2 - mean2) / std2
    trans_training = (p1, p2)
    return trans_training

def _plot_two_series(x1, x2, label1, label2, title, plt_width=20, plt_height=5):
    plt.rcParams['figure.figsize'] = [plt_width, plt_height]
    plt.plot(x1, marker='.', label=label1)
    plt.plot(x2, marker='.', label=label2)
    plt.title(title)
    plt.legend(loc='best')
    plt.show()
        
def pair_select(df, n = 5):
    coin_list = df.columns
    coin_num = len(coin_list)
    score_mat = np.zeros(coin_num * (coin_num-1)//2)
    res_pairs = []

    # scoring for all pairs
    i = 0
    all_pairs = list(itertools.combinations(coin_list,2))
    print(len(all_pairs))
    for pair in all_pairs:
        px_series = (df[pair[0]],df[pair[1]])
        score = _distance_score(*_distance_transform(px_series))
        score_mat[i] = score
        res_pairs.append(pair)
        i+=1
    first_n = np.argpartition(score_mat, n)[:n]
    result_pairs = [(res_pairs[i], score_mat[i]) for i in first_n]
    
    # for plotting
    # for pair in result_pairs:
    #     training_price_series = (df[pair[0]], df[pair[1]])
    #     trans_training_series = _distance_transform(training_price_series)
    #     _plot_two_series(*px_series, *pair,
    #         title='Price Series')
    #     _plot_two_series(*trans_training_series, *pair,
    #         title='Normalized Price Series')
    return result_pairs

In [170]:
pair_select(pivot_df, n = 5)

66066


[(('BTC-USDT.OK', 'WBTC-USDT.OK'), 0.0012734742398090961),
 (('BETH-USDT.OK', 'ETH-USDT.OK'), 0.416863267715406),
 (('BORA-USDT.OK', 'T-USDT.OK'), 1.87303404135761),
 (('BORA-USDT.OK', 'PCI-USDT.OK'), 2.684909475371553),
 (('PCI-USDT.OK', 'T-USDT.OK'), 2.7013239886117533)]

## Cointegration

In [161]:
def coint(df, sig_level = 0.01):
    df.ffill(inplace=True)
    df.bfill(inplace=True)
    cointegrated_pairs = []
    coin_list = df.columns 
    all_pairs = list(itertools.combinations(coin_list, 2))
    for pair in all_pairs:
        stock_1, stock_2 = pair
        p_value = 0
        # if not intercept:
        #     p_value = smts.coint(df[stock_1].values.astype(float), df[stock_2].values.astype(float), trend='c')[1]
        Y = df[stock_1]
        X = df[stock_2]
        X = sm.add_constant(X)
        model = sm.OLS(Y, X)
        results = model.fit()
        _, slope = results.params
        p_value = smts.adfuller(results.resid)[1]
        if p_value < sig_level and slope > 0:
            cointegrated_pairs.append(tuple([stock_1, stock_2, p_value]))
    return cointegrated_pairs

In [163]:
coint(pivot_df)

[('1INCH-USDT.OK', 'ACA-USDT.OK', 1.9751969292429477e-05),
 ('1INCH-USDT.OK', 'ADA-USDT.OK', 0.002047551187243065),
 ('1INCH-USDT.OK', 'AKITA-USDT.OK', 0.0002118674519677873),
 ('1INCH-USDT.OK', 'ANW-USDT.OK', 0.005505452464221092),
 ('1INCH-USDT.OK', 'APIX-USDT.OK', 0.001520309434695305),
 ('1INCH-USDT.OK', 'AR-USDT.OK', 0.007136321935775084),
 ('1INCH-USDT.OK', 'ATOM-USDT.OK', 0.0025443365904616612),
 ('1INCH-USDT.OK', 'AUCTION-USDT.OK', 0.002863848123551835),
 ('1INCH-USDT.OK', 'BADGER-USDT.OK', 7.126160753590234e-05),
 ('1INCH-USDT.OK', 'BAND-USDT.OK', 0.0031436603349244193),
 ('1INCH-USDT.OK', 'BORA-USDT.OK', 0.0048423097233382135),
 ('1INCH-USDT.OK', 'BORING-USDT.OK', 0.008218944641635893),
 ('1INCH-USDT.OK', 'CELO-USDT.OK', 0.002718803921701999),
 ('1INCH-USDT.OK', 'CELR-USDT.OK', 0.00986032062007259),
 ('1INCH-USDT.OK', 'CFG-USDT.OK', 0.008024572936191342),
 ('1INCH-USDT.OK', 'CLV-USDT.OK', 7.137346304149092e-05),
 ('1INCH-USDT.OK', 'CQT-USDT.OK', 0.0014672151934755446),
 ('1IN