## Imports

In [1]:
# Remove unwanted warnings
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

# Data extraction and management
import pandas as pd
import numpy as np

# Feature Engineering
from sklearn.preprocessing import StandardScaler

# Machine Learning
from sklearn.cluster import KMeans
from sklearn import metrics
from kneed import KneeLocator

import statsmodels.api as sm

# Reporting visualization
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt
import matplotlib.cm as cm
%matplotlib inline

data_location = "D:\\data_dump\\deriv_market_data-5M-20221231-20110101.hd5"

### Data Extraction

In [2]:
# get a list of all the symbols we will be working with
symbols = []
with pd.HDFStore(data_location) as hdf:
    keys = hdf.keys()
    for k in keys:
        symbols.append(k[1:])

# for testing only fx pairs
symbols = ['AUDCAD', 'AUDCHF', 'AUDJPY', 'AUDNZD', 'CADCHF', 'CADJPY', 'CHFJPY', 'EURAUD', 'EURCAD', 'EURCHF', 'EURGBP', 'EURJPY', 'EURNOK', 'EURNZD', 'EURUSD', 'GBPAUD', 'GBPCAD', 'GBPCHF', 'GBPJPY', 'GBPNOK', 'GBPNZD', 'GBPUSD', 'NZDCAD', 'NZDJPY', 'NZDUSD', 'USDCAD', 'USDCHF', 'USDJPY', 'XAUEUR']

## K-Means Clustering

In [5]:
## func to return a list of features from each market data 
def get_market_features(sym, tf_rule='1D'):
    df = pd.read_hdf(data_location, key=sym)
    df.asfreq(freq='5T')
    df.set_index(['time'], inplace=True)
    agg_dict = {'open': 'first', 'high': 'max', 'low': 'min', 'close': 'last', 'tick_volume': 'sum', 'spread': 'mean'}
    df = df.resample(rule=tf_rule).agg(agg_dict) # 1 Day
    df.dropna(how='all', subset=['open', 'high', 'low', 'close', 'spread'], axis=0, inplace=True)
    
    # feature engineering 
    df['returns'] = df['close'].pct_change()
    ann_returns =  df['returns'].mean() * 255
    ann_volatility = df['returns'].std() * np.sqrt(255)
        
    return (sym, ann_returns, ann_volatility)


def do_kmeans_clustering(tf_rule='1D'):
    ## run get_market_features on all symbols to build our new data frame
    all_market_features = [get_market_features(sym, tf_rule) for sym in symbols]
    data = pd.DataFrame.from_records(all_market_features, columns=['ticker', 'ann_returns', 'ann_volatility'], index='ticker')

    # Scale Features
    scaler = StandardScaler()
    scaler = scaler.fit_transform(data)
    scaled_data = pd.DataFrame(scaler, columns=data.columns, index=data.index)
    
    # Find the optimum number of clusters
    X = scaled_data.copy()
    K = range(1, 15)
    distortions = []
    for k in K:
        kmeans = KMeans(n_clusters=k)
        kmeans.fit(X)
        distortions.append(kmeans.inertia_)

    kl = KneeLocator(K, distortions, curve="convex", direction="decreasing")
    c = kl.elbow
    
    # Fit K-Means Model
    k_means = KMeans(n_clusters=c)
    k_means.fit(X)
    prediction = k_means.predict(scaled_data)
    
    groupings = {}
    for k, v in zip(X.index, k_means.labels_.flatten()):
        if v in groupings:
            groupings[v].append(k)
        else:
            groupings[v] = [k]
            
    return c, groupings
    
    

In [6]:
(k, groupings) = do_kmeans_clustering()
print(k)
print(groupings)



5
{4: ['AUDCAD', 'AUDNZD', 'EURCHF', 'EURGBP', 'EURUSD', 'GBPUSD'], 3: ['AUDCHF', 'CADCHF', 'GBPCHF', 'GBPNZD', 'NZDUSD'], 0: ['AUDJPY', 'CADJPY', 'CHFJPY', 'GBPJPY', 'GBPNOK', 'NZDJPY'], 1: ['EURAUD', 'EURCAD', 'EURJPY', 'EURNOK', 'EURNZD', 'GBPAUD', 'GBPCAD', 'NZDCAD', 'USDCAD', 'USDCHF', 'USDJPY'], 2: ['XAUEUR']}


In [7]:
various_timeframes = ['5M', '15M', '1H', '4H', '1D']
results = {tf:do_kmeans_clustering(tf) for tf in various_timeframes}



In [8]:
results

{'5M': (4,
  {2: ['AUDCAD',
    'EURAUD',
    'EURCAD',
    'EURGBP',
    'EURNOK',
    'GBPCAD',
    'GBPNOK',
    'NZDCAD',
    'USDCAD',
    'USDCHF'],
   0: ['AUDCHF', 'AUDNZD', 'CADCHF', 'EURCHF', 'EURUSD', 'GBPCHF', 'GBPUSD'],
   1: ['AUDJPY',
    'CADJPY',
    'EURJPY',
    'EURNZD',
    'GBPAUD',
    'GBPJPY',
    'GBPNZD',
    'NZDJPY',
    'NZDUSD'],
   3: ['CHFJPY', 'USDJPY', 'XAUEUR']}),
 '15M': (3,
  {2: ['AUDCAD',
    'EURAUD',
    'EURCAD',
    'EURGBP',
    'GBPAUD',
    'GBPCAD',
    'GBPNOK',
    'GBPNZD',
    'NZDCAD',
    'NZDUSD',
    'USDCHF'],
   0: ['AUDCHF',
    'AUDNZD',
    'CADCHF',
    'EURCHF',
    'EURNZD',
    'EURUSD',
    'GBPCHF',
    'GBPUSD'],
   1: ['AUDJPY',
    'CADJPY',
    'CHFJPY',
    'EURJPY',
    'EURNOK',
    'GBPJPY',
    'NZDJPY',
    'USDCAD',
    'USDJPY',
    'XAUEUR']}),
 '1H': (5,
  {3: ['AUDCAD', 'AUDNZD', 'EURCHF', 'EURGBP', 'EURUSD', 'GBPUSD'],
   2: ['AUDCHF', 'CADCHF', 'GBPCHF', 'GBPNZD', 'NZDUSD'],
   4: ['AUDJPY', 'CADJPY', '