In [314]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from scipy.stats import zscore
from scipy.spatial.distance import squareform
from scipy.cluster.hierarchy import linkage, dendrogram, fcluster
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from tqdm import tqdm
from dtaidistance import dtw


In [160]:
class Backtest:
    def __init__(self, csv :str, start :int, interval :int, port) -> None:
        self.df = pd.read_csv(csv, delimiter=',')
        self.start_idx = self.current_idx = self.df.index[self.df['unix'] == start][0]
        self.interval = int(interval / 3600)
        self.port = port
        self.owned = 0

    def get_close(self, datapoints :int):
        return self.df.iloc[self.start_idx:self.start_idx + self.interval * datapoints:self.interval]['close']
    
    def get_price(self):
        return self.df.iloc[self.current_idx]['open']

    def buy(self, volume: float):
            self.owned += volume
            self.port -= volume * self.get_price()

    def sell(self, volume: float):
        if self.owned >= volume:
            self.port += volume * self.get_price()
            self.owned -= volume
        else:
            self.port = self.owned * self.get_price()
            self.owned = 0

    def simple_rsi(self, datapoints :int, lookback=24):
        changes = self.get_close(datapoints).pct_change()
        simple_rsis = [np.nan]*lookback

        for window in changes.rolling(window=lookback):
    
            if len(window) != lookback: continue

            positives = window[window>0].sum()
            negatives = window[window<0].sum() * -1

            simple_rsis.append(100 - 100/(1 + positives/negatives))

        return pd.Series(index = self.get_close(datapoints).index, data=simple_rsis[:-1])
    
    def get_signals(self, datapoints, buy_thresh: int, sell_thresh: int):
        rsi_data = self.simple_rsi(datapoints)
        print(f'length of rsi_data is {len(rsi_data)}')
        actions = []
        for rsi in rsi_data:
            if rsi == np.nan: actions.append(None)
            
            if rsi >= sell_thresh: actions.append('SELL')
            elif rsi <= buy_thresh: actions.append('BUY')
            else: actions.append(None)
        print(f'length of actions is {len(actions)}')
        return pd.Series(index=rsi_data.index, data=actions)

    def calc_pnl(self):
        return self.port + self.owned * self.get_price()

In [163]:
def run_algo():
    ETHUSD = Backtest('BacktestData.csv', 1670396400, 3600*4, 10000)
    signals = ETHUSD.get_signals(2000, 35, 65)
    k = []
    for signal in signals:
        if signal == 'BUY' :
            ETHUSD.buy(ETHUSD.port/ETHUSD.get_price())
        elif signal == 'SELL':
            ETHUSD.sell(ETHUSD.owned)
        else:
            pass
        ETHUSD.current_idx += ETHUSD.interval
        k.append(ETHUSD.calc_pnl())
    print(max(k))
    print(ETHUSD.calc_pnl())

In [47]:
run_algo()

NameError: name 'run_algo' is not defined

In [None]:
# data labeling
# write + implement dtw
# implement kmeans clustering (+ heirarchical clustering as a side bonus?)
# cut out outliers from the dataset
# create the ml model using indicators (see paper)
# train the model (80/20 train test split)
# signal generation + final implementation + paper live test?
# + finding optimal k

In [180]:
class KMeansDTW():
    def __init__(self, k: int = 8, max_iter: int = 3000, tol: float = 0.001):
        self.k = k
        self.max_iter = max_iter
        self.tol = tol
        
    def create_clusters(self, data: np.ndarray):
        # Initialize centroids randomly
        rand_idx = np.random.choice(data.shape[0], self.k, replace=False)
        self.centroids = data[rand_idx]
        
        for _ in tqdm(range(self.max_iter)):
            self.classifications = [[] for _ in range(self.k)]
            
            # Precompute distances between each datapoint and each centroid
            # This step is assumed to be the optimized part; depending on the fastdtw implementation details
            # You might need to manually loop through data and centroids if fastdtw cannot be vectorized directly
            for i, datapoint in enumerate(data):
                distances = np.array([dtw.distance_fast(centroid, datapoint) for centroid in self.centroids])
                closest_centroid_idx = np.argmin(distances)
                self.classifications[closest_centroid_idx].append(datapoint)
            
            prev_centroids = np.copy(self.centroids)
            for i, classification in enumerate(self.classifications):
                # Efficiently compute new centroids
                if classification:  # Check if classification is not empty
                    self.centroids[i] = np.mean(classification, axis=0)
            
            # Check for convergence
            optimised_flag = True
            for i in range(self.k):
                diff = np.linalg.norm(prev_centroids[i] - self.centroids[i])
                if diff >= self.tol:
                    optimised_flag = False
                    break
            
            if optimised_flag:
                break

    def elbow_method(self):
        total_var = 0
        for i, centroid in enumerate(self.centroids):
            for datapoint in self.classifications[i]:
                total_var += dtw.distance_fast(centroid, datapoint)
        return total_var
    
    def display_clusters(self):
        for i, cluster in enumerate(self.classifications, start = 1):
            plt.figure(figsize=(3, 1.5))
            for series in cluster:
                plt.plot(series)

        plt.title(f'Cluster {i} Time Series')
        plt.show()



In [107]:
my_KMeans = KMeansDTW()

In [108]:
my_KMeans.create_clusters(tester)
print(my_KMeans.classifications)

<class 'numpy.ndarray'>
[203 400 274]
[[-0.9924995  -0.99150183 -0.98640069 -0.97765588 -0.97927818 -0.97677967
  -0.98046672 -0.97646735 -0.97794217 -0.9796946  -0.97722211 -0.98251411
  -0.98142968 -0.98086578 -0.98156849 -0.98000692 -0.97939097 -0.97811568
  -0.97616372 -0.97378665 -0.97752575 -0.97228581 -0.97290176 -0.97326613
  -0.97637192 -0.97501856 -0.96800884 -0.97102788 -0.96825175 -0.9687983
  -0.96983067 -0.97022974 -0.97915673 -0.9791307  -0.97782072 -0.97848005]
 [ 1.57500343  1.58638555  1.67893476  1.68174559  1.72048994  1.69703165
   1.69608603  1.69127985  1.67959409  1.6902041   1.68434821  1.68044428
   1.65086983  1.65838273  1.63270353  1.66095065  1.68825214  1.65205836
   1.62945893  1.6491174   1.63053468  1.76366742  1.72786403  1.67189033
   1.68414     1.77134515  1.80802476  1.84202366  1.83946442  1.84806174
   1.93428657  1.95412722  1.94246748  2.08698235  2.14286062  2.12751383]
 [-0.75553086 -0.75298897 -0.75094157 -0.73772893 -0.7401407  -0.73982838

  0%|          | 0/3000 [00:13<?, ?it/s]


ValueError: operands could not be broadcast together with shapes (234,) (234,36) 

In [299]:
class HierarchDTW():
    def __init__(self, data: np.ndarray):
        self.data = data
        self.linkages_matrix = None
    
    def calc_dist_matrix(self):
        num_datapoints = self.data.shape[0]
        self.distance_matrix = np.zeros((num_datapoints, num_datapoints))
        for i in range(num_datapoints):
            for j in range(i + 1, num_datapoints):
                self.distance_matrix[i][j] = self.distance_matrix[j][i] = dtw.distance_fast(self.data[i], self.data[j])
        
    def cluster(self, method = 'ward'):
        if self.linkages_matrix is None:
            self.calc_dist_matrix()
        self.linkages_matrix = linkage(squareform(self.distance_matrix), method = method)

    def plot_dendrogram(self):
        if self.linkages_matrix is None:
            print('not clustered yet')
        else:
            plt.figure(figsize = (10, 7))
            dendrogram(self.linkages_matrix)
            plt.xlabel('Sample index')
            plt.ylabel('Distance')
            plt.show()

    def display_clusters(self, max_clusters: int):
        print(self.linkages_matrix.shape)
        cluster_labels = fcluster(self.linkages_matrix, t = max_clusters, criterion = 'maxclust')
        print(cluster_labels, type(cluster_labels))
        for i in range(1,  len(np.unique(cluster_labels)) + 1):
            idx = np.where(cluster_labels == i)
            plt.plot(self.data[idx])
            plt.title(f'Cluster {i} for hierarchical')
            plt.show()
        
    def find_outliers_idx(self, max_clusters: int, cluster_nums : [int]):
        # assumes that 
        idx = np.array([])
        cluster_labels = fcluster(self.linkages_matrix, t = max_clusters, criterion = 'maxclust')
        for i in cluster_nums:
            idx = np.append(idx, np.where(cluster_labels == i))
        return idx
            
    

In [None]:
# on visual inspection, cluster 6, 7 and 12 look to be outliers so i will cut them outj

In [339]:
class Indicators():
    
    def __init__(self, data):
        self.data = data

    def rate_of_change(self, period=14):
        return self.data['close'].diff(period) / self.data['close'].shift(period)
    
    def compute_rsi(self, window=14):
        delta = self.data['close'].diff()
        gain = delta.where(delta > 0, 0)
        loss = -delta.where(delta < 0, 0)

        avg_gain = gain.rolling(window=window, min_periods=1).mean()
        avg_loss = loss.rolling(window=window, min_periods=1).mean()

        rs = avg_gain / avg_loss
        rsi = 100 - (100 / (1 + rs))
        return rsi

    def exponential_moving_average(self, period=14):
        return self.data['close'].ewm(span=period, adjust=False).mean()

    def moving_average_convergence_divergence(self, slow=26, fast=12, signal=9):
        ema_fast = self.exponential_moving_average(fast)
        ema_slow = self.exponential_moving_average(slow)
        macd = ema_fast - ema_slow
        signal_line = macd.ewm(span=signal, adjust=False).mean()
        return macd, signal_line

    def commodity_channel_index(self, period=14):
        TP = (self.data['high'] + self.data['low'] + self.data['close']) / 3
        CCI = (TP - TP.rolling(window=period).mean()) / (0.015 * TP.rolling(window=period).std())
        return CCI

    def bollinger_bands(self, period=14, num_std_dev=2):
        sma = self.data['close'].rolling(window=period).mean()
        std_dev = self.data['close'].rolling(window=period).std()
        upper_band = sma + (std_dev * num_std_dev)
        lower_band = sma - (std_dev * num_std_dev)
        return upper_band, lower_band

    def stochastic_oscillator(self, period=14):
        low_min = self.data['low'].rolling(window=period).min()
        high_max = self.data['high'].rolling(window=period).max()
        stoch = ((self.data['close'] - low_min) / (high_max - low_min)) * 100
        return stoch

    def price_volume_volatility(self, period=14):
        price_volatility = self.data['close'].rolling(window=period).std()
        volume_volatility = self.data['Volume USD'].rolling(window=period).std()
        return price_volatility, volume_volatility
    
    def label_data(self, thresh):
        result = np.zeros_like(self.data)
        changes = self.data.pct_change()
        print(changes)
        result[changes >= thresh] = 1
        result[changes <= - thresh] = -1

        return result


In [336]:
def split_time_series(series, window: int, slide: int, train_proportion = 0.7):
    split = []
    series = zscore(series)
    for i in range(0, int(len(series) * train_proportion) - window, slide):
        split.append(list(series.iloc[i:i+window]))
    return np.array(split)

# Return array with 1 if price has gone up or -1 if price has gone down more than threshold
def label_data(data, thresh):
    result = np.zeros_like(data)
    changes = data.pct_change()
    print(changes)
    result[changes >= thresh] = 1
    result[changes <= - thresh] = -1

    return result

In [338]:
train_df = pd.read_csv('TrainData.csv', delimiter=',')
my_Hierarch = HierarchDTW(train_df)
my_Hierarch.cluster()
my_Hierarch.find_outliers_idx(12, [6, 7, 12])

indicators = Indicators(train_df)
# X = indicators.data[['RSI', 'EMA', 'MACD', 'MACD_signal', 'CCI', 'Upper_BB', 'Lower_BB', 'Stochastic_Oscillator', 'Price_Volatility']]
# Y = indicators.data['Label']
X = [indicators.compute_rsi(), indicators.exponential_moving_average(), indicators.moving_average_convergence_divergence()[0], indicators.moving_average_convergence_divergence()[1], indicators.commodity_channel_index(), indicators.bollinger_bands()[0], indicators.bollinger_bands()[1], indicators.stochastic_oscillator(), indicators.price_volume_volatility()]
Y = indicators.label_data(0.5)
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.3, shuffle = None, )
tester = split_time_series(train_df['close'], 36, 18)


TypeError: unsupported operand type(s) for /: 'str' and 'str'