In [1]:
# !pip install --upgrade pip

In [2]:
# !pip install yfinance

In [3]:
# import libraries

import numpy as np 
import pandas as pd
import yfinance as yf

from sklearn.preprocessing import StandardScaler # use it because kmean is sentsitive for the wide range of features, then we apply it for normalization

import matplotlib.pyplot as plt 
%matplotlib inline

In [4]:
start = "2010-01-01"
end = "2023-12-31"

In [5]:
arkk_df = yf.download("ARKK", start = start, end = end, progress = False) # raw data
arkk_df.head()

Unnamed: 0_level_0,Open,High,Low,Close,Adj Close,Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2014-10-31,20.42,20.42,20.379999,20.379999,18.391508,2700
2014-11-03,20.49,20.49,20.35,20.379999,18.391508,2300
2014-11-04,20.200001,20.27,20.200001,20.26,18.283218,7900
2014-11-05,20.51,20.51,20.0,20.0,18.048588,14900
2014-11-06,20.09,20.139999,20.07,20.139999,18.174925,2000


In [6]:
arkk_df.shape

NameError: name 'arkk_df' is not defined

In [None]:
arkk_df.index

In [None]:
type(arkk_df)

In [None]:
stock = arkk_df.copy()
stock.head()

In [None]:
stock.shape

In [None]:
# plot 

plt.plot(stock['Adj Close'])

In [None]:
# get feature for clustering

# calculate log return
stock['Log Rets'] = np.log(stock['Adj Close'] / stock['Adj Close'].shift(1))
stock.head()

In [None]:
stock.shape

In [None]:
stock.dropna(inplace = True)

In [None]:
stock.head()

In [None]:
stock.shape

In [None]:
# prepare features to be used for clustering

stock['Rets Mean'] = stock['Log Rets'].rolling(5).mean() # calculate mean eavery 5 days
stock.head()

In [None]:
stock['Rets STD'] = stock['Log Rets'].rolling(5).std() # calculate mean eavery 5 days
stock.head()

In [None]:
stock.dropna(inplace = True)
stock.head()

# or we can use
# stock = stock.dropna()

In [None]:
stock.shape

In [None]:
# plot

plt.scatter(stock['Rets Mean'], stock['Rets STD']); # plot X and y

In [None]:
# get array X in order to perform clustering

X1 = stock['Rets Mean']
X1

In [None]:
type(X1)

In [None]:
X2 = stock['Rets STD']
X2

In [None]:
type(X2)

In [None]:
X1 = stock['Rets Mean'].values
X1

In [None]:
X2 = stock['Rets STD'].values
X2

In [None]:
type(X1), type(X2)

In [None]:
X = np.array(list(zip(X1, X2)))
X

In [None]:
type(X)

In [None]:
# plot 

plt.scatter(X[:, 0], X[:, 1]); # not need to use stock dataframe

In [None]:
# feature X are not bad but require normalization for kmean - better

def normalizedFeature(x):
    sc = StandardScaler()
    x = sc.fit_transform(x)

    return x


In [None]:
# call a function and keep data in X_norm
X_norm = normalizedFeature(X)
X_norm

In [None]:
# plot 

plt.scatter(X_norm[:, 0], X_norm[:, 1]);

In [None]:
# start KMean Clustering

# prepare a function for distance between 2 points

def distance_fn(a, b, ax = 1):
    return np.linalg.norm(a-b, axis = ax) # Euclidean distance

In [None]:
def plotCluster(data, C, K, cluster_label):

    colors = ['green', 'blue', 'orange', 'red', 'yellow', 'pink', 'purple', 'salmon']

    plt.figure(figsize = (8, 6))

    for i in range(K):
        points = [] # empty list
        points = np.array([data[j] for j in range(len(data)) if cluster_label[j] == i]) # find data for each cluster
        plt.scatter(points[:, 0], points[:, 1], s = 7, c = colors[i]) # plot
        plt.annotate(i, (C[i, 0], C[i, 1]), size = 20)
        
    plt.scatter(C[:, 0], C[:, 1], marker = '*', color = 'black', s = 200)

    plt.xlabel('Variable #1')
    plt.ylabel('Variable #2')
    plt.title('Cluster')

In [None]:
def kmeanClustering(data, K):

    # step 1. random centroids
    C_x = np.random.randint(np.min(data[:, 0]), np.max(data[:, 0]), size = K)
    C_y = np.random.randint(np.min(data[:, 1]), np.max(data[:, 1]), size = K)

    # set Centroid to variable C
    C = np.zeros((K, 2))
    C_old = np.zeros(C.shape)

    C[:, 0] = C_x 
    C[:, 1] = C_y 

    # Step 2. label data
    cluster_label = np.zeros((len(data), 1))

    for i in range(len(data)): # run 150 rounds - depands on length of data
        distance = distance_fn(data[i, :], C)
        cluster = np.argmin(distance) # for the shortest distance
        cluster_label[i] = cluster

    # Step 3. calculate new centroids
    C_old = C.copy()

    for i in range(K):
        points = [] # empty list
        points = np.array([data[j] for j in range(len(data)) if cluster_label[j] == i])
        C[i] = np.mean(points, axis = 0) # calculate mean and keep in C for each group (i)

    # Calculate error
    error = distance_fn(C, C_old, None)
    
    # Plot
    plotCluster(data, C, K, cluster_label)

    # Check if error is ok to stop
    while error != 0: # will stop when error = 0
    
        cluster_label = np.zeros((len(data), 1))
    
        for i in range(len(data)):
            distance = distance_fn(data[i, :], C)
            cluster = np.argmin(distance)
            cluster_label[i] = cluster
    
        C_old = C.copy()
    
        for i in range(K):
            points = []
            points = np.array([data[j] for j in range(len(data)) if cluster_label[j] == i])
            C[i] = np.mean(points, axis=0)
        
        plotCluster(data, C, K, cluster_label.copy())
    
        error = distance_fn(C, C_old, None)
        print(error)

    return cluster_label

In [None]:
# call a function
K = 3
label_kmean = kmeanClustering(X_norm, K)

In [None]:
label_kmean

### Use label from KMean

In [None]:
stock['Class'] = label_kmean 
stock.head(10)

In [None]:
# idea to trade

idx_c0 = np.where(stock['Class'] == 0) # find index in class 0
idx_c0

In [None]:
len(idx_c0)

In [None]:
len(idx_c0[0])

In [None]:
idx_c1 = np.where(stock['Class'] == 1)
idx_c1

In [None]:
len(idx_c1[0])

In [None]:
idx_c2 = np.where(stock['Class'] == 2)
idx_c2

In [None]:
len(idx_c2[0])

In [None]:
len(stock), len(idx_c0[0]) + len(idx_c1[0]) + len(idx_c2[0])

In [None]:
# if we would like to trade only some class

profit_c0 = stock['Log Rets']
profit_c0

In [None]:
type(profit_c0)

In [None]:
profit_c0 = np.array(profit_c0).reshape(-1, 1)
profit_c0

In [None]:
profit_c0[idx_c1, 0] = 0
profit_c0[idx_c2, 0] = 0
profit_c0

In [None]:
plt.plot(profit_c0.cumsum())

In [None]:
profit_c1 = np.array(stock['Log Rets']).reshape(-1, 1)
profit_c1

In [None]:
profit_c1[idx_c0, 0] = 0
profit_c1[idx_c2, 0] = 0
profit_c1

In [None]:
plt.plot(profit_c1.cumsum())

In [None]:
profit_c2 = np.array(stock['Log Rets']).reshape(-1, 1)
profit_c2

In [None]:
profit_c2[idx_c0, 0] = 0
profit_c2[idx_c1, 0] = 0
profit_c2

In [None]:
plt.plot(profit_c2.cumsum())

*** Require to select the correct class and time before trade
- What we require to do more
    - Lock ahead bias
    - In sample & Out sample

In [None]:
stock.head()