# Analysis of Stocks

Import packages

In [None]:
import psycopg2 as pg
import pandas as pd
import numpy as np
import math
import matplotlib.pyplot as plt
from mpl_toolkits.axes_grid1 import make_axes_locatable
from datetime import date
from tslearn.clustering import TimeSeriesKMeans
from sklearn.decomposition import PCA

Set up constants

In [None]:
SEED = 1990
N_CLUSTERS = 8
WINDOW_SIZE = 100
VAR_PERCENTILE_LEVEL = 5
METRIC = "dtw"
FONT = {'family': 'serif',
        'color':  'black',
        'weight': 'normal',
        'size': 20,
        }

Import data from the local database. If the API or local database are not working, load the csv file directly.

In [None]:
try:
    connection = pg.connect(database="postgres", user="postgres", password="Crypto01", host="localhost", port=5430)
    cur = connection.cursor()
    cur.execute("SELECT * FROM input.stock_daily_changes ORDER BY date DESC")
    data = sorted(cur.fetchall(), key=lambda row: row[0], reverse=False)
    data = pd.DataFrame(data=data, columns=[desc[0] for desc in cur.description], index=[row[0] for row in data], dtype="float64")
except:
    try:
        data = pd.read_csv("./stock_daily_changes.csv", sep=",")
        data = data.sort_values(by="date", ascending=True)
        data.index = pd.to_datetime(data.date).dt.date
    except:
        raise Exception("Unable to read data")

Transform data

In [None]:
daily_changes = data.drop(["date"], axis=1)
daily_changes = daily_changes[daily_changes.index <= date(2018, 7, 31)]
daily_changes = daily_changes[daily_changes.index >= date(2016, 1, 1)]
daily_changes = daily_changes - 1
transpose_df = daily_changes.transpose()

Daily return/change dataset

In [None]:
daily_changes

Transposed daily return/change dataset

In [None]:
transpose_df

Correlation matrix

In [None]:
corr_mat = daily_changes.corr()
cryptos = np.array(corr_mat.columns)
plt.clf()
plt.rcParams['figure.figsize'] = [20, 15]
plt.matshow(corr_mat)
plt.xticks(range(len(cryptos)), cryptos, rotation="vertical", fontsize=24)
plt.yticks(range(len(cryptos)), cryptos, fontsize=24)

ax = plt.gca()
divider = make_axes_locatable(ax)
cax = divider.append_axes("right", size="5%", pad=0.1)

plt.colorbar(cax=cax)
plt.tick_params(labelsize=24)
plt.show()

Change the transposed dataframe into an array

In [None]:
transpose_array = np.array(transpose_df)

Plot the cumulative explained variance percentage of PCs

In [None]:
pca_plot = PCA(n_components=29)
pca_plot.fit(transpose_array)

cumsum_variance_ratio = np.cumsum(pca_plot.explained_variance_ratio_)
pc_names = ["PC" + str(i) for i in range(1, 30)]

plt.clf()
plt.plot(cumsum_variance_ratio, color="darkorange")
plt.bar(range(0, 29), pca_plot.explained_variance_ratio_, color="blue")
plt.yticks(fontsize=24)
plt.xticks(range(0, 29), pc_names, rotation=40, fontsize=24)
plt.xlabel("Principal Component", fontdict=FONT, fontsize=28)
plt.ylabel("Explained Variance Percentage", fontdict=FONT, fontsize=28)
plt.show()

Implement PCA and K-means clustering

In [None]:
pca = PCA(n_components=27)
pca_array = pca.fit_transform(transpose_array)
dba_km = TimeSeriesKMeans(n_clusters=N_CLUSTERS, max_iter=100, metric=METRIC, verbose=True, max_iter_barycenter=10, random_state=SEED)
pred_clusters = dba_km.fit_predict(pca_array)

Explained variance percentage

In [None]:
print("Percentage of interpretation:", sum(pca.explained_variance_ratio_))

Cluster components

In [None]:
for cluster in set(pred_clusters):
    print("Cluster", cluster + 1, ":", cryptos[np.where(pred_clusters == cluster)])

Cluster visualisation

In [None]:
# Plot
plt.clf()
plt.rcParams['figure.figsize'] = [20, 15]
for yi in set(pred_clusters):
    plt.subplot(N_CLUSTERS/2 + 1, 2, yi + 1)
    for xx in pca_array[pred_clusters == yi]:
        plt.plot(xx.ravel(), "k-", alpha=.2)
    plt.plot(dba_km.cluster_centers_[yi].ravel(), "r-")
    plt.xlim(0, pca_array.shape[1])
    plt.ylim(-0.2, 0.2)
    title = "Cluster : " + str(yi + 1)
    plt.title(title, fontdict=FONT, fontsize=32)
    plt.xticks(fontsize=24)
    plt.yticks(fontsize=24)
    if yi % 2 == 0 :
        plt.ylabel("Return", rotation="vertical", fontdict=FONT, fontsize=28)
    if yi == 6 or yi == 7:
        plt.xlabel("Principal Component", fontdict=FONT, fontsize=28)

plt.tight_layout()
plt.show()

VaR estimation

In [None]:
vars_arr_before = []
vars_arr_after = []

for i in range(0, transpose_array.shape[1] - WINDOW_SIZE):
    vars_arr_before.append([np.percentile(row, VAR_PERCENTILE_LEVEL) for row in transpose_array[:, i: i + WINDOW_SIZE]])
    cluster_vars = [None] * 29
    for j in set(pred_clusters):
        var = np.percentile(np.hstack(transpose_array[pred_clusters == j, i: i + WINDOW_SIZE]), VAR_PERCENTILE_LEVEL)
        for k in np.where(pred_clusters == j)[0]:
            cluster_vars[k] = var
    vars_arr_after.append(cluster_vars)

vars_arr_before = np.array(vars_arr_before).transpose()
vars_df_before = pd.DataFrame(vars_arr_before, columns=transpose_df.columns[WINDOW_SIZE:], index=daily_changes.columns)
vars_arr_after = np.array(vars_arr_after).transpose()
vars_df_after = pd.DataFrame(vars_arr_after, columns=transpose_df.columns[WINDOW_SIZE:], index=daily_changes.columns)

VaRs before clustering

In [None]:
vars_df_before

VaRs after clustering

In [None]:
vars_df_after

VaR performance

In [None]:
accuracies_before = np.array([np.sum(transpose_array[i, WINDOW_SIZE:] > vars_arr_before[i, :]) / vars_arr_before.shape[1] for i in range(0, vars_arr_before.shape[0])])
accuracies_before = pd.DataFrame(accuracies_before, columns=["Accuracy"], index=daily_changes.columns)
accuracies_after = np.array([np.sum(transpose_array[i, WINDOW_SIZE:] > vars_arr_after[i, :]) / vars_arr_after.shape[1] for i in range(0, vars_arr_after.shape[0])])
accuracies_after = pd.DataFrame(accuracies_after, columns=["Accuracy"], index=daily_changes.columns)

VaR performance before clustering

In [None]:
accuracies_before

VaR performance after clustering

In [None]:
accuracies_after

Average accuracy before clustering

In [None]:
print("Average accuracy before clustering:", np.mean(np.array(accuracies_before)))

Average accuracy after clustering

In [None]:
print("Average accuracy after clustering:", np.mean(np.array(accuracies_after)))

p-value of exactly 5% before clustering

In [None]:
count = 0
for i in range(0, 29):
    count += np.sum(transpose_array[i, WINDOW_SIZE:] < vars_arr_before[i, :]) == 27
p_value_equal_before = count/29

print("p-value of exactly 5% before clustering:", p_value_equal_before)

p-value of exactly 5% after clustering

In [None]:
count = 0
for i in range(0, 29):
    count += np.sum(transpose_array[i, WINDOW_SIZE:] < vars_arr_after[i, :]) == 27
p_value_equal_after = count/29

print("p-value of exactly 5% after clustering:", p_value_equal_after)

1 sided p-value before clustering

In [None]:
count = 0
for i in range(0, 29):
    count += np.sum(transpose_array[i, WINDOW_SIZE:] < vars_arr_before[i, :]) >= 29
p_value_1side_before = count/29

print("1 sided p-value of underestimation before clustering:", p_value_1side_before)

1 sided p-value after clustering

In [None]:
count = 0
for i in range(0, 29):
    count += np.sum(transpose_array[i, WINDOW_SIZE:] < vars_arr_after[i, :]) >= 29
p_value_1side_after = count/29

print("1 sided p-value of underestimation after clustering:", p_value_1side_after)

2 sided p-value before clustering

In [None]:
count = 0
for i in range(0, 29):
    count += np.sum(transpose_array[i, WINDOW_SIZE:] < vars_arr_before[i, :]) <= 25 or np.sum(transpose_array[i, WINDOW_SIZE:] < vars_arr_before[i, :]) >= 29
p_value_2side_before = count/29

print("2 sided p-value before clustering:", p_value_2side_before)

2 sided p-value after clustering

In [None]:
count = 0
for i in range(0, 29):
    count += np.sum(transpose_array[i, WINDOW_SIZE:] < vars_arr_after[i, :]) <= 25 or np.sum(transpose_array[i, WINDOW_SIZE:] < vars_arr_after[i, :]) >= 29
p_value_2side_after = count/29

print("2 sided p-value after clustering:", p_value_2side_after)