In [1]:
import os
import math
import pandas as pd
import numpy as np
from sklearn.cluster import DBSCAN

In [20]:
def process_data(n=50):
    data_path = "./data"
    stock_data_df = pd.read_pickle(os.path.join(data_path, 'stock_data_3year.pkl'))
    stock_data_df = (stock_data_df['Close'] - stock_data_df['Open'])/stock_data_df['Open']

    # calculate percentage change
#     stock_data_df = stock_data_df.pct_change()
    
    # calculate correlation matrix using percentage change
    stock_data_corr = stock_data_df.corr()
    
    # adding mean in the NaN values
    stock_data_corr.fillna(value=0, inplace=True)

    # get correlation matrix of 100 stock data
    stock_data_corr = stock_data_corr.iloc[0:n, 0:n]
    return stock_data_corr

def dist(x):
    return math.sqrt(2*(1-x))

In [22]:
def dbscan_cluster(stock_corr, eps):
    distance_mtx = stock_corr.applymap(lambda x: math.sqrt(2*(1 - x)))
    # get the stock labels
    clusters_dbscan = DBSCAN(eps=eps).fit(distance_mtx)
    return clusters_dbscan.labels_

In [38]:
def main(n=50, eps=1.8):
    stock_corr_data = process_data(n)
    # get the stock labels
    labels = dbscan_cluster(stock_corr_data, eps)
    label_set = set(labels)
    print("Number of clusters: {}".format(len(label_set)))
    for x in label_set:
        print("Label: {}, Cluster size: {}".format(x, np.count_nonzero(labels==x)))
    return label_set, labels

In [47]:
label_set, labels = main(500, 1.8)

Number of clusters: 16
Label: 0, Cluster size: 12
Label: 1, Cluster size: 55
Label: 2, Cluster size: 91
Label: 3, Cluster size: 24
Label: 4, Cluster size: 13
Label: 5, Cluster size: 5
Label: 6, Cluster size: 5
Label: 7, Cluster size: 7
Label: 8, Cluster size: 18
Label: 9, Cluster size: 6
Label: 10, Cluster size: 8
Label: 11, Cluster size: 5
Label: 12, Cluster size: 6
Label: 13, Cluster size: 6
Label: 14, Cluster size: 5
Label: -1, Cluster size: 234


In [48]:
cluster = dict()
labels_list = list(labels)
for x in label_set:
    c_set = list()
    for i in range(len(labels_list)):
        if labels_list[i] == x:
            c_set.append(i)
    cluster[x] = c_set

In [52]:
stock_corr = process_data(500)
# negative edges inside cluster
negative_edge_tot = 0
for c,v in cluster.items():
    negative_edge = 0
    for i in range(len(v)):
        for j in range(i+1, len(v)):
            if stock_corr.iloc[i,j] < 0:
                negative_edge += 1
    negative_edge_tot += negative_edge
    print("Cluster: {} has {} negative edges".format(c, negative_edge))
print("Total negative edges: {}".format(negative_edge_tot))

Cluster: 0 has 0 negative edges
Cluster: 1 has 18 negative edges
Cluster: 2 has 47 negative edges
Cluster: 3 has 2 negative edges
Cluster: 4 has 0 negative edges
Cluster: 5 has 0 negative edges
Cluster: 6 has 0 negative edges
Cluster: 7 has 0 negative edges
Cluster: 8 has 1 negative edges
Cluster: 9 has 0 negative edges
Cluster: 10 has 0 negative edges
Cluster: 11 has 0 negative edges
Cluster: 12 has 0 negative edges
Cluster: 13 has 0 negative edges
Cluster: 14 has 0 negative edges
Cluster: -1 has 454 negative edges
Total negative edges: 522


In [50]:
stock_corr = process_data(500)
# positive edges inside cluster
positive_edge_tot = 0
for c,v in cluster.items():
    positive_edge = 0
    for i in range(len(v)):
        for j in range(i+1, len(v)):
            if stock_corr.iloc[i,j] >= 0:
                positive_edge += 1
    positive_edge_tot += positive_edge
    print("Cluster: {} has {} positive edges".format(c, positive_edge))
print("Total positive edges: {}".format(positive_edge_tot))

Cluster: 0 has 66 positive edges
Cluster: 1 has 1467 positive edges
Cluster: 2 has 4048 positive edges
Cluster: 3 has 274 positive edges
Cluster: 4 has 78 positive edges
Cluster: 5 has 10 positive edges
Cluster: 6 has 10 positive edges
Cluster: 7 has 21 positive edges
Cluster: 8 has 152 positive edges
Cluster: 9 has 15 positive edges
Cluster: 10 has 28 positive edges
Cluster: 11 has 10 positive edges
Cluster: 12 has 15 positive edges
Cluster: 13 has 15 positive edges
Cluster: 14 has 10 positive edges
Cluster: -1 has 26807 positive edges
Total positive edges: 33026
