In [6]:
import os
import pandas as pd
import numpy as np
import scipy.cluster.hierarchy as spc

In [26]:
def process_data(n=50):
    data_path = "./data"
    stock_data_df = pd.read_pickle(os.path.join(data_path, 'stock_data_3year.pkl'))
    stock_data_df = (stock_data_df['Close'] - stock_data_df['Open'])/stock_data_df['Open']

    # calculate percentage change
#     stock_data_df = stock_data_df.pct_change()
    
    # calculate correlation matrix using percentage change
    stock_data_corr = stock_data_df.corr()
    
    # adding mean in the NaN values
    stock_data_corr.fillna(value=0, inplace=True)

    # get correlation matrix of 100 stock data
    stock_data_corr = stock_data_corr.iloc[0:n, 0:n]
    return stock_data_corr

In [9]:
def hierarchial_cluster(stock_corr):
    pdist = spc.distance.pdist(stock_corr)
    linkage = spc.linkage(pdist, method='complete')
    idx = spc.fcluster(linkage, 0.5 * pdist.max(), 'distance')
    return idx

In [31]:
import numpy as np
def main(n=50):
    stock_corr_data = process_data(n)
    # get the stock labels
    # stock_label = stock_corr_data
    labels = hierarchial_cluster(stock_corr_data)
    label_set = set(labels)
    print("Number of clusters: {}".format(len(label_set)))
    for x in label_set:
        print("Label: {}, Cluster size: {}".format(x, np.count_nonzero(labels==x)))
    return label_set, labels

In [44]:
# using pct_change
# main(500)

In [45]:
# without pct_change
# main(500)

In [46]:
# without pct_change, 3 years data
label_set, labels = main(500)

Number of clusters: 9
Label: 1, Cluster size: 72
Label: 2, Cluster size: 89
Label: 3, Cluster size: 61
Label: 4, Cluster size: 78
Label: 5, Cluster size: 6
Label: 6, Cluster size: 41
Label: 7, Cluster size: 36
Label: 8, Cluster size: 25
Label: 9, Cluster size: 92


In [47]:
cluster = dict()
labels_list = list(labels)
for x in label_set:
    c_set = list()
    for i in range(len(labels_list)):
        if labels_list[i] == x:
            c_set.append(i)
    cluster[x] = c_set

In [39]:
# cluster

In [48]:
stock_corr = process_data(500)
# negative edges inside cluster
negative_edge_tot = 0
for c,v in cluster.items():
    negative_edge = 0
    for i in range(len(v)):
        for j in range(i+1, len(v)):
            if stock_corr.iloc[i,j] < 0:
                negative_edge += 1
    negative_edge_tot += negative_edge
    print("Cluster: {} has {} negative edges".format(c, negative_edge))
print("Total negative edges: {}".format(negative_edge_tot))

Cluster: 1 has 25 negative edges
Cluster: 2 has 47 negative edges
Cluster: 3 has 25 negative edges
Cluster: 4 has 28 negative edges
Cluster: 5 has 0 negative edges
Cluster: 6 has 8 negative edges
Cluster: 7 has 4 negative edges
Cluster: 8 has 2 negative edges
Cluster: 9 has 48 negative edges
Total negative edges: 187


In [49]:
stock_corr = process_data(500)
# negative edges inside cluster
positive_edge_tot = 0
for c,v in cluster.items():
    positive_edge = 0
    for i in range(len(v)):
        for j in range(i+1, len(v)):
            if stock_corr.iloc[i,j] >= 0:
                positive_edge += 1
    positive_edge_tot += positive_edge
    print("Cluster: {} has {} positive edges".format(c, positive_edge))
print("Total positive edges: {}".format(positive_edge_tot))

Cluster: 1 has 2531 positive edges
Cluster: 2 has 3869 positive edges
Cluster: 3 has 1805 positive edges
Cluster: 4 has 2975 positive edges
Cluster: 5 has 15 positive edges
Cluster: 6 has 812 positive edges
Cluster: 7 has 626 positive edges
Cluster: 8 has 298 positive edges
Cluster: 9 has 4138 positive edges
Total positive edges: 17069
