# Stock Clustering - KLSE

In [None]:
from pylab import plot,show
from numpy import vstack,array
from numpy.random import rand
import numpy as np
from scipy.cluster.vq import kmeans,vq
import pandas as pd
import pandas_datareader as dr
from math import sqrt
from sklearn.cluster import KMeans
from matplotlib import pyplot as plt


sp500_url = 'https://en.wikipedia.org/wiki/List_of_S%26P_500_companies'

#read in the url and scrape ticker data
data_table = pd.read_html(sp500_url)

tickers = data_table[0][1:][0].tolist()
prices_list = []
for ticker in tickers:
    try:
        print('Retrieving  {}'.format(ticker))
        prices = dr.DataReader(ticker,'yahoo','01/01/2017')['Adj Close']
        prices = pd.DataFrame(prices)
        prices.columns = [ticker]
        prices_list.append(prices)
    except:
        pass
    prices_df = pd.concat(prices_list,axis=1)
    print(prices_df)

prices_df.sort_index(inplace=True)

prices_df.head()

Retrieving  MMM
                   MMM
Date                  
2017-01-03  169.468918
2017-01-04  169.725876
2017-01-05  169.145309
2017-01-06  169.640244
2017-01-09  168.726486
2017-01-10  168.069763
2017-01-11  169.316589
2017-01-12  168.888306
2017-01-13  168.840714
2017-01-17  168.716965
2017-01-18  169.887695
2017-01-19  170.068527
2017-01-20  169.887695
2017-01-23  169.906723
2017-01-24  167.489120
2017-01-25  168.212524
2017-01-26  168.298187
2017-01-27  168.926361
2017-01-30  166.965652
2017-01-31  166.394592
2017-02-01  166.727676
2017-02-02  165.785400
2017-02-03  166.603989
2017-02-06  166.661072
2017-02-07  167.289276
2017-02-08  168.536118
2017-02-09  169.592636
2017-02-10  170.373123
2017-02-13  172.295776
2017-02-14  172.847809
...                ...
2018-12-06  202.679993
2018-12-07  198.240005
2018-12-10  198.320007
2018-12-11  197.460007
2018-12-12  199.809998
2018-12-13  202.130005
2018-12-14  196.100006
2018-12-17  192.820007
2018-12-18  194.559998
2018-12-19  189.96

In [None]:
#Calculate average annual percentage return and volatilities over a theoretical one year period
returns = prices_df.pct_change().mean() * 252
returns = pd.DataFrame(returns)
returns.columns = ['Returns']
returns['Volatility'] = prices_df.pct_change().std() * sqrt(252)

#format the data as a numpy array to feed into the K-Means algorithm
data = np.asarray([np.asarray(returns['Returns']),np.asarray(returns['Volatility'])]).T

X = data
distorsions = []
for k in range(2, 20):
    k_means = KMeans(n_clusters=k)
    k_means.fit(X)
    distorsions.append(k_means.inertia_)

fig = plt.figure(figsize=(15, 5))
plt.plot(range(2, 20), distorsions)
plt.grid(True)
plt.title('Elbow curve')

In [None]:
# computing K-Means with K = 5 (5 clusters)
centroids,_ = kmeans(data,5)
# assign each sample to a cluster
idx,_ = vq(data,centroids)

# some plotting using numpy's logical indexing
plot(data[idx==0,0],data[idx==0,1],'ob',
     data[idx==1,0],data[idx==1,1],'oy',
     data[idx==2,0],data[idx==2,1],'or',
     data[idx==3,0],data[idx==3,1],'og',
     data[idx==4,0],data[idx==4,1],'om')
plot(centroids[:,0],centroids[:,1],'sg',markersize=8)
show()

In [None]:
#identify the outlier
print(returns.idxmax())

In [None]:
#drop the relevant stock from our data
returns.drop('BHF',inplace=True)

#recreate data to feed into the algorithm
data = np.asarray([np.asarray(returns['Returns']),np.asarray(returns['Volatility'])]).T

In [None]:
# computing K-Means with K = 5 (5 clusters)
centroids,_ = kmeans(data,5)
# assign each sample to a cluster
idx,_ = vq(data,centroids)

# some plotting using numpy's logical indexing
plot(data[idx==0,0],data[idx==0,1],'ob',
     data[idx==1,0],data[idx==1,1],'oy',
     data[idx==2,0],data[idx==2,1],'or',
     data[idx==3,0],data[idx==3,1],'og',
     data[idx==4,0],data[idx==4,1],'om')
plot(centroids[:,0],centroids[:,1],'sg',markersize=8)
show()

In [None]:
details = [(name,cluster) for name, cluster in zip(returns.index,idx)]

for detail in details:
    print(detail)