**k-Means clustering and cluster analysis**

In [1]:
import pandas as pd
import numpy as np

from sklearn.cluster import KMeans



In [2]:
stockData = pd.read_csv('https://raw.githubusercontent.com/djcrow-instructor/datasets/main/StockDataWithoutClusters_v2.csv')    

In [3]:
stockData.head()

Unnamed: 0,Company,Sector,Price,Market Cap,Free Float Market Cap %,6m ADV,RoE %,RoCE %,EBIT Margin %,EPS,PAT %,Stock Return %
0,Hindustan Oil Exploration Company Ltd.,Crude Oil,68.4,131.299505,0.8371,1.136947,-1.0,-0.046311,-0.923766,4.533827,-1.0,90.794979
1,United Breweries (Holdings) Ltd.,Alcohol,40.0,39.316572,0.4766,0.162805,-1.0,-0.535856,-1.0,158.744001,-1.0,38.169257
2,Hindustan Motors Ltd.,Auto & Auto Anc.,7.15,21.946366,0.6766,0.041085,-0.868864,-1.0,-1.0,6.880137,-1.0,-8.333333
3,HMT Ltd.,Industrials,42.85,477.697516,0.0631,0.013322,-0.183166,-0.167937,-1.0,5.865182,-1.0,-23.070018
4,ABG Shipyard Ltd.,Industrials,31.1,24.718436,0.8177,0.178504,-0.766749,-0.051679,-0.86516,182.175768,-1.0,-57.014513


In [None]:
# All % columns were on a scale where 1 meant 100%, except for Stock Return %. We multiply all % columns (except Returns) by 100
stockData['Free Float Market Cap %'] = stockData['Free Float Market Cap %']*100
stockData['RoE %'] = stockData['RoE %']*100
stockData['RoCE %'] = stockData['RoCE %']*100
stockData['EBIT Margin %'] = stockData['EBIT Margin %']*100
stockData['PAT %'] = stockData['PAT %']*100
stockData.describe()

Unnamed: 0,Price,Market Cap,Free Float Market Cap %,6m ADV,RoE %,RoCE %,EBIT Margin %,EPS,PAT %,Stock Return %
count,1225.0,1225.0,1225.0,1225.0,1225.0,1225.0,1225.0,1225.0,1225.0,1225.0
mean,524.523282,1007.097584,42.49911,1.476287,5.812153,13.48316,7.695264,137.198393,3.048752,7.087798
std,1975.921838,3977.721963,15.629644,4.790404,25.26401,15.847713,17.072537,318.540047,17.421837,50.988015
min,0.35,1.027453,6.26,4e-06,-100.0,-100.0,-100.0,0.234502,-100.0,-91.12801
25%,60.55,40.744259,29.18,0.02905,1.393013,5.415892,3.374554,24.568699,0.654241,-21.670429
50%,170.1,113.898556,40.38,0.123546,8.981234,11.664688,7.695264,56.775656,3.935423,-1.576713
75%,448.05,417.201805,51.23,0.54884,17.094221,19.677723,13.344717,125.772649,8.648472,25.083403
max,51579.2,66127.30299,100.0,52.834197,100.0,100.0,100.0,6136.311392,100.0,586.043761


In [None]:
features = stockData.columns[2:]
features

Index(['Price', 'Market Cap', 'Free Float Market Cap %', '6m ADV', 'RoE %',
       'RoCE %', 'EBIT Margin %', 'EPS', 'PAT %', 'Stock Return %'],
      dtype='object')

In [None]:
# Good idea to standardize the features before k-Means
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
stockDataFeatures_scaled = scaler.fit_transform(stockData[features])
stockDataFeatures_scaled = pd.DataFrame(stockDataFeatures_scaled, columns=features)
stockDataFeatures_scaled.describe()

Unnamed: 0,Price,Market Cap,Free Float Market Cap %,6m ADV,RoE %,RoCE %,EBIT Margin %,EPS,PAT %,Stock Return %
count,1225.0,1225.0,1225.0,1225.0,1225.0,1225.0,1225.0,1225.0,1225.0,1225.0
mean,0.010163,0.015214,0.386592,0.027942,0.529061,0.567416,0.538476,0.022321,0.515244,0.145038
std,0.038309,0.060153,0.166734,0.090669,0.12632,0.079239,0.085363,0.051913,0.087109,0.075296
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.001167,0.000601,0.244506,0.00055,0.506965,0.527079,0.516873,0.003966,0.503271,0.10257
50%,0.003291,0.001707,0.363985,0.002338,0.544906,0.558323,0.538476,0.009215,0.519677,0.132243
75%,0.00868,0.006294,0.479731,0.010388,0.585471,0.598389,0.566724,0.020459,0.543242,0.171613
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [None]:
kmeans = KMeans(7, n_jobs=-1)

In [None]:
clus = kmeans.fit_predict(stockDataFeatures_scaled)

In [None]:
stockData['cluster'] = clus
stockData['cluster'].value_counts()

1    432
5    404
2    168
0    124
3     63
6     19
4     15
Name: cluster, dtype: int64

In [None]:
clusterDesc = pd.DataFrame(stockData.iloc[:,2:].groupby('cluster').mean().round(3))
clusterDesc.insert(0,'size',stockData['cluster'].value_counts())

In [None]:
clusterDesc

Unnamed: 0_level_0,size,Price,Market Cap,Free Float Market Cap %,6m ADV,RoE %,RoCE %,EBIT Margin %,EPS,PAT %,Stock Return %
cluster,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
0,124,1080.754,2705.938,44.352,3.373,34.23,38.111,22.061,34.712,17.21,7.621
1,432,598.294,569.398,27.941,0.519,10.269,14.316,9.183,103.236,5.801,8.996
2,168,630.69,635.866,67.993,1.623,5.126,10.2,7.806,208.097,3.181,2.828
3,63,92.13,218.797,43.803,0.626,-73.107,-7.897,-8.172,308.978,-23.213,-9.171
4,15,1013.843,25916.713,57.296,34.915,15.054,18.932,13.068,102.046,8.39,37.485
5,404,303.213,349.926,45.681,0.811,6.972,11.1,7.699,152.188,3.605,9.032
6,19,31.53,76.196,52.317,0.138,-45.198,-19.873,-72.576,90.791,-82.077,-13.529
