In [9]:
import pandas as pd
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import Normalizer
from sklearn.cluster import KMeans

## Company Stock Price Movements Dataset

In [11]:
df = pd.read_csv('https://assets.datacamp.com/production/course_2072/datasets/company-stock-movements-2010-2015-incl.csv', 
                 index_col=0)
df.head()

Unnamed: 0,2010-01-04,2010-01-05,2010-01-06,2010-01-07,2010-01-08,2010-01-11,2010-01-12,2010-01-13,2010-01-14,2010-01-15,...,2013-10-16,2013-10-17,2013-10-18,2013-10-21,2013-10-22,2013-10-23,2013-10-24,2013-10-25,2013-10-28,2013-10-29
Apple,0.58,-0.220005,-3.409998,-1.17,1.680011,-2.689994,-1.469994,2.779997,-0.680003,-4.999995,...,0.320008,4.519997,2.899987,9.590019,-6.540016,5.959976,6.910011,-5.359962,0.840019,-19.589981
AIG,-0.640002,-0.65,-0.210001,-0.42,0.710001,-0.200001,-1.130001,0.069999,-0.119999,-0.5,...,0.919998,0.709999,0.119999,-0.48,0.010002,-0.279998,-0.190003,-0.040001,-0.400002,0.66
Amazon,-2.350006,1.260009,-2.350006,-2.009995,2.960006,-2.309997,-1.640007,1.209999,-1.790001,-2.039994,...,2.109985,3.699982,9.570008,-3.450013,4.820008,-4.079986,2.579986,4.790009,-1.760009,3.740021
American express,0.109997,0.0,0.260002,0.720002,0.190003,-0.270001,0.75,0.300004,0.639999,-0.130001,...,0.680001,2.290001,0.409996,-0.069999,0.100006,0.069999,0.130005,1.849999,0.040001,0.540001
Boeing,0.459999,1.77,1.549999,2.690003,0.059997,-1.080002,0.36,0.549999,0.530002,-0.709999,...,1.559997,2.480003,0.019997,-1.220001,0.480003,3.020004,-0.029999,1.940002,1.130005,0.309998


In [19]:
movements = df.values
companies = df.index.values
movements.shape, companies.shape

((60, 963), (60,))

## Clustering stocks using KMeans

In [7]:
# Create a normalizer
normalizer = Normalizer()

In [8]:
# Create a KMeans model with 10 clusters
kmeans = KMeans(n_clusters=10)

In [10]:
# Make a pipeline chaining normalizer and kmeans
pipeline = make_pipeline(normalizer, kmeans)

In [13]:
# Fit pipeline to the daily price movements
pipeline.fit(movements)

Pipeline(steps=[('normalizer', Normalizer(copy=True, norm='l2')), ('kmeans', KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=300,
    n_clusters=10, n_init=10, n_jobs=1, precompute_distances='auto',
    random_state=None, tol=0.0001, verbose=0))])

## Which stocks move together?

In [24]:
# Predict the cluster labels
labels = pipeline.predict(movements)

In [21]:
# Create a DataFrame aligning labels and companies
df = pd.DataFrame({'labels': labels, 'companies': companies})

In [22]:
# Display df sorted by cluster label
print(df.sort_values('labels'))

                             companies  labels
48                              Toyota       0
45                                Sony       0
34                          Mitsubishi       0
7                                Canon       0
21                               Honda       0
44                        Schlumberger       1
35                            Navistar       1
32                                  3M       1
51                   Texas instruments       1
53                       Valero Energy       1
13                   DuPont de Nemours       1
12                             Chevron       1
0                                Apple       1
10                      ConocoPhillips       1
57                               Exxon       1
8                          Caterpillar       1
54                            Walgreen       2
40                      Procter Gamble       2
9                    Colgate-Palmolive       2
39                              Pfizer       2
38           