<a href="https://colab.research.google.com/github/Amelrich/Capstone-Fall-2020/blob/master/romane_silhouette_score.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [10]:
import pandas as pd
import numpy as np
from collections import Counter
from datetime import datetime
from random import sample
import numpy.random as rd

!pip install tslearn

Collecting tslearn
[?25l  Downloading https://files.pythonhosted.org/packages/a7/67/aa3149fdfef2582d881ce4a5117c9e6a465d5082dd57866904ca508a157c/tslearn-0.4.1-cp36-cp36m-manylinux2010_x86_64.whl (770kB)
[K     |▍                               | 10kB 14.8MB/s eta 0:00:01[K     |▉                               | 20kB 2.1MB/s eta 0:00:01[K     |█▎                              | 30kB 2.7MB/s eta 0:00:01[K     |█▊                              | 40kB 3.0MB/s eta 0:00:01[K     |██▏                             | 51kB 2.5MB/s eta 0:00:01[K     |██▌                             | 61kB 2.7MB/s eta 0:00:01[K     |███                             | 71kB 3.0MB/s eta 0:00:01[K     |███▍                            | 81kB 3.3MB/s eta 0:00:01[K     |███▉                            | 92kB 3.5MB/s eta 0:00:01[K     |████▎                           | 102kB 3.3MB/s eta 0:00:01[K     |████▊                           | 112kB 3.3MB/s eta 0:00:01[K     |█████                           | 1

# 1. Silhouette Score Function

In [11]:
#Silhouette scores per clusters
from sklearn.metrics import silhouette_samples

def sil_sample_cluster(data, col1, col2, thresh):
  '''
  computes the percentage of silhouette scores about a threshold within each cluster
  (input) data: DataFrame; col1: str refering to cluster column; col2: str refering to silhouette scores column; thresh: float between -1 and 1
  (output) ss_scores; dictionary
  '''
  ss_scores = {}
  for i in range(data[col1].nunique()):
    subs = data[data[col1]==i]
    ss_thresh = subs[subs[col2]>=thresh]
    ss_scores[i] = len(ss_thresh)/len(subs)
  
  return ss_scores

# 2. Example 

In [12]:
#Get the data

##Time series generator function
class TS_generator:
  def __init__(self, nb_timeseries=2000, chunk_size=100):
    
    self.chunk_size = chunk_size
    self.nb_timeseries = nb_timeseries

    #Retrieve the stocks names
    self.symbols = pd.read_csv('https://raw.githubusercontent.com/Amelrich/Capstone-Fall-2020/master/sp500.csv', index_col=False)
    self.symbols = list(self.symbols['Symbol'].values)
    self.symbols = ['BF-B' if x=='BF.B' else x for x in self.symbols]
    self.symbols = ['BRK-B' if x=='BRK.B' else x for x in self.symbols]

    self.list_df = []

    #Build the random time series
    self.build_()

  def build_(self):    
    for _ in range(self.nb_timeseries):

      #Pick a random stock
      stock = self.symbols[rd.randint(len(self.symbols))]
      TS = pd.read_csv('https://raw.githubusercontent.com/Amelrich/Capstone-Fall-2020/master/data/'+stock+'.csv')
      
      #Pick a random starting point
      timemax = len(TS) - self.chunk_size
      start = rd.randint(timemax)
      stock_df = TS[start : start+self.chunk_size]

      self.list_df.append( stock_df )

  def get_list_of_df(self):
    #
    return self.list_df

  def get_array(self):
    #Return adjusted close array
    close_array = np.zeros((self.nb_timeseries, self.chunk_size))

    for i in range(self.nb_timeseries):
      close_array[i,:] = self.list_df[i]['Adj Close'].to_numpy()

    return close_array

##Scraping and getting the time series
nb_timeseries = 1000

gen = TS_generator(nb_timeseries=nb_timeseries, chunk_size=100) #default values but just for the syntax
X = gen.get_array()

In [14]:
#Fourier Transform Example
from tslearn.clustering import TimeSeriesKMeans
from sklearn.preprocessing import StandardScaler
from scipy.fftpack import fft,ifft, rfft, irfft

X_scaled = StandardScaler().fit_transform(X.T).T
X_ft = fft(X_scaled)

X_ft_real_imag = np.concatenate((X_ft.real,X_ft.imag),axis=1)

tsm = TimeSeriesKMeans(n_clusters=10, metric="dtw", max_iter=5,random_state=0).fit(X_ft_real_imag)
y_train_pred = tsm.predict(X_ft_real_imag)

print(Counter(y_train_pred))

  '{} 1-dimensional timeseries'.format(X.shape[0]))


Counter({0: 240, 1: 126, 3: 108, 4: 104, 9: 102, 2: 98, 5: 73, 7: 60, 8: 50, 6: 39})


In [15]:
#Overall Silhouette Score (mean silhouette scores over all clusters)
from tslearn.clustering import silhouette_score

silhouette_score(X_ft_real_imag, y_train_pred, metric = 'dtw')

0.14569857686201193

In [23]:
#Silhouette scores per clusters
from sklearn.metrics import silhouette_samples

silhouette_scores = silhouette_samples(X_ft_real_imag, y_train_pred)

clusters_ss = pd.concat([pd.DataFrame(y_train_pred, columns= ['cluster']), pd.DataFrame(silhouette_scores, columns = ['silhouette_score'])], axis = 1)
clusters_ss.head() 


sil_sample_cluster(clusters_ss, 'cluster', 'silhouette_score', 0.5) #0 for all

{0: 1.0,
 1: 1.0,
 2: 0.6632653061224489,
 3: 0.7592592592592593,
 4: 0.9038461538461539,
 5: 0.9452054794520548,
 6: 0.8205128205128205,
 7: 0.95,
 8: 0.76,
 9: 0.6470588235294118}