Unsupervised Clusters</br>
- Using the DTW distance matrices computed on gas consumption time serieses this notebook runs K-Medoids, Agglomerative, DBSCAN and Affinity Propagation clustering models and saves models creating clusters of size 3 to 22 in nested dictionaries to use as input features for Supervised learning (Supervised_Unsupervised.p). The silhouette scores of these clusters are stored as a dictionary in S_Scores.p
- Dependencies: Merging/Unsupervised_Merge_DTW_Matrix.ipynb (DTW_Matrix_Norm.csv, DTW_Matrix_Raw.csv, Merge_Norm.csv, Merge_Raw.csv)</br>
- Produces: Clusters_Norm.p, Clusters_Raw.p, S_Scores.p</br>
</br>


In [2]:
#uncomment line below and run if not yet installed
#! pip install scikit-learn-extra

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import pickle

from pandas.plotting import register_matplotlib_converters
register_matplotlib_converters()

# Suppress all warnings
import warnings
warnings.filterwarnings("ignore")
import math

## Additional imports can be inlcuded here
from  sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import silhouette_score
from sklearn.cluster import DBSCAN
from sklearn.cluster import AgglomerativeClustering
from sklearn.cluster import AffinityPropagation
from sklearn_extra.cluster import KMedoids
from sklearn import metrics
from sklearn.preprocessing import OneHotEncoder


Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


In [52]:
#Finding potential k based on 'Rule of Thumb' √n/2
#Decided to use 22 (√n) as the top cut off for k to add buffer, and 3 as the bottom cut off.
data = len(pd.read_csv('../Merging/Merge_Raw.csv'))
k = np.sqrt(data/2)
k

15.858751527153705

In [3]:
def hierarchical_cluster (dmatrix, k=None, d_thresh=None, compute_d = False, linkage='complete'):
    '''Input: dmatrix =  distance matrix of plants
        k = number of clusters or none if using distance threshold
        d_thresh = distance threshold to use for clustering assignments
        compute_d = False, or True if distance threshold used
        encode = whether to return results with onehot encoding or not
    Output: df of plants and their cluster labels, either one hot encoded or as one label column'''
    
    model = AgglomerativeClustering(metric='precomputed', n_clusters=k, distance_threshold = d_thresh,
                                    linkage=linkage, compute_distances=compute_d).fit(dmatrix)
    
    labels = model.labels_
    encoder = OneHotEncoder(sparse_output=False)
    onehot = encoder.fit_transform(labels.reshape(-1, 1))
    results  = pd.DataFrame(data=onehot)
    results.columns = results.columns.astype(str)

    k = len(results.columns)
    if k > 3 and k < 22:
        s_score = silhouette_score(dmatrix, labels, metric='precomputed', random_state=0)
    else:
        s_score = -2
    
    return results, s_score

In [4]:
def DBSCAN_cluster(dmatrix, eps=0.5, min_samples=5, encode=True):
    '''Input: dmatrix =  distance matrix of plants
        eps = max distance between 2 samples to be considered in same cluster
        min_samples = the minimum number of samples in a cluster
        encode = whether to return results with onehot encoding or not
    Output: df of plants and their cluster labels, either one hot encoded or as one label column'''
    
    model = DBSCAN(eps=eps, min_samples=min_samples, metric='precomputed').fit(dmatrix)
    
    labels = model.labels_
    encoder = OneHotEncoder(sparse_output=False)
    onehot = encoder.fit_transform(labels.reshape(-1, 1))
    results  = pd.DataFrame(data=onehot)
    results.columns = results.columns.astype(str)
    
    k = len(results.columns)
    if k > 3 and k < 22:
        s_score = silhouette_score(dmatrix, labels, metric='precomputed', random_state=0)
    else:
        s_score = -2
    
    return results, s_score

In [5]:
def AffinityProp_cluster(dmatrix, damping=.9, max_iter=15, preference=None):
    '''Input: dmatrix =  distance matrix of plants
        dampening = damping factor - the extent to which current value is maintained relative to incoming values
        max_iter = maximum number of iterations
        encode = whether to return results with onehot encoding or not
    Output: df of plants and their cluster labels, either one hot encoded or as one label column'''

    model = AffinityPropagation(damping=damping, max_iter=max_iter, affinity='precomputed', preference=preference, random_state=0).fit(dmatrix)
    
    labels = model.labels_
    encoder = OneHotEncoder(sparse_output=False)
    onehot = encoder.fit_transform(labels.reshape(-1, 1))
    results  = pd.DataFrame(data=onehot)
    results.columns = results.columns.astype(str)

    k = len(results.columns)
    if k > 3 and k < 22:
        s_score = silhouette_score(dmatrix, labels, metric='precomputed', random_state=0)
    else:
        s_score = -1
        
    return results, s_score

In [6]:
def kMediods_cluster (dmatrix, k=8, init='random', max_iter=300):
    '''Input: dmatrix =  distance matrix of plants
        k = number of clusters
        encode = whether to return results with onehot encoding or not
    Output: df of plants and their cluster labels, either one hot encoded or as one label column'''
    
    model = KMedoids(n_clusters=k, random_state=0, metric='precomputed', method='pam', init=init, max_iter=max_iter).fit(dmatrix)

    labels = model.labels_
    encoder = OneHotEncoder(sparse_output=False)
    onehot = encoder.fit_transform(labels.reshape(-1, 1))
    results  = pd.DataFrame(data=onehot)
    results.columns = results.columns.astype(str)
    
    s_score = silhouette_score(dmatrix, labels, metric='precomputed', random_state=0)
    
    return results, s_score

In [54]:
def dictionary_clusters (K=22, norm = None, dzero=False):
    '''Input: K = number of clusters to find
        norm = True if using normalized rather than raw data
        dzero = True if using dezero'd data
    Output: results_dict =  a dictionary where the keys are the model type and parameters
        and the values are a data frame with the labels for the particular clustering one hot encoded
        s_scores = df of the silhouette scores for model and parameter combo, organized by s_score
        models = a simple count of the number of models that were iterated through'''

    results_dict = {'KM':{}, 'H':{}, 'DB':{}, 'AP':{}}
    s_scores = []
    
    if norm:
        dmatrix = pd.read_csv('../Merging/DTW_Matrix_Norm.csv')
    else:
        dmatrix = pd.read_csv('../Merging/DTW_Matrix_Raw.csv')

    models = 0
    for k in range(3,K):
        
        for init in ['random', 'heuristic', 'k-medoids++', 'build']:
            for max_iter in np.arange(1, 300, 25):
                models += 1
                results, s_score = kMediods_cluster (dmatrix, k=k, init=init, max_iter=max_iter)
                results = pd.concat([pd.Series(dmatrix.columns,name='plantCode'),results], axis=1)
                results['plantCode'] = results['plantCode'].astype(np.int64)
                key = f'{k}KM_I{init[:4]}_MI{max_iter}'
                results_dict['KM'][key] = results
                s_scores.append({'Model': 'KM', 'Parameter Key': key,
                                'params':{'init': init, 'max_iter': max_iter},'s_score': s_score, 'k':k})

        for link in ['complete', 'average', 'single']:
                models += 1
                results, s_score = hierarchical_cluster (dmatrix, k=k, linkage=link)
                results = pd.concat([pd.Series(dmatrix.columns,name='plantCode'),results], axis=1)
                results['plantCode'] = results['plantCode'].astype(np.int64)
                key = f'{k}H_L{link[:4]}'
                results_dict['H'][key] = results
                s_scores.append({'Model': 'H', 'Parameter Key': key, 
                                 'params':{'link': link, 'dist': 'None'}, 's_score': s_score, 'k':k})
    
    if norm: 
        epss = np.linspace(0.000005,0.001, 26)
        dists = np.linspace(0.025, 0.25, 26)
    else:
        epss = np.arange(1000,30000,1000)
        dists = np.arange(50000000000000, 500000000000000, 20000000000000)

    for eps in epss:
        for mins in np.linspace(5, 100, 5):
            models += 1
            results, s_score = DBSCAN_cluster(dmatrix, eps=eps, min_samples=int(mins))
            k = len(results.columns)
            if s_score == -2:
                s_score = -1
            else:
                results = pd.concat([pd.Series(dmatrix.columns,name='plantCode'),results], axis=1)
                results['plantCode'] = results['plantCode'].astype(np.int64)
                key = f'{k}DB_M{int(mins)}eps{round(eps,6)}'
                results_dict['DB'][key] = results 
                s_scores.append({'Model': 'DB', 'Parameter Key': key, 
                                'params':{'eps': eps, 'mins': int(mins)}, 's_score': s_score, 'k': k,})
                

    for link in ['complete', 'average', 'single']:
        for dist in dists:
            models += 1
            results, s_score = hierarchical_cluster (dmatrix, d_thresh=dist, linkage = link)
            k = len(results.columns)
            if s_score == -2:
                s_score = -1
            else:
                results = pd.concat([pd.Series(dmatrix.columns,name='plantCode'),results], axis=1)
                results['plantCode'] = results['plantCode'].astype(np.int64)
                key = f'{k}H_L{link[:4]}_DT{round(dist,4)}'
                results_dict['H'][key] = results
                s_scores.append({'Model': 'H', 'Parameter Key': key,
                                'params': {'link': link, 'dist': dist, }, 's_score': s_score, 'k':k})
                
    n=0
    j=0
    for pref in range(0,10):
        for max_iter in range(1,15):
            for damp in np.linspace(.5, .9, 5):
                models += 1
                results, s_score = AffinityProp_cluster(dmatrix, max_iter=max_iter, damping=damp, preference=pref)
                k = len(results.columns)
                #As AP returns 1 (or on normative sometimes 481 of 483 clusters), this makes sure that at least 1 'example' saved
                if (n ==0 and k<len(dmatrix)) or (j ==0 and k > 1) or (k>1 and k<len(dmatrix)-2):
                    if (n ==0 and k<len(dmatrix)):
                        n = 1
                    if (j ==0 and k > 1):
                        j=1
                    results = pd.concat([pd.Series(dmatrix.columns,name='plantCode'),results], axis=1)
                    results['plantCode'] = results['plantCode'].astype(np.int64)
                    key = f'{k}AP_MI{max_iter}_D{damp}'
                    results_dict['AP'][key] = results
                    s_scores.append({'Model': 'AP', 'Parameter Key': key,
                                    'params': {'pref': pref, 'max_iter': max_iter, 'damp': damp},
                                    's_score': s_score, 'k':k})  

    s_scores = pd.DataFrame(s_scores).sort_values(by=['s_score'], ascending=False).reset_index(drop=True)

    return results_dict, s_scores, models
        

In [37]:
def raw_and_norm (K=22, dzero=False):
    '''Input: K = number of max clusters to find
        dezero = whether to dezero or not
    Output: results_dict = returns and saves to pickle files (split by raw and normalized version of the data) a dictionary where 
        the keys are the model and parameters and the values are a df with the labels for the particular clustering one hot encoded
        s_scores = dictionary where keys are raw/normalized and values are dfs of models + params sorted by S-score
        num_models = simple count of all models created in hyperparameter tuning   '''
    
    results_dict = {}
    s_scores = {}
    results_dict['norm'], s_scores['norm'], norm_models = dictionary_clusters(K=K, norm = True, dzero=dzero)
    results_dict['raw'], s_scores['raw'], raw_models = dictionary_clusters(K=K, dzero=dzero)
    num_models = norm_models + raw_models

    #was too big a file to upload to github, so spliting dictionary into 2 groups for storage
    #pickle.dump(results_dict, open(f"Plant_Clusters.p", "wb"))
    pickle.dump(results_dict['norm'], open(f"Clusters_Norm.p", "wb"))
    pickle.dump(results_dict['raw'], open(f"Clusters_Raw.p", "wb"))
    pickle.dump(s_scores, open(f"S_Scores.p", "wb"))   

    return results_dict, s_scores, num_models

In [43]:
clusters, scores, num_models = raw_and_norm()

In [41]:
#number of total models created in hyperparameter tuning
num_models

3760

In [57]:
#Number of models meeting criteria to be evaluated (between 3-22 clusters, plus AP models)
len(scores['raw']) + len(scores['norm'])

2078

In [42]:
scores['norm'].head()

Unnamed: 0,index,Model,Parameter Key,params,s_score,k
0,1050,H,4H_Lsing_DT0.043,"{'link': 'single', 'dist': 0.043000000000000003}",0.809829,4
1,101,H,4H_Lsing,"{'link': 'single', 'dist': 'None'}",0.809829,4
2,1047,H,4H_Laver_DT0.088,"{'link': 'average', 'dist': 0.088}",0.785676,4
3,100,H,4H_Laver,"{'link': 'average', 'dist': 'None'}",0.785676,4
4,151,H,5H_Laver,"{'link': 'average', 'dist': 'None'}",0.776311,5


In [472]:
clusters.keys()

dict_keys(['norm', 'raw'])

In [474]:
clusters['norm'].keys()

dict_keys(['KM', 'H', 'DB', 'AP'])

In [473]:
clusters['raw']['KM']['3KM_Irand_MI1']

Unnamed: 0,plantCode,0,1,2
0,9,0.0,1.0,0.0
1,99,0.0,1.0,0.0
2,136,0.0,1.0,0.0
3,298,0.0,1.0,0.0
4,550,0.0,1.0,0.0
...,...,...,...,...
498,66596,0.0,1.0,0.0
499,66597,0.0,1.0,0.0
500,66612,0.0,1.0,0.0
501,66613,0.0,1.0,0.0
