In [2]:
#uncomment line below and run if not yet installed
#! pip install scikit-learn-extra

In [290]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import pickle

from pandas.plotting import register_matplotlib_converters
register_matplotlib_converters()

# Suppress all warnings
import warnings
warnings.filterwarnings("ignore")
import math

## Additional imports can be inlcuded here
from  sklearn.preprocessing import OneHotEncoder
from sklearn.cluster import KMeans
from sklearn.cluster import DBSCAN
from sklearn.cluster import AgglomerativeClustering
from sklearn.cluster import AffinityPropagation
from sklearn_extra.cluster import KMedoids
from sklearn import metrics
from sklearn.preprocessing import OneHotEncoder


In [291]:
def make_pds (norm=None, dezero=None, norm_vis=None):
    '''Input: norm - if True, then the consumption values for each plant will be normalized
        dezero - if True, then removes all plants that have a value of 0 consumption at some point
        norm_vis - if True, then uses normalized time series for result (visualization) dataframe
    Output: a matrix df and a results df that has timeseries and location data for each plant'''
    if norm and dezero:
        dmatrix = pd.read_csv('../Merging/DTW_Matrix_Norm_No0.csv')
    elif norm:
        dmatrix = pd.read_csv('../Merging/DTW_Matrix_Norm.csv')
    elif dezero:
        dmatrix = pd.read_csv('../Merging/DTW_Matrix_Raw_No0.csv')
    else:
        dmatrix = pd.read_csv('../Merging/DTW_Matrix_Raw.csv')
        
    if norm_vis:
        results = pd.read_csv('../Merging/Merge_Norm.csv')
    else:
        results = pd.read_csv('../Merging/Merge_Raw.csv')
    
    return dmatrix, results

In [292]:
def hierarchical_cluster (dmatrix, k=None, d_thresh=None, compute_d = False, encode=True):
    '''Input: dmatrix =  distance matrix of plants
        k = number of clusters or none if using distance threshold
        d_thresh = distance threshold to use for clustering assignments
        compute_d = False, or True if distance threshold used
        encode = whether to return results with onehot encoding or not
    Output: df of plants and their cluster labels, either one hot encoded or as one label column'''
    
    model = AgglomerativeClustering(metric='precomputed', n_clusters=k, distance_threshold = d_thresh,
                                    linkage='complete', compute_distances=compute_d).fit(dmatrix)
    
    labels = model.labels_
    encoder = OneHotEncoder(sparse_output=False)
    onehot = encoder.fit_transform(labels.reshape(-1, 1))
    results  = pd.DataFrame(data=onehot)
    
    return results

In [293]:
def DBSCAN_cluster(dmatrix, eps=0.5, min_samples=5, encode=True):
    '''Input: dmatrix =  distance matrix of plants
        eps = max distance between 2 samples to be considered in same cluster
        min_samples = the minimum number of samples in a cluster
        encode = whether to return results with onehot encoding or not
    Output: df of plants and their cluster labels, either one hot encoded or as one label column'''
    
    model = DBSCAN(eps=eps, min_samples=min_samples, metric='precomputed').fit(dmatrix)
    
    labels = model.labels_
    encoder = OneHotEncoder(sparse_output=False)
    onehot = encoder.fit_transform(labels.reshape(-1, 1))
    results  = pd.DataFrame(data=onehot)
    
    return results

In [294]:
def AffinityProp_cluster(dmatrix, damping=.9, max_iter=15, encode=True):
    '''Input: dmatrix =  distance matrix of plants
        dampening = damping factor - the extent to which current value is maintained relative to incoming values
        max_iter = maximum number of iterations
        encode = whether to return results with onehot encoding or not
    Output: df of plants and their cluster labels, either one hot encoded or as one label column'''

    model = AffinityPropagation(damping=damping, max_iter=max_iter, affinity='precomputed',random_state=0).fit(dmatrix)
    
    labels = model.labels_
    encoder = OneHotEncoder(sparse_output=False)
    onehot = encoder.fit_transform(labels.reshape(-1, 1))
    results  = pd.DataFrame(data=onehot)
    
    return results

In [295]:
def kMediods_cluster (dmatrix, k=8, encode=True):
    '''Input: dmatrix =  distance matrix of plants
        k = number of clusters
        encode = whether to return results with onehot encoding or not
    Output: df of plants and their cluster labels, either one hot encoded or as one label column'''
    
    model = KMedoids(n_clusters=k, random_state=0, metric='precomputed').fit(dmatrix)

    labels = model.labels_
    encoder = OneHotEncoder(sparse_output=False)
    onehot = encoder.fit_transform(labels.reshape(-1, 1))
    results  = pd.DataFrame(data=onehot)
    
    return results

In [312]:
def dictionary_clusters (K=22, norm = None, dzero=False):
    '''Input: K = number of clusters to find
        norm = True if using normalized rather than raw data
        dzero = True if using dezero'd data
    Output: returns and saves to a pickle file a dictionary where the keys are the model type and parameters
        and the values are a data frame with the labels for the particular clustering one hot encoded'''

    results_dict = {}
    dmatrix, _ = make_pds (norm=norm, dezero=dzero)

    for k in range(3,K):
        
        for n in range(2):
            if n == 0:
                results = kMediods_cluster (dmatrix, k=k, encode = True)
                d = 'KM'
            else:
                results = hierarchical_cluster (dmatrix, k=k, encode=True)
                d = 'H'
                
            if norm:
                results.columns = [f'N_{y}' for y in results.columns]
                results = pd.concat([pd.Series(dmatrix.columns,name='plantCode'),results], axis=1)
                results['plantCode'] = results['plantCode'].astype(np.int64)
                results_dict[f'N_{d}{k}'] = results
                n = 'Norm'
            else:
                results = pd.concat([pd.Series(dmatrix.columns,name='plantCode'),results], axis=1)
                results['plantCode'] = results['plantCode'].astype(np.int64)
                results_dict[f'{d}{k}'] = results
                n = 'Raw'
    if norm: 
        for eps in np.linspace(0.000005,0.001, 20):
            for mins in np.linspace(5, 100, 5):
                results = DBSCAN_cluster(dmatrix, eps=eps, min_samples=int(mins))
                d = 'DB'
                k = len(np.unique(results.columns))
                if k < 3:
                    pass
                else:
                    results.columns = [f'N_{y}' for y in results.columns]
                    results = pd.concat([pd.Series(dmatrix.columns,name='plantCode'),results], axis=1)
                    results['plantCode'] = results['plantCode'].astype(np.int64)
                    results_dict[f'N_{d}{k}_m{mins}eps{round(eps,6)}'] = results
        for dist in np.linspace(0.025, 0.25, 20):
            results = hierarchical_cluster (dmatrix, d_thresh=dist, compute_d = True, encode=True)
            d = 'H'
            k = len(np.unique(results.columns))
            if k < 3:
                    pass
            else:
                results.columns = [f'N_{y}' for y in results.columns]
                results = pd.concat([pd.Series(dmatrix.columns,name='plantCode'),results], axis=1)
                results['plantCode'] = results['plantCode'].astype(np.int64)
                results_dict[f'N_{d}{k}_d{round(dist,4)}'] = results
    else:
        for eps in np.arange(1000,30000,1000):
            for mins in np.linspace(5, 100, 5):
                results = DBSCAN_cluster(dmatrix, eps=eps, min_samples=int(mins))
                d = 'DB'
                k = len(np.unique(results.columns))
                if k < 3:
                    pass
                else:
                    results = pd.concat([pd.Series(dmatrix.columns,name='plantCode'),results], axis=1)
                    results['plantCode'] = results['plantCode'].astype(np.int64)
                    results_dict[f'{d}{k}_m{mins}eps{round(eps,6)}'] = results 
        for dist in np.linspace(50000000000000, 500000000000000, 20):
            results = hierarchical_cluster (dmatrix, d_thresh=dist, compute_d = True, encode=True)
            d = 'H'
            k = len(np.unique(results.columns))
            if k < 3:
                    pass
            else:
                results = pd.concat([pd.Series(dmatrix.columns,name='plantCode'),results], axis=1)
                results['plantCode'] = results['plantCode'].astype(np.int64)
                results_dict[f'{d}{k}_d{dist}'] = results

    results = AffinityProp_cluster(dmatrix)
    d = 'AP'
    k = len(np.unique(results.columns))
    results = pd.concat([pd.Series(dmatrix.columns,name='plantCode'),results], axis=1)
    results['plantCode'] = results['plantCode'].astype(np.int64)
    results_dict[f'{d}{k}'] = results 
                
    pickle.dump(results_dict, open(f"{n}_Plant_Clusters.p", "wb"))              
    
    return results_dict
        

In [313]:
def raw_and_norm (K=22, dzero=False):
    '''Input: K = number of max clusters to find
        dezero = whether to dezero or not
    Output: returns and saves to a pickle file a dictionary where the keys are the model type and parameters
        and the values are a data frame with the labels for the particular clustering one hot encoded
        for both a raw and normalized version of the data   '''
    norm = dictionary_clusters (K=K, norm = True, dzero=dzero)
    raw = dictionary_clusters (K=K, dzero=dzero)

    return raw, norm

In [314]:
raw, norm = raw_and_norm()

In [315]:
norm.keys()

dict_keys(['N_KM3', 'N_H3', 'N_KM4', 'N_H4', 'N_KM5', 'N_H5', 'N_KM6', 'N_H6', 'N_KM7', 'N_H7', 'N_KM8', 'N_H8', 'N_KM9', 'N_H9', 'N_KM10', 'N_H10', 'N_KM11', 'N_H11', 'N_KM12', 'N_H12', 'N_KM13', 'N_H13', 'N_KM14', 'N_H14', 'N_KM15', 'N_H15', 'N_KM16', 'N_H16', 'N_KM17', 'N_H17', 'N_KM18', 'N_H18', 'N_KM19', 'N_H19', 'N_KM20', 'N_H20', 'N_KM21', 'N_H21', 'N_DB5_m5.0eps5e-06', 'N_DB6_m5.0eps5.7e-05', 'N_DB8_m5.0eps0.00011', 'N_DB5_m5.0eps0.000162', 'N_DB4_m5.0eps0.000214', 'N_DB3_m28.75eps0.000214', 'N_DB5_m5.0eps0.000267', 'N_DB5_m28.75eps0.000267', 'N_DB5_m5.0eps0.000319', 'N_DB5_m28.75eps0.000319', 'N_DB4_m5.0eps0.000372', 'N_DB4_m28.75eps0.000372', 'N_DB4_m5.0eps0.000424', 'N_DB4_m28.75eps0.000424', 'N_DB4_m5.0eps0.000476', 'N_DB4_m28.75eps0.000476', 'N_DB4_m5.0eps0.000529', 'N_DB4_m28.75eps0.000529', 'N_DB4_m5.0eps0.000581', 'N_DB4_m28.75eps0.000581', 'N_DB4_m5.0eps0.000633', 'N_DB4_m28.75eps0.000633', 'N_DB4_m5.0eps0.000686', 'N_DB4_m28.75eps0.000686', 'N_DB4_m5.0eps0.000738', 'N

In [317]:
norm['N_KM3']

Unnamed: 0,plantCode,N_0,N_1,N_2
0,9,0.0,0.0,1.0
1,99,0.0,0.0,1.0
2,136,0.0,0.0,1.0
3,298,0.0,0.0,1.0
4,550,0.0,1.0,0.0
...,...,...,...,...
478,66596,1.0,0.0,0.0
479,66597,1.0,0.0,0.0
480,66612,1.0,0.0,0.0
481,66613,1.0,0.0,0.0
