# 8 PV image data clustering
- process: https://towardsdatascience.com/a-step-by-step-guide-for-clustering-images-4b45f9906128
- clustering methods: https://towardsdatascience.com/from-data-to-clusters-when-is-your-clustering-good-enough-5895440a978a
- hog method for feature method: https://www.analyticsvidhya.com/blog/2019/09/feature-engineering-images-introduction-hog-feature-descriptor/

In [None]:
#imports
import pandas as pd
import os
import numpy as np
import plotly.graph_objects as go
import plotly.express as px
from datetime import datetime
import random
import matplotlib.pyplot as plt
import plotly.io as pio

#image clustering library
try:
    from clustimage import Clustimage
except:
    pass

#machine learning
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans

#folders
data_folder = "data"

#warnings
import warnings
warnings.filterwarnings('ignore')

In [None]:
random.seed(42)

In [None]:
#set if data should be safed
save_data = False

In [None]:
#plot styles
plt_style_c = px.colors.sequential.haline #complex
plt_style_s = px.colors.diverging.Portland #simple

#defualt plot size 
size = {
    "width" : 1500 ,
    "height" : 750 ,
}

#function for plotting
def scale_show(fig, size_override = False):

    #set font
    fig.update_layout(
        font = dict(size=16),
        title_font = dict(size=20),
        xaxis_title_font = dict(size=18),
        yaxis_title_font = dict(size=18),
    )

    #set size
    if size_override == False:
        fig.update_layout(
            width=1500,
            height=750,
        )

    #show
    fig.show()

    return

## 8.1 data preparation

In [None]:
df = pd.read_csv(os.path.join(data_folder, "df_pv_clustering.csv"))
#df["date"] = pd.to_datetime(df["date"])
df.drop(labels = "longitude", axis = 1, inplace = True)
df.head()

In [None]:
df.sort_values(by = ["date", "level", "latitude"], ascending = [True, True, True], inplace = True)
df.reset_index(inplace = True, drop = True)
df.head()

In [None]:
n_dates     = df["date"].unique().shape[0]
n_lats      = df["latitude"].unique().shape[0]
n_levels    = df["level"].unique().shape[0]

print(n_dates)
print(n_lats)
print(n_levels)


In [None]:
#cost: high

"""
metric = "speed" #["speed", "t"]
images_na = {}

for date in df["date"].unique().tolist()[:3]:
    im = []

    for level in df["level"].unique():
        metrics = df.loc[(df["date"] == date) & (df["level"] == level)]["speed"].tolist()
        im.append(metrics)
    
    print(f"Compiling image: {date}", end = "\r")
    images_na[date] = im
"""

In [None]:
def image_compiler(df, metric):

    n_lats      = df["latitude"].unique().shape[0]
    n_levels    = df["level"].unique().shape[0]

    pixel_row = []
    image = []
    images = []

    #standardize pixels
    pixels = (df[metric] - df[metric].mean()) / df[metric].std()
    pixels = pixels.tolist()

    for pixel in pixels:

        pixel_row.append(pixel)

        if len(pixel_row) == n_lats:
            image.append(np.array(pixel_row.copy()))
            pixel_row.clear()

            if len(image) == n_levels:
                images.append(np.array(image.copy()))
                image.clear()

    images = np.stack(images)

    return images

In [None]:
images = image_compiler(df = df, metric = "speed")

In [None]:
#checksum
print(df.shape[0] / (n_lats * n_levels))
print(len(images))

In [None]:
px.imshow(images[0], title = "Wind speed standardized", color_continuous_scale = plt_style_s)


In [None]:
px.imshow(images[-1], title = "Wind speed standardized", color_continuous_scale = plt_style_s)

In [None]:
images[0]

In [None]:
images

In [None]:
#split the data, no validiotn and test sset

def splitter (images):

    images = images.copy() #prevent altering the original list

    train_split     = 0.8
    test_split      = 0.2  #only used for idication

    index_list = list(range(len(images)))
    train_size = int(len(images) * train_split)

    train_set_i     = random.sample(population = index_list, k = train_size)
    test_set_i      = [index for index in index_list if index not in train_set_i]

    train_set       = [images[i] for i in train_set_i]
    test_set        = [images[i] for i in test_set_i]

    train_set = np.stack(train_set)
    test_set = np.stack(test_set)

    print(f"train set:\t{round(len(train_set) / len(images), 2)}\ntest set:\t{round(len(test_set) / len(images),2)}")

    #match indexes in df to indexes in sets
    index_matcher = {
        "train" : {
            "df_i" :    train_set_i,
            "set_i" :   list(range(len(train_set))),
        },
        "test" : {
            "df_i" :    test_set_i,
            "set_i":    list(range(len(test_set))),
        },
    }

    return train_set, test_set, index_matcher

In [None]:
train_set, test_set, index_matcher = splitter(images)

In [None]:
train_set[0].shape

## 8.2 modelling (clustimage)

In [None]:
try:
    cl = Clustimage(
        method='hog',
        embedding='tsne',
        grayscale=False,
        dim=(10,45),

        params_hog = {
            "orientations"      : 8,
            "pixels_per_cell"   : (4,4),
        },

        verbose = True,
    )
except:
    pass

In [None]:
try:
    results = cl.fit_transform(
        train_set,
        cluster='agglomerative',
        evaluate='silhouette',
        metric='euclidean',
        linkage='ward',
        min_clust=3,
        max_clust=15,
        cluster_space='high',
    )
except :
    print("Not able to conver the datatypes of numpy arrays to image data. Twat")

In [None]:
try:
    #create model
    cl = Clustimage(method='hog')

    #extract features
    train_set_feat = cl.extract_feat(train_set)

    # Embedding using tSNE
    xycoord = cl.embedding(train_set_feat)

    # Cluster with all default settings
    labels = cl.cluster(
        cluster='agglomerative',
        evaluate='silhouette',
        metric='euclidean',
        linkage='ward',
        min_clust=3,
        max_clust=15,
        cluster_space='high',
    )

    # Return
    results = cl.results

except:
    print("Not able to conver the datatypes of numpy arrays to image data. Twat")

## 8.3 modelling (skealrn)
- https://medium.com/@chengweizhang2012/how-to-do-unsupervised-clustering-with-keras-9e1284448437

In [None]:
class KMC():

    def __init__ (self, n_cluster, train_set, test_set):

        #set unflattended valus
        self.train_set      = train_set.copy()
        self.test_set       = test_set.copy()
        
        #model params
        self.random_state   = 42
        self.n_cluster      = n_cluster
        self.algorithm      = "full"

        #flatten values
        self.train_set_flat     = self.reshape(self.train_set)
        self.test_set_flat      = self.reshape(self.test_set)

    def reshape(self, set):

        n_samples, height, width = set.shape
        images_flat = set.reshape((n_samples, height * width))

        return images_flat

    def create_model(self):

        self.model = KMeans(
            n_clusters      = self.n_cluster,
            random_state    = self.random_state,
            algorithm       = self.algorithm,
        )

        self.model.fit(self.train_set_flat)
        self.__predict_match()

        return

    def __predict_match(self):

        #match image and labels for analysis (train)
        labels_train = self.model.labels_
        self.labels = []

        for i in range(len(labels_train)):

            data = {
                "set"   : "train",
                "label" : labels_train[i],
                "im"    : self.train_set[i],
            }
            self.labels.append(data)

        #match image and labels for analysis (test)
        labels_test = self.model.predict(self.test_set_flat)

        for i in range(len(labels_test)):

            data = {
                "set"   : "test",
                "label" : labels_test[i],
                "im"    : self.test_set[i],
            }
            self.labels.append(data)

        return

In [None]:
kmc = KMC(n_cluster = 9, train_set = train_set, test_set = test_set)
kmc.create_model()

In [None]:
df_cluster = pd.DataFrame(kmc.labels)
df_cluster

In [None]:
fig = px.histogram(
    data_frame = df_cluster,
    x = "label",
    histfunc = "count",
    histnorm = "probability density",
    color = "set",
    barmode = "group",

    title = "KMeans Clusters",
    color_discrete_sequence = plt_style_s,
)

scale_show(fig)

In [None]:
kmc.labels

In [None]:
#get a sample from each cluster to get an idea of the distribution
label_indexes = {}
sample_plots = 3

for label in df_cluster["label"].unique().tolist():
    indexes = df_cluster.loc[df_cluster["label"] == label].index.tolist()
    label_indexes[label] = indexes


#get first elemt of each cluster and plot image
keys = list(label_indexes.keys())
keys.sort()

for label in keys:

    for i in range(sample_plots):

        im_ind_all = label_indexes[label]
        im_ind = random.sample(population = im_ind_all, k = 1, )[0]

        #retrvie data for plotting
        im_data     = kmc.labels[im_ind]["im"]
        im_label    = kmc.labels[im_ind]["label"]
        im_set      = kmc.labels[im_ind]["set"]

        #generate image plot
        fig = px.imshow(
            im_data,
            title = f"Wind speed (norm) - Label: {im_label} ({im_set})",
            color_continuous_scale = plt_style_s,
            range_color = [-2,6],
            width = 1500,
            height = 500,
            )

        fig.update_xaxes(title_text="Latitude (offset by -44)")
        fig.update_yaxes(title_text="Pressure level [hPa]")

        #image_file = "test.png"
        #pio.write_image(fig, image_file, engine="plotly.io")

        #fig.write_image("test.png") #f"Wind_speed_(norm)-Label:{im_label}({im_set}).png"

        #scale_show(fig, size_override = True)

In [None]:
#mach df data with image clusets
df_index_train = pd.DataFrame(index_matcher["train"])
df_index_test = pd.DataFrame(index_matcher["test"])

df_index = pd.concat(objs = [df_index_train, df_index_test])

df_index.head()

In [None]:
df_index.shape

In [None]:
df_cluster.iloc[2]["im"].sum()

In [None]:
df_cluster.reset_index(inplace = True, drop = False)
df_cluster.rename(axis = 1, mapper = {"index" : "set_i"}, inplace = True)
df_cluster = pd.merge(left = df_cluster, right = df_index, left_on = "set_i", right_on = "set_i")
df_cluster.drop(labels = "set_i", inplace = True, axis = 1)

In [None]:
df_cluster

In [None]:
df_date = pd.DataFrame(
    data = {
        "date"      : df["date"].unique().tolist(),
    }
)

df_date.reset_index(inplace = True, drop = False)
df_date.rename(axis = 1, mapper = {"index" : "df_i"}, inplace = True)

df_date.head()

In [None]:
df_cluster = pd.merge(left = df_cluster, right = df_date, left_on = "df_i", right_on = "df_i")

In [None]:
df_cluster

In [None]:
df_cluster["month"] = df_cluster["date"].astype(str).apply(lambda value: int(value[5:7]))
df_cluster["year"] = df_cluster["date"].astype(str).apply(lambda value: int(value[0:4]))

In [None]:
df_cluster

In [None]:
fig = px.histogram(
    data_frame = df_cluster.sort_values("label", ascending = True, axis = 0),
    x = "month",
    y = "label",
    histfunc = "count",
    histnorm = "density",
    title = "Distribution of clusters",
    color_discrete_sequence = plt_style_c,
    facet_row = "label",
    width = 1250,
    height = 3000,
)

scale_show(fig, size_override = True)

In [None]:
fig = px.histogram(
    data_frame = df_cluster.sort_values("month", ascending = True, axis = 0),
    x = "label",
    histfunc = "count",
    histnorm = "density",
    title = "Distribution of clusters",
    color_discrete_sequence = plt_style_c,
    facet_row = "month",
    width = 750,
    height = 3000,
)

scale_show(fig, size_override = True)

In [None]:
fig = px.histogram(
    data_frame = df_cluster.sort_values("year", ascending = True, axis = 0),
    x = "year",
    y = "label",
    histfunc = "count",
    histnorm = "density",
    title = "Distribution of clusters",
    color_discrete_sequence = plt_style_c,
    facet_row = "label",
    width = 750,
    height = 3000,
)

scale_show(fig, size_override = True)