In [56]:
%load_ext autoreload
%autoreload 2
from DataPaths import Paths
from DatasetReader import DatasetReader
from HalfhourClusterer import HalfhourClusterer
from ClusterVisualisation import visualise_model

import numpy as np

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [4]:
dr = DatasetReader()
year_data_paths = dr.read_directories(Paths.mean_pickles).full_file_paths

In [7]:
year_data_paths

['YOUR_PATH\\data\\mean_pickles\\\\means_2016_from_1st_monday_february_3_weeks.pkl',
 'YOUR_PATH\\data\\mean_pickles\\\\means_2017_from_1st_monday_february_3_weeks.pkl',
 'YOUR_PATH\\data\\mean_pickles\\\\means_2020_from_1st_monday_february_3_weeks.pkl',
 'YOUR_PATH\\data\\mean_pickles\\\\means_2021_from_1st_monday_february_3_weeks.pkl']

In [115]:
filter = 15
hhc_2016 = HalfhourClusterer(year_data_paths[0], filter)
hhc_2017 = HalfhourClusterer(year_data_paths[1], filter)
hhc_2020 = HalfhourClusterer(year_data_paths[2], filter)
hhc_2021 = HalfhourClusterer(year_data_paths[3], filter)

In [116]:
model_2016 = hhc_2016.train(6, True, False)
model_2017 = hhc_2017.train(6, True, False)
model_2020 = hhc_2020.train(6, True, False)
model_2021 = hhc_2021.train(6, True, False)

In [117]:
preds_2016 = hhc_2016.predict(model_2016, True, False)
preds_2017 = hhc_2017.predict(model_2017, True, False)
preds_2020 = hhc_2020.predict(model_2020, True, False)
preds_2021 = hhc_2021.predict(model_2021, True, False)

In [118]:
%matplotlib qt

visualise_model(model_2016, preds_2016, hhc_2016.get_dataset(True, False), True, False)
visualise_model(model_2017, preds_2017, hhc_2017.get_dataset(True, False), True, False, "g")
visualise_model(model_2020, preds_2020, hhc_2020.get_dataset(True, False), True, False, "b")
visualise_model(model_2021, preds_2021, hhc_2021.get_dataset(True, False), True, False, "y")


In [119]:
model_2016_unscaled = hhc_2016.train(6, False, False)
model_2017_unscaled = hhc_2017.train(6, False, False)
model_2020_unscaled = hhc_2020.train(6, False, False)
model_2021_unscaled = hhc_2021.train(6, False, False)

In [120]:
visualise_model(model_2016_unscaled, hhc_2016.predict(model_2016_unscaled, True, False), hhc_2016.get_dataset(True, False), True, False)
visualise_model(model_2017_unscaled, hhc_2017.predict(model_2017_unscaled, True, False), hhc_2017.get_dataset(True, False), True, False, "g")
visualise_model(model_2020_unscaled, hhc_2020.predict(model_2020_unscaled, True, False), hhc_2020.get_dataset(True, False), True, False, "b")
visualise_model(model_2021_unscaled, hhc_2021.predict(model_2021_unscaled, True, False), hhc_2021.get_dataset(True, False), True, False, "y")

In [121]:
from tslearn.metrics import dtw

In [122]:
def find_closest_clusters(model_list):
  models = len(model_list)
  classes = model_list[0].n_clusters
  all_pairs = []

  for i in range(1, models):
    m1 = model_list[i-1]
    m2 = model_list[i]
    dtws = np.zeros((classes, classes))
    for j in range(classes):
      cluster1 = m1.cluster_centers_[j].ravel()
      for k in range(classes):
        cluster2 = m2.cluster_centers_[k].ravel()
        dtws[j, k] = dtw(cluster1, cluster2, sakoe_chiba_radius=5)
    mask = np.ones((classes, classes))
    pairs = []

    while len(pairs) != classes:
      mask, place = row_or_column_filled(mask)
      
      if (len(place) == 2):
        pairs.append(place)
      else:
        mask = fill_largest_to_mask(dtws, mask)
    
    all_pairs.append(pairs)

  return all_pairs

def fill_largest_to_mask(matrix, mask):
  new_mask = mask.copy()
  largest = 0
  largest_i = 0
  largest_j = 0
  matrix_size = matrix.shape[0]
  for i in range(matrix_size):
    for j in range(matrix_size):
      if mask[i, j] == 1 and matrix[i, j] > largest:
        largest = matrix[i, j]
        largest_i = i
        largest_j = j
  
  new_mask[largest_i, largest_j] = 0
  return new_mask

def find_one_col(mask, col):
  for i in range(len(mask[col])):
    if mask[i, col] == 1:
      return i

def find_one_row(mask, row):
  for i in range(len(mask[row])):
    if mask[row, i] == 1:
      return i

def row_or_column_filled(mask):
  mask_l = mask.shape[0]
  new_mask = mask.copy()
  for i in range(mask_l):
    if mask[i,:].sum() == 1:
      row = find_one_row(mask, i)
      new_mask[i,:] = 0
      new_mask[:,row] = 0
      return new_mask, (row, i)
    elif mask[:,i].sum() == 1:
      col = find_one_col(mask, i)
      new_mask[col,:] = 0
      new_mask[:,i] = 0
      return new_mask, (i, col)
  return new_mask, ()


  

In [123]:
all_models = [model_2016_unscaled, model_2017_unscaled, model_2020_unscaled, model_2021_unscaled]

In [124]:
pairs = find_closest_clusters(all_models)

In [125]:
pairs

[[(4, 5), (1, 2), (5, 4), (3, 0), (0, 1), (2, 3)],
 [(4, 4), (5, 1), (0, 5), (3, 3), (1, 0), (2, 2)],
 [(2, 4), (0, 0), (4, 5), (3, 3), (1, 1), (5, 2)]]

In [126]:
similar_clusters = []
for i in range(6):
  clusters = [i]
  comparator = i
  for pair_list in pairs:
    skip = False
    j = 0
    while not skip:
      pair = pair_list[j]
      j += 1
      if pair[0] == comparator:
        comparator = pair[1]
        clusters.append(pair[1])
        skip = True
  similar_clusters.append(clusters)



In [127]:
# PLOT SIMILAR CLUSTERS TOGETHER
similar_clusters

[[0, 1, 0, 0],
 [1, 2, 2, 4],
 [2, 3, 3, 3],
 [3, 0, 5, 2],
 [4, 5, 1, 1],
 [5, 4, 4, 5]]

In [128]:
model_dir = "YOUR_PATH\\models\\"

model_2016.to_pickle(f"{model_dir}model_2016")
model_2017.to_pickle(f"{model_dir}model_2017")
model_2020.to_pickle(f"{model_dir}model_2020")
model_2021.to_pickle(f"{model_dir}model_2021")
model_2016_unscaled.to_pickle(f"{model_dir}model_2016_unscaled")
model_2017_unscaled.to_pickle(f"{model_dir}model_2017_unscaled")
model_2020_unscaled.to_pickle(f"{model_dir}model_2020_unscaled")
model_2021_unscaled.to_pickle(f"{model_dir}model_2021_unscaled")

In [154]:
import matplotlib.pyplot as plt

def visualise_clusters(models, cluster_similarity):
  class_c = models[0].n_clusters

  cluster_center_groups = []
  # For each cluster find all similar centers from all models
  for i in range(class_c):
    centers = []
    for j in range(len(models)):
      centers.append(models[j].cluster_centers_[cluster_similarity[i][j]].ravel())
    cluster_center_groups.append(centers)

  for asd in range(len(cluster_center_groups)):
    cluster_group = cluster_center_groups[asd]
    plt.subplot(int(class_c / 3), 3, asd+1)
    for cluster_center in cluster_group:
      plt.plot(cluster_center)
    plt.xlim(0, 336)
    plt.ylim(-2, 800)

  plt.tight_layout()
  plt.show()

In [155]:
visualise_clusters(all_models, similar_clusters)

In [10]:
import pandas as pd

In [5]:
models = dr.read_directories(Paths.model_path).full_file_paths

In [6]:
models

['YOUR_PATH\\models\\\\model_2016',
 'YOUR_PATH\\models\\\\model_2016_unscaled',
 'YOUR_PATH\\models\\\\model_2017',
 'YOUR_PATH\\models\\\\model_2017_unscaled',
 'YOUR_PATH\\models\\\\model_2020',
 'YOUR_PATH\\models\\\\model_2020_unscaled',
 'YOUR_PATH\\models\\\\model_2021',
 'YOUR_PATH\\models\\\\model_2021_unscaled']

In [11]:
means_data_2016_feb_3weeks = pd.read_pickle(year_data_paths[0])

In [44]:
def filter_only_lams_in_all_datasets(data_paths):
  datas = []
  for path in data_paths:
    datas.append(pd.read_pickle(path))
  
  all_df = pd.concat(datas, axis=0)
  grouped = all_df.groupby(["lamId", "direction"])
  filtered = grouped.filter(lambda x: x["year"].count() == len(data_paths))

  return filtered

In [47]:
all_datas = filter_only_lams_in_all_datasets(year_data_paths)

In [63]:
def get_year_of_data(full_data, year):
  return full_data[full_data["year"] == year].reset_index()

In [64]:
data_2016 = get_year_of_data(all_datas, 2016)
data_2017 = get_year_of_data(all_datas, 2017)
data_2020 = get_year_of_data(all_datas, 2020)
data_2021 = get_year_of_data(all_datas, 2021)

In [65]:
filter = 15
hhc_2016_new = HalfhourClusterer(data_2016, filter)
hhc_2017_new = HalfhourClusterer(data_2017, filter)
hhc_2020_new = HalfhourClusterer(data_2020, filter)
hhc_2021_new = HalfhourClusterer(data_2021, filter)

In [67]:
model_2016_new = hhc_2016_new.train(6, True, False)
model_2017_new = hhc_2017_new.train(6, True, False)
model_2020_new = hhc_2020_new.train(6, True, False)
model_2021_new = hhc_2021_new.train(6, True, False)

In [68]:
preds_2016_new_scaled = hhc_2016_new.predict(model_2016_new, True, False)
preds_2017_new_scaled = hhc_2017_new.predict(model_2017_new, True, False)
preds_2020_new_scaled = hhc_2020_new.predict(model_2020_new, True, False)
preds_2021_new_scaled = hhc_2021_new.predict(model_2021_new, True, False)

In [69]:
%matplotlib qt

visualise_model(model_2016_new, preds_2016_new_scaled, hhc_2016_new.get_dataset(True, False), True, False)
visualise_model(model_2017_new, preds_2017_new_scaled, hhc_2017_new.get_dataset(True, False), True, False, "g")
visualise_model(model_2020_new, preds_2020_new_scaled, hhc_2020_new.get_dataset(True, False), True, False, "b")
visualise_model(model_2021_new, preds_2021_new_scaled, hhc_2021_new.get_dataset(True, False), True, False, "y")
