In [18]:
%load_ext autoreload
%autoreload 2
from DataPaths import Paths
from DatasetReader import DatasetReader
from HalfhourClusterer import HalfhourClusterer
from ClusterVisualisation import visualise_model
from ClusterVisualisation import visualise_models

import pickle
import numpy as np
import pandas as pd

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [2]:
dr = DatasetReader()
year_data_paths = dr.read_directories(Paths.mean_pickles_v2).full_file_paths

In [26]:
retrain = True
save_models = True
model_prefix = "pickled_halfhour_cluster"

year_data_paths

['YOUR_PATH\\data\\mean_pickles_v2\\means_2016_from_1st_monday_february_3_weeks.pkl',
 'YOUR_PATH\\data\\mean_pickles_v2\\means_2017_from_1st_monday_february_3_weeks.pkl',
 'YOUR_PATH\\data\\mean_pickles_v2\\means_2020_from_1st_monday_february_3_weeks.pkl',
 'YOUR_PATH\\data\\mean_pickles_v2\\means_2021_from_1st_monday_february_3_weeks.pkl']

In [23]:
filter = 15
hhc_2016 = HalfhourClusterer(year_data_paths[0], filter)
hhc_2017 = HalfhourClusterer(year_data_paths[1], filter)
hhc_2020 = HalfhourClusterer(year_data_paths[2], filter)
hhc_2021 = HalfhourClusterer(year_data_paths[3], filter)

In [24]:
if retrain:
  model_2016 = hhc_2016.train(6, True, False)
  model_2017 = hhc_2017.train(6, True, False)
  model_2020 = hhc_2020.train(6, True, False)
  model_2021 = hhc_2021.train(6, True, False)
else:
  model_2016 = pd.read_pickle("halhour_cluster_2016.pkl")
  model_2017 = pd.read_pickle("halhour_cluster_2017.pkl")
  model_2020 = pd.read_pickle("halhour_cluster_2020.pkl")
  model_2021 = pd.read_pickle("halhour_cluster_2021.pkl")

In [25]:
preds_2016 = hhc_2016.predict(model_2016, True, False)
preds_2017 = hhc_2017.predict(model_2017, True, False)
preds_2020 = hhc_2020.predict(model_2020, True, False)
preds_2021 = hhc_2021.predict(model_2021, True, False)

In [27]:
%matplotlib qt

# visualise_model(model_2016, preds_2016, hhc_2016.get_dataset(True, False), True, False)
# visualise_model(model_2017, preds_2017, hhc_2017.get_dataset(True, False), True, False, "g")
# visualise_model(model_2020, preds_2020, hhc_2020.get_dataset(True, False), True, False, "b")
# visualise_model(model_2021, preds_2021, hhc_2021.get_dataset(True, False), True, False, "y")

models = [model_2016, model_2017, model_2020, model_2021]
colors = ["r", "g", "b", "y"]
visualise_models(models, colors)

In [28]:
if save_models:
  years = [2016, 2017, 2020, 2021]
  for i in range(len(models)):
    filename = f"{model_prefix}_{years[i]}.pkl"
    pickle.dump(models[i], open(filename, 'wb'))
    # TO READ USE: pickle.load(open("pickled_halfhour_cluster_2016.pkl", 'rb'))


In [31]:
model_2016_unscaled = hhc_2016.train(6, False, False)
model_2017_unscaled = hhc_2017.train(6, False, False)
model_2020_unscaled = hhc_2020.train(6, False, False)
model_2021_unscaled = hhc_2021.train(6, False, False)

unscaled_models = [model_2016_unscaled, model_2017_unscaled, model_2020_unscaled, model_2021_unscaled]

if save_models:
  years = [2016, 2017, 2020, 2021]
  for i in range(len(unscaled_models)):
    filename = f"{model_prefix}_unsclaed_{years[i]}.pkl"
    pickle.dump(unscaled_models [i], open(filename, 'wb'))
    # TO READ USE: pickle.load(open("pickled_halfhour_cluster_2016.pkl", 'rb'))

In [32]:
visualise_models(unscaled_models, colors)

In [121]:
from tslearn.metrics import dtw

In [33]:
# Filter only TMS points in all sets

In [34]:
def filter_only_lams_in_all_datasets(data_paths):
  datas = []
  for path in data_paths:
    datas.append(pd.read_pickle(path))
  
  all_df = pd.concat(datas, axis=0)
  grouped = all_df.groupby(["lamId", "direction"])
  filtered = grouped.filter(lambda x: x["year"].count() == len(data_paths))

  return filtered

In [35]:
all_datas = filter_only_lams_in_all_datasets(year_data_paths)

In [36]:
def get_year_of_data(full_data, year):
  return full_data[full_data["year"] == year].reset_index()

In [37]:
data_2016 = get_year_of_data(all_datas, 2016)
data_2017 = get_year_of_data(all_datas, 2017)
data_2020 = get_year_of_data(all_datas, 2020)
data_2021 = get_year_of_data(all_datas, 2021)

In [38]:
filter = 15
hhc_2016_new = HalfhourClusterer(data_2016, filter)
hhc_2017_new = HalfhourClusterer(data_2017, filter)
hhc_2020_new = HalfhourClusterer(data_2020, filter)
hhc_2021_new = HalfhourClusterer(data_2021, filter)

In [40]:
model_2016_new = hhc_2016_new.train(6, True, False)
model_2017_new = hhc_2017_new.train(6, True, False)
model_2020_new = hhc_2020_new.train(6, True, False)
model_2021_new = hhc_2021_new.train(6, True, False)

In [41]:
preds_2016_new_scaled = hhc_2016_new.predict(model_2016_new, True, False)
preds_2017_new_scaled = hhc_2017_new.predict(model_2017_new, True, False)
preds_2020_new_scaled = hhc_2020_new.predict(model_2020_new, True, False)
preds_2021_new_scaled = hhc_2021_new.predict(model_2021_new, True, False)

In [43]:
models_new = [model_2016_new, model_2017_new, model_2020_new, model_2021_new]

visualise_models(models_new, colors)
