In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import pickle
import os
from tqdm import tqdm

import utils

In [None]:
clusters = {
	'canadian_cluster':{
		'regions':{
		'CAN-0': {'stations':['NEW', 'T19', 'C10', 'LET', 'T03', 'T43']},
		'CAN-1': {'stations':['LET', 'T03', 'T43', 'RED', 'C06']},
		'CAN-2': {'stations':['T43', 'RED', 'C06', 'MEA', 'T36']}
		}
	},
	'greenland_cluster':{
		'regions':{
		'GRL-0': {'stations':['GHB', 'SKT', 'STF', 'ATU']},
		'GRL-1': {'stations':['SKT', 'STF', 'ATU', 'GDH']},
		'GRL-2': {'stations':['STF', 'ATU', 'GDH', 'UMQ']},
		'GRL-3': {'stations':['GHB', 'FHB', 'NAQ']},
		}
	},
	'fennoscandinavian_cluster':{
		'regions':{
		'FSC-0': {'stations':['RVK', 'LYC', 'DON', 'JCK']},
		'FSC-1': {'stations':['HAN', 'MEK', 'OUJ', 'NUR']},
		'FSC-2': {'stations':['MAS', 'NOR', 'IVA', 'KEV', 'KIL', 'MUO', 'SOR', 'TRO', 'ABK', 'KIR']},
		'FSC-3': {'stations':['MAS', 'AND', 'KIL', 'MUO', 'SOR', 'TRO', 'ABK', 'KIR']},
		'FSC-4': {'stations':['MAS', 'SOD', 'IVA', 'KEV', 'KIL', 'MUO', 'ABK', 'KIR', 'PEL']},
		'FSC-5': {'stations':['JCK', 'DON', 'ABK', 'KIR', 'LYC']},
		'FSC-6': {'stations':['MAS', 'AND', 'KIL', 'MUO', 'JCK', 'TRO', 'ABK', 'KIR', 'PEL']},
		}
	},
	'central_european_cluster':{
		'regions':{
		'CEU-0': {'stations':['ZAG', 'LVV', 'BEL', 'VYH']},
		'CEU-1': {'stations':['BEL', 'HLP', 'SZC', 'KLD']},
		'CEU-2': {'stations':['THY', 'BDV', 'WIC', 'NCK', 'HRB']},
		'CEU-3': {'stations':['ROE', 'BFE', 'WNG']},
		}
	},
	'non_cluster_regions':{
		'regions':{
		'SVLB': {'stations':['BBG', 'LYR', 'HOR', 'NAL', 'HRN', 'HOP']},
		'JPN-0': {'stations':['KUJ', 'KNY', 'KAG']},
		'JPN-1': {'stations':['MMB', 'ASB', 'RIK', 'MSR']},
		'ALSK': {'stations':['CMO', 'FYU', 'PKR', 'GAK']},
		'HUD-0': {'stations':['PIN', 'ISL', 'C05']},
		'HUD-1': {'stations':['FCC', 'EKP', 'RAN', 'BLC']},
		}
	}
}
forecasts = [1, 5, 10, 15, 20, 25, 30]
windows = [1, 5, 10, 15, 20, 25, 30]


In [None]:
def getting_storm_times(df, lead=24, recovery=48, twins=False, target_var=None, map_keys=None, classification=False):

	'''
	Pulling out storms using a defined list of datetime strings, adding a lead and recovery time to it and
	appending each storm to a list which will be later processed.

	Args:
		data (list of pd.dataframes): ACE and supermag data with the test set's already removed.
		lead (int): how much time in hours to add to the beginning of the storm.
		recovery (int): how much recovery time in hours to add to the end of the storm.
		sw_only (bool): True if this is the solar wind only data, will drop dbht from the feature list.

	Returns:
		list: ace and supermag dataframes for storm times
		list: np.arrays of shape (n,2) containing a one hot encoded boolean target array
	'''

	# setting the datetime index
	if 'Date_UTC' in df.columns:
		pd.to_datetime(df['Date_UTC'], format='%Y-%m-%d %H:%M:%S')
		df.reset_index(drop=True, inplace=True)
		df.set_index('Date_UTC', inplace=True, drop=True)
	else:
		print('Date_UTC not in columns. Check to make sure index is datetime not integer.')

	df.index = pd.to_datetime(df.index)

	# loading the storm list
	if twins and map_keys is None:
		storm_list = pd.read_feather('outputs/regular_twins_map_dates.feather', columns=['dates'])
		storm_list = storm_list['dates']
	elif twins and map_keys is not None:
		storm_list = pd.DataFrame({'dates':[pd.to_datetime(key, format='%Y-%m-%d %H:%M:%S') for key in map_keys]})
		storm_list = storm_list['dates']
	else:
		storm_list = pd.read_csv('stormList.csv', header=None, names=['dates'])
		storm_list = storm_list['Date_UTC']

	stime, etime = [], []					# will store the resulting time stamps here then append them to the storm time df

	# will loop through the storm dates, create a datetime object for the lead and recovery time stamps and append those to different lists
	for date in storm_list:
		if isinstance(date, str):
			date = pd.to_datetime(date, format='%Y-%m-%d %H:%M:%S')
		if twins:
			stime.append(date.round('T')-pd.Timedelta(minutes=lead))
			etime.append(date.round('T')+pd.Timedelta(minutes=recovery))
		else:
			stime.append(date-pd.Timedelta(hours=lead))
			etime.append(date+pd.Timedelta(hours=recovery))

	# adds the time stamp lists to the storm_list dataframes

	storm_list = pd.DataFrame(storm_list, columns=['dates'])
	storm_list['stime'] = stime
	storm_list['etime'] = etime

	storm_list = pd.DataFrame({'stime':stime, 'etime':etime})
	

	return storm_list


def extracting_storm_only_data(df, storm_list):

	storms = pd.DataFrame()

	for start, end in zip(storm_list['stime'], storm_list['etime']):		# looping through the storms to remove the data from the larger df
		if start < df.index[0] or end > df.index[-1]:						# if the storm is outside the range of the data, skip it
			continue
		storm = df[(df.index >= start) & (df.index <= end)]

		if len(storm) != 0:
			storms = pd.concat([storms, storm], axis=0)

	return storms
	 

In [None]:
# for cluster in tqdm(clusters.keys()):
# 	for region in clusters[cluster]['regions'].keys():

# 		RP = utils.RegionPreprocessing(cluster,region, features=['dbht'], maximum=True)
# 		region_df = RP()
# 		dates = getting_storm_times(region_df, twins=True)

# 		class_df = pd.DataFrame()
# 		for forecast in forecasts:
# 			print(f'Forecast length: {forecast}')
# 			for window in windows:
				
# 				temp_df = RP.classification_column(df=region_df, param='rsd', forecast=forecast, window=window)

# 				class_df[f'forecast_{forecast}_window_{window}'] = temp_df['classification']

# 		print('Extracting storm only data...')
# 		storms = extracting_storm_only_data(class_df, dates)

# 		ratio_df = pd.DataFrame(columns=['forecast', 'window', 'percentage'], index=[0])

# 		for forecast in forecasts:
# 			print(f'Forecast length: {forecast}')
# 			for window in windows:

# 				test = storms[f'forecast_{forecast}_window_{window}']

# 				ratio = test.sum()/len(test)

# 				ratio_df = pd.concat([ratio_df, pd.DataFrame({'forecast':forecast, 'window':window, 'percentage':ratio}, index=[0])], axis=0)
		

# 		ratio_df = ratio_df.dropna().reset_index(drop=True, inplace=False)
# 		clusters[cluster]['regions'][region]['ratios'] = ratio_df

  0%|          | 0/5 [00:00<?, ?it/s]

{'cluster': 'canadian_cluster', 'region': 'CAN-0', 'features': ['dbht'], 'mean': False, 'std': False, 'maximum': True, 'median': False}
{'cluster': 'canadian_cluster', 'region': 'CAN-0', 'features': ['dbht'], 'mean': False, 'std': False, 'maximum': True, 'median': False, 'forecast': 15, 'window': 15, 'classification': False}
Forecast: 15, Window: 15, Classification: False
Loading station NEW....
Loading station T19....
Loading station C10....
Loading station LET....
Loading station T03....
Loading station T43....


  max_station = rsd.idxmax(axis=1)


Date_UTC not in columns. Check to make sure index is datetime not integer.
Forecast length: 1
Forecast length: 5
Forecast length: 10
Forecast length: 15
Forecast length: 20
Forecast length: 25
Forecast length: 30
Extracting storm only data...
Forecast length: 1
Forecast length: 5
Forecast length: 10
Forecast length: 15
Forecast length: 20
Forecast length: 25
Forecast length: 30
{'cluster': 'canadian_cluster', 'region': 'CAN-1', 'features': ['dbht'], 'mean': False, 'std': False, 'maximum': True, 'median': False}
{'cluster': 'canadian_cluster', 'region': 'CAN-1', 'features': ['dbht'], 'mean': False, 'std': False, 'maximum': True, 'median': False, 'forecast': 15, 'window': 15, 'classification': False}
Forecast: 15, Window: 15, Classification: False
Loading station LET....


  ratio_df = pd.concat([ratio_df, pd.DataFrame({'forecast':forecast, 'window':window, 'percentage':ratio}, index=[0])], axis=0)


Loading station T03....
Loading station T43....
Loading station RED....
Loading station C06....


  max_station = rsd.idxmax(axis=1)


Date_UTC not in columns. Check to make sure index is datetime not integer.
Forecast length: 1
Forecast length: 5
Forecast length: 10
Forecast length: 15
Forecast length: 20
Forecast length: 25
Forecast length: 30
Extracting storm only data...
Forecast length: 1
Forecast length: 5
Forecast length: 10
Forecast length: 15
Forecast length: 20
Forecast length: 25
Forecast length: 30
{'cluster': 'canadian_cluster', 'region': 'CAN-2', 'features': ['dbht'], 'mean': False, 'std': False, 'maximum': True, 'median': False}
{'cluster': 'canadian_cluster', 'region': 'CAN-2', 'features': ['dbht'], 'mean': False, 'std': False, 'maximum': True, 'median': False, 'forecast': 15, 'window': 15, 'classification': False}
Forecast: 15, Window: 15, Classification: False
Loading station T43....


  ratio_df = pd.concat([ratio_df, pd.DataFrame({'forecast':forecast, 'window':window, 'percentage':ratio}, index=[0])], axis=0)


Loading station RED....
Loading station C06....
Loading station MEA....
Loading station T36....


  max_station = rsd.idxmax(axis=1)


Date_UTC not in columns. Check to make sure index is datetime not integer.
Forecast length: 1
Forecast length: 5
Forecast length: 10
Forecast length: 15
Forecast length: 20
Forecast length: 25
Forecast length: 30
Extracting storm only data...


  ratio_df = pd.concat([ratio_df, pd.DataFrame({'forecast':forecast, 'window':window, 'percentage':ratio}, index=[0])], axis=0)
 20%|██        | 1/5 [1:12:54<4:51:39, 4374.77s/it]

Forecast length: 1
Forecast length: 5
Forecast length: 10
Forecast length: 15
Forecast length: 20
Forecast length: 25
Forecast length: 30
{'cluster': 'greenland_cluster', 'region': 'GRL-0', 'features': ['dbht'], 'mean': False, 'std': False, 'maximum': True, 'median': False}
{'cluster': 'greenland_cluster', 'region': 'GRL-0', 'features': ['dbht'], 'mean': False, 'std': False, 'maximum': True, 'median': False, 'forecast': 15, 'window': 15, 'classification': False}
Forecast: 15, Window: 15, Classification: False
Loading station GHB....
Loading station SKT....
Loading station STF....
Loading station ATU....


  max_station = rsd.idxmax(axis=1)


Date_UTC not in columns. Check to make sure index is datetime not integer.
Forecast length: 1
Forecast length: 5
Forecast length: 10
Forecast length: 15
Forecast length: 20
Forecast length: 25
Forecast length: 30
Extracting storm only data...
Forecast length: 1
Forecast length: 5
Forecast length: 10
Forecast length: 15
Forecast length: 20
Forecast length: 25
Forecast length: 30
{'cluster': 'greenland_cluster', 'region': 'GRL-1', 'features': ['dbht'], 'mean': False, 'std': False, 'maximum': True, 'median': False}
{'cluster': 'greenland_cluster', 'region': 'GRL-1', 'features': ['dbht'], 'mean': False, 'std': False, 'maximum': True, 'median': False, 'forecast': 15, 'window': 15, 'classification': False}
Forecast: 15, Window: 15, Classification: False
Loading station SKT....


  ratio_df = pd.concat([ratio_df, pd.DataFrame({'forecast':forecast, 'window':window, 'percentage':ratio}, index=[0])], axis=0)


Loading station STF....
Loading station ATU....
Loading station GDH....


  max_station = rsd.idxmax(axis=1)


Date_UTC not in columns. Check to make sure index is datetime not integer.
Forecast length: 1
Forecast length: 5
Forecast length: 10
Forecast length: 15
Forecast length: 20
Forecast length: 25
Forecast length: 30
Extracting storm only data...
Forecast length: 1
Forecast length: 5
Forecast length: 10
Forecast length: 15
Forecast length: 20
Forecast length: 25
Forecast length: 30
{'cluster': 'greenland_cluster', 'region': 'GRL-2', 'features': ['dbht'], 'mean': False, 'std': False, 'maximum': True, 'median': False}
{'cluster': 'greenland_cluster', 'region': 'GRL-2', 'features': ['dbht'], 'mean': False, 'std': False, 'maximum': True, 'median': False, 'forecast': 15, 'window': 15, 'classification': False}
Forecast: 15, Window: 15, Classification: False
Loading station STF....


  ratio_df = pd.concat([ratio_df, pd.DataFrame({'forecast':forecast, 'window':window, 'percentage':ratio}, index=[0])], axis=0)


Loading station ATU....
Loading station GDH....
Loading station UMQ....


  max_station = rsd.idxmax(axis=1)


Date_UTC not in columns. Check to make sure index is datetime not integer.
Forecast length: 1
Forecast length: 5
Forecast length: 10
Forecast length: 15
Forecast length: 20
Forecast length: 25
Forecast length: 30
Extracting storm only data...
Forecast length: 1
Forecast length: 5
Forecast length: 10
Forecast length: 15
Forecast length: 20
Forecast length: 25
Forecast length: 30
{'cluster': 'greenland_cluster', 'region': 'GRL-3', 'features': ['dbht'], 'mean': False, 'std': False, 'maximum': True, 'median': False}
{'cluster': 'greenland_cluster', 'region': 'GRL-3', 'features': ['dbht'], 'mean': False, 'std': False, 'maximum': True, 'median': False, 'forecast': 15, 'window': 15, 'classification': False}
Forecast: 15, Window: 15, Classification: False
Loading station GHB....


  ratio_df = pd.concat([ratio_df, pd.DataFrame({'forecast':forecast, 'window':window, 'percentage':ratio}, index=[0])], axis=0)


Loading station FHB....
Loading station NAQ....


  max_station = rsd.idxmax(axis=1)


Date_UTC not in columns. Check to make sure index is datetime not integer.
Forecast length: 1
Forecast length: 5
Forecast length: 10
Forecast length: 15
Forecast length: 20
Forecast length: 25
Forecast length: 30
Extracting storm only data...


  ratio_df = pd.concat([ratio_df, pd.DataFrame({'forecast':forecast, 'window':window, 'percentage':ratio}, index=[0])], axis=0)
 40%|████      | 2/5 [2:45:36<4:13:38, 5072.78s/it]

Forecast length: 1
Forecast length: 5
Forecast length: 10
Forecast length: 15
Forecast length: 20
Forecast length: 25
Forecast length: 30
{'cluster': 'fennoscandinavian_cluster', 'region': 'FSC-0', 'features': ['dbht'], 'mean': False, 'std': False, 'maximum': True, 'median': False}
{'cluster': 'fennoscandinavian_cluster', 'region': 'FSC-0', 'features': ['dbht'], 'mean': False, 'std': False, 'maximum': True, 'median': False, 'forecast': 15, 'window': 15, 'classification': False}
Forecast: 15, Window: 15, Classification: False
Loading station RVK....
Loading station LYC....
Loading station DON....
Loading station JCK....


  max_station = rsd.idxmax(axis=1)


Date_UTC not in columns. Check to make sure index is datetime not integer.
Forecast length: 1
Forecast length: 5
Forecast length: 10
Forecast length: 15
Forecast length: 20
Forecast length: 25
Forecast length: 30
Extracting storm only data...
Forecast length: 1
Forecast length: 5
Forecast length: 10
Forecast length: 15
Forecast length: 20
Forecast length: 25
Forecast length: 30
{'cluster': 'fennoscandinavian_cluster', 'region': 'FSC-1', 'features': ['dbht'], 'mean': False, 'std': False, 'maximum': True, 'median': False}
{'cluster': 'fennoscandinavian_cluster', 'region': 'FSC-1', 'features': ['dbht'], 'mean': False, 'std': False, 'maximum': True, 'median': False, 'forecast': 15, 'window': 15, 'classification': False}
Forecast: 15, Window: 15, Classification: False
Loading station HAN....


  ratio_df = pd.concat([ratio_df, pd.DataFrame({'forecast':forecast, 'window':window, 'percentage':ratio}, index=[0])], axis=0)


Loading station MEK....
Loading station OUJ....
Loading station NUR....


  max_station = rsd.idxmax(axis=1)


Date_UTC not in columns. Check to make sure index is datetime not integer.
Forecast length: 1
Forecast length: 5
Forecast length: 10
Forecast length: 15
Forecast length: 20
Forecast length: 25
Forecast length: 30
Extracting storm only data...
Forecast length: 1
Forecast length: 5
Forecast length: 10
Forecast length: 15
Forecast length: 20
Forecast length: 25
Forecast length: 30
{'cluster': 'fennoscandinavian_cluster', 'region': 'FSC-2', 'features': ['dbht'], 'mean': False, 'std': False, 'maximum': True, 'median': False}
{'cluster': 'fennoscandinavian_cluster', 'region': 'FSC-2', 'features': ['dbht'], 'mean': False, 'std': False, 'maximum': True, 'median': False, 'forecast': 15, 'window': 15, 'classification': False}
Forecast: 15, Window: 15, Classification: False
Loading station MAS....


  ratio_df = pd.concat([ratio_df, pd.DataFrame({'forecast':forecast, 'window':window, 'percentage':ratio}, index=[0])], axis=0)


Loading station NOR....
Loading station IVA....
Loading station KEV....
Loading station KIL....
Loading station MUO....
Loading station SOR....
Loading station TRO....
Loading station ABK....
Loading station KIR....


  max_station = rsd.idxmax(axis=1)


Date_UTC not in columns. Check to make sure index is datetime not integer.
Forecast length: 1
Forecast length: 5
Forecast length: 10
Forecast length: 15
Forecast length: 20
Forecast length: 25
Forecast length: 30
Extracting storm only data...
Forecast length: 1
Forecast length: 5
Forecast length: 10
Forecast length: 15
Forecast length: 20
Forecast length: 25
Forecast length: 30
{'cluster': 'fennoscandinavian_cluster', 'region': 'FSC-3', 'features': ['dbht'], 'mean': False, 'std': False, 'maximum': True, 'median': False}
{'cluster': 'fennoscandinavian_cluster', 'region': 'FSC-3', 'features': ['dbht'], 'mean': False, 'std': False, 'maximum': True, 'median': False, 'forecast': 15, 'window': 15, 'classification': False}
Forecast: 15, Window: 15, Classification: False
Loading station MAS....


  ratio_df = pd.concat([ratio_df, pd.DataFrame({'forecast':forecast, 'window':window, 'percentage':ratio}, index=[0])], axis=0)


Loading station AND....
Loading station KIL....
Loading station MUO....
Loading station SOR....
Loading station TRO....
Loading station ABK....
Loading station KIR....


  max_station = rsd.idxmax(axis=1)


Date_UTC not in columns. Check to make sure index is datetime not integer.
Forecast length: 1
Forecast length: 5
Forecast length: 10
Forecast length: 15
Forecast length: 20
Forecast length: 25
Forecast length: 30
Extracting storm only data...
Forecast length: 1
Forecast length: 5
Forecast length: 10
Forecast length: 15
Forecast length: 20
Forecast length: 25
Forecast length: 30
{'cluster': 'fennoscandinavian_cluster', 'region': 'FSC-4', 'features': ['dbht'], 'mean': False, 'std': False, 'maximum': True, 'median': False}
{'cluster': 'fennoscandinavian_cluster', 'region': 'FSC-4', 'features': ['dbht'], 'mean': False, 'std': False, 'maximum': True, 'median': False, 'forecast': 15, 'window': 15, 'classification': False}
Forecast: 15, Window: 15, Classification: False
Loading station MAS....


  ratio_df = pd.concat([ratio_df, pd.DataFrame({'forecast':forecast, 'window':window, 'percentage':ratio}, index=[0])], axis=0)


Loading station SOD....
Loading station IVA....
Loading station KEV....
Loading station KIL....
Loading station MUO....
Loading station ABK....
Loading station KIR....
Loading station PEL....


  max_station = rsd.idxmax(axis=1)


Date_UTC not in columns. Check to make sure index is datetime not integer.
Forecast length: 1
Forecast length: 5
Forecast length: 10
Forecast length: 15
Forecast length: 20
Forecast length: 25
Forecast length: 30
Extracting storm only data...
Forecast length: 1
Forecast length: 5
Forecast length: 10
Forecast length: 15
Forecast length: 20
Forecast length: 25
Forecast length: 30
{'cluster': 'fennoscandinavian_cluster', 'region': 'FSC-5', 'features': ['dbht'], 'mean': False, 'std': False, 'maximum': True, 'median': False}
{'cluster': 'fennoscandinavian_cluster', 'region': 'FSC-5', 'features': ['dbht'], 'mean': False, 'std': False, 'maximum': True, 'median': False, 'forecast': 15, 'window': 15, 'classification': False}
Forecast: 15, Window: 15, Classification: False
Loading station JCK....


  ratio_df = pd.concat([ratio_df, pd.DataFrame({'forecast':forecast, 'window':window, 'percentage':ratio}, index=[0])], axis=0)


Loading station DON....
Loading station ABK....
Loading station KIR....
Loading station LYC....


  max_station = rsd.idxmax(axis=1)


Date_UTC not in columns. Check to make sure index is datetime not integer.
Forecast length: 1
Forecast length: 5
Forecast length: 10
Forecast length: 15
Forecast length: 20
Forecast length: 25
Forecast length: 30
Extracting storm only data...
Forecast length: 1
Forecast length: 5
Forecast length: 10
Forecast length: 15
Forecast length: 20
Forecast length: 25
Forecast length: 30
{'cluster': 'fennoscandinavian_cluster', 'region': 'FSC-6', 'features': ['dbht'], 'mean': False, 'std': False, 'maximum': True, 'median': False}
{'cluster': 'fennoscandinavian_cluster', 'region': 'FSC-6', 'features': ['dbht'], 'mean': False, 'std': False, 'maximum': True, 'median': False, 'forecast': 15, 'window': 15, 'classification': False}
Forecast: 15, Window: 15, Classification: False
Loading station MAS....


  ratio_df = pd.concat([ratio_df, pd.DataFrame({'forecast':forecast, 'window':window, 'percentage':ratio}, index=[0])], axis=0)


Loading station AND....
Loading station KIL....
Loading station MUO....
Loading station JCK....
Loading station TRO....
Loading station ABK....
Loading station KIR....
Loading station PEL....


  max_station = rsd.idxmax(axis=1)


Date_UTC not in columns. Check to make sure index is datetime not integer.
Forecast length: 1
Forecast length: 5
Forecast length: 10
Forecast length: 15
Forecast length: 20
Forecast length: 25
Forecast length: 30
Extracting storm only data...


  ratio_df = pd.concat([ratio_df, pd.DataFrame({'forecast':forecast, 'window':window, 'percentage':ratio}, index=[0])], axis=0)
 60%|██████    | 3/5 [5:27:47<4:00:00, 7200.03s/it]

Forecast length: 1
Forecast length: 5
Forecast length: 10
Forecast length: 15
Forecast length: 20
Forecast length: 25
Forecast length: 30
{'cluster': 'central_european_cluster', 'region': 'CEU-0', 'features': ['dbht'], 'mean': False, 'std': False, 'maximum': True, 'median': False}
{'cluster': 'central_european_cluster', 'region': 'CEU-0', 'features': ['dbht'], 'mean': False, 'std': False, 'maximum': True, 'median': False, 'forecast': 15, 'window': 15, 'classification': False}
Forecast: 15, Window: 15, Classification: False
Loading station ZAG....
Loading station LVV....
Loading station BEL....
Loading station VYH....


  max_station = rsd.idxmax(axis=1)


Date_UTC not in columns. Check to make sure index is datetime not integer.
Forecast length: 1
Forecast length: 5
Forecast length: 10
Forecast length: 15
Forecast length: 20
Forecast length: 25
Forecast length: 30
Extracting storm only data...
Forecast length: 1
Forecast length: 5
Forecast length: 10
Forecast length: 15
Forecast length: 20
Forecast length: 25
Forecast length: 30
{'cluster': 'central_european_cluster', 'region': 'CEU-1', 'features': ['dbht'], 'mean': False, 'std': False, 'maximum': True, 'median': False}
{'cluster': 'central_european_cluster', 'region': 'CEU-1', 'features': ['dbht'], 'mean': False, 'std': False, 'maximum': True, 'median': False, 'forecast': 15, 'window': 15, 'classification': False}
Forecast: 15, Window: 15, Classification: False
Loading station BEL....


  ratio_df = pd.concat([ratio_df, pd.DataFrame({'forecast':forecast, 'window':window, 'percentage':ratio}, index=[0])], axis=0)


Loading station HLP....
Loading station SZC....
Loading station KLD....


  max_station = rsd.idxmax(axis=1)


Date_UTC not in columns. Check to make sure index is datetime not integer.
Forecast length: 1
Forecast length: 5
Forecast length: 10
Forecast length: 15
Forecast length: 20
Forecast length: 25
Forecast length: 30
Extracting storm only data...
Forecast length: 1
Forecast length: 5
Forecast length: 10
Forecast length: 15
Forecast length: 20
Forecast length: 25
Forecast length: 30
{'cluster': 'central_european_cluster', 'region': 'CEU-2', 'features': ['dbht'], 'mean': False, 'std': False, 'maximum': True, 'median': False}
{'cluster': 'central_european_cluster', 'region': 'CEU-2', 'features': ['dbht'], 'mean': False, 'std': False, 'maximum': True, 'median': False, 'forecast': 15, 'window': 15, 'classification': False}
Forecast: 15, Window: 15, Classification: False
Loading station THY....


  ratio_df = pd.concat([ratio_df, pd.DataFrame({'forecast':forecast, 'window':window, 'percentage':ratio}, index=[0])], axis=0)


Loading station BDV....
Loading station WIC....
Loading station NCK....
Loading station HRB....


  max_station = rsd.idxmax(axis=1)


Date_UTC not in columns. Check to make sure index is datetime not integer.
Forecast length: 1
Forecast length: 5
Forecast length: 10
Forecast length: 15
Forecast length: 20
Forecast length: 25
Forecast length: 30
Extracting storm only data...
Forecast length: 1
Forecast length: 5
Forecast length: 10
Forecast length: 15
Forecast length: 20
Forecast length: 25
Forecast length: 30
{'cluster': 'central_european_cluster', 'region': 'CEU-3', 'features': ['dbht'], 'mean': False, 'std': False, 'maximum': True, 'median': False}
{'cluster': 'central_european_cluster', 'region': 'CEU-3', 'features': ['dbht'], 'mean': False, 'std': False, 'maximum': True, 'median': False, 'forecast': 15, 'window': 15, 'classification': False}
Forecast: 15, Window: 15, Classification: False
Loading station ROE....


  ratio_df = pd.concat([ratio_df, pd.DataFrame({'forecast':forecast, 'window':window, 'percentage':ratio}, index=[0])], axis=0)


Loading station BFE....
Loading station WNG....


  max_station = rsd.idxmax(axis=1)


Date_UTC not in columns. Check to make sure index is datetime not integer.
Forecast length: 1
Forecast length: 5
Forecast length: 10
Forecast length: 15
Forecast length: 20
Forecast length: 25
Forecast length: 30
Extracting storm only data...


  ratio_df = pd.concat([ratio_df, pd.DataFrame({'forecast':forecast, 'window':window, 'percentage':ratio}, index=[0])], axis=0)
 80%|████████  | 4/5 [7:00:08<1:49:04, 6544.90s/it]

Forecast length: 1
Forecast length: 5
Forecast length: 10
Forecast length: 15
Forecast length: 20
Forecast length: 25
Forecast length: 30
{'cluster': 'non_cluster_regions', 'region': 'SVLB', 'features': ['dbht'], 'mean': False, 'std': False, 'maximum': True, 'median': False}
{'cluster': 'non_cluster_regions', 'region': 'SVLB', 'features': ['dbht'], 'mean': False, 'std': False, 'maximum': True, 'median': False, 'forecast': 15, 'window': 15, 'classification': False}
Forecast: 15, Window: 15, Classification: False
Loading station BBG....
Loading station LYR....
Loading station HOR....
Loading station NAL....
Loading station HRN....
Loading station HOP....


  max_station = rsd.idxmax(axis=1)


Date_UTC not in columns. Check to make sure index is datetime not integer.
Forecast length: 1
Forecast length: 5
Forecast length: 10
Forecast length: 15
Forecast length: 20
Forecast length: 25
Forecast length: 30
Extracting storm only data...
Forecast length: 1
Forecast length: 5
Forecast length: 10
Forecast length: 15
Forecast length: 20
Forecast length: 25
Forecast length: 30
{'cluster': 'non_cluster_regions', 'region': 'JPN-0', 'features': ['dbht'], 'mean': False, 'std': False, 'maximum': True, 'median': False}
{'cluster': 'non_cluster_regions', 'region': 'JPN-0', 'features': ['dbht'], 'mean': False, 'std': False, 'maximum': True, 'median': False, 'forecast': 15, 'window': 15, 'classification': False}
Forecast: 15, Window: 15, Classification: False
Loading station KUJ....


  ratio_df = pd.concat([ratio_df, pd.DataFrame({'forecast':forecast, 'window':window, 'percentage':ratio}, index=[0])], axis=0)


Loading station KNY....
Loading station KAG....


  max_station = rsd.idxmax(axis=1)


Date_UTC not in columns. Check to make sure index is datetime not integer.
Forecast length: 1
Forecast length: 5
Forecast length: 10
Forecast length: 15
Forecast length: 20
Forecast length: 25
Forecast length: 30
Extracting storm only data...
Forecast length: 1
Forecast length: 5
Forecast length: 10
Forecast length: 15
Forecast length: 20
Forecast length: 25
Forecast length: 30
{'cluster': 'non_cluster_regions', 'region': 'JPN-1', 'features': ['dbht'], 'mean': False, 'std': False, 'maximum': True, 'median': False}
{'cluster': 'non_cluster_regions', 'region': 'JPN-1', 'features': ['dbht'], 'mean': False, 'std': False, 'maximum': True, 'median': False, 'forecast': 15, 'window': 15, 'classification': False}
Forecast: 15, Window: 15, Classification: False
Loading station MMB....


  ratio_df = pd.concat([ratio_df, pd.DataFrame({'forecast':forecast, 'window':window, 'percentage':ratio}, index=[0])], axis=0)


Loading station ASB....
Loading station RIK....
Loading station MSR....


  max_station = rsd.idxmax(axis=1)


Date_UTC not in columns. Check to make sure index is datetime not integer.
Forecast length: 1
Forecast length: 5
Forecast length: 10
Forecast length: 15
Forecast length: 20
Forecast length: 25
Forecast length: 30
Extracting storm only data...
Forecast length: 1
Forecast length: 5
Forecast length: 10
Forecast length: 15
Forecast length: 20
Forecast length: 25
Forecast length: 30
{'cluster': 'non_cluster_regions', 'region': 'ALSK', 'features': ['dbht'], 'mean': False, 'std': False, 'maximum': True, 'median': False}
{'cluster': 'non_cluster_regions', 'region': 'ALSK', 'features': ['dbht'], 'mean': False, 'std': False, 'maximum': True, 'median': False, 'forecast': 15, 'window': 15, 'classification': False}
Forecast: 15, Window: 15, Classification: False
Loading station CMO....


  ratio_df = pd.concat([ratio_df, pd.DataFrame({'forecast':forecast, 'window':window, 'percentage':ratio}, index=[0])], axis=0)


Loading station FYU....
Loading station PKR....
Loading station GAK....


  max_station = rsd.idxmax(axis=1)


Date_UTC not in columns. Check to make sure index is datetime not integer.
Forecast length: 1
Forecast length: 5
Forecast length: 10
Forecast length: 15
Forecast length: 20
Forecast length: 25
Forecast length: 30
Extracting storm only data...
Forecast length: 1
Forecast length: 5
Forecast length: 10
Forecast length: 15
Forecast length: 20
Forecast length: 25
Forecast length: 30
{'cluster': 'non_cluster_regions', 'region': 'HUD-0', 'features': ['dbht'], 'mean': False, 'std': False, 'maximum': True, 'median': False}
{'cluster': 'non_cluster_regions', 'region': 'HUD-0', 'features': ['dbht'], 'mean': False, 'std': False, 'maximum': True, 'median': False, 'forecast': 15, 'window': 15, 'classification': False}
Forecast: 15, Window: 15, Classification: False
Loading station PIN....


  ratio_df = pd.concat([ratio_df, pd.DataFrame({'forecast':forecast, 'window':window, 'percentage':ratio}, index=[0])], axis=0)


Loading station ISL....
Loading station C05....


  max_station = rsd.idxmax(axis=1)


Date_UTC not in columns. Check to make sure index is datetime not integer.
Forecast length: 1
Forecast length: 5
Forecast length: 10
Forecast length: 15
Forecast length: 20
Forecast length: 25
Forecast length: 30
Extracting storm only data...
Forecast length: 1
Forecast length: 5
Forecast length: 10
Forecast length: 15
Forecast length: 20
Forecast length: 25
Forecast length: 30
{'cluster': 'non_cluster_regions', 'region': 'HUD-1', 'features': ['dbht'], 'mean': False, 'std': False, 'maximum': True, 'median': False}
{'cluster': 'non_cluster_regions', 'region': 'HUD-1', 'features': ['dbht'], 'mean': False, 'std': False, 'maximum': True, 'median': False, 'forecast': 15, 'window': 15, 'classification': False}
Forecast: 15, Window: 15, Classification: False
Loading station FCC....


  ratio_df = pd.concat([ratio_df, pd.DataFrame({'forecast':forecast, 'window':window, 'percentage':ratio}, index=[0])], axis=0)


Loading station EKP....
Loading station RAN....
Loading station BLC....


  max_station = rsd.idxmax(axis=1)


Date_UTC not in columns. Check to make sure index is datetime not integer.
Forecast length: 1
Forecast length: 5
Forecast length: 10
Forecast length: 15
Forecast length: 20
Forecast length: 25
Forecast length: 30
Extracting storm only data...


  ratio_df = pd.concat([ratio_df, pd.DataFrame({'forecast':forecast, 'window':window, 'percentage':ratio}, index=[0])], axis=0)
100%|██████████| 5/5 [9:18:07<00:00, 6697.46s/it]  

Forecast length: 1
Forecast length: 5
Forecast length: 10
Forecast length: 15
Forecast length: 20
Forecast length: 25
Forecast length: 30





In [None]:
# # turning teh ratio df into a pivot table
# pivot = ratio_df.pivot_table(index='forecast', columns='window', values='percentage')

# fig, axes = plt.subplots(1, 1, figsize=(10, 10))
# axes.set_title('Percentage of threshold crossings')
# im = axes.imshow(pivot, cmap='viridis')
# fig.colorbar(im)
# axes.set_xticks(np.arange(len(windows)))
# axes.set_xticklabels(windows)
# axes.set_xlabel('Window')
# axes.set_yticks(np.arange(len(forecasts)))
# axes.set_yticklabels(forecasts)
# axes.set_ylabel('Forecast')
# plt.show()

In [None]:
with open('outputs/clusters_with_ratios.pkl', 'wb') as f:
	pickle.dump(clusters, f)