 use neural network to predict the change in the ecmwf-eps from the previous cycle run 12 hours ago for days 9-14
 ### current model features
- the difference between the new ecmwf forecast for day 9 and the previous emcwf-eps forecast for day 9[x]
- the difference between the new emcwf-eps forecast for day 8 and the previous emcwf-eps forecast for day 8[x]
- the difference between the new gfs-ens-bc forecast for days 9-14 and the previous emcwf-eps forecast for days 9-14[x]
- the difference between the new cmc-ens forecast for days 9-14 and the new gfs-ens-bc  forecast for days 9-14[x]

### new feature ideas
- rain data
- wind data
- ???

In [44]:
import pandas as pd
import glob
import matplotlib.pyplot as plt
from datetime import datetime, time

In [45]:
path = "RawData"

In [46]:
def extract_date_time(filename):
    """
    extract the date and time from the filename
    :param filename:
    :return:
    """
    parts = filename.split('.')
    date = parts[1]
    time = parts[2]
    return date, time

In [62]:
def get_date(df, file):
    """get the date from the dataframe and the time from the filename and combine them into a datetime object
    :param df: dataframe containing the date
    :param file: filename containing the time
    :return: datetime object
    """
    date_str = df[df.iloc[:, 2] == 1].iloc[0]['Date']
    time_str = str(file.split('.')[2])
    date = datetime.strptime(date_str, '%Y-%m-%d')
    time_value = time(int(time_str), 0)
    combined_datetime = datetime.combine(date.date(), time_value)
    return combined_datetime

In [78]:
ecmwf_files = glob.glob(path + '/ecmwf.*.[01][02].gw_hdd.csv')
ecmwf_sorted_files = sorted(ecmwf_files, key=lambda x: (x.split('.')[1], x.split('.')[2]))[3:]

ecmwf_eps_files = glob.glob(path + '/ecmwf-eps.*.[01][02].gw_hdd.csv')
ecmwf_eps_sorted_files = sorted(ecmwf_eps_files, key=lambda x: (x.split('.')[1], x.split('.')[2]))[2:]

gfs_ens_bc_files = glob.glob(path + '/gfs-ens-bc.*.[01][02].gw_hdd.csv')
gfs_ens_bc_sorted_files = sorted(gfs_ens_bc_files, key=lambda x: (x.split('.')[1], x.split('.')[2]))[2:]

In [79]:
set1 = set((extract_date_time(filename) for filename in ecmwf_sorted_files))
set2 = set((extract_date_time(filename) for filename in ecmwf_eps_sorted_files))

In [80]:
ecmwf_sorted_files = [filename for filename in ecmwf_sorted_files if extract_date_time(filename) in set2]
ecmwf_eps_sorted_files = [filename for filename in ecmwf_eps_sorted_files if extract_date_time(filename) in set1]

In [81]:
master_set = set((extract_date_time(filename) for filename in ecmwf_sorted_files))
gfs_ens_bc_sorted_files = [filename for filename in gfs_ens_bc_sorted_files if extract_date_time(filename) in master_set]

In [82]:
print(len(ecmwf_sorted_files))
print(len(ecmwf_eps_sorted_files))
print(len(gfs_ens_bc_sorted_files))

3513
3513
3498


### step 1 get changes in ecmwf-eps compared to 12 hours ago for days 9-14

In [64]:
ecmwf_eps_change_df = pd.DataFrame(columns=['diff_9', 'diff_10', 'diff_11', 'diff_12',
                                  'diff_13', 'diff_14'])

for i in range(1, len(ecmwf_eps_sorted_files)):
    ecmwf_eps_df = pd.read_csv(ecmwf_eps_sorted_files[i])
    prev_ecmwf_eps_df = pd.read_csv(ecmwf_eps_sorted_files[i-1])
    date = get_date(ecmwf_eps_df, ecmwf_eps_sorted_files[i])

    changes = []
    for day in range(8, 14):
        changes.append(ecmwf_eps_df.iloc[day]['Value'] - prev_ecmwf_eps_df.iloc[day]['Value'])
    new_row = pd.DataFrame([changes], columns=ecmwf_eps_change_df.columns, index=[date])
    ecmwf_eps_change_df = pd.concat([ecmwf_eps_change_df, new_row])


In [73]:
ecmwf_eps_change_df

Unnamed: 0,diff_9,diff_10,diff_11,diff_12,diff_13,diff_14
2018-07-11 00:00:00,-0.004,0.002,-0.001,-0.002,0.002,-0.002
2018-07-12 12:00:00,0.001,0.000,-0.001,-0.001,-0.001,-0.001
2018-07-12 00:00:00,0.001,-0.002,-0.002,0.001,-0.001,0.003
2018-07-13 12:00:00,0.000,-0.001,0.001,0.001,0.006,0.003
2018-07-13 00:00:00,-0.003,-0.001,-0.001,0.005,-0.002,-0.003
...,...,...,...,...,...,...
2023-05-15 00:00:00,-1.345,3.692,-0.092,-2.278,-0.519,-0.550
2023-05-16 12:00:00,-0.195,-0.020,-0.281,-0.132,0.050,-0.327
2023-05-16 00:00:00,3.558,-0.416,-2.051,-0.182,-1.027,0.912
2023-05-17 12:00:00,0.261,0.335,0.252,0.221,0.176,0.538


## step 2 recreate model

### step 2.1 get difference between the new ecmwf forecast for day 9 and the previous ecmwf-eps forecast for day 9

### problems:
- not all dates are in both sets of data

In [74]:
ecmwf_change_df = pd.DataFrame(columns=['diff_8', 'diff_9',])
for i in range(1, len(ecmwf_sorted_files)-1):
    ecmwf_df = pd.read_csv(ecmwf_sorted_files[i])
    ecmwf_eps_df = pd.read_csv(ecmwf_eps_sorted_files[i-1]) #one day behind

    ecmwf = ecmwf_df.iloc[8]
    ecmwf_eps = ecmwf_eps_df.iloc[9]

    date =get_date(ecmwf_df, ecmwf_sorted_files[i])
    changes = []
    for day in range(8,10):
        changes.append(ecmwf_df.iloc[day]['Value'] - ecmwf_eps_df.iloc[day]['Value'])
    new_row = pd.DataFrame([changes], columns=ecmwf_change_df.columns, index=[date])
    ecmwf_change_df = pd.concat([ecmwf_change_df, new_row])

### step 2.2 the difference between the new emcwf-eps forecast for day 8 and the previous emcwf-eps forecast for day 8 (not possible as the new ecmwf-eps forecast is not available because it will not be released yet???)

### step 2.3 the difference between the new gfs-ens-bc forecast for days 9-14 and the previous emcwf-eps forecast for days 9-14

In [None]:
ecmwf_eps_change_df = pd.DataFrame(columns=['diff_9', 'diff_10', 'diff_11', 'diff_12',
                                  'diff_13', 'diff_14'])

for i in range(1, len(ecmwf_eps_sorted_files)):
    ecmwf_eps_df = pd.read_csv(ecmwf_eps_sorted_files[i])
    prev_ecmwf_eps_df = pd.read_csv(ecmwf_eps_sorted_files[i-1])
    date = get_date(ecmwf_eps_df, ecmwf_eps_sorted_files[i])

    changes = []
    for day in range(8, 14):
        changes.append(ecmwf_eps_df.iloc[day]['Value'] - prev_ecmwf_eps_df.iloc[day]['Value'])
    new_row = pd.DataFrame([changes], columns=ecmwf_eps_change_df.columns, index=[date])
    ecmwf_eps_change_df = pd.concat([ecmwf_eps_change_df, new_row])

In [None]:
gfs_ens_bc_change_df = pd.DataFrame(columns=['diff_9', 'diff_10', 'diff_11', 'diff_12',
                                  'diff_13', 'diff_14'])
