In [None]:
import pandas as pd 
from prophet import Prophet
from datetime import datetime

In [None]:
# read in processed water level data 
df = pd.read_csv("/lakehouse/default/" + "Files/dim_fact_copies/vw_ml_training_flood_short.csv")
df.head()

In [None]:
df['date'] = pd.to_datetime(df['date'])

# filter to latest data which is more complete
df = df[df['date'] > datetime.strptime('2024-01-30', '%Y-%m-%d')]

# rename columns for Prophet 
df.rename(columns={'date':'ds', 'water_level_high':'y'}, inplace=True)

In [None]:
# remove stations that are only ever 0 for computation speed reasons

key_zero = df[df['y']==0]['flood_area_key'].unique()
key_nonzero = df[df['y']!=0]['flood_area_key'].unique()

key_only_zero = set(key_zero) - set(key_nonzero)

df_final = df[~df['flood_area_key'].isin(key_only_zero)]

In [None]:
# read in data to filter by area
flood_area = pd.read_csv("/lakehouse/default/" + "Files/dim_fact_copies/dim_flood_area.csv")

# calculating risk just for Yorkshire for computation speed reasons
filt = flood_area['county'].str.contains('Yorkshire')

subsection = flood_area[filt]['flood_area_key'].unique()

In [None]:
# loop through measurement stations and make a 5 day forecast of water levels

preds = []

for group, group_df in df_final[df_final['flood_area_key'].isin(subsection)].groupby('flood_area_key'):
    m = Prophet()
    m.fit(group_df[['ds', 'y']])
    future = m.make_future_dataframe(periods=5)
    forecast = m.predict(future)
    forecast['flood_area_key'] = group
    preds.append(forecast)

In [None]:
subsection_output = pd.concat(preds)

subsection_output.to_csv("/lakehouse/default/" + "Files/water_level_predictions.csv")