### Libraries:

In [2]:
import pandas as pd

import sys
sys.path.append("../src")

from data.acquisition import get_weather_data, get_bank_holiday_data
from features.preprocessing import past_events, prepare_delays, distances_between_hospitals, rolling_sums_and_means, enrich_data, situation_at_all_hospitals

### Reading in the fake data for Assignments, Incidents and Patient Transfers System (PTS)

In [3]:
df_arrived = pd.read_parquet('../outputs/arrived.parquet')
df_incidents = pd.read_parquet('../outputs/incidents.parquet')
df_pts = pd.read_parquet('../outputs/pts.parquet')

### Preparing a dataframe for past events

The past events function takes the Assignments, Incidents and PTS dataframes and merges them together to give an overall representation of the hospital at each timestamp. We output a dataframe that describes the following: 
- hospital
- time
- number of ambulances arriving
- number of specific skills missing from ambulance arrivals (e.g. missing_skill_A, missing_skill_B)
- idea of flow in and out of the hospital using admissions, discharges and transfers (PTS)
- the mean handover delays
- the number of each age band arriving in ambulances
- the number of patients arriving with each responding priority
- the number of patients being transported by the ambulances
- the number of ambulances leaving the queue

For example, at a specifc time there will be a 1 in missing_skill_A if the no crew members on that ambulance have that skill.

In [4]:
df_past_events = past_events(df_arrived, df_incidents, df_pts)
df_past_events.head()

Unnamed: 0,hospital,time,num_ambulance_arrivals,missing_skill_E,missing_skill_C,missing_skill_D,missing_skill_A,missing_skill_B,flow,past_delay_mins,...,priority_4,priority_5,priority_6,priority_7,priority_8,priority_9,1_patient_transported,2_patients_transported,3_patients_transported,num_ambulance_departures
0,hospital_A,1953-01-12 19:46:17,1.0,0.0,1.0,1.0,1.0,1.0,,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,
1,hospital_A,1953-01-12 20:39:57,1.0,1.0,0.0,1.0,1.0,0.0,,148948.46,...,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,
2,hospital_A,1953-01-12 21:00:31,,,,,,,,,...,,,,,,,,,,1.0
3,hospital_A,1953-01-12 23:09:35,,,,,,,,,...,,,,,,,,,,1.0
4,hospital_A,1953-01-13 00:06:37,1.0,1.0,1.0,1.0,0.0,0.0,,246817.97,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,


In [5]:
# Preparing delays dataframe and outputting the first five rows
delays = prepare_delays(df_arrived)
delays.head()

Unnamed: 0,time_plus_3,hospital,handover_delay_mins,time_plus_10,time_plus_24
0,1953-02-06 19:30:40,hospital_B,158224.37,1953-02-06 19:30:40,1953-02-06 19:30:40
1,1953-01-28 01:50:57,hospital_C,0.0,1953-01-28 01:50:57,1953-01-28 01:50:57
2,1953-04-12 10:53:12,hospital_B,0.0,1953-04-12 10:53:12,1953-04-12 10:53:12
3,1953-09-19 04:25:49,hospital_C,0.0,1953-09-19 04:25:49,1953-09-19 04:25:49
4,1954-01-04 11:26:56,hospital_B,333720.97,1954-01-04 11:26:56,1954-01-04 11:26:56


### Preparing distances dataframe

Using a function from preprocessing.py which calculates the Euclidean distance between each of the 11 hospitals to be used to see whether the number of ambulances arriving at hospitals nearby has an impact on the situation at the current hospital.

In [6]:
distances = distances_between_hospitals(df_arrived)
distances.head()

Unnamed: 0,hospital,hospital_other,distance
0,hospital_B,hospital_B,0.0
1,hospital_B,hospital_C,402984.403645
2,hospital_B,hospital_D,493500.578019
3,hospital_B,hospital_E,398383.74702
4,hospital_B,hospital_A,433103.747848


### Preparing three dataframes representing the situation at all hospitals. 

The three dataframes being constructed are to be used as training data; one for the prediction three hours ahead, one for the prediction ten hours ahead and one for the prediction twenty four hours ahead. These dataframes are constructed using the situation at all hospitals function from preprocessing.py. This function combines rolling aggregations of the past events dataframe aligned to the timestamps of the handover delay.

In [7]:
df_for_three_hour_pred = situation_at_all_hospitals(delays, distances, df_past_events, 3)
df_for_ten_hour_pred = situation_at_all_hospitals(delays, distances, df_past_events, 10)
df_for_twenty_four_hour_pred = situation_at_all_hospitals(delays, distances, df_past_events, 24)

### Enriching the situational data with weather data and bank holiday data

Adding on the following information:
- maximum temperature
- minimum temperature
- rainfall
- whether there is a bank holiday or not on that date (Yes/No)

In [8]:
df_for_three_hour_pred = enrich_data(df_for_three_hour_pred, 1950)
df_for_ten_hour_pred = enrich_data(df_for_ten_hour_pred, 1950)
df_for_twenty_four_hour_pred = enrich_data(df_for_twenty_four_hour_pred, 1950)

In [9]:
# Example output of the dataframe for the three hour prediction
df_for_three_hour_pred.head()

Unnamed: 0,delay_id,time_plus_3,hospital,handover_delay_mins,time_plus_10,time_plus_24,missing_skill_A,missing_skill_B,missing_skill_C,missing_skill_D,...,time,date,day_of_week,month,year,hour_of_day,bank_holiday,tmax,tmin,rain
0,8237,1953-01-12 19:46:17,hospital_A,0.0,1953-01-12 19:46:17,1953-01-12 19:46:17,,,,,...,1953-01-12 19:46:17,1953-01-12,Monday,1,1953,19,No,5.4,0.8,21.8
1,2175,1953-01-12 20:39:57,hospital_A,148948.46,1953-01-12 20:39:57,1953-01-12 20:39:57,,,,,...,1953-01-12 20:39:57,1953-01-12,Monday,1,1953,20,No,5.4,0.8,21.8
2,5357,1953-01-13 00:06:37,hospital_A,246817.97,1953-01-13 00:06:37,1953-01-13 00:06:37,2.0,1.0,1.0,2.0,...,1953-01-13 00:06:37,1953-01-13,Tuesday,1,1953,0,No,5.4,0.8,21.8
3,8668,1953-01-13 04:09:46,hospital_A,117349.47,1953-01-13 04:09:46,1953-01-13 04:09:46,2.0,1.0,2.0,3.0,...,1953-01-13 04:09:46,1953-01-13,Tuesday,1,1953,4,No,5.4,0.8,21.8
4,2102,1953-01-13 06:04:33,hospital_A,156187.31,1953-01-13 06:04:33,1953-01-13 06:04:33,0.0,0.0,1.0,1.0,...,1953-01-13 06:04:33,1953-01-13,Tuesday,1,1953,6,No,5.4,0.8,21.8


### Saving the dataframes as `.parquet`:

In [10]:
# Saving the three dataframes as a parquet file to your directory
df_for_three_hour_pred.to_parquet('../outputs/data_for_three_hour_pred.parquet')
df_for_ten_hour_pred.to_parquet('../outputs/data_for_ten_hour_pred.parquet')
df_for_twenty_four_hour_pred.to_parquet('../outputs/data_for_twenty_four_hour_pred.parquet')