In [1]:
%matplotlib inline
import pandas as pd
from scrape.graph import get_merged_summaries_with_final_actual_intensities, create_graph_images, generate_boxplot_ci, generate_boxplot_ci_error, generate_boxplot_ci_error_for_days, generate_plot_ci_lines, generate_boxplot_ci_error_per_hour

# Carbon intensity forecast tracking

How this repo operates, for national data:

1. Periodically scrape JSON data from the National Grid API
2. Convert to CSV
3. Summarise in a giant combined CSV
4. Plot graphs and calculate statistics.

Most operations are possible via the CLI: see the `python3 run.py ...` commands.

## 1. Scrape JSON data from the National Grid API

We'll use the existing repo folder `./data`. Subfolders will be created for each endpoint.

The supported endpoints are defined in `api.py`. For national data, we'll use the two endpoints `national_fw48h`, `national_pt24h`.

Run the following commands repeatedly every half-hour until you have gathered the data you want:
```
python run.py download --output_directory data --now --endpoint national_fw48h
python run.py download --output_directory data --now --endpoint national_pt24h
```

Gathering data as it is published, every half hour, is important because historical forecasts are overwritten/unavailable (hence this project).

## 2. Convert to CSV

```
python3 run.py wrangle --input_directory "data/national_fw48h" --endpoint "national_fw48h"
python3 run.py wrangle --input_directory "data/national_pt24h" --endpoint "national_pt24h"
```

This will create `.csv` files in the same directory with identical names as the `.json` files.

## 3. Summarise

```
python3 run.py summary --input_directory "data/national_fw48h" --output_directory "data" --endpoint "national_fw48h"
python3 run.py summary --input_directory "data/national_pt24h" --output_directory "data" --endpoint "national_pt24h"
```
This will create `summary_national_fw48h.csv` and `summary_national_pt24h.csv` files in the `data` directory which combine all the available CSVs into one file.

In [2]:
# Example
df = pd.read_csv("./data/summary_national_fw48h.csv").head()
df

Unnamed: 0.1,Unnamed: 0,intensity.forecast,intensity.forecast.1,intensity.forecast.2,intensity.forecast.3,intensity.forecast.4,intensity.forecast.5,intensity.forecast.6,intensity.forecast.7,intensity.forecast.8,...,intensity.actual.86,intensity.actual.87,intensity.actual.88,intensity.actual.89,intensity.actual.90,intensity.actual.91,intensity.actual.92,intensity.actual.93,intensity.actual.94,intensity.actual.95
0,time_difference,0.0,0.5,1.0,1.5,2.0,2.5,3.0,3.5,4.0,...,43.0,43.5,44.0,44.5,45.0,45.5,46.0,46.5,47.0,47.5
1,2023-03-14T03:00Z,68.0,,,,,,,,,...,,,,,,,,,,
2,2023-03-14T03:30Z,69.0,69.0,,,,,,,,...,,,,,,,,,,
3,2023-03-14T04:00Z,76.0,72.0,72.0,,,,,,,...,,,,,,,,,,
4,2023-03-14T04:30Z,77.0,74.0,69.0,69.0,,,,,,...,,,,,,,,,,


## 4. Plots and statistics

```
python3 run.py graph --input_directory "data"
```

In [3]:
# Example to plot individual plots
summaries_merged_df = get_merged_summaries_with_final_actual_intensities("data", filter="national")

In [None]:
fig = generate_plot_ci_lines(summaries_merged_df, hours_of_data=24)

In [None]:
fig = generate_boxplot_ci(summaries_merged_df, hours_of_data=24)

In [None]:
fig = generate_boxplot_ci_error(summaries_merged_df, hours_of_data=24)

In [None]:
fig = generate_boxplot_ci_error_for_days(summaries_merged_df, days=7)

In [None]:
fig = generate_boxplot_ci_error_per_hour(summaries_merged_df, days=7)

In [None]:
create_graph_images("data", hours_of_data=24, days=7)

Note these CIs assume an approximate Normal distribution, which we can see from the days' histograms is not true.

In [4]:
from scrape.graph import generate_combined_stats_dataframe, update_stats_history

In [5]:
stats_combined_df = generate_combined_stats_dataframe(
    summaries_merged_df, days=100
)
stats_combined_df

  ret = _var(a, axis=axis, dtype=dtype, out=out, ddof=ddof,
  ret = ret.dtype.type(ret / rcount)
  ret = _var(a, axis=axis, dtype=dtype, out=out, ddof=ddof,
  ret = ret.dtype.type(ret / rcount)


Unnamed: 0_level_0,forecast,"error, gCO2/kWh","error, gCO2/kWh","error, gCO2/kWh","error, gCO2/kWh",percentage error,percentage error,percentage error,percentage error
Unnamed: 0_level_1,count,mean,std,sem,95% confidence interval,mean,std,sem,95% confidence interval
2023-03-14,903,-14.27,10.89,0.36,"(-14.98, -13.56)",-10.15,8.8,0.29,"(-10.72, -9.57)"
2023-03-15,3099,-4.92,22.33,0.4,"(-5.71, -4.13)",-0.76,16.88,0.3,"(-1.35, -0.16)"
2023-03-16,4405,-11.13,26.91,0.41,"(-11.93, -10.34)",-4.34,30.54,0.46,"(-5.24, -3.44)"
2023-03-17,4417,-11.55,36.93,0.56,"(-12.64, -10.46)",-0.08,29.48,0.44,"(-0.95, 0.79)"
2023-03-18,4415,31.8,24.47,0.37,"(31.07, 32.52)",22.16,21.62,0.33,"(21.52, 22.79)"
2023-03-19,3678,48.56,46.22,0.76,"(47.07, 50.06)",49.71,54.84,0.9,"(47.93, 51.48)"
2023-03-20,1921,-49.5,23.34,0.53,"(-50.54, -48.45)",-25.71,11.7,0.27,"(-26.23, -25.19)"
2023-03-21,1430,-15.41,14.57,0.39,"(-16.17, -14.66)",-14.38,13.69,0.36,"(-15.09, -13.67)"
2023-03-22,2551,0.63,14.07,0.28,"(0.08, 1.17)",4.24,22.89,0.45,"(3.36, 5.13)"
2023-03-23,3873,-0.26,18.08,0.29,"(-0.83, 0.31)",4.29,28.0,0.45,"(3.41, 5.17)"
