/
fossil_fuel_production.py
254 lines (190 loc) · 7.39 KB
/
fossil_fuel_production.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
"""Garden step for Fossil fuel production dataset (part of the OWID Energy dataset), based on a combination of the
Energy Institute Statistical Review dataset and Shift data on fossil fuel production.
"""
import numpy as np
from owid.catalog import Dataset, Table
from owid.datautils import dataframes
from etl.data_helpers.geo import add_population_to_table
from etl.helpers import PathFinder, create_dataset
# Get paths and naming conventions for current step.
paths = PathFinder(__file__)
# Conversion factors.
# Terawatt-hours to kilowatt-hours.
TWH_TO_KWH = 1e9
def prepare_statistical_review_data(tb_review: Table) -> Table:
"""Prepare Statistical Review data.
Parameters
----------
tb_review : Table
Statistical Review data.
Returns
-------
tb_review : Table
Selected data from the Statistical Review.
"""
tb_review = tb_review.reset_index()
columns = {
"country": "country",
"year": "year",
"coal_production_twh": "Coal production (TWh)",
"gas_production_twh": "Gas production (TWh)",
"oil_production_twh": "Oil production (TWh)",
}
tb_review = tb_review[list(columns)].rename(columns=columns, errors="raise")
return tb_review
def prepare_shift_data(tb_shift: Table) -> Table:
"""Prepare Shift data.
Parameters
----------
tb_shift : Table
Shift data.
Returns
-------
shift_table : Table
Selected data from Shift.
"""
tb_shift = tb_shift.reset_index()
columns = {
"country": "country",
"year": "year",
"coal": "Coal production (TWh)",
"gas": "Gas production (TWh)",
"oil": "Oil production (TWh)",
}
tb_shift = tb_shift[list(columns)].rename(columns=columns, errors="raise")
return tb_shift
def combine_statistical_review_and_shift_data(tb_review: Table, tb_shift: Table) -> Table:
"""Combine Statistical Review and Shift data.
Parameters
----------
tb_review : Table
Processed Statistical Review table.
tb_shift : Table
Process Shift table.
Returns
-------
combined : Table
Combined data.
"""
# Check that there are no duplicated rows in any of the two datasets.
assert tb_review[tb_review.duplicated(subset=["country", "year"])].empty, "Duplicated rows in Statistical Review."
assert tb_shift[tb_shift.duplicated(subset=["country", "year"])].empty, "Duplicated rows in Shift data."
# Combine Shift data (which goes further back in the past) with Statistical Review data (which is more up-to-date).
# On coincident rows, prioritize Statistical Review data.
index_columns = ["country", "year"]
combined = dataframes.combine_two_overlapping_dataframes(df1=tb_review, df2=tb_shift, index_columns=index_columns)
# Update the name of the new combined table.
combined.metadata.short_name = paths.short_name
# Remove rows that only have nan.
combined = combined.dropna(subset=combined.drop(columns=["country", "year"]).columns, how="all")
# Sort data appropriately.
combined = combined.sort_values(index_columns).reset_index(drop=True)
return combined
def add_annual_change(tb: Table) -> Table:
"""Add annual change variables to combined Statistical Review and Shift data.
Parameters
----------
tb : Table
Combined Statistical Review and Shift data.
Returns
-------
combined : Table
Combined data after adding annual change variables.
"""
combined = tb.copy()
# Calculate annual change.
combined = combined.sort_values(["country", "year"]).reset_index(drop=True)
for cat in ("Coal", "Oil", "Gas"):
combined[f"Annual change in {cat.lower()} production (%)"] = (
combined.groupby("country")[f"{cat} production (TWh)"].pct_change() * 100
)
combined[f"Annual change in {cat.lower()} production (TWh)"] = combined.groupby("country")[
f"{cat} production (TWh)"
].diff()
return combined
def add_per_capita_variables(tb: Table, ds_population: Dataset) -> Table:
"""Add per-capita variables to combined Statistical Review and Shift data.
Parameters
----------
tb : Table
Combined Statistical Review and Shift data.
ds_population : Dataset
Population dataset.
Returns
-------
combined : Table
Combined data after adding per-capita variables.
"""
tb = tb.copy()
# List countries for which we expect to have no population.
# These are countries and regions defined by the Energy Institute and Shift.
expected_countries_without_population = [
country for country in tb["country"].unique() if (("(EI)" in country) or ("(Shift)" in country))
]
# Add population to data.
combined = add_population_to_table(
tb=tb,
ds_population=ds_population,
warn_on_missing_countries=True,
interpolate_missing_population=True,
expected_countries_without_population=expected_countries_without_population,
)
# Calculate production per capita.
for cat in ("Coal", "Oil", "Gas"):
combined[f"{cat} production per capita (kWh)"] = (
combined[f"{cat} production (TWh)"] / combined["population"] * TWH_TO_KWH
)
combined = combined.drop(errors="raise", columns=["population"])
return combined
def remove_spurious_values(tb: Table) -> Table:
"""Remove spurious infinity values.
These values are generated when calculating the annual change of a variable that is zero or nan the previous year.
Parameters
----------
tb : Table
Data that may contain infinity values.
Returns
-------
tb : Table
Corrected data.
"""
# Replace any infinity value by nan.
tb = tb.replace([np.inf, -np.inf], np.nan)
# Remove rows that only have nan.
tb = tb.dropna(subset=tb.drop(columns=["country", "year"]).columns, how="all").reset_index(drop=True)
return tb
def run(dest_dir: str) -> None:
#
# Load data.
#
# Load Statistical Review dataset and read its main table.
ds_review = paths.load_dataset("statistical_review_of_world_energy")
tb_review = ds_review["statistical_review_of_world_energy"]
# Load Shift dataset and read its main table.
ds_shift = paths.load_dataset("energy_production_from_fossil_fuels")
tb_shift = ds_shift["energy_production_from_fossil_fuels"]
# Load population dataset.
ds_population = paths.load_dataset("population")
#
# Process data.
#
# Prepare Statistical Review data.
tb_review = prepare_statistical_review_data(tb_review=tb_review)
# Prepare Shift data on fossil fuel production.
tb_shift = prepare_shift_data(tb_shift=tb_shift)
# Combine Statistical Review and Shift data.
tb = combine_statistical_review_and_shift_data(tb_review=tb_review, tb_shift=tb_shift)
# Add annual change.
tb = add_annual_change(tb=tb)
# Add per-capita variables.
tb = add_per_capita_variables(tb=tb, ds_population=ds_population)
# Remove spurious values and rows that only have nans.
tb = remove_spurious_values(tb=tb)
# Create an appropriate index and sort conveniently.
tb = tb.set_index(["country", "year"], verify_integrity=True).sort_index()
#
# Save outputs.
#
# Create a new garden dataset.
ds_garden = create_dataset(dest_dir=dest_dir, tables=[tb], check_variables_metadata=True)
ds_garden.save()