/
energy_production_from_fossil_fuels.py
148 lines (119 loc) · 4.64 KB
/
energy_production_from_fossil_fuels.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
"""Garden step for Shift data on energy production from fossil fuels.
"""
import owid.catalog.processing as pr
from owid.catalog import Table
from structlog import get_logger
from etl.data_helpers import geo
from etl.helpers import PathFinder, create_dataset
# Initialize log.
log = get_logger()
# Get paths and naming conventions for current step.
paths = PathFinder(__file__)
def split_ussr_and_russia(tb: Table) -> Table:
"""Split data for USSR & Russia into two separate entities (given that Shift treats them as the same entity).
Parameters
----------
tb : Table
Shift data after harmonizing country names.
Returns
-------
tb: Table
Shift data after separating data for USSR and Russia as separate entities.
"""
tb = tb.copy()
# Name that The Shift Data Portal uses for Russia and USSR.
shift_ussr_russia_name = "Russian Federation and USSR (Shift)"
# The relevant part of the data is originally from EIA, who have the first data point for Russia in 1992.
# Therefore we use this year to split USSR and Russia.
russia_start_year = 1992
# Filter to select rows of USSR & Russia data.
ussr_russia_filter = tb["country"] == shift_ussr_russia_name
ussr_data = (
tb[ussr_russia_filter & (tb["year"] < russia_start_year)]
.replace({shift_ussr_russia_name: "USSR"})
.reset_index(drop=True)
)
russia_data = (
tb[ussr_russia_filter & (tb["year"] >= russia_start_year)]
.replace({shift_ussr_russia_name: "Russia"})
.reset_index(drop=True)
)
# Remove rows where Russia and USSR are combined.
tb = tb[~ussr_russia_filter].reset_index(drop=True)
# Combine original data (without USSR and Russia as one entity) with USSR and Russia as separate entities.
tb = (
pr.concat([tb, ussr_data, russia_data], ignore_index=True)
.sort_values(["country", "year"])
.reset_index(drop=True)
)
return tb
def correct_historical_regions(data: Table) -> Table:
"""Correct some issues in Shift data involving historical regions.
Parameters
----------
data : Table
Shift data after harmonization of country names.
Returns
-------
data : Table
Shift data after doing some corrections related to historical regions.
"""
data = data.copy()
# For coal and oil, Czechoslovakia's data become Czechia and Slovakia in 1993.
# However, for gas, Czechia appear at an earlier date.
# We correct those rows to be part of Czechoslovakia.
data_to_add = pr.merge(
data[(data["year"] < 1980) & (data["country"] == "Czechoslovakia")]
.reset_index(drop=True)
.drop(columns=["gas"]),
data[(data["year"] < 1980) & (data["country"] == "Czechia")].reset_index(drop=True)[["year", "gas"]],
how="left",
on="year",
)
select_rows_to_correct = (data["country"].isin(["Czechia", "Czechoslovakia"])) & (data["year"] < 1980)
data = (
pr.concat([data[~select_rows_to_correct], data_to_add], ignore_index=True)
.sort_values(["country", "year"])
.reset_index(drop=True)
)
return data
def run(dest_dir: str) -> None:
#
# Load data.
#
# Load meadow dataset and read its main table.
ds_meadow = paths.load_dataset("energy_production_from_fossil_fuels")
tb_meadow = ds_meadow["energy_production_from_fossil_fuels"].reset_index()
# Load regions dataset.
ds_regions = paths.load_dataset("regions")
# Load income groups dataset.
ds_income_groups = paths.load_dataset("income_groups")
#
# Process data.
#
# Harmonize country names.
tb = geo.harmonize_countries(
df=tb_meadow, countries_file=paths.country_mapping_path, excluded_countries_file=paths.excluded_countries_path
)
# Remove rows that only have nans.
tb = tb.dropna(subset=["coal", "oil", "gas"], how="all").reset_index(drop=True)
# Treat USSR and Russia as separate entities.
tb = split_ussr_and_russia(tb=tb)
# Correct gas data where Czechia and Czechoslovakia overlap.
tb = correct_historical_regions(data=tb)
# Create aggregate regions.
tb = geo.add_regions_to_table(
tb,
ds_regions=ds_regions,
ds_income_groups=ds_income_groups,
min_num_values_per_year=1,
ignore_overlaps_of_zeros=True,
)
# Prepare output data.
tb = tb.set_index(["country", "year"], verify_integrity=True).sort_index()
#
# Save outputs.
#
# Create a new garden dataset (with the same metadata as the meadow version).
ds_garden = create_dataset(dest_dir=dest_dir, tables=[tb], check_variables_metadata=True)
ds_garden.save()