-
Notifications
You must be signed in to change notification settings - Fork 12
/
preprocess.py
137 lines (94 loc) · 4.62 KB
/
preprocess.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
import os
import numpy as np
import pandas as pd
import geopandas as gpd
from shapely.geometry import Point
import scripts.read as read
from scripts.misc import upsample_df
def map_population(input_path, countries, interim_path, plot=True):
population = None
weather_grid = None
mapped_population = {}
for country in countries:
file = os.path.join(interim_path, 'population_{}'.format(country))
if not os.path.isfile(file):
if population is None:
population = read.population(input_path)
weather_data = read.wind(input_path) # For the weather grid
# Make GeoDataFrame from the weather data coordinates
weather_grid = gpd.GeoDataFrame(index=weather_data.columns)
weather_grid['geometry'] = weather_grid.index.map(lambda i: Point(reversed(i)))
# Set coordinate reference system to 'latitude/longitude'
weather_grid.crs = {'init': 'epsg:4326'}
# Make polygons around the weather points
weather_grid['geometry'] = weather_grid.geometry.apply(lambda point: point.buffer(.75 / 2, cap_style=3))
# Make list from MultiIndex (this is necessary for the spatial join)
weather_grid.index = weather_grid.index.tolist()
# For Luxembourg, a single weather grid point is manually added for lack of population geodata
if country == 'LU':
s = pd.Series({(49.5, 6): 1})
else:
# Filter population data by country to cut processing time
if country == 'GB':
gdf = population[population['CNTR_CODE'] == 'UK'].copy()
elif country == 'GR':
gdf = population[population['CNTR_CODE'] == 'EL'].copy()
else:
gdf = population[population['CNTR_CODE'] == country].copy()
# Align coordinate reference systems
gdf = gdf.to_crs({'init': 'epsg:4326'})
# Spatial join
gdf = gpd.sjoin(gdf, weather_grid, how="left", op='within')
# Sum up population
s = gdf.groupby('index_right')['TOT_P'].sum()
# Write results to interim path
s.to_pickle(file)
else:
s = pd.read_pickle(file)
print('{} already exists and is read from disk.'.format(file))
mapped_population[country] = s
if plot:
print('Plot of the re-mapped population data of {} (first selected country) '
'for visual inspection:'.format(countries[0]))
gdf = gpd.GeoDataFrame(mapped_population[countries[0]], columns=['TOT_P'], geometry=[Point(reversed(i)) for i in mapped_population[countries[0]].index])
gdf.plot(column='TOT_P')
return mapped_population
def wind(input_path, mapped_population, plot=True):
df = read.wind(input_path)
# Temporal average
s = df.mean(0)
if plot:
print('Plot of the wind averages for visual inspection:')
gdf = gpd.GeoDataFrame(s, columns=['wind'], geometry=[Point(reversed(i)) for i in s.index])
gdf.plot(column='wind')
# Wind data is filtered by country
return pd.concat(
[s[population.index] for population in mapped_population.values()],
keys=mapped_population.keys(), names=['country', 'latitude', 'longitude'], axis=0
).apply(pd.to_numeric, downcast='float')
def temperature(input_path, year_start, year_end, mapped_population, test_mode):
parameters = {
'air': 't2m',
'soil': 'stl1'
}
ts_parameters = []
for parameter in parameters.values():
ts_years = []
for year in list(range(year_start, year_end + 1)):
if test_mode:
# TODO remove dummy usage
ts = read.dummy_temperature(input_path, year, year, parameter)
else:
ts = read.temperature(input_path, year, year, parameter)
ts[ts.select_dtypes(np.float64).columns] = ts.select_dtypes(np.float64).astype(np.float32)
ts_years.append(ts)
df_years = pd.concat(ts_years, axis=0)
# Temperature data is filtered by country
df_countries = pd.concat(
[df_years[population.index] for population in mapped_population.values()],
keys=mapped_population.keys(), axis=1, names=['country', 'latitude', 'longitude']
).apply(pd.to_numeric, downcast='float')
ts_parameters.append(df_countries)
return pd.concat(
ts_parameters, keys=parameters.keys(), names=['parameter', 'country', 'latitude', 'longitude'], axis=1
)