# Reading station weather data



In [19]:
# disable future warning
import warnings
warnings.simplefilter('ignore')

# other imports
from sqlalchemy import create_engine
from sqlalchemy.orm import sessionmaker
from meteostat import Stations, Daily
import geopandas as gp
import pandas as pd
from shapely.geometry import Point
from datetime import date, datetime
from tqdm import tqdm

## Spacial Data
Each weather station has coordinates attached with it. To be able to merge it with the SOEP data set these coordinates need to be converted to some other format. We use the nuts format for now. 

In [20]:
# setup cache
Stations.cache_dir = "./prod/.meteostat/cache"

query = Stations()
query.region("DE")
stations = query.fetch()
stations.reset_index(inplace=True)

In [None]:
path = "prod/weather.db"
con = create_engine("sqlite:///"+path, echo=False)

In [21]:
# remove unnecessary station data
cols = [
    'id',
    'latitude',
    'longitude',
    'elevation',
    'daily_start',
    'daily_end'
]
stations = stations[cols]
stations.rename({'id':'station_id'}, axis=1, inplace=True)

In [22]:
# read shape files
shape = [
    gp.read_file(f"./data/nuts5000/5000_NUTS{i}.shp").to_crs(epsg=4326)
    for i in range(1, 4)
]

# spacial join shape files. This is possible since nuts is hirarchical
nuts:gp.GeoDataFrame
nuts = shape[2].sjoin(shape[1], how="left", lsuffix='3', rsuffix='2', predicate="within")\
               .sjoin(shape[0], how="left", rsuffix='1', predicate="within")
nuts.rename({"NUTS_CODE":"NUTS_CODE_1",	"NUTS_NAME":"NUTS_NAME_1"}, inplace=True, axis=1)
nuts.drop("NUTS_LEVEL_2 NUTS_LEVEL_3 NUTS_LEVEL index_2 index_1".split(), inplace=True, axis=1)
nuts.sort_index(inplace=True)
nuts.head()

Unnamed: 0,NUTS_CODE_3,NUTS_NAME_3,geometry,NUTS_CODE_2,NUTS_NAME_2,NUTS_CODE_1,NUTS_NAME_1
0,DE111,"Stuttgart, Stadtkreis","POLYGON ((9.13452 48.85668, 9.14122 48.86183, ...",DE11,Stuttgart,DE1,Baden-Württemberg
1,DE112,Böblingen,"POLYGON ((8.96647 48.82980, 8.99216 48.83356, ...",DE11,Stuttgart,DE1,Baden-Württemberg
2,DE113,Esslingen,"POLYGON ((9.40973 48.53721, 9.39153 48.53014, ...",DE11,Stuttgart,DE1,Baden-Württemberg
3,DE114,Göppingen,"POLYGON ((9.91934 48.63977, 9.94730 48.63369, ...",DE11,Stuttgart,DE1,Baden-Württemberg
4,DE115,Ludwigsburg,"MULTIPOLYGON (((9.30157 48.95210, 9.31516 48.9...",DE11,Stuttgart,DE1,Baden-Württemberg


In [23]:
# create GeoDataFrame from stations
x, y = stations["longitude"], stations["latitude"]
stations["geometry"] = gp.GeoSeries(map(Point, zip(x, y)))
stations.drop(["longitude", "latitude"], axis=1, inplace=True)
stations = gp.GeoDataFrame(stations)

In [None]:
# save to database
stations.to_sql('stationinfo', con)

In [24]:
# spactial join with nuts data
stations:gp.GeoDataFrame
stations = stations.sjoin(nuts, how="inner", predicate="within")
stations.drop("index_right", axis=1, inplace=True)
stations.sort_index(inplace=True)
stations.head()

Unnamed: 0,station_id,elevation,daily_start,daily_end,geometry,NUTS_CODE_3,NUTS_NAME_3,NUTS_CODE_2,NUTS_NAME_2,NUTS_CODE_1,NUTS_NAME_1
1230,10015,4.0,1952-05-01,2022-12-18,POINT (7.90000 54.18330),DEF09,Pinneberg,DEF0,Schleswig-Holstein,DEF,Schleswig-Holstein
1231,10018,16.0,2009-02-24,2022-04-25,POINT (8.35000 54.91670),DEF07,Nordfriesland,DEF0,Schleswig-Holstein,DEF,Schleswig-Holstein
1232,10020,26.0,1931-01-01,2022-12-18,POINT (8.41670 55.01670),DEF07,Nordfriesland,DEF0,Schleswig-Holstein,DEF,Schleswig-Holstein
1233,10022,7.0,1973-01-01,2022-12-18,POINT (8.95000 54.80000),DEF07,Nordfriesland,DEF0,Schleswig-Holstein,DEF,Schleswig-Holstein
1234,10026,28.0,1891-01-01,1974-06-30,POINT (9.15000 54.51670),DEF07,Nordfriesland,DEF0,Schleswig-Holstein,DEF,Schleswig-Holstein


## Weather data
Now that we have all the spacial data the only thing left to do is to get the historical weather data

In [26]:
# setup cache
Daily.cache_dir = "./prod/.meteostat/cache"
Daily.max_age = 12000000 # approx 4 months cache time 
Daily.threads = 20

# start end endtime of SOEP panel
start = datetime(1985, 1, 1)
end = datetime.combine(date.today(), datetime.min.time())

def work(station):
    s_id = station["station_id"]
    daily = Daily(s_id, start=start, end=end)
    data = daily.fetch()
    data['station_id'] = s_id
    data.to_sql('station', con, if_exists='append')

In [27]:
# get stations that are already in db
try:
    found = set(pd.read_sql_query(
        """
        SELECT "station_id" FROM stationPOINT (7.90000 54.18330)	
        """,
        con=con
    )["station_id"].tolist())
except:
    found = []

# fetch data: TODO make multistreaded
for _, station in tqdm(stations.iterrows(), total=stations.shape[0]):
    s_id = station["station_id"]
    if s_id in found:
        continue
    daily = Daily(s_id, start=start, end=end)
    data = daily.fetch()
    data['station_id'] = s_id
    data = data.join(station, how='left', on=["station_id"])
    data.to_sql('station', con, if_exists='append')

  0%|          | 0/1116 [00:00<?, ?it/s]


OperationalError: (sqlite3.OperationalError) table station has no column named station_id
[SQL: INSERT INTO station (time, tavg, tmin, tmax, prcp, snow, wdir, wspd, wpgt, pres, tsun, station_id, "1230") VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)]
[parameters: (('1985-01-01 00:00:00.000000', 1.6, 0.4, 5.9, 0.5, 0.0, 103.0, 40.7, 73.1, 1007.3, 0.0, '10015', None), ('1985-01-02 00:00:00.000000', 0.2, -0.8, 0.9, 0.0, 0.0, 35.0, 28.8, 71.3, 1012.0, 60.0, '10015', None), ('1985-01-03 00:00:00.000000', -2.7, -4.5, 0.5, 0.0, 0.0, 24.0, 34.2, 73.8, 1010.7, 60.0, '10015', None), ('1985-01-04 00:00:00.000000', -4.1, -5.2, -3.4, 0.0, 0.0, 31.0, 41.0, 78.8, 1011.0, 90.0, '10015', None), ('1985-01-05 00:00:00.000000', -5.3, -6.1, -3.3, 0.0, 0.0, 49.0, 25.9, 59.4, 1016.6, 54.0, '10015', None), ('1985-01-06 00:00:00.000000', -5.7, -7.5, -3.9, 3.5, 10.0, 131.0, 34.6, 77.8, 1009.9, 0.0, '10015', None), ('1985-01-07 00:00:00.000000', -6.7, -8.0, -5.0, 0.0, 130.0, 46.0, 32.0, 77.4, 1017.6, 366.0, '10015', None), ('1985-01-08 00:00:00.000000', -4.7, -7.7, -3.3, 4.9, 100.0, 55.0, 16.6, 58.7, 1014.3, 174.0, '10015', None)  ... displaying 10 of 13856 total bound parameter sets ...  ('2022-12-07 00:00:00.000000', 4.2, 3.9, 4.3, None, None, 107.0, 25.1, 38.9, 1022.4, None, '10015', None), ('2022-12-08 00:00:00.000000', 4.6, 4.3, 4.9, None, None, 169.0, 25.9, 38.9, 1015.8, None, '10015', None))]
(Background on this error at: https://sqlalche.me/e/14/e3q8)