# Reading station weather data



In [5]:
# disable future warning
import warnings
warnings.simplefilter('ignore')

# other imports
from meteostat import Stations, Daily
import geopandas as gp
import pandas as pd
import queue
from os import listdir
from concurrent.futures import ThreadPoolExecutor
from shapely.geometry import Point
from datetime import date, datetime
from tqdm import tqdm

Reading the station info. To see how it was created see `./stationdata_info.ipynb`.

In [6]:
stations:gp.GeoDataFrame
stations = gp.read_file("./prod/stationinfo.geojson")
stations.head()

Unnamed: 0,id,latitude,longitude,elevation,daily_start,daily_end,NUTS_CODE_3,NUTS_NAME_3,NUTS_CODE_2,NUTS_NAME_2,NUTS_CODE_1,NUTS_NAME_1,geometry
0,10015,54.1833,7.9,4.0,1952-05-01,2023-01-04,DEF09,Pinneberg,DEF0,Schleswig-Holstein,DEF,Schleswig-Holstein,POINT (7.90000 54.18330)
1,10018,54.9167,8.35,16.0,2009-02-24,2022-04-25,DEF07,Nordfriesland,DEF0,Schleswig-Holstein,DEF,Schleswig-Holstein,POINT (8.35000 54.91670)
2,10020,55.0167,8.4167,26.0,1931-01-01,2023-01-04,DEF07,Nordfriesland,DEF0,Schleswig-Holstein,DEF,Schleswig-Holstein,POINT (8.41670 55.01670)
3,10022,54.8,8.95,7.0,1973-01-01,2023-01-04,DEF07,Nordfriesland,DEF0,Schleswig-Holstein,DEF,Schleswig-Holstein,POINT (8.95000 54.80000)
4,10026,54.5167,9.15,28.0,1891-01-01,1974-06-30,DEF07,Nordfriesland,DEF0,Schleswig-Holstein,DEF,Schleswig-Holstein,POINT (9.15000 54.51670)


## Weather data
Now that we have all the spacial data the only thing left to do is to get the historical weather data

In [7]:
# setup cache
Daily.cache_dir = "./prod/.meteostat/cache"
Daily.max_age = 12000000 # approx 4 months cache time 
Daily.threads = 20

# start end endtime of SOEP panel
start = datetime(1985, 1, 1)
end = datetime.combine(date.today(), datetime.min.time())

def task(station:pd.DataFrame, pbar:tqdm):
    s_id = station["id"]
    daily = Daily(s_id, start=start, end=end)
    data = daily.fetch()
    data['id'] = s_id
    data.to_csv(f'./prod/stationdata/{s_id}.csv')
    pbar.update()

In [8]:
found = listdir('./prod/stationdata/')
found = set(map(lambda x: x.replace('.csv', ''), found))

work = queue.Queue()
for _, station in stations.iterrows():
    if station['id'] not in found:
        work.put(station)

pbar = tqdm(total=work.qsize())
with ThreadPoolExecutor(max_workers=20) as executor:
    while not work.empty():
        station = work.get()
        result = executor.submit(task, station, pbar)

100%|██████████| 1116/1116 [03:52<00:00,  4.80it/s]
