In [6]:
import csv
import numpy as np
import pandas as pd

In this jupyter note book we read the weather data, prepare it for further analysis, and convert it to a Pandas dataframe.  
Note, that for some statistical values (e.g. mean, max) I only use data for 06 to 19 UTC.

In [7]:
# read data for Tmax und Tmin, convert missing values to NaN, and check for data gaps
df1 = pd.read_csv('DATA/DWD/produkt_klima_tag_19470101_20231231_05705.txt', sep=';', parse_dates=['MESS_DATUM'])
df1 = df1[(df1['MESS_DATUM'] >= '2022-01-01') & (df1['MESS_DATUM'] <= '2023-12-31')]
df1 = df1[['MESS_DATUM', ' TXK', ' TNK']]
df2 = pd.read_csv('DATA/DWD/produkt_klima_tag_20230719_20250118_05705.txt', sep=';', parse_dates=['MESS_DATUM'])
df2 = df2[(df2['MESS_DATUM'] >= '2024-01-01') & (df2['MESS_DATUM'] <= '2024-12-31')]
df2 = df2[['MESS_DATUM', ' TXK', ' TNK']]
dft = pd.concat([df1, df2], ignore_index=True)
dft.replace(-999, np.nan, inplace=True)
nan_rows = dft.isnull().any(axis=1).sum()
print('number of NaNs:',nan_rows)
dft['date_diff'] = dft['MESS_DATUM'].diff()
gaps = dft[dft['date_diff'] > pd.Timedelta(days=1)]
print('Data gaps:', gaps)
df = dft[['MESS_DATUM', ' TXK', ' TNK']].rename(columns={
    'MESS_DATUM': 'Datum',
    ' TXK': 'Tmax',
    ' TNK': 'Tmin'
})
display(df.head())

number of NaNs: 0
Data gaps: Empty DataFrame
Columns: [MESS_DATUM,  TXK,  TNK, date_diff]
Index: []


Unnamed: 0,Datum,Tmax,Tmin
0,2022-01-01,12.7,4.8
1,2022-01-02,11.8,3.0
2,2022-01-03,11.9,7.1
3,2022-01-04,10.8,4.3
4,2022-01-05,5.1,1.4


In [8]:
# read data for Wind, convert missing values to NaN, check for data gaps, and compute mean and max wind velocity
df1 = pd.read_csv('DATA/DWD/produkt_f_stunde_19490101_20231231_05705.txt', sep=';')
df1['MESS_DATUM'] = pd.to_datetime(df1['MESS_DATUM'], format='%Y%m%d%H')
df1 = df1[(df1['MESS_DATUM'] >= '2022-01-01 00:00:00') & (df1['MESS_DATUM'] <= '2023-12-31 23:00:00')]
df1 = df1[['MESS_DATUM', '  FF']]
df2 = pd.read_csv('DATA/DWD/produkt_f_stunde_20230719_20250118_05705.txt', sep=';')
df2['MESS_DATUM'] = pd.to_datetime(df2['MESS_DATUM'], format='%Y%m%d%H')
df2 = df2[(df2['MESS_DATUM'] >= '2024-01-01 00:00:00') & (df2['MESS_DATUM'] <= '2024-12-31 23:00:00')]
df2 = df2[['MESS_DATUM', '  FF']]
dft = pd.concat([df1, df2], ignore_index=True)
dft.replace(-999, np.nan, inplace=True)
nan_rows = dft.isnull().any(axis=1).sum()
print('number of NaNs:',nan_rows)
dft['date_diff'] = dft['MESS_DATUM'].diff()
gaps = dft[dft['date_diff'] > pd.Timedelta(days=1)]
print('Data gaps:', gaps)
dfw = dft[(dft['MESS_DATUM'].dt.hour >= 6) & (dft['MESS_DATUM'].dt.hour <= 19)]
wind = dfw.groupby(dfw['MESS_DATUM'].dt.date).agg(
    wmean=('  FF', 'mean'),  # mean wind velocity
    wmax=('  FF', 'max')     # max wind velocity
).reset_index()
wind['MESS_DATUM'] = pd.to_datetime(wind['MESS_DATUM'], errors='coerce')
wind = wind.rename(columns={'MESS_DATUM': 'Datum'})
wind['Datum'] = pd.to_datetime(wind['Datum'])
df = df.merge(wind, on='Datum', how='left')
display(df.head())

number of NaNs: 70
Data gaps: Empty DataFrame
Columns: [MESS_DATUM,   FF, date_diff]
Index: []


Unnamed: 0,Datum,Tmax,Tmin,wmean,wmax
0,2022-01-01,12.7,4.8,2.292857,3.5
1,2022-01-02,11.8,3.0,4.592857,10.1
2,2022-01-03,11.9,7.1,5.757143,8.6
3,2022-01-04,10.8,4.3,4.335714,7.2
4,2022-01-05,5.1,1.4,6.821429,9.8


In [9]:
# read data for rainfall, convert missing values to NaN, check for data gaps, and compute accumulated rainfall and rainfall duration
df1 = pd.read_csv('DATA/DWD/produkt_rr_stunde_19950901_20231231_05705.txt', sep=';')
df1['MESS_DATUM'] = pd.to_datetime(df1['MESS_DATUM'], format='%Y%m%d%H')
df1 = df1[(df1['MESS_DATUM'] >= '2022-01-01 00:00:00') & (df1['MESS_DATUM'] <= '2023-12-31 23:00:00')]
df1 = df1[['MESS_DATUM', '  R1']]
df2 = pd.read_csv('DATA/DWD/produkt_rr_stunde_20230719_20250118_05705.txt', sep=';')
df2['MESS_DATUM'] = pd.to_datetime(df2['MESS_DATUM'], format='%Y%m%d%H')
df2 = df2[(df2['MESS_DATUM'] >= '2024-01-01 00:00:00') & (df2['MESS_DATUM'] <= '2024-12-31 23:00:00')]
df2 = df2[['MESS_DATUM', '  R1']]
dft = pd.concat([df1, df2], ignore_index=True)
dft.replace(-999, np.nan, inplace=True)
nan_rows = dft.isnull().any(axis=1).sum()
print('number of NaNs:', nan_rows)
dft['date_diff'] = dft['MESS_DATUM'].diff()
gaps = dft[dft['date_diff'] > pd.Timedelta(days=1)]
print('Data gaps:', gaps)
dfr = dft[(dft['MESS_DATUM'].dt.hour >= 6) & (dft['MESS_DATUM'].dt.hour <= 19)]
regen = dfr.groupby(dfr['MESS_DATUM'].dt.date).agg(
    rsum=('  R1', 'sum'),
    rdauer=('  R1', lambda x: (x > 0).sum())
).reset_index()
regen['MESS_DATUM'] = pd.to_datetime(regen['MESS_DATUM'], errors='coerce')
regen = regen.rename(columns={'MESS_DATUM': 'Datum'})
regen['Datum'] = pd.to_datetime(regen['Datum'])
df = df.merge(regen, on='Datum', how='left')
display(df.head())

number of NaNs: 16
Data gaps: Empty DataFrame
Columns: [MESS_DATUM,   R1, date_diff]
Index: []


Unnamed: 0,Datum,Tmax,Tmin,wmean,wmax,rsum,rdauer
0,2022-01-01,12.7,4.8,2.292857,3.5,0.0,0
1,2022-01-02,11.8,3.0,4.592857,10.1,0.0,0
2,2022-01-03,11.9,7.1,5.757143,8.6,6.1,9
3,2022-01-04,10.8,4.3,4.335714,7.2,16.6,13
4,2022-01-05,5.1,1.4,6.821429,9.8,0.0,0


In [10]:
# SNOW:
# read data for temperature, convert missing values to NaN, check for data gaps.
# convert rainfall to snow if temperature is below 0.5°C, and compute accumulated snow.
df1 = pd.read_csv('DATA/DWD/produkt_tu_stunde_19480101_20231231_05705.txt', sep=';')
df1['MESS_DATUM'] = pd.to_datetime(df1['MESS_DATUM'], format='%Y%m%d%H')
df1 = df1[(df1['MESS_DATUM'] >= '2022-01-01 00:00:00') & (df1['MESS_DATUM'] <= '2023-12-31 23:00:00')]
df1 = df1[['MESS_DATUM', 'TT_TU']]
df2 = pd.read_csv('DATA/DWD/produkt_tu_stunde_20230719_20250118_05705.txt', sep=';')
df2['MESS_DATUM'] = pd.to_datetime(df2['MESS_DATUM'], format='%Y%m%d%H')
df2 = df2[(df2['MESS_DATUM'] >= '2024-01-01 00:00:00') & (df2['MESS_DATUM'] <= '2024-12-31 23:00:00')]
df2 = df2[['MESS_DATUM', 'TT_TU']]
dft = pd.concat([df1, df2], ignore_index=True)
dft.replace(-999, np.nan, inplace=True)
nan_rows = dft.isnull().any(axis=1).sum()
print('number of NaNs:', nan_rows)
dft['date_diff'] = dft['MESS_DATUM'].diff()
gaps = dft[dft['date_diff'] > pd.Timedelta(days=1)]
print('Data gaps:', gaps)
dfs = dft[(dft['MESS_DATUM'].dt.hour >= 6) & (dft['MESS_DATUM'].dt.hour <= 19)]
dfm = pd.merge(dfr, dfs, on='MESS_DATUM', how='inner')
dfm['schnee'] = dfm.apply(
    lambda row: row['  R1'] if row['  R1'] > 0 and row['TT_TU'] < 0.5 else 0,
    axis=1
)
schnee = dfm.groupby(dfm['MESS_DATUM'].dt.date).agg(
    schnee=('schnee', 'sum')
).reset_index()
schnee['MESS_DATUM'] = pd.to_datetime(schnee['MESS_DATUM'], errors='coerce')
schnee = schnee.rename(columns={'MESS_DATUM': 'Datum'})
schnee['Datum'] = pd.to_datetime(schnee['Datum'])
df = df.merge(schnee, on='Datum', how='left')
display(df.head())

number of NaNs: 45
Data gaps: Empty DataFrame
Columns: [MESS_DATUM, TT_TU, date_diff]
Index: []


Unnamed: 0,Datum,Tmax,Tmin,wmean,wmax,rsum,rdauer,schnee
0,2022-01-01,12.7,4.8,2.292857,3.5,0.0,0,0.0
1,2022-01-02,11.8,3.0,4.592857,10.1,0.0,0,0.0
2,2022-01-03,11.9,7.1,5.757143,8.6,6.1,9,0.0
3,2022-01-04,10.8,4.3,4.335714,7.2,16.6,13,0.0
4,2022-01-05,5.1,1.4,6.821429,9.8,0.0,0,0.0


In [11]:
# read data for sunshine, convert missing values to NaN, check for data gaps, and compute sunshine duration
df1 = pd.read_csv('DATA/DWD/produkt_sd_stunde_19510101_20231231_05705.txt', sep=';')
df1['MESS_DATUM'] = pd.to_datetime(df1['MESS_DATUM'], format='%Y%m%d%H')
df1 = df1[(df1['MESS_DATUM'] >= '2022-01-01 00:00:00') & (df1['MESS_DATUM'] <= '2023-12-31 23:00:00')]
df1 = df1[['MESS_DATUM', 'SD_SO']]
df2 = pd.read_csv('DATA/DWD/produkt_sd_stunde_20230719_20250118_05705.txt', sep=';')
df2['MESS_DATUM'] = pd.to_datetime(df2['MESS_DATUM'], format='%Y%m%d%H')
df2 = df2[(df2['MESS_DATUM'] >= '2024-01-01 00:00:00') & (df2['MESS_DATUM'] <= '2024-12-31 23:00:00')]
df2 = df2[['MESS_DATUM', 'SD_SO']]
dft = pd.concat([df1, df2], ignore_index=True)
dft.replace(-999, np.nan, inplace=True)
nan_rows = dft.isnull().any(axis=1).sum()
print('number of NaNs:', nan_rows)
dft['date_diff'] = dft['MESS_DATUM'].diff()
gaps = dft[dft['date_diff'] > pd.Timedelta(days=1)]
print('Data gaps:', gaps)
dfsd = dft[(dft['MESS_DATUM'].dt.hour >= 6) & (dft['MESS_DATUM'].dt.hour <= 19)]
sd = dft.groupby(dfsd['MESS_DATUM'].dt.date).agg(
    sd=('SD_SO', 'sum')
).reset_index()
sd['MESS_DATUM'] = pd.to_datetime(sd['MESS_DATUM'], errors='coerce')
sd = sd.rename(columns={'MESS_DATUM': 'Datum'})
sd['Datum'] = pd.to_datetime(sd['Datum'])
df = df.merge(sd, on='Datum', how='left')
print(df.head())

number of NaNs: 4
Data gaps: Empty DataFrame
Columns: [MESS_DATUM, SD_SO, date_diff]
Index: []
       Datum  Tmax  Tmin     wmean  wmax  rsum  rdauer  schnee    sd
0 2022-01-01  12.7   4.8  2.292857   3.5   0.0       0     0.0   0.0
1 2022-01-02  11.8   3.0  4.592857  10.1   0.0       0     0.0  41.0
2 2022-01-03  11.9   7.1  5.757143   8.6   6.1       9     0.0   0.0
3 2022-01-04  10.8   4.3  4.335714   7.2  16.6      13     0.0   0.0
4 2022-01-05   5.1   1.4  6.821429   9.8   0.0       0     0.0  92.0


In [12]:
# rename some columns
df = df.rename(columns={'Datum': 'date'})
df = df.rename(columns={'rdauer': 'rdur'})
df = df.rename(columns={'schnee': 'snow'})
display(df.head())

Unnamed: 0,date,Tmax,Tmin,wmean,wmax,rsum,rdur,snow,sd
0,2022-01-01,12.7,4.8,2.292857,3.5,0.0,0,0.0,0.0
1,2022-01-02,11.8,3.0,4.592857,10.1,0.0,0,0.0,41.0
2,2022-01-03,11.9,7.1,5.757143,8.6,6.1,9,0.0,0.0
3,2022-01-04,10.8,4.3,4.335714,7.2,16.6,13,0.0,0.0
4,2022-01-05,5.1,1.4,6.821429,9.8,0.0,0,0.0,92.0


Tmax = daily maximumum Temperature; Tmin = daily minimum Temperature; wmean = mean wind velocity (6-19UTC);  
wmax = maximum wind velocity (6-19UTC); rsum = accumulated rainfall (6-19UTC); rdur = rainfall duration (6-19UTC);  
snow = accumulated snow (6-19UTC); sd = sunshine duration (6-19UTC)

In [13]:
# save dataframe as .csv
df.to_csv('DATA/DWD/weather-data_05705_prep.csv', index=False)