In [2]:
import pandas as pd
import numpy as np

# Metadata Analysis

This notebook serves to analyse the provided metadata for the historical DWD weather data. First the data is read line by line, as the original data is not machine readable

In [3]:
data_list = []
f = open("Data/station_description.txt")
for line in f:
    lst = []
    contents = " ".join(line.split())
    contents = contents.split(" ")
    lst.append(contents[0]) # id
    lst.append(contents[1]) # from_date
    lst.append(contents[2]) # to_date
    lst.append(contents[3]) # height
    lst.append(contents[4]) # geoLatitude
    lst.append(contents[5]) # geoLongitude
    lst.append(" ".join(contents[6:-1])) # name
    lst.append(contents[-1]) # area
    data_list.append(lst)
f.close()
columns = data_list.pop(0)
_ = data_list.pop(0)

In [13]:
# Setting index and casting to right format
df= pd.DataFrame(data_list, columns=columns).set_index("Stations_id")
df = df.astype({"von_datum": "int32", "bis_datum": "int32", "Stationshoehe": "int32", "geoBreite": "float64", "geoLaenge": "float64"})
df = df.rename({"von_datum": "from_date", "bis_datum": "to_date", "Stationshoehe": "Height", "geoBreite": "geoLatitude", "geoLaenge": "geoLongitude", "Bundesland": "State"}, axis=1)

In [14]:
# Filtering out all stations that weren't active for the flooding
current_stations = df[df["to_date"]>20210000]
current_stations.head()

Unnamed: 0_level_0,from_date,to_date,Height,geoLatitude,geoLongitude,Stationsname,State
Stations_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
44,19690101,20210921,44,52.9336,8.237,Großenkneten,Niedersachsen
73,19590301,20210921,340,48.6159,13.0506,Aldersbach-Kriestorf,Bayern
78,19610101,20210921,65,52.4853,7.9126,Alfhausen,Niedersachsen
90,19880219,20210921,305,50.7557,9.2583,Alsfeld,Hessen
91,19781101,20210921,300,50.7446,9.345,Alsfeld-Eifa,Hessen


The affected area is roughly contained between 7 and 9 degrees longitude and 49 to 51 degrees latitude. So we will filter for this region

In [15]:
relevant_laenge = current_stations[(current_stations["geoLongitude"]>7)&(current_stations["geoLongitude"]<9)]
relevant = relevant_laenge[(relevant_laenge["geoLatitude"] < 51)& (relevant_laenge["geoLatitude"] >49)]
relevant.head()

Unnamed: 0_level_0,from_date,to_date,Height,geoLatitude,geoLongitude,Stationsname,State
Stations_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
150,19510101,20210921,215,49.7273,8.1164,Alzey,Rheinland-Pfalz
161,19801201,20210921,75,50.4237,7.4202,Andernach,Rheinland-Pfalz
330,19370101,20210921,455,49.5617,8.9673,Oberzent-Beerfelden,Hessen
377,19470101,20210921,210,49.107,7.9967,"Bergzabern, Bad",Rheinland-Pfalz
390,19861201,20210921,610,50.9837,8.3683,"Berleburg, Bad-Stünzel",Nordrhein-Westfalen


In [16]:
# Filtering for the three states
filtered = relevant[relevant["State"].isin(["Rheinland-Pfalz", "Nordrhein-Westfalen", "Saarland"])]

Next we will calculate the distance from every station to the two hardest hit cities. Afterwards we will take the mean to get the stations most representative for the two cities.

In [17]:
lat_erftstadt = 50.7948
long_erftstadt = 6.7775
filtered["distance_erftstadt"] = np.sqrt((filtered["geoLatitude"] - lat_erftstadt)*(filtered["geoLatitude"] - lat_erftstadt) + (filtered["geoLongitude"] - long_erftstadt)*(filtered["geoLongitude"] - long_erftstadt))
filtered.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until


Unnamed: 0_level_0,from_date,to_date,Height,geoLatitude,geoLongitude,Stationsname,State,distance_erftstadt
Stations_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
150,19510101,20210921,215,49.7273,8.1164,Alzey,Rheinland-Pfalz,1.71237
161,19801201,20210921,75,50.4237,7.4202,Andernach,Rheinland-Pfalz,0.742145
377,19470101,20210921,210,49.107,7.9967,"Bergzabern, Bad",Rheinland-Pfalz,2.082094
390,19861201,20210921,610,50.9837,8.3683,"Berleburg, Bad-Stünzel",Nordrhein-Westfalen,1.601976
535,19470101,20210921,417,50.0372,7.3079,Blankenrath,Rheinland-Pfalz,0.924815


In [18]:
lat_schuld = 50.4467
long_schuld = 6.88944
filtered["distance_schuld"] = np.sqrt((filtered["geoLatitude"] - lat_schuld)*(filtered["geoLatitude"] - lat_schuld) + (filtered["geoLongitude"] - long_schuld)*(filtered["geoLongitude"] - long_schuld))
filtered.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until


Unnamed: 0_level_0,from_date,to_date,Height,geoLatitude,geoLongitude,Stationsname,State,distance_erftstadt,distance_schuld
Stations_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
150,19510101,20210921,215,49.7273,8.1164,Alzey,Rheinland-Pfalz,1.71237,1.422311
161,19801201,20210921,75,50.4237,7.4202,Andernach,Rheinland-Pfalz,0.742145,0.531258
377,19470101,20210921,210,49.107,7.9967,"Bergzabern, Bad",Rheinland-Pfalz,2.082094,1.738051
390,19861201,20210921,610,50.9837,8.3683,"Berleburg, Bad-Stünzel",Nordrhein-Westfalen,1.601976,1.573339
535,19470101,20210921,417,50.0372,7.3079,Blankenrath,Rheinland-Pfalz,0.924815,0.58549


In [20]:
filtered["mean_distance"] = (filtered["distance_erftstadt"] + filtered["distance_schuld"])/2
filtered.sort_values("mean_distance").head(3)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


Unnamed: 0_level_0,from_date,to_date,Height,geoLatitude,geoLongitude,Stationsname,State,distance_erftstadt,distance_schuld,mean_distance
Stations_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
3490,19480101,20210714,111,50.5346,7.0853,"Neuenahr, Bad-Ahrweiler",Rheinland-Pfalz,0.403045,0.21468,0.308862
603,19860801,20210921,147,50.7293,7.204,Königswinter-Heiderhof,Nordrhein-Westfalen,0.4315,0.42286,0.42718
2667,19570701,20210921,92,50.8646,7.1575,Köln-Bonn,Nordrhein-Westfalen,0.386357,0.496484,0.441421
