In [2]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from collections import defaultdict
from functools import reduce

from geopy.distance import geodesic

In [3]:
column_names = [
    "Stations_id", "von_datum", "bis_datum", "Stationshoehe",
    "geoBreite", "geoLaenge", "Stationsname", "Bundesland", "Abgabe"
]

In [4]:
dataTUtxt= pd.read_fwf("..\data\Metha List\TU_Stundenwerte_Beschreibung_Stationen.txt", encoding="iso-8859-1", skiprows=2, names=column_names)
dataRRtxt= pd.read_fwf("..\data\Metha List\RR_Stundenwerte_Beschreibung_Stationen.txt", encoding="iso-8859-1", skiprows=2, names=column_names)
dataP0txt= pd.read_fwf("..\data\Metha List\P0_Stundenwerte_Beschreibung_Stationen.txt", encoding="iso-8859-1", skiprows=2, names=column_names)
dataFFtxt= pd.read_fwf("..\data\Metha List\FF_Stundenwerte_Beschreibung_Stationen.txt", encoding="iso-8859-1", skiprows=2, names=column_names)

In [5]:
dataTU= pd.DataFrame(dataTUtxt)
dataRR= pd.DataFrame(dataRRtxt)
dataP0= pd.DataFrame(dataP0txt)
dataFF= pd.DataFrame(dataFFtxt)

In [6]:
print(dataTU.columns)

Index(['Stations_id', 'von_datum', 'bis_datum', 'Stationshoehe', 'geoBreite',
       'geoLaenge', 'Stationsname', 'Bundesland', 'Abgabe'],
      dtype='object')


In [7]:
data_set= [dataTU, dataRR, dataP0, dataFF]

In [8]:
start_zeit= pd.to_datetime("2014-01-01")
end_zeit= pd.to_datetime("2023-12-31")

#vor 1996 keine Daten
min_jahr= 1996
max_jahr= 2024

fenster_längen=[5,6,7,8,9,10]
max_distance= 100

In [9]:
for i in range(len(data_set)):
    data_set[i]['von_datum']= pd.to_datetime(data_set[i]['von_datum'].astype(str), format='%Y%m%d')
    data_set[i]['bis_datum']= pd.to_datetime(data_set[i]['bis_datum'].astype(str), format='%Y%m%d')

In [10]:
for i in range(len(data_set)):
    data_set[i]= data_set[i].drop(['Bundesland'], axis= 1)
    data_set[i]= data_set[i][data_set[i]['Abgabe'] == 'Frei']

In [11]:
data_set_orginal= [df.copy() for df in data_set]

In [12]:
results = []

for startjahr in range(min_jahr, max_jahr):
    fenster_start = pd.Timestamp(f"{startjahr}-01-01")
    
    for länge in fenster_längen:
        fenster_ende = fenster_start + pd.DateOffset(years= länge)
        if(fenster_ende >= pd.Timestamp(f"{max_jahr}-01-01")):
            continue

        for i in range(len(data_set)):
            maske= (data_set[i]['von_datum'] <= fenster_start) & (data_set[i]['bis_datum'] >= fenster_ende)
            data_set[i]= data_set[i][maske]    
            df_tmp= data_set[0][['Stations_id']].copy()

        for i in data_set[1:]:
            df_tmp= pd.merge(df_tmp, i['Stations_id'], on= 'Stations_id')

        for i in  range(len(data_set)):
            data_set[i]= pd.merge(data_set[i], df_tmp, on= 'Stations_id')

        assert data_set[i]['Stations_id'].is_unique

        nachbarn_set = set()

        for i, row in data_set[0].iterrows():
            coord1 = (row['geoBreite'], row['geoLaenge'])
            station_id = row['Stationsname']
            count = 0
            nachbarn_id= [row['Stations_id']]

            for j, row2 in data_set[0].iterrows():
                if i == j:
                    continue
                coord2 = (row2['geoBreite'], row2['geoLaenge'])
                if geodesic(coord1, coord2).km <= max_distance:
                    count += 1
                    nachbarn_id.append(row2['Stations_id'])
    
            nachbarn_set.add((station_id, count, tuple(nachbarn_id)))

        sortiert = sorted(nachbarn_set, key=lambda x: x[1], reverse=True)

        if sortiert:
            results.append({"startjahr": startjahr, "start_zeitpunkt": fenster_start, "end_zeitpunkt": fenster_ende,"dauer_jahre": länge, "anzahl_stationen": sortiert[0][1], "Stationsname": sortiert[0][0], "Score": länge*sortiert[0][1], "Stations_ids": sortiert[0][2]})
        
        data_set= [df.copy() for df in data_set_orginal]

sortierte_results= sorted(results, key= lambda x: (-x["Score"], -x["startjahr"]))

In [13]:
print(sortierte_results[0])

{'startjahr': 2013, 'start_zeitpunkt': Timestamp('2013-01-01 00:00:00'), 'end_zeitpunkt': Timestamp('2023-01-01 00:00:00'), 'dauer_jahre': 10, 'anzahl_stationen': 18, 'Stationsname': 'Erfurt-Weimar', 'Score': 180, 'Stations_ids': (1270, 198, 656, 867, 1612, 1691, 2044, 2171, 2261, 2925, 3231, 3513, 3821, 3946, 4464, 4501, 5371, 5490, 7368)}
