# Create subsets of stations
Code that let's you create subset of stations.

We use postcodes to filter stations according to our needs. All other address data related to the stations is varying much more in regards to data quality than the postcode - so the postcode is the best option to rely on.

As reference we are using [a dataset from opendatasoft](https://public.opendatasoft.com/api/explore/v2.1/catalog/datasets/georef-germany-postleitzahl/exports/csv?lang=en&timezone=Europe%2FBerlin&use_labels=true&delimiter=%3B) here. It should be placed in the `data/` folder.

From the generated `df_georef` we can use columns `state_name`, `kreis_name` and `location` (referring to the adminstrative level) to filter for the necessary postcodes.

In [59]:
import glob
import re
import pandas as pd
from tqdm import tqdm
from config import paths


In [60]:
# federal state name -> offical abbreviation
# useful later for bigger station subsets with more than one state
state_dict = {
    'Baden-Württemberg' :'BW',
    'Bayern': 'BY',
    'Berlin': 'BE',
    'Brandenburg': 'BB',
    'Bremen': 'HB',
    'Hamburg': 'HH',
    'Hessen': 'HE',
    'Mecklenburg-Vorpommern': 'MV',
    'Niedersachsen' : 'NI',
    'Nordrhein-Westfalen': 'NW',
    'Rheinland-Pfalz': 'RP',
    'Saarland': 'SL',
    'Sachsen': 'SN',
    'Sachsen-Anhalt': 'ST',
    'Schleswig-Holstein': 'SH',
    'Thüringen': 'TH'
}

# shortcut dict to select column from df_georef
administrative_level = {
   1: "state_name",
   2: "kreis_name",
   3: "location" 
}


In [61]:
# read the master postcode list and make it more accessible
df_georef = pd.read_csv("data/georef-germany-postleitzahl.csv", sep=';')
df_georef.rename(columns={'Name': 'name',
                    'PLZ Name (short)': 'location',
                    'PLZ Name (long)': 'plz_long',
                    'Geometry': 'geometry',
                    'Postleitzahl / Post code': 'plz',
                    'Kreis code': 'kreis_code',
                    'Land name': 'state_name',
                    'Land code': 'state_code',
                    'Kreis name': 'kreis_name'}, inplace=True)

# convert plz to string: to match to column 'post_code" in station list
# but there are (faulty) strings in the stations post_code column
df_georef = df_georef.astype({"plz": str})

df_georef.head(2)

Unnamed: 0,name,location,plz_long,geometry,plz,kreis_code,state_name,state_code,kreis_name,geo_point_2d
0,47551,Bedburg-Hau,47551 Bedburg-Hau,"{""coordinates"": [[[6.1156615, 51.7419192], [6....",47551,5154,Nordrhein-Westfalen,5,Kreis Kleve,"51.7581345416, 6.20695861505"
1,52477,Alsdorf,52477 Alsdorf,"{""coordinates"": [[[6.1218109, 50.8594794], [6....",52477,5334,Nordrhein-Westfalen,5,Kreis Städteregion Aachen,"50.86861665, 6.17550818828"


In [62]:
# work on sample station files for now
# create station list from files and deduplicate it
work_dir = paths.SAMPLE_DIR / "stations"
station_files = list(work_dir.glob("*.csv"))
station_list = pd.DataFrame()

for c in tqdm(range(0,len(station_files)), desc="Processing station files"):
# for filename in station_files:
    filename = station_files[0]
    current_file = pd.read_csv(filename)
    current_file["file_date"] = re.search("[\d]{4}-[\d]{2}-[\d]{2}",str(filename)).group()
    station_list = pd.concat([station_list, current_file])

station_list["file_date"] = pd.to_datetime(station_list["file_date"])
station_list.drop_duplicates(subset="uuid" ,inplace=True)

Processing station files: 100%|██████████| 13/13 [00:01<00:00,  9.91it/s]


In [63]:
def create_station_subset(names: list, level: int, filename_part: str) -> pd.DataFrame:
    '''
    Determines relevant postcodes of subset 
    Filters stations accordingly
    Saves subset as csv-file
    '''
    
    query_column = administrative_level[level]
    df_georefsubset = df_georef.query(f"{query_column} == @names")
    subset_postcodes = df_georefsubset["plz"].to_list()
    subset_stations = station_list[station_list["post_code"].isin(subset_postcodes)]
    subset_stations.to_csv(work_dir / f"stations_{filename_part}.csv", index=False)

    return subset_stations

In [64]:
# Subset 1: Düsseldorf and surrounding - using "location" (level 3)
dus_location_names = ["Düsseldorf", "Ratingen", "Erkrath", "Hilden", "Neuss", "Meerbusch"]
dus_subset = create_station_subset(dus_location_names, 3, 'dus_plus')


In [65]:
# Subset 2: Rheinland - going for "kreis_name" (level 2) here
rheinland_kreis_names = ['Kreis Borken',
                'Kreis Düren',
                'Kreis Euskirchen'
                'Kreis Heinsberg'
                'Kreis Kleve',
                'Kreis Mettmann'
                'Kreis Oberbergischer Kreis',
                'Kreis Rhein-Erft-Kreis',
                'Kreis Rhein-Kreis Neuss',
                'Kreis Rhein-Sieg-Kreis',
                'Kreis Rheinisch-Bergischer Kreis',
                'Kreis Städteregion Aachen',
                'Kreis Viersen',
                'Kreis Wesel',
                'Kreisfreie Stadt Bonn',
                'Kreisfreie Stadt Duisburg',
                'Kreisfreie Stadt Düsseldorf',
                'Kreisfreie Stadt Essen',
                'Kreisfreie Stadt Krefeld',
                'Kreisfreie Stadt Köln',
                'Kreisfreie Stadt Leverkusen',
                'Kreisfreie Stadt Mönchengladbach',
                'Kreisfreie Stadt Mülheim an der Ruhr',
                'Kreisfreie Stadt Oberhausen',
                'Kreisfreie Stadt Remscheid',
                'Kreisfreie Stadt Solingen',
                'Kreisfreie Stadt Wuppertal']

rheinland_subset = create_station_subset(rheinland_kreis_names, 2, 'rheinland')

In [66]:
# Subset 3: Nordrhein-Westfalen - simply go for "state_name" (level 1)
nrw_state_names = ['Nordrhein-Westfalen']
nrw_subset = create_station_subset(nrw_state_names, 1, 'nrw')

In [68]:
# just for checking
dus_subset.sort_values("city")

Unnamed: 0,uuid,name,brand,street,house_number,post_code,city,latitude,longitude,first_active,openingtimes_json,file_date
3001,9f1a9948-4a60-4db6-8ac5-0fdffcf51613,"DUESSELDORF, GRAFENBERGER ALLEE",Shell,GRAFENBERGER ALLEE 230,,40237,DUESSELDORF,51.234072,6.816542,2014-03-18 16:45:31+01,"{""overrides"":[{""startp"":""2019-07-01 12:00"",""en...",2023-05-01
9130,5e3c22de-18c6-4715-aa36-bc61244fd431,Esso Tankstelle,ESSO,ARNHEIMER STR. 22,,40489,DUESSELDORF,51.303623,6.740373,2014-03-18 16:45:31+01,"{""openingTimes"":[{""applicable_days"":63,""period...",2023-05-01
3869,c8608399-adf3-4162-8321-0f313ff043ac,TOTAL DUESSELDORF,TOTAL,REISHOLZER STR.,57-63,40231,DUESSELDORF,51.206093,6.832643,2014-03-18 16:45:31+01,{},2023-05-01
9718,51d4b6fa-a095-1aa0-e100-80009459e03a,JET DUESSELDORF RATHER BROICH 151,JET,RATHER BROICH,151,40472,DUESSELDORF,51.255400,6.821780,2014-03-18 16:45:31+01,"{""openingTimes"":[{""applicable_days"":31,""period...",2023-05-01
9804,51d4b683-a095-1aa0-e100-80009459e03a,JET DUESSELDORF WORRINGER STR. 33,JET,WORRINGER STR.,33,40211,DUESSELDORF,51.227690,6.796300,2014-03-18 16:45:31+01,"{""openingTimes"":[{""applicable_days"":31,""period...",2023-05-01
...,...,...,...,...,...,...,...,...,...,...,...,...
2954,b605dd15-57f6-495a-b1f9-645731f11272,"RATINGEN, BAT HÖSEL (OSTSEIT",Shell,BAT HÖSEL,,40883,Ratingen,51.325534,6.891643,2014-03-18 16:45:31+01,"{""overrides"":[{""startp"":""2018-03-27 23:59"",""en...",2023-05-01
3571,79fb1f24-bebb-489e-841f-728f9053b555,TotalEnergies Ratingen,TotalEnergies,Homberger Str.,21,40882,Ratingen,51.295966,6.866760,2014-03-18 16:45:31+01,"{""openingTimes"":[{""applicable_days"":31,""period...",2023-05-01
7529,9a9aabaf-adee-4180-87cb-e1a4c7fcd33e,Fricke,Tankpoint,Stadionring,11,40878,Ratingen,51.298100,6.839630,2014-03-18 16:45:31+01,"{""openingTimes"":[{""applicable_days"":64,""period...",2023-05-01
1603,e7807347-796f-4aac-997d-07d0c988e109,Shell Ratingen Brachter Str. 36,Shell,Brachter Str.,36,40882,Ratingen,51.295287,6.914963,2014-03-18 16:45:31+01,{},2023-05-01


---

## Debugging & Testing Area

---

In [None]:
pd.set_option('display.max_rows', None)

In [2]:
dus_stations = pd.read_csv("data/sample/stations/stations_dus_plus.csv")
dus_stations.sort_values("brand").head(3)

Unnamed: 0.1,Unnamed: 0,uuid,name,brand,street,house_number,post_code,city,latitude,longitude,first_active,openingtimes_json,file_date
127,15972,da6f245b-5aa2-419c-b00e-5daf159bdfc1,"Aral Tankstelle, Ratingen Hohenstein",ARAL,Broichhofstrasse,8(A52),40880,Ratingen,51.290546,6.8044,2020-04-23 06:01:16+02,{},2023-05-06
83,9481,529b7b56-0ff8-4f37-b116-d3b0144bf207,Aral Tankstelle,ARAL,Nürnberger Straße,33,40599,Düsseldorf,51.170425,6.859408,2014-03-18 16:45:31+01,"{""openingTimes"":[{""applicable_days"":31,""period...",2023-05-06
82,9103,39add580-4053-4eb2-8a4a-2744c249c48c,Aral Tankstelle,ARAL,Nievenheimer Straße,6,41469,Neuss,51.15772,6.731179,2014-03-18 16:45:31+01,"{""openingTimes"":[{""applicable_days"":31,""period...",2023-05-06


In [12]:
display(dus_stations.groupby(["brand"]).count().sort_values("uuid", ascending=False))

Unnamed: 0_level_0,Unnamed: 0,uuid,name,street,house_number,post_code,city,latitude,longitude,first_active,openingtimes_json,file_date
brand,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
Shell,26,26,26,26,22,26,26,26,26,26,26,26
ARAL,24,24,24,24,23,24,24,24,24,24,24,24
ESSO,21,21,21,21,21,21,21,21,21,21,21,21
TotalEnergies,9,9,9,9,9,9,9,9,9,9,9,9
JET,6,6,6,6,6,6,6,6,6,6,6,6
STAR,6,6,6,6,6,6,6,6,6,6,6,6
SB,5,5,5,5,4,5,5,5,5,5,5,5
Freie Tankstelle,3,3,3,3,3,3,3,3,3,3,3,3
AVIA,2,2,2,2,2,2,2,2,2,2,2,2
Supermarkt-Tankstelle,2,2,2,2,1,2,2,2,2,2,2,2


## Working with the processor classes

In [1]:
import src.process as pcs
help(pcs)

Help on module src.process in src:

NAME
    src.process

FUNCTIONS
    add_time_columns(df: pandas.core.frame.DataFrame, date='date', attributes=['year', 'month', 'day', 'dayofyear', 'dayofweek', 'hour', 'minute']) -> pandas.core.frame.DataFrame
        Takes a Dataframe with a DateTime Index and creates columns for 
        ['year', 'month', 'day', 'dayofyear', 'dayofweek', 'hour', 'minute']
        
        Args:
            df (pd.DataFrame): DataFrame with DateTime ndex
            date (str, optional): Name of the DateTime index. Defaults to 'date'.
            attributes (list, optional): List if attributes from the DateTime library to create columns from. Defaults to ['year', 'month', 'day', 'dayofyear', 'dayofweek', 'hour', 'minute'].
        
        Returns:
            pd.DataFrame: DateTime Index DataFrame with new time columns
    
    extend_panel(df: pandas.core.frame.DataFrame, date='date', individual='station_uuid', names=['date', 'station']) -> pandas.core.frame.Data