#### Merging the datasets

The script merges the bridge dataset with the GDP data and population data. We want to include further information on the individual districts. We are going to combine cities (`Kreisfreie Stadt`) and their respective districts (`Kreis`). In the end, we add the following columns: 
- `GDP 2022`
- `Bevölkerung (insgesamt)`
- `Fläche pro Kreis (qkm)`

In [43]:
# load libaries
import pandas as pd
import re

In [84]:
# read data
data_bridges = pd.read_csv('../data/filled_bridge_statistic_germany.csv', sep=';')
data_gdp = pd.read_excel("../data/gdp_dataset.xlsx", sheet_name= "1.1", skiprows= 4)
data_population = pd.read_excel("../data/population_dataset.xlsx", sheet_name= "Kreisfreie Städte u. Landkreise", skiprows= 1)

In [85]:
#data_gdp

Because we want to look at the districts (`Kreise`), we delete the states. Furthermore, all rows which does not contain a value in the column `Kreis` are deleted. 

In [86]:
# copy data to modify it
data_gdp_modified = data_gdp.copy()
data_population_modified = data_population.copy()

# deleting rows without an entry in NUTS 3 (district)
data_gdp_modified = data_gdp_modified[
    data_gdp_modified['NUTS 3'].notna() & (data_gdp_modified['NUTS 3'] != '')
]

data_population_modified = data_population_modified[
    data_population_modified['NUTS3'].notna() & (data_population_modified['NUTS3'] != '')
]


We have to normalize the names of the districts because they do not correspond to the ones we are using in our bridge data set (and for the visualisation).

In [87]:

# function to normalize district names
# @param name: district name (string)
# @return: normalized district name (string)
def normalize_name(name):
    # check if NaN
    if pd.isna(name):
        return name
    
    # convert to lower case
    name = name.lower()

    # remove space around hyphen
    name = re.sub(r'\s*-\s*', '-', name)

    # remove commas and full stops
    name = name.replace(',', '')
    name = name.replace('.', '')

    # remove special keywords
    name = re.sub(r'\b(landkreis|kreisfreie stadt|stadt|regierungsbezirk|universitätsstadt|wissenschaftsstadt|landeshauptstadt|reg-bez|kfr|hansestadt|stadtkreis|klingenstadt|freie und hansestadt)\b', '', name)
    name = re.sub(r'(?<!-)\bkreis\b', '', name) # kreis nur wenn es allein steht

    # remove bracket additions
    # name = re.sub(r'\([^)]*\)', '', name)

    # remove leading/trailing spaces
    name = name.strip()

    # reduce multiple space character to one
    name = re.sub(r'\s+', ' ', name)

    # convert to upper case
    name = name.title()

    # make 'Am', 'An', 'Im', 'Auf', 'Vom', 'Zum', 'Zur' lowercase when surrounded by spaces
    name = re.sub(r'\b(Am|An|Im|Auf|Vom|Zum|Zur|Und|Der|In|Bei|Vom|Von|Zu|Mit|Aus|Auf)\b', lambda m: m.group(0).lower(), name)

    # special cases
    name = re.sub (r'Oldenburg \(Oldenburg\)', 'Oldenburg', name)
    name = re.sub ('Saarbrücken Regionalverband', 'Regionalverband Saarbrücken', name)
    name = re.sub ('St Wendel', 'St. Wendel', name)
    name = re.sub ('Märkischer', 'Märkischer Kreis', name)
    name = re.sub ('Rheinisch-Bergischer', 'Rheinisch-Bergischer Kreis', name)
    name = re.sub ('Oberbergischer', 'Oberbergischer Kreis', name)
    name = re.sub ('Neustadt/Weinstrasse','Neustadt an der Weinstraße', name)
    name = re.sub ('Frankfurt Main', 'Frankfurt am Main', name)
    name = re.sub ('Hagen der Fernuniversität', 'Hagen', name)
    name = re.sub ('Neustadt Adaisch-Bad Windsheim', 'Neustadt a.d. Aisch-Bad Windsheim', name)
    name = re.sub ('Neustadt Adwaldnaab', 'Neustadt a.d. Waldnaab', name)
    name = re.sub ('Landau In Der Pfalz', 'Landau in der Pfalz', name)
    name = re.sub ('Kassel Documenta-', 'Kassel', name)
    name = re.sub ('Neumarkt Idopf', 'Neumarkt i.d. OPf.', name)
    name = re.sub ('Offenbach am Main', 'Offenbach', name)
    name = re.sub ('Bremerhaven', 'Bremen', name)
    name = re.sub ('Weiden Idopf', 'Weiden i.d. OPf.', name)
    name = re.sub ('Dillingen Addonau', 'Dillingen a.d. Donau', name)
    name = re.sub ('Pfaffenhofen Adilm', 'Pfaffenhofen a.d. Ilm', name)
    name = re.sub ('Mühldorf Ainn', 'Mühldorf a. Inn', name)
    name = re.sub ('Pfaffenhofen Ad Ilm', 'Pfaffenhofen a.d. Ilm', name)
    name = re.sub ('Neustadt Ad Waldnaab', 'Neustadt a.d. Waldnaab', name)
    name = re.sub ('Neustadt Ad Aisch-Bad Windsheim', 'Neustadt a.d. Aisch-Bad Windsheim', name)
    name = re.sub ('Dillingen Ad Donau', 'Dillingen a.d. Donau', name)
    name = re.sub ('Weiden Id Opf', 'Weiden i.d. OPf.', name)
    name = re.sub ('Wunsiedel Ifichtelgebirge', 'Wunsiedel i. Fichtelgebirge', name)
    name = re.sub ('Neumarkt Id Opf', 'Neumarkt i.d. OPf.', name)
    name = re.sub ('Mühldorf A Inn', 'Mühldorf a. Inn', name)
    name = re.sub ('Wunsiedel I Fichtelgebirge', 'Wunsiedel i. Fichtelgebirge', name)

    return name

df_pop = data_population_modified.copy()
df_gdp = data_gdp_modified.copy()
df_bridges = data_bridges.copy()

# ---------- district name normalisation on the datasets ----------
# population data
df_pop['Kreisfreie Städte und Landkreise'] = df_pop['Kreisfreie Städte und Landkreise'].apply(normalize_name)
# GDP data
df_gdp['Gebietseinheit'] = df_gdp['Gebietseinheit'].apply(normalize_name)
# bridge data
df_bridges['Kreis'] = df_bridges['Kreis'].apply(normalize_name)

# ---------- check whether both dataset contain the same district names and correct for it ----------
print(f"Districts contained only in population data: {set(df_pop['Kreisfreie Städte und Landkreise']) - set(df_gdp['Gebietseinheit'])}")
print(f"District contained only in GDP data: {set(df_gdp['Gebietseinheit']) - set(df_pop['Kreisfreie Städte und Landkreise'])}")

# delete three rows not corresponding to districts in the population data
df_pop = df_pop[~df_pop['Kreisfreie Städte und Landkreise'].str.contains('Mittelfranken', case=False, na=False)]
df_pop = df_pop[~df_pop['Kreisfreie Städte und Landkreise'].str.contains('Unterfranken', case=False, na=False)]
df_pop = df_pop[~df_pop['Kreisfreie Städte und Landkreise'].str.contains('Schwaben', case=False, na=False)]

# check which districts are only contained in the bridge data
print(f"Districts contained only in bridge data: {set(df_bridges['Kreis']) - set(df_gdp['Gebietseinheit'])}")

# there is one district on the bridge data set (Eisenach) not contained in the other datasets due to the fact that it is an 
# old one, Eisenach nowadays belongs to Wartburgkreis
df_bridges['Kreis'] = data_bridges['Kreis'].replace('Eisenach', 'Wartburgkreis')

# check which districts are only contained in the bridge data or not contained in bridge data
print(f"Districts contained only in bridge data: {set(df_bridges['Kreis']) - set(df_gdp['Gebietseinheit'])}")
print(f"Districts not contained in bridge data: {set(df_gdp['Gebietseinheit']) - set(df_bridges['Kreis'])}")

# print number of unique districts in each data set
print(f"Number unique districts POPULATION: {len(df_pop['Kreisfreie Städte und Landkreise'].unique())} (total: {len(df_pop)})")
print(f"Number unique districts GDP: {len(df_gdp['Gebietseinheit'].unique())} (total: {len(df_gdp)})")
print(f"Number unique districts BRIDGES: {len(df_bridges['Kreis'].unique())}")

print(f"Districts that are not unique: {df_gdp['Gebietseinheit'].value_counts()[lambda x: x > 1].index.tolist()}")


Districts contained only in population data: {'Unterfranken', 'Schwaben', 'Mittelfranken'}
District contained only in GDP data: set()
Districts contained only in bridge data: {'Eisenach'}
Districts contained only in bridge data: set()
Districts not contained in bridge data: set()
Number unique districts POPULATION: 375 (total: 400)
Number unique districts GDP: 375 (total: 400)
Number unique districts BRIDGES: 375
Districts that are not unique: ['Bayreuth', 'Karlsruhe', 'Regensburg', 'Rostock', 'Bamberg', 'München', 'Coburg', 'Hof', 'Rosenheim', 'Ansbach', 'Fürth', 'Passau', 'Landshut', 'Kassel', 'Würzburg', 'Kaiserslautern', 'Augsburg', 'Aschaffenburg', 'Osnabrück', 'Leipzig', 'Offenbach', 'Bremen', 'Oldenburg', 'Schweinfurt', 'Heilbronn']


Because some cities occur more then once (because we deleted labels like `Kreis` and `Landkreis`) we have to merge those rows.

In [88]:
df_pop_norm = df_pop.copy()

# rename columns in pop data set, remove unnecessary columns
print(f"Columns in population data: {df_pop_norm.columns}")
df_pop_norm = df_pop_norm.rename(columns={'NUTS3': 'Code', 'Kreisfreie Städte und Landkreise': 'Kreis', ' Fläche km² ¹⁾': 'Fläche (qkm)', 'Bevölkerung auf Grundlage des ZENSUS 2022 ²⁾ insgesamt': 'Bevölkerung (insgesamt)', 'Bevölkerung auf Grundlage des ZENSUS 2022 ²⁾ männlich': 'Bevölkerung (männlich)', 'Bevölkerung auf Grundlage des ZENSUS 2022 ²⁾ weiblich': 'Bevölkerung (weiblich)', 'Bevölkerung auf Grundlage des ZENSUS 2022 ²⁾ je km²': 'Bevölkerung (je qkm)'})
df_pop_norm = df_pop_norm.drop(columns=['Amtlicher Regionalschlüssel'])
print(f"Columns in population data: {df_pop_norm.columns}")

# Merge data of cities with the same name due to normalisation. (Kassel Kreis, Kassel Landkreis)
# Merge population
columns_to_sum = [
    'Fläche (qkm)',
    'Bevölkerung (insgesamt)',
    'Bevölkerung (männlich)',
    'Bevölkerung (weiblich)',
    'Bevölkerung (je qkm)'
]

# Group by 'Kreis' and sum all numeric columns
df_pop_cleaned = df_pop_norm.groupby('Kreis', as_index=False)[columns_to_sum].sum()

# save final population dataset
df_pop_cleaned.to_csv('../data/population_dataset_cleaned.csv', sep=',', index=False)

Columns in population data: Index(['Amtlicher Regionalschlüssel', 'Kreisfreie Städte und Landkreise',
       'NUTS3', ' Fläche km² ¹⁾',
       'Bevölkerung auf Grundlage des ZENSUS 2022 ²⁾ insgesamt',
       'Bevölkerung auf Grundlage des ZENSUS 2022 ²⁾ männlich',
       'Bevölkerung auf Grundlage des ZENSUS 2022 ²⁾ weiblich',
       'Bevölkerung auf Grundlage des ZENSUS 2022 ²⁾ je km²'],
      dtype='object')
Columns in population data: Index(['Kreis', 'Code', 'Fläche (qkm)', 'Bevölkerung (insgesamt)',
       'Bevölkerung (männlich)', 'Bevölkerung (weiblich)',
       'Bevölkerung (je qkm)'],
      dtype='object')


To summarize the GDP for those districts, that include a district and a city, we need to know the population of both to be able to calculate the GDP of the combined district. This works as follows: 

GDP_combined_district = (GDP_district * Population_district + GDP_city * Population_city) / (Population_district + Population_city)

Therefore, we need to add the information which row corresponds to the district and which to the city if there are multiple rows with the same district name.

In [89]:
df_gdp_norm = df_gdp.copy()

# rename columns in gdp data set and remove unnecesary ones
#print(f"Columns in GDP data: {df_gdp_norm.columns}")
df_gdp_norm = df_gdp_norm[['Gebietseinheit', 2022, 'EU-Code']]
df_gdp_norm = df_gdp_norm.rename(columns={'Gebietseinheit': 'Kreis', 2022: 'GDP 2022', 'EU-Code': 'Code'})
#print(f"Columns in GDP data: {df_gdp_norm.columns}")

# list all district names occuring twice in the gdp data set
districts = df_gdp_norm['Kreis'].value_counts()[lambda x: x > 1].index.tolist()

# merge GDP
# loop over all districts summarizing multiple districts/cities
merged_dict = dict()
for d_name in districts: 
    gdp = df_gdp_norm.loc[df_gdp_norm['Kreis'] == d_name]
    pop = df_pop_norm.loc[df_pop_norm['Kreis'] == d_name]
    merged = pd.merge(gdp, pop, on='Code')
   
    merged_gdp = (merged['Bevölkerung (insgesamt)'][0] * merged['GDP 2022'][0] + 
                  merged['Bevölkerung (insgesamt)'][1] * merged['GDP 2022'][1]) / (merged['Bevölkerung (insgesamt)'][0] + merged['Bevölkerung (insgesamt)'][1])
    merged_dict[d_name] = merged_gdp

# new GDP
new_df_gdp = pd.DataFrame(columns=['Kreis', 'GDP 2022'])
for i, row in df_gdp_norm.iterrows(): 
    district_name = row['Kreis']
    if district_name in districts: 
        new_df_gdp.loc[len(new_df_gdp)] = [district_name, merged_dict[district_name]]
    else: 
        new_df_gdp.loc[len(new_df_gdp)] = [district_name, df_gdp_norm.loc[df_gdp_norm['Kreis'] == district_name, 'GDP 2022'].iloc[0]]

# remove duplicate rows
df_gdp_cleaned = new_df_gdp.drop_duplicates(subset=['Kreis'])

# save final gdp dataset
df_gdp_cleaned.to_csv('../data/gdp_dataset_cleaned.csv', sep=',', index=False)

Now we merge the corrected bridge dataset with the cleaned population and gdp dataset. 

In [90]:

# merge datasets
gdp_pop = pd.merge(df_gdp_cleaned, df_pop_cleaned[['Kreis', 'Fläche (qkm)', 'Bevölkerung (insgesamt)']], on='Kreis', how='inner')
final_dataset = pd.merge(df_bridges, gdp_pop, on='Kreis', how='inner')

# save final dataset
final_dataset.to_csv('../data/final_bridge_statistic_germany.csv', sep=';', index=False)

print(len(final_dataset))

46952
