In [1]:
import pandas as pd

In [3]:
year = "1964"

In [4]:
# Read the CSV files into DataFrames
df = pd.read_csv(f'line_data_{year}_east.csv', encoding='utf-8')

In [5]:
# Replace U+00a0 with regular whitespace in the entire DataFrame
df.replace('\u00a0', ' ', regex=True, inplace=True)

In [66]:
df.head()

Unnamed: 0,line_name,type,east-west,Stops,Frequency_7-30,Length (time),direction,Length (km),year,Info
0,A,u-bahn,east,Pankow (Vinetastr.) - Schönhauser Allee - Dimi...,2.5,19,both,,1963,
1,E,u-bahn,east,Alexanderplatz - Schillingstr. - Strausberger ...,5.0,15,both,,1963,
2,1,Omnibus,east,Köpenickerstrasse Ecke Adalbertstrasse - Alexa...,20.0,6,both,,1963,
3,1,strassenbahn,east,Ostbahnhof - Breslauerstrasse Ecke Andreasstra...,20.0,28,both,,1963,
4,3,strassenbahn,east,S-Bhf. Warschauer Strasse - Holteistrasse Ecke...,12.0,45,both,,1963,


In [67]:
df['line_name'] = df['line_name'].astype(str).str.strip()
df['Stops'] = df['Stops'].astype(str).str.strip()
df['year'] = df['year'].astype(int)
df['Frequency_7-30'] = pd.to_numeric(df['Frequency_7-30'], errors='coerce').fillna(0).astype(int)
df['Length (time)'] = pd.to_numeric(df['Length (time)'], errors='coerce').fillna(0).astype(int)
df['type'] = df['type'].astype(str).str.strip()
df['east-west'] = df['east-west'].astype(str).str.strip()

In [68]:
# Define a function to remove leftover parentheses from a column
def remove_double_whitespace(text):
    text = text.replace("   ", " ")
    text = text.replace("  ", " ")
    text = text.replace("–", "-")
    text = text.replace(" - ", " - ")
    return text

# Apply the function to the 'line_stops' column
df['Stops'] = df['Stops'].apply(remove_double_whitespace)
df['Stops'] = df['Stops'].str.replace(r'\s*–\s*', ' - ', regex=True)


In [69]:
mask = df['Stops'].str.contains(" - ").fillna(True)
none_df = df[~mask]
none_df

Unnamed: 0,line_name,type,east-west,Stops,Frequency_7-30,Length (time),direction,Length (km),year,Info


In [70]:
# Filter rows where 'line_stops' is not a string
non_string_rows = df[~df['Stops'].apply(lambda x: isinstance(x, str))]
non_string_rows

Unnamed: 0,line_name,type,east-west,Stops,Frequency_7-30,Length (time),direction,Length (km),year,Info


In [71]:
def extract_first_start_stop(string):
    stations = string.split(" - ")
    first_station = stations[0]
    last_station = stations[-1]
    return f"{first_station}<> {last_station}"

In [72]:
def create_line_df(df):
    line_df = pd.DataFrame({
        'line_id': range(1, 1+len(df)),
        'year': df['year'],
        'line_name': df['line_name'],
        'type': df["type"],
        "start_stop": df['Stops'].apply(extract_first_start_stop),
        "east_west": df['east-west'],
        "Frequency_7-30": df['Frequency_7-30'],
        "Length (time)": df['Length (time)']
    })
    return line_df

In [73]:
def create_stops_df(df):
    # Split line1 into individual stops and stack them into a new dataframe
    stops_df = df['Stops'].str.split(' - ', expand=True).stack().reset_index(level=1, drop=True).reset_index(name='stop_name')
        
    # Count the occurrences of each stop_name
    stop_counts = stops_df['stop_name'].value_counts().reset_index()
    stop_counts.columns = ['stop_name', 'line_count']

    # Merge the count information back into the original DataFrame
    stops_df = pd.merge(stops_df, stop_counts, on='stop_name', how='left')

    # Clean the 'Stop Name' column by removing whitespace and non-breaking spaces
    stops_df['stop_name'] = stops_df['stop_name'].str.replace(u'\xa0', ' ').str.replace(u"U+00a0", "").str.replace(r'\.{2,}', '.').str.replace("[Früh]", "").str.strip()

    # reset index so that it can be used for foreign key
    stops_df = stops_df.rename_axis("station_id").reset_index()
    stops_df["station_id"] += 1

    # Create a dictionary where the keys are the stop names and the values are lists of line names that contain the stop.
    stop_lines_dict = {}

    # Iterate over the rows in the line_stops dataframe and update the dictionary with the line name for each stop in the line
    for index, row in df.iterrows():
        line_name = row['line_name']
        stops = row['Stops'].split(' - ')
        stop_type = row["type"]
        for stop in stops:
            stop_name = stop.replace(u'\xa0', ' ').replace(u"U+00a0", "").strip()
            if stop_name in stop_lines_dict:
                if stop_type in stop_lines_dict[stop_name]:
                    stop_lines_dict[stop_name][stop_type].append(line_name)
                else:
                    stop_lines_dict[stop_name][stop_type] = [line_name]
            else:
                stop_lines_dict[stop_name] = {stop_type: [line_name]}

    # Create a new DataFrame with the stops and their corresponding lines
    stops_data = []
    for stop_name, lines_by_type in stop_lines_dict.items():
        row = {'stop_name': stop_name, 'type': lines_by_type}
        stops_data.append(row)
    stops_in_lines_df = pd.DataFrame(stops_data)

    # Merge the count information back into the original DataFrame
    stops_df = pd.merge(stops_df, stops_in_lines_df, on='stop_name', how='left')

    return stops_df


In [74]:
line_df = create_line_df(df)
stops_df = create_stops_df(df)

In [75]:
# Function to transform dictionary values to nested lists
def transform_dict_to_nested_list(d):
    if isinstance(d, dict):
        nested_list = []
        for key, value in d.items():
            nested_list.append([key, value])
        return nested_list
    else:
        print(d)
        return d

# Apply the transformation to the 'type' column
stops_df['type'] = stops_df['type'].apply(transform_dict_to_nested_list)
stops_df = stops_df.explode('type').reset_index(drop=True)

In [76]:
stops_df["row_type"] = None
stops_df["in_lines"] = None
for index, row in stops_df.iterrows():
    try:
        row_type = row["type"][0]
        stops_df.at[index, "row_type"] = row_type
        if len(row["type"]) > 1:
            lines = row["type"][1]
            lines = list(set(lines))
            stops_df.at[index, "in_lines"] = lines
    except:
        print(row)
stops_df.drop(columns="type", inplace=True)
stops_df.drop(columns="index", inplace=True)

# Rename the 'row_type' column to 'type'
stops_df.rename(columns={'row_type': 'type'}, inplace=True)

In [78]:
# Normalize stop_name in df_1963
stops_df['stop_name'] = stops_df['stop_name'].str.replace('tr.', 'trasse').str.replace('traße', 'trasse')

# Normalize stop_name in df_stops_total
stops_df['stop_name'] = stops_df['stop_name'].str.replace('tr.', 'trasse').str.replace('traße', 'trasse')

In [80]:
# Normalize stop_name in df_1963
df['Stops'] = df['Stops'].str.replace('tr.', 'trasse').str.replace('traße', 'trasse')

# Normalize stop_name in df_stops_total
df['Stops'] = df['Stops'].str.replace('tr.', 'trasse').str.replace('traße', 'trasse')

In [81]:
stops_df.to_csv("stops_df_1963-initial.csv")
df.to_csv("line_data_1963_east.csv")