In [1]:
import pandas as pd

In [3]:
# Read the CSV files into DataFrames
df = pd.read_csv('line_data_1946.csv', encoding='utf-8')

In [5]:
# Replace U+00a0 with regular whitespace in the entire DataFrame
df.replace('\u00a0', ' ', regex=True, inplace=True)

In [43]:
df.head()

Unnamed: 0,year,Stops,Frequency,Length (time),line_name,type,east-west
0,1946,"Flughafen - Franz-Mehring-Strasse (3, 99) - Ha...",25.0,,1,bus,both/unkown
1,1946,Hackescher Markt - Rosenthaler Strasse - Rosen...,10.0,15.0,1,strassenbahn,both/unkown
2,1946,"Zehlendorf, Rathaus (5) - Potsdamer Chaussee -...",30.0,,17,bus,both/unkown
3,1946,Wiebestrasse - Thälmannstrasse - Thälmannstras...,6.0,15.0,21,strassenbahn,both/unkown
4,1946,Rosenthal - Reichskanzlerdamm - Niederschönhau...,10.0,20.0,23,strassenbahn,both/unkown


In [44]:
df['line_name'] = df['line_name'].astype(str).str.strip()
df['Stops'] = df['Stops'].astype(str).str.strip()
df['year'] = df['year'].astype(int)
df['Frequency'] = pd.to_numeric(df['Frequency'], errors='coerce').fillna(0).astype(int)
df['Length (time)'] = pd.to_numeric(df['Length (time)'], errors='coerce').fillna(0).astype(int)
df['type'] = df['type'].astype(str).str.strip()
df['east-west'] = df['east-west'].astype(str).str.strip()
df["Length (km)"] = ""

In [45]:
# Function to remove content within all parentheses iteratively
def remove_all_parentheses_iter(text):
    while '(' in text and ')' in text:
        start = text.find('(')
        end = text.find(')')
        text = text[:start] + text[end + 1:]

    return text

# Apply the function to the 'Stops' column
df['Stops'] = df['Stops'].apply(remove_all_parentheses_iter)

In [46]:
# Define a function to remove leftover parentheses from a column
def remove_double_whitespace(text):
    text = text.replace("   ", " ")
    text = text.replace("  ", " ")
    text = text.replace("–", "-")
    text = text.replace(" - ", " - ")
    return text

# Apply the function to the 'line_stops' column
df['Stops'] = df['Stops'].apply(remove_double_whitespace)
df['Stops'] = df['Stops'].str.replace(r'\s*–\s*', ' - ', regex=True)


In [47]:
mask = df['Stops'].str.contains(" - ").fillna(True)
none_df = df[~mask]
none_df

Unnamed: 0,year,Stops,Frequency,Length (time),line_name,type,east-west,Length (km)


In [48]:
# Filter rows where 'line_stops' is not a string
non_string_rows = df[~df['Stops'].apply(lambda x: isinstance(x, str))]
non_string_rows

Unnamed: 0,year,Stops,Frequency,Length (time),line_name,type,east-west,Length (km)


In [49]:
def extract_first_start_stop(string):
    stations = string.split(" - ")
    first_station = stations[0]
    last_station = stations[-1]
    return f"{first_station}<> {last_station}"

In [50]:
def create_line_df(df):
    line_df = pd.DataFrame({
        'line_id': range(1, 1+len(df)),
        'year': df['year'],
        'line_name': df['line_name'],
        'type': df["type"],
        "start_stop": df['Stops'].apply(extract_first_start_stop),
        "east_west": df['east-west'],
        "Frequency": df['Frequency'],
        "Length (time)": df['Length (time)'],
        "Length (km)": df['Length (km)'],
    })
    return line_df

In [51]:
def create_stops_df(df):
    # Split line1 into individual stops and stack them into a new dataframe
    stops_df = df['Stops'].str.split(' - ', expand=True).stack().reset_index(level=1, drop=True).reset_index(name='stop_name')
        
    # Count the occurrences of each stop_name
    stop_counts = stops_df['stop_name'].value_counts().reset_index()
    stop_counts.columns = ['stop_name', 'line_count']

    # Merge the count information back into the original DataFrame
    stops_df = pd.merge(stops_df, stop_counts, on='stop_name', how='left')

    # Clean the 'Stop Name' column by removing whitespace and non-breaking spaces
    stops_df['stop_name'] = stops_df['stop_name'].str.replace(u'\xa0', ' ').str.replace(u"U+00a0", "").str.replace(r'\.{2,}', '.').str.replace("[Früh]", "").str.strip()

    # Create a dictionary where the keys are the stop names and the values are lists of line names that contain the stop.
    stop_lines_dict = {}

    # Iterate over the rows in the line_stops dataframe and update the dictionary with the line name for each stop in the line
    for index, row in df.iterrows():
        line_name = row['line_name']
        stops = row['Stops'].split(' - ')
        stop_type = row["type"]
        for stop in stops:
            stop_name = stop.replace(u'\xa0', ' ').replace(u"U+00a0", "").strip()
            if stop_name in stop_lines_dict:
                if stop_type in stop_lines_dict[stop_name]:
                    stop_lines_dict[stop_name][stop_type].append(line_name)
                else:
                    stop_lines_dict[stop_name][stop_type] = [line_name]
            else:
                stop_lines_dict[stop_name] = {stop_type: [line_name]}

    # Create a new DataFrame with the stops and their corresponding lines
    stops_data = []
    for stop_name, lines_by_type in stop_lines_dict.items():
        row = {'stop_name': stop_name, 'type': lines_by_type}
        stops_data.append(row)
    stops_in_lines_df = pd.DataFrame(stops_data)

    # Merge the count information back into the original DataFrame
    stops_df = pd.merge(stops_df, stops_in_lines_df, on='stop_name', how='left')

    return stops_df


In [52]:
line_df = create_line_df(df)
stops_df = create_stops_df(df)

In [53]:
# Function to transform dictionary values to nested lists
def transform_dict_to_nested_list(d):
    if isinstance(d, dict):
        nested_list = []
        for key, value in d.items():
            nested_list.append([key, value])
        return nested_list
    else:
        print(d)
        return d

# Apply the transformation to the 'type' column
stops_df['type'] = stops_df['type'].apply(transform_dict_to_nested_list)
stops_df = stops_df.explode('type').reset_index(drop=True)

In [54]:
stops_df["row_type"] = None
stops_df["in_lines"] = None
for index, row in stops_df.iterrows():
    try:
        row_type = row["type"][0]
        stops_df.at[index, "row_type"] = row_type
        if len(row["type"]) > 1:
            lines = row["type"][1]
            lines = list(set(lines))
            stops_df.at[index, "in_lines"] = lines
    except:
        print(row)
stops_df.drop(columns="type", inplace=True)
stops_df.drop(columns="index", inplace=True)

# Rename the 'row_type' column to 'type'
stops_df.rename(columns={'row_type': 'type'}, inplace=True)

In [56]:
stops_df.tail()

Unnamed: 0,stop_name,line_count,type,in_lines
1122,Friedrichsfelde,1,u-bahn,[E]
1123,"Pichelsberg, Stössenseebrücke",1,Fähre,[Wannsee-Pichelsberg]
1124,Kladow,1,Fähre,[Wannsee-Pichelsberg]
1125,Wannsee,3,s-bahn,"[KBS 101, KBS 104a]"
1126,Wannsee,3,Fähre,[Wannsee-Pichelsberg]


In [57]:
stops_df.to_csv("stops_df_1946-initial.csv")