In [84]:
import pandas as pd
import ast
import re

In [85]:
year = 1961
direction = "east"

In [86]:
# Read the CSV files into DataFrames
df = pd.read_csv(f'line_data_{year}_{direction}.csv', encoding='utf-8')

In [87]:
# Replace U+00a0 with regular whitespace in the entire DataFrame
df.replace('\u00a0', ' ', regex=True, inplace=True)

In [88]:
df.head()

Unnamed: 0.1,Unnamed: 0,line_name,Stops,Frequency,Length (time),type,east-west,year
0,0,1,Ostbahnhof - Breslauer Strasse Ecke Andreasstr...,20.0,39.0,strassenbahn,Ost,1961
1,3,3,Warschauer Strasse - Warschauer Brücke - Koper...,12.0,46.0,strassenbahn,Ost,1961
2,4,4,Warschauer Strasse - Warschauer Brücke - Koper...,8.0,25.0,strassenbahn,Ost,1961
3,5,11,Heinrich-Heine-Strasse - S-Bhf. Jannowitzbrück...,20.0,19.0,strassenbahn,Ost,1961
4,6,13,"Klingenberg, Blockdammweg - Kraftwerk Klingenb...",8.0,35.0,strassenbahn,Ost,1961


In [89]:
df['line_name'] = df['line_name'].astype(str).str.strip()
df['Stops'] = df['Stops'].astype(str).str.strip()
df['year'] = df['year'].astype(int)
df['Frequency'] = pd.to_numeric(df['Frequency'], errors='coerce').fillna(0).astype(int)
df['Length (time)'] = pd.to_numeric(df['Length (time)'], errors='coerce').fillna(0).astype(int)
df['type'] = df['type'].astype(str).str.strip()
df['east-west'] = df['east-west'].astype(str).str.strip()

In [90]:
# Define a function to remove leftover parentheses from a column
def remove_double_whitespace(text):
    text = text.replace("   ", " ")
    text = text.replace("  ", " ")
    text = text.replace("–", "-")
    text = text.replace(" - ", " - ")
    return text

# Apply the function to the 'line_stops' column
df['Stops'] = df['Stops'].apply(remove_double_whitespace)
df['Stops'] = df['Stops'].str.replace(r'\s*–\s*', ' - ', regex=True)


In [91]:
mask = df['Stops'].str.contains(" - ").fillna(True)
none_df = df[~mask]
none_df

Unnamed: 0.1,Unnamed: 0,line_name,Stops,Frequency,Length (time),type,east-west,year


In [92]:
# Filter rows where 'line_stops' is not a string
non_string_rows = df[~df['Stops'].apply(lambda x: isinstance(x, str))]
non_string_rows

Unnamed: 0.1,Unnamed: 0,line_name,Stops,Frequency,Length (time),type,east-west,year


In [93]:
def create_stops_df(df):
    # Split line1 into individual stops and stack them into a new dataframe
    stops_df = df['Stops'].str.split(' - ', expand=True).stack().reset_index(level=1, drop=True).reset_index(name='stop_name')
        
    # Count the occurrences of each stop_name
    stop_counts = stops_df['stop_name'].value_counts().reset_index()
    stop_counts.columns = ['stop_name', 'line_count']

    # Merge the count information back into the original DataFrame
    stops_df = pd.merge(stops_df, stop_counts, on='stop_name', how='left')

    # Clean the 'Stop Name' column by removing whitespace and non-breaking spaces
    stops_df['stop_name'] = stops_df['stop_name'].str.replace(u'\xa0', ' ').str.replace(u"U+00a0", "").str.replace(r'\.{2,}', '.').str.replace("[Früh]", "").str.strip()

    # reset index so that it can be used for foreign key
    stops_df = stops_df.rename_axis("station_id").reset_index()
    stops_df["station_id"] += 1

    # Create a dictionary where the keys are the stop names and the values are lists of line names that contain the stop.
    stop_lines_dict = {}

    # Iterate over the rows in the line_stops dataframe and update the dictionary with the line name for each stop in the line
    for index, row in df.iterrows():
        line_name = row['line_name']
        stops = row['Stops'].split(' - ')
        stop_type = row["type"]
        for stop in stops:
            stop_name = stop.replace(u'\xa0', ' ').replace(u"U+00a0", "").strip()
            if stop_name in stop_lines_dict:
                if stop_type in stop_lines_dict[stop_name]:
                    stop_lines_dict[stop_name][stop_type].append(line_name)
                else:
                    stop_lines_dict[stop_name][stop_type] = [line_name]
            else:
                stop_lines_dict[stop_name] = {stop_type: [line_name]}

    # Create a new DataFrame with the stops and their corresponding lines
    stops_data = []
    for stop_name, lines_by_type in stop_lines_dict.items():
        row = {'stop_name': stop_name, 'type': lines_by_type}
        stops_data.append(row)
    stops_in_lines_df = pd.DataFrame(stops_data)

    # Merge the count information back into the original DataFrame
    stops_df = pd.merge(stops_df, stops_in_lines_df, on='stop_name', how='left')

    return stops_df


In [94]:
stops_df = create_stops_df(df)

In [95]:
# Function to transform dictionary values to nested lists
def transform_dict_to_nested_list(d):
    if isinstance(d, dict):
        nested_list = []
        for key, value in d.items():
            nested_list.append([key, value])
        return nested_list
    else:
        print(d)
        return d

# Apply the transformation to the 'type' column
stops_df['type'] = stops_df['type'].apply(transform_dict_to_nested_list)
stops_df = stops_df.explode('type').reset_index(drop=True)

In [96]:
stops_df["row_type"] = None
stops_df["in_lines"] = None
for index, row in stops_df.iterrows():
    try:
        row_type = row["type"][0]
        stops_df.at[index, "row_type"] = row_type
        if len(row["type"]) > 1:
            lines = row["type"][1]
            lines = list(set(lines))
            stops_df.at[index, "in_lines"] = lines
    except:
        print(row)
stops_df.drop(columns="type", inplace=True)
stops_df.drop(columns="index", inplace=True)

# Rename the 'row_type' column to 'type'
stops_df.rename(columns={'row_type': 'type'}, inplace=True)

In [97]:
# Define a custom aggregation function to combine lists into a set
def combine_lists(lists):
    combined_set = set()
    for lst in lists:
        combined_set.update(lst)
    return combined_set

In [98]:
# Group by the 'stop_name' column and aggregate other columns as needed
aggregation_functions = {
'in_lines': combine_lists
}

# Assuming stops_df is your DataFrame
stops_df = stops_df.groupby(['stop_name', 'type'], group_keys=False).agg(aggregation_functions).reset_index()

In [99]:
# Reset the index and rename the index column to 'stop_id'
stops_df.reset_index(inplace=True)
stops_df.rename(columns={'index': 'stop_id'}, inplace=True)


In [100]:
empty_in_lines = stops_df[stops_df['in_lines'].apply(lambda x: len(x) == 0)]
empty_in_lines
# a few types of data errors that need to be fixed manually, done to df

Unnamed: 0,stop_id,stop_name,type,in_lines


In [101]:
for index, row in stops_df.iterrows():
    stops_df.at[index, "stop_id"] = row["stop_id"] + 1

In [102]:
# Function to convert string representation of set to list
def convert_to_list(value):
    try:
        # Use ast.literal_eval to safely convert string to set
        items = ast.literal_eval(value)
        # Convert the set to a list
        return list(items)
    except (SyntaxError, ValueError):
        # Handle the case where literal_eval fails (e.g., invalid string)
        return [value]

# Apply the function to the 'in_lines' column
stops_df['in_lines'] = stops_df['in_lines'].apply(convert_to_list)

In [103]:
stops_df['in_lines'] = stops_df['in_lines'].apply(lambda x: ','.join(map(str, x)))

In [104]:
stops_df.head()

Unnamed: 0,stop_id,stop_name,type,in_lines
0,1,Adlershof,s-bahn,"{'KBS 103', 'KBS 106'}"
1,2,"Ahrensfeld, Kirche",bus,{'A46'}
2,3,Albrechtshof,s-bahn,{'KBS 103'}
3,4,Alexanderplatz,s-bahn,"{'KBS 101', 'KBS 102', 'KBS 103'}"
4,5,Alexanderplatz,u-bahn,"{'A', 'E'}"


In [105]:
stops_df.to_csv(f"stops_df_{year}_{direction}.csv")