In [1]:
import pandas as pd
import ast
import re

In [2]:
# Read the CSV files into DataFrames
df = pd.read_csv('line_data_1964.csv', encoding='utf-8')

In [3]:
# Replace U+00a0 with regular whitespace in the entire DataFrame
df.replace('\u00a0', ' ', regex=True, inplace=True)

In [4]:
df.head()

Unnamed: 0,line_name,Stops,Frequency,Length (time),type,east-west,year
0,1,Ostbahnhof - U-Bhf. Strausberger Platz - Lenin...,20.0,28.0,strassenbahn,Ost,1964
1,A1P,Adalbertstrasse - S-Bhf. Jannowitzbrücke - Ale...,10.0,6.0,bus,Ost,1964
2,3,S-Bhf. Warschauer Strasse - Kopernikusstrasse ...,12.0,45.0,strassenbahn,Ost,1964
3,4,S-Bhf. Warschauer Strasse - Grünberger Strasse...,12.0,26.0,strassenbahn,Ost,1964
4,11,Heinrich-Heine-Strasse - S-Bhf. Jannowitzbrück...,20.0,27.0,strassenbahn,Ost,1964


In [5]:
df['line_name'] = df['line_name'].astype(str).str.strip()
df['Stops'] = df['Stops'].astype(str).str.strip()
df['year'] = df['year'].astype(int)
df['Frequency'] = pd.to_numeric(df['Frequency'], errors='coerce').fillna(0).astype(int)
df['Length (time)'] = pd.to_numeric(df['Length (time)'], errors='coerce').fillna(0).astype(int)
df['type'] = df['type'].astype(str).str.strip()
df['east-west'] = df['east-west'].astype(str).str.strip()

In [6]:
# Define a function to remove leftover parentheses from a column
def remove_double_whitespace(text):
    text = text.replace("   ", " ")
    text = text.replace("  ", " ")
    text = text.replace("–", "-")
    text = text.replace(" - ", " - ")
    return text

# Apply the function to the 'line_stops' column
df['Stops'] = df['Stops'].apply(remove_double_whitespace)
df['Stops'] = df['Stops'].str.replace(r'\s*–\s*', ' - ', regex=True)


In [7]:
mask = df['Stops'].str.contains(" - ").fillna(True)
none_df = df[~mask]
none_df

Unnamed: 0,line_name,Stops,Frequency,Length (time),type,east-west,year


In [8]:
# Filter rows where 'line_stops' is not a string
non_string_rows = df[~df['Stops'].apply(lambda x: isinstance(x, str))]
non_string_rows

Unnamed: 0,line_name,Stops,Frequency,Length (time),type,east-west,year


In [9]:
def extract_first_start_stop(string):
    stations = string.split(" - ")
    first_station = stations[0]
    last_station = stations[-1]
    return f"{first_station}<> {last_station}"

In [10]:
def create_line_df(df):
    line_df = pd.DataFrame({
        'line_id': range(1, 1+len(df)),
        'year': df['year'],
        'line_name': df['line_name'],
        'type': df["type"],
        "start_stop": df['Stops'].apply(extract_first_start_stop),
        "east_west": df['east-west'],
        "Frequency": df['Frequency'],
        "Length (time)": df['Length (time)']
    })
    return line_df

In [11]:
def create_stops_df(df):
    # Split line1 into individual stops and stack them into a new dataframe
    stops_df = df['Stops'].str.split(' - ', expand=True).stack().reset_index(level=1, drop=True).reset_index(name='stop_name')
        
    # Count the occurrences of each stop_name
    stop_counts = stops_df['stop_name'].value_counts().reset_index()
    stop_counts.columns = ['stop_name', 'line_count']

    # Merge the count information back into the original DataFrame
    stops_df = pd.merge(stops_df, stop_counts, on='stop_name', how='left')

    # Clean the 'Stop Name' column by removing whitespace and non-breaking spaces
    stops_df['stop_name'] = stops_df['stop_name'].str.replace(u'\xa0', ' ').str.replace(u"U+00a0", "").str.replace(r'\.{2,}', '.').str.replace("[Früh]", "").str.strip()

    # reset index so that it can be used for foreign key
    stops_df = stops_df.rename_axis("station_id").reset_index()
    stops_df["station_id"] += 1

    # Create a dictionary where the keys are the stop names and the values are lists of line names that contain the stop.
    stop_lines_dict = {}

    # Iterate over the rows in the line_stops dataframe and update the dictionary with the line name for each stop in the line
    for index, row in df.iterrows():
        line_name = row['line_name']
        stops = row['Stops'].split(' - ')
        stop_type = row["type"]
        for stop in stops:
            stop_name = stop.replace(u'\xa0', ' ').replace(u"U+00a0", "").strip()
            if stop_name in stop_lines_dict:
                if stop_type in stop_lines_dict[stop_name]:
                    stop_lines_dict[stop_name][stop_type].append(line_name)
                else:
                    stop_lines_dict[stop_name][stop_type] = [line_name]
            else:
                stop_lines_dict[stop_name] = {stop_type: [line_name]}

    # Create a new DataFrame with the stops and their corresponding lines
    stops_data = []
    for stop_name, lines_by_type in stop_lines_dict.items():
        row = {'stop_name': stop_name, 'type': lines_by_type}
        stops_data.append(row)
    stops_in_lines_df = pd.DataFrame(stops_data)

    # Merge the count information back into the original DataFrame
    stops_df = pd.merge(stops_df, stops_in_lines_df, on='stop_name', how='left')

    return stops_df


In [12]:
line_df = create_line_df(df)
stops_df = create_stops_df(df)

In [13]:
# Function to transform dictionary values to nested lists
def transform_dict_to_nested_list(d):
    if isinstance(d, dict):
        nested_list = []
        for key, value in d.items():
            nested_list.append([key, value])
        return nested_list
    else:
        print(d)
        return d

# Apply the transformation to the 'type' column
stops_df['type'] = stops_df['type'].apply(transform_dict_to_nested_list)
stops_df = stops_df.explode('type').reset_index(drop=True)

In [14]:
stops_df["row_type"] = None
stops_df["in_lines"] = None
for index, row in stops_df.iterrows():
    try:
        row_type = row["type"][0]
        stops_df.at[index, "row_type"] = row_type
        if len(row["type"]) > 1:
            lines = row["type"][1]
            lines = list(set(lines))
            stops_df.at[index, "in_lines"] = lines
    except:
        print(row)
stops_df.drop(columns="type", inplace=True)
stops_df.drop(columns="index", inplace=True)

# Rename the 'row_type' column to 'type'
stops_df.rename(columns={'row_type': 'type'}, inplace=True)

In [15]:
# Define a custom aggregation function to combine lists into a set
def combine_lists(lists):
    combined_set = set()
    for lst in lists:
        combined_set.update(lst)
    return combined_set

In [16]:
# Group by the 'stop_name' column and aggregate other columns as needed
aggregation_functions = {
'in_lines': combine_lists
}

# Assuming stops_df is your DataFrame
stops_df = stops_df.groupby(['stop_name', 'type'], group_keys=False).agg(aggregation_functions).reset_index()

In [17]:
# Reset the index and rename the index column to 'stop_id'
stops_df.reset_index(inplace=True)
stops_df.rename(columns={'index': 'stop_id'}, inplace=True)


In [18]:
empty_in_lines = stops_df[stops_df['in_lines'].apply(lambda x: len(x) == 0)]
empty_in_lines
# a few types of data errors that need to be fixed manually, done to df

Unnamed: 0,stop_id,stop_name,type,in_lines


In [20]:
for index, row in stops_df.iterrows():
    stops_df.at[index, "stop_id"] = row["stop_id"] + 1

In [21]:
# Function to convert string representation of set to list
def convert_to_list(value):
    try:
        # Use ast.literal_eval to safely convert string to set
        items = ast.literal_eval(value)
        # Convert the set to a list
        return list(items)
    except (SyntaxError, ValueError):
        # Handle the case where literal_eval fails (e.g., invalid string)
        return [value]

# Apply the function to the 'in_lines' column
stops_df['in_lines'] = stops_df['in_lines'].apply(convert_to_list)

In [22]:
stops_df['in_lines'] = stops_df['in_lines'].apply(lambda x: ','.join(map(str, x)))

In [23]:
stops_df.head()

Unnamed: 0,stop_id,stop_name,type,in_lines
0,1,4. Ring,bus,{'A53'}
1,2,Adalbertstrasse,bus,{'A1P'}
2,3,Adamstrasse,bus,"{'A34', 'A35'}"
3,4,Adlershof,s-bahn,"{'KBS 103', 'KBS 103a', 'KBS 106a'}"
4,5,Adolf-Scheidt-Platz,strassenbahn,{'96'}


In [61]:
def create_line_stops_df(df):
    line_stops = df['Stops'].str.split(' - ', expand=True).stack().reset_index(level=1, drop=True).reset_index(name='stop_name')

    line_stops['stop_order'] = line_stops.groupby('index').cumcount()
    #index starts from 0 so it looks like 1 row is missing but this is not true

    # Clean the 'Stop Name' column by removing whitespace and non-breaking spaces
    line_stops['stop_name'] = line_stops['stop_name'].str.replace(u'\xa0', ' ').str.strip()

    # reset index so that it can be used for foreign key

    return line_stops

In [62]:
line_stops_df = create_line_stops_df(df)

In [63]:
for index, row in line_stops_df.iterrows():
    line_stops_df.at[index, "index"] = row["index"] + 1
line_stops_df.rename(columns={'index': 'line_id'}, inplace=True)

line_stops_df

Unnamed: 0,line_id,stop_name,stop_order
0,1,Ostbahnhof,0
1,1,Breslauer Strasse,1
2,1,U-Bhf. Strausberger Platz,2
3,1,Leninplatz,3
4,1,"Alexanderplatz, Memhardstr.",4
...,...,...,...
1939,162,Konsum Wernsdorf,2
1940,162,Konsum Ziegenhals,3
1941,162,"Ziegenhals, VA Dienststelle",4
1942,163,Kladow,0


In [64]:
line_df.loc[133]

line_id                               134
year                                 1961
line_name                         KBS 100
type                               s-bahn
start_stop    Gesundbrunnen<> Sonnenallee
east_west                            West
Frequency                               0
Name: 133, dtype: object

In [65]:
line_df.loc[132, "start_stop"] = "Alexanderplatz<>Friedrichsfelde (Tierpark)"

In [66]:
# get a list of unique Line IDs in the line_stops DataFrame
line_ids = line_stops_df['line_id'].unique()

# loop through each unique Line ID
for line_id in line_ids:
    # get the first and last Stop Name for this Line ID
    first_stop_name = line_stops_df.loc[line_stops_df['line_id'] == line_id, 'stop_name'].iloc[0]
    last_stop_name = line_stops_df.loc[line_stops_df['line_id'] == line_id, 'stop_name'].iloc[-1]
    
    # get the corresponding Start-Stop value from line_df for this Line ID
    start_stop = line_df.loc[line_df['line_id'] == line_id, 'start_stop'].iloc[0]
    
    # check if the first and last Stop Names are contained in the Start-Stop value
    if first_stop_name not in start_stop or last_stop_name not in start_stop:
        print(f"Quality control check failed for line_id {line_id}.")

# I checked that this works using the following code:
# line_stops_df.loc[0, "stop_name"] = "test"
# and got the right result: Quality control check failed for line_id 1001.


In [67]:
def add_type(line_stops, line_df):
    # Assuming line_id is the common column between line_stops and line_df
    merged_df = pd.merge(line_stops, line_df[['line_id', 'type', "line_name"]], on='line_id', how='left')
    
    # Rename the 'type' column from line_df to 'type_from_line_df' to avoid conflicts
    merged_df.rename(columns={'type': 'type'}, inplace=True)
    
    # Drop the 'type_from_line_df' column if it's not needed in the final result
    # merged_df.drop(columns=['type_from_line_df'], inplace=True)
    
    return merged_df

In [68]:
line_stops_df = add_type(line_stops_df, line_df)

In [69]:
line_stops_df

Unnamed: 0,line_id,stop_name,stop_order,type,line_name
0,1,Ostbahnhof,0,strassenbahn,1
1,1,Breslauer Strasse,1,strassenbahn,1
2,1,U-Bhf. Strausberger Platz,2,strassenbahn,1
3,1,Leninplatz,3,strassenbahn,1
4,1,"Alexanderplatz, Memhardstr.",4,strassenbahn,1
...,...,...,...,...,...
1939,162,Konsum Wernsdorf,2,bus,Schmöckwitz<>Ziegenhals (VEB Kraftverkehr Schö...
1940,162,Konsum Ziegenhals,3,bus,Schmöckwitz<>Ziegenhals (VEB Kraftverkehr Schö...
1941,162,"Ziegenhals, VA Dienststelle",4,bus,Schmöckwitz<>Ziegenhals (VEB Kraftverkehr Schö...
1942,163,Kladow,0,Fähre,Wannsee<>Kladow (Betrieb durch Stern und Kreis...


In [70]:
def add_fk(line_stops_df, stops_df):
    # Create a new dataframe with the Stop Name and Stop ID columns
    stop_id_df = stops_df[['stop_name', 'stop_id', 'type', 'in_lines']]

    # Merge the line_stops_df and stop_id_df dataframes based on matching stop names and line_name condition
    line_stops_df = line_stops_df.merge(stop_id_df,
                                        left_on=['stop_name', 'type'],
                                        right_on=['stop_name', 'type'],
                                        how='left')

    # Filter the rows based on the condition that line_name is contained in in_lines
    line_stops_df = line_stops_df[line_stops_df.apply(lambda row: str(row['line_name']) in str(row['in_lines']), axis=1)]

    return line_stops_df

# Assuming line_stops_df and df_stops are your dataframes
# Replace 'stop_name', 'stop_id', 'type', and 'in_lines' with the actual column names you have

line_stops_df = add_fk(line_stops_df, stops_df)


In [71]:
line_stops_df

Unnamed: 0,line_id,stop_name,stop_order,type,line_name,stop_id,in_lines
0,1,Ostbahnhof,0,strassenbahn,1,852,"{'82', '1'}"
1,1,Breslauer Strasse,1,strassenbahn,1,213,{'1'}
2,1,U-Bhf. Strausberger Platz,2,strassenbahn,1,1231,{'1'}
3,1,Leninplatz,3,strassenbahn,1,700,"{'64', '63', '1'}"
4,1,"Alexanderplatz, Memhardstr.",4,strassenbahn,1,16,{'1'}
...,...,...,...,...,...,...,...
1939,162,Konsum Wernsdorf,2,bus,Schmöckwitz<>Ziegenhals (VEB Kraftverkehr Schö...,629,{'Schmöckwitz<>Ziegenhals (VEB Kraftverkehr Sc...
1940,162,Konsum Ziegenhals,3,bus,Schmöckwitz<>Ziegenhals (VEB Kraftverkehr Schö...,630,{'Schmöckwitz<>Ziegenhals (VEB Kraftverkehr Sc...
1941,162,"Ziegenhals, VA Dienststelle",4,bus,Schmöckwitz<>Ziegenhals (VEB Kraftverkehr Schö...,1357,{'Schmöckwitz<>Ziegenhals (VEB Kraftverkehr Sc...
1942,163,Kladow,0,Fähre,Wannsee<>Kladow (Betrieb durch Stern und Kreis...,606,{'Wannsee<>Kladow (Betrieb durch Stern und Kre...


In [72]:
# convert 'Stop ID' column to numeric values, coercing errors to NaN
line_stops_df['stop_id'] = pd.to_numeric(line_stops_df['stop_id'], errors='coerce')

# check if all values in 'Stop ID' column are numeric
if line_stops_df['stop_id'].notnull().all():
    print("All values in 'stop_id' column are numeric")
else:
    print("There are non-numeric values in 'stop_id' column")
    print(line_stops_df[line_stops_df['stop_id'].isnull()])


All values in 'stop_id' column are numeric


In [24]:
# line_stops_df.to_csv("line_stops_1961.csv")
line_df.to_csv("line_df_1964.csv")
stops_df.to_csv("stops_df_1964.csv")