In [140]:
import pandas as pd
import re

In [141]:
year = 1961

In [142]:
df_stops = pd.read_csv(f"{year}-stops-localised-final.csv")  

In [143]:
df_stops.head()

Unnamed: 0.1,Unnamed: 0,stop_name,type,location,in_lines,identifier,east-west
0,0,4. Ring,bus,"52.42250630689789, 13.2869490566549",A53,,West
1,1,Adamstrasse,bus,"52.521029117218866, 13.19780699419913","A34,A35",,West
2,2,Adlershof,s-bahn,"52.434722222222,13.541388888889","KBS 103,KBS 100a,KBS 106",Q323551,Ost
3,3,Adolf-Scheidt-Platz,strassenbahn,"52.478194856980814, 13.37836902238447",96,,West
4,4,Afrikanische Str.,u-bahn,"52.560027777778,13.334633333333",C I,Q559239,West


In [144]:
df_stops = df_stops.fillna('').groupby(['stop_name', 'type', 'location']).agg({
    'in_lines': ', '.join,
    'identifier': ', '.join
}).reset_index()

df_stops.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1380 entries, 0 to 1379
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   stop_name   1380 non-null   object
 1   type        1380 non-null   object
 2   location    1380 non-null   object
 3   in_lines    1380 non-null   object
 4   identifier  1380 non-null   object
dtypes: object(5)
memory usage: 54.0+ KB


In [145]:
split_df = pd.concat([df_stops[col].astype(str).str.split(';', expand=True).stack().str.strip() for col in df_stops.columns], axis=1, keys=df_stops.columns)
df_stops = split_df.groupby(level=0).ffill().reset_index(drop=True)

In [146]:
df_line = pd.read_csv("../initial/line_data_1961.csv")  

In [147]:
# Create a dictionary to map stop names to their respective east-west values
stop_to_east_west = {}
for index, row in df_line.iterrows():
    stops = row['Stops'].split(' - ')
    for stop in stops:
        stop_to_east_west[stop] = row['east-west']

# Create the east-west column in df
df_stops['east-west'] = df_stops['stop_name'].apply(lambda x: stop_to_east_west.get(x, 'Unknown'))

print(df_stops[['stop_name', 'east-west']])

                                 stop_name east-west
0                                  4. Ring      West
1                              Adamstrasse      West
2                                Adlershof       Ost
3                      Adolf-Scheidt-Platz      West
4                        Afrikanische Str.      West
...                                    ...       ...
1375                   Zoologischer Garten      West
1376                      Zossener Strasse      West
1377  Zossener Strasse Ecke Blücherstrasse      West
1378                        Zum Heckeshorn      West
1379                   ZweiBrücker Strasse      West

[1380 rows x 2 columns]


In [148]:
# Check for invalid values in the 'east-west' column
invalid_values = df_stops[~df_stops['east-west'].isin(['Ost', 'West'])]

# Print the rows with invalid values
print(invalid_values)

                                              stop_name          type  \
38                                             Am Fließ           bus   
81                  Baumschulenstrasse Ecke Südostallee           bus   
124                                   Bhf. Groß Ziethen           bus   
194                                       Blumenstrasse           bus   
201                                    Borsigwalder Weg           bus   
299                                             Dorfaue           bus   
314                                          Eichendamm           bus   
348                                      Fasanenstrasse           bus   
418                                       Goethestrasse           bus   
577                           Judenstrasse, Molkenmarkt  strassenbahn   
651                                Krankenhaus Neukölln  strassenbahn   
715                                      Lerchenstrasse           bus   
721   Lichtenrade, Blohmstrasse Ecke Wünsdorfer Str

In [149]:
df_stops.to_csv(f"{year}-stops-localised-final.csv", encoding="utf-8")

In [150]:
# Split the dataframe into east and west based on the 'east-west' column
df_stops_east = df_stops[df_stops['east-west'] == 'Ost']
df_stops_west = df_stops[df_stops['east-west'] == 'West']

# Save the dataframes to CSV files
df_stops_east.to_csv(f"../east/final/{year}-stops-localised-final-east.csv", index=False, encoding="utf-8")
df_stops_west.to_csv(f"../west/final/{year}-stops-localised-final-west.csv", index=False, encoding="utf-8")

In [151]:
# Reset the index
df_stops.reset_index(inplace=True)
df_stops.rename(columns={'index': 'stop_id'}, inplace=True)

df_stops.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1380 entries, 0 to 1379
Data columns (total 7 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   stop_id     1380 non-null   int64 
 1   stop_name   1380 non-null   object
 2   type        1380 non-null   object
 3   location    1380 non-null   object
 4   in_lines    1380 non-null   object
 5   identifier  1380 non-null   object
 6   east-west   1380 non-null   object
dtypes: int64(1), object(6)
memory usage: 75.6+ KB


In [152]:
for index, row in df_line.iterrows():
    # Check if the regex pattern (lowercase 's' or 'x' at the end) is found in the "in_lines" column
    if re.search(r'[sx]$', row["line_name"].lower()):
        # Delete the section called for in the regex
        df_line.at[index, "line_name"] = re.sub(r'[sx]$', '', row["line_name"])


In [153]:
# Replace U+00a0 with regular whitespace in the entire DataFrame
df_line.replace('\u00a0', ' ', regex=True, inplace=True)

In [154]:
df_line.head(2)

Unnamed: 0,line_name,Stops,Frequency_7-30,Length (time),type,east-west,year
0,1,Ostbahnhof - Breslauer Strasse Ecke Andreasstr...,20.0,39.0,strassenbahn,Ost,1961
1,2,Bernauer Strasse Ecke Wolliner Strasse - Volta...,15.0,60.0,strassenbahn,West,1961


In [155]:
df_stops.reset_index(inplace=True)
df_stops.rename(columns={'index': 'stop_id'}, inplace=True)

df_stops['stop_id'] = '1961' + df_stops['stop_id'].astype(str)
df_stops.head(2)

Unnamed: 0,stop_id,stop_id.1,stop_name,type,location,in_lines,identifier,east-west
0,19610,19610,4. Ring,bus,"52.42250630689789, 13.2869490566549",A53,,West
1,19611,19611,Adamstrasse,bus,"52.521029117218866, 13.19780699419913","A34,A35",,West


In [156]:
df_line['line_name'] = df_line['line_name'].astype(str).str.strip()
df_line['Stops'] = df_line['Stops'].astype(str).str.strip()
df_line['year'] = df_line['year'].astype(int)
df_line['Frequency_7-30'] = pd.to_numeric(df_line['Frequency_7-30'], errors='coerce').fillna(0).astype(int)
df_line['Length (time)'] = pd.to_numeric(df_line['Length (time)'], errors='coerce').fillna(0).astype(int)

In [157]:
# Define a function to remove leftover parentheses from a column
def remove_double_whitespace(text):
    text = text.replace("   ", " ")
    text = text.replace("  ", " ")
    text = text.replace("–", "-")
    text = text.replace(" - ", " - ")
    return text

# Apply the function to the 'line_stops' column
df_line['Stops'] = df_line['Stops'].apply(remove_double_whitespace)
df_line['Stops'] = df_line['Stops'].str.replace(r'\s*–\s*', ' - ', regex=True)

In [158]:
mask = df_line['Stops'].str.contains(" - ").fillna(True)
none_df = df_line[~mask]
none_df

Unnamed: 0,line_name,Stops,Frequency_7-30,Length (time),type,east-west,year


In [159]:
# Filter rows where 'line_stops' is not a string
non_string_rows = df_line[~df_line['Stops'].apply(lambda x: isinstance(x, str))]
non_string_rows

Unnamed: 0,line_name,Stops,Frequency_7-30,Length (time),type,east-west,year


In [160]:
### Creating related tables

In [161]:
def extract_first_start_stop(string):
    stations = string.split(" - ")
    first_station = stations[0]
    last_station = stations[-1]
    return f"{first_station}<> {last_station}"

In [162]:
def create_line_df(df):
    line_df = pd.DataFrame({
        'line_id': range(1, 1+len(df)),
        'year': df['year'],
        'line_name': df['line_name'],
        'type': df["type"],
        "start_stop": df['Stops'].apply(extract_first_start_stop),
        "Length (time)": df['Length (time)'],
        "east_west": df['east-west'],
        "Frequency_7-30": df['Frequency_7-30']
    })
    return line_df

In [163]:
line_df = create_line_df(df_line)

In [164]:
# Assuming df is your DataFrame and 'stop_id' is the column you want to modify
line_df['line_id'] = '1961' + line_df['line_id'].astype(str)
line_df.head(2)

Unnamed: 0,line_id,year,line_name,type,start_stop,Length (time),east_west,Frequency_7-30
0,19611,1961,1,strassenbahn,Ostbahnhof<> Am Kupfergraben,39,Ost,20
1,19612,1961,2,strassenbahn,Bernauer Strasse Ecke Wolliner Strasse<> Goten...,60,West,15


In [165]:
# # Reset the index and rename the index column to 'stop_id'
# df_stops.reset_index(inplace=True)
# df_stops.rename(columns={'index': 'stop_id'}, inplace=True)
# df_stops.rename(columns={'original_name': 'stop_name'}, inplace=True)


In [166]:
# for index, row in df_stops.iterrows():
#     df_stops.at[index, "stop_id"] = row["stop_id"] + 1

In [167]:
# df_stops.to_csv("stops_1960.csv")

In [168]:
df_stops.head(2)

Unnamed: 0,stop_id,stop_id.1,stop_name,type,location,in_lines,identifier,east-west
0,19610,19610,4. Ring,bus,"52.42250630689789, 13.2869490566549",A53,,West
1,19611,19611,Adamstrasse,bus,"52.521029117218866, 13.19780699419913","A34,A35",,West


In [169]:
def create_line_stops_df(df):
    line_stops = df['Stops'].str.split(' - ', expand=True).stack().reset_index(level=1, drop=True).reset_index(name='stop_name')

    line_stops['stop_order'] = line_stops.groupby('index').cumcount()
    #index starts from 0 so it looks like 1 row is missing but this is not true

    # Clean the 'Stop Name' column by removing whitespace and non-breaking spaces
    line_stops['stop_name'] = line_stops['stop_name'].str.replace(u'\xa0', ' ').str.strip()

    # reset index so that it can be used for foreign key

    return line_stops

In [170]:
line_stops_df = create_line_stops_df(df_line)

In [171]:
for index, row in line_stops_df.iterrows():
    line_stops_df.at[index, "index"] = row["index"] + 1
line_stops_df.rename(columns={'index': 'line_id'}, inplace=True)

line_stops_df

Unnamed: 0,line_id,stop_name,stop_order
0,1,Ostbahnhof,0
1,1,Breslauer Strasse Ecke Andreasstrasse,1
2,1,U-Bhf. Strausberger Platz,2
3,1,Leninplatz,3
4,1,"Alexanderplatz, Memhardstr.",4
...,...,...,...
1939,162,Konsum Wernsdorf,2
1940,162,Konsum Ziegenhals,3
1941,162,"Ziegenhals, VA Dienststelle",4
1942,163,Kladow,0


In [172]:
# Assuming df is your DataFrame and 'stop_id' is the column you want to modify
line_stops_df['line_id'] = '1961' + line_stops_df['line_id'].astype(str)
line_stops_df.head(2)

Unnamed: 0,line_id,stop_name,stop_order
0,19611,Ostbahnhof,0
1,19611,Breslauer Strasse Ecke Andreasstrasse,1


In [173]:
# get a list of unique Line IDs in the line_stops DataFrame
line_ids = line_stops_df['line_id'].unique()

# loop through each unique Line ID
for line_id in line_ids:
    # get the first and last Stop Name for this Line ID
    first_stop_name = line_stops_df.loc[line_stops_df['line_id'] == line_id, 'stop_name'].iloc[0]
    last_stop_name = line_stops_df.loc[line_stops_df['line_id'] == line_id, 'stop_name'].iloc[-1]
    
    # get the corresponding Start-Stop value from line_df for this Line ID
    start_stop = line_df.loc[line_df['line_id'] == line_id, 'start_stop'].iloc[0]
    
    # check if the first and last Stop Names are contained in the Start-Stop value
    if first_stop_name not in start_stop or last_stop_name not in start_stop:
        print(f"Quality control check failed for line_id {line_id}.")

# I checked that this works using the following code:
# line_stops_df.loc[0, "stop_name"] = "test"
# and got the right result: Quality control check failed for line_id 1001.


In [174]:
line_df.head(3)

Unnamed: 0,line_id,year,line_name,type,start_stop,Length (time),east_west,Frequency_7-30
0,19611,1961,1,strassenbahn,Ostbahnhof<> Am Kupfergraben,39,Ost,20
1,19612,1961,2,strassenbahn,Bernauer Strasse Ecke Wolliner Strasse<> Goten...,60,West,15
2,19613,1961,3,strassenbahn,Grüntaler Straße Ecke Osloer Strasse<> Elsenst...,87,West,10


In [175]:
def add_type(line_stops, line_df):
    # Assuming line_id is the common column between line_stops and line_df
    merged_df = pd.merge(line_stops, line_df[['line_id', 'type', "line_name"]], on='line_id', how='left')
    
    # Rename the 'type' column from line_df to 'type_from_line_df' to avoid conflicts
    merged_df.rename(columns={'type': 'type'}, inplace=True)
    
    # Drop the 'type_from_line_df' column if it's not needed in the final result
    # merged_df.drop(columns=['type_from_line_df'], inplace=True)
    
    return merged_df

In [176]:
line_stops_df = add_type(line_stops_df, line_df)

In [177]:
line_stops_df.iloc[591:600]

Unnamed: 0,line_id,stop_name,stop_order,type,line_name
591,196147,Amtsgerichsplatz,5,bus,A10
592,196147,Kurfürstendamm,6,bus,A10
593,196147,Bhf. Halensee,7,bus,A10
594,196147,Elsterplatz,8,bus,A10
595,196147,Hundekehlestrasse,9,bus,A10
596,196147,Königin-Luise-Strasse,10,bus,A10
597,196147,U-Bhf. Thielplatz,11,bus,A10
598,196147,Dahlemer Weg,12,bus,A10
599,196147,Teltower Damm,13,bus,A10


In [178]:
duplicate_stop_ids = df_stops[df_stops.duplicated(subset='stop_id', keep=False)]
print(duplicate_stop_ids)

Empty DataFrame
Columns: [stop_id, stop_id, stop_name, type, location, in_lines, identifier, east-west]
Index: []


In [179]:
def add_fk(line_stops_df, df_stops):
    # Create a new dataframe with the Stop Name and Stop ID columns
    stop_id_df = df_stops[['stop_name', 'stop_id', 'type', 'in_lines']]

    # Merge the line_stops_df and stop_id_df dataframes based on matching stop names and line_name condition
    line_stops_df = line_stops_df.merge(stop_id_df,
                                        left_on=['stop_name', 'type'],
                                        right_on=['stop_name', 'type'],
                                        how='left')

    # Filter the rows based on the condition that line_name is contained in in_lines
    line_stops_df = line_stops_df[line_stops_df.apply(lambda row: str(row['line_name']) in str(row['in_lines']), axis=1)]

    return line_stops_df

line_stops_df = add_fk(line_stops_df, df_stops)
line_stops_df.head(2)

Unnamed: 0,line_id,stop_name,stop_order,type,line_name,stop_id,stop_id.1,in_lines
0,19611,Ostbahnhof,0,strassenbahn,1,1961859,1961859,821
1,19611,Breslauer Strasse Ecke Andreasstrasse,1,strassenbahn,1,1961215,1961215,1


In [180]:
# Rename one of the stop_id columns
line_stops_df.rename(columns={'stop_id': 'stop_id_2'}, inplace=True)

# Check for rows where the values of the two columns are not the same
mismatched_rows = line_stops_df[line_stops_df['stop_id'] != line_stops_df['stop_id_2']]

# Print the mismatched rows
print(mismatched_rows)

KeyError: 'stop_id'

In [145]:
# Calculate the difference between consecutive 'stop_order' values
line_stops_df['diff'] = line_stops_df['stop_order'].diff()

# Identify faulty rows where the difference is not 1 digit behind
faulty_rows = line_stops_df[(line_stops_df['diff'] != 1) & (line_stops_df['stop_order'] != 0)]
faulty_rows

Unnamed: 0,line_id,stop_name,stop_order,type,line_name,stop_id,stop_id.1,in_lines,diff


In [146]:
faulty_rows.info()

<class 'pandas.core.frame.DataFrame'>
Index: 0 entries
Data columns (total 9 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   line_id     0 non-null      object 
 1   stop_name   0 non-null      object 
 2   stop_order  0 non-null      int64  
 3   type        0 non-null      object 
 4   line_name   0 non-null      object 
 5   stop_id     0 non-null      object 
 6   stop_id     0 non-null      object 
 7   in_lines    0 non-null      object 
 8   diff        0 non-null      float64
dtypes: float64(1), int64(1), object(7)
memory usage: 0.0+ bytes


In [147]:
line_stops_df

Unnamed: 0,line_id,stop_name,stop_order,type,line_name,stop_id,stop_id.1,in_lines,diff
0,19611,Ostbahnhof,0,strassenbahn,1,1961859,1961859,821,
1,19611,Breslauer Strasse Ecke Andreasstrasse,1,strassenbahn,1,1961215,1961215,1,1.0
2,19611,U-Bhf. Strausberger Platz,2,strassenbahn,1,19611241,19611241,1,1.0
3,19611,Leninplatz,3,strassenbahn,1,1961708,1961708,64631,1.0
4,19611,"Alexanderplatz, Memhardstr.",4,strassenbahn,1,196115,196115,"1, 63,11,72,74E,22,74",1.0
...,...,...,...,...,...,...,...,...,...
2011,1961162,Konsum Wernsdorf,2,bus,Schmöckwitz<>Ziegenhals (VEB Kraftverkehr Schö...,1961635,1961635,Schmöckwitz<>Ziegenhals (VEB Kraftverkehr Schö...,1.0
2012,1961162,Konsum Ziegenhals,3,bus,Schmöckwitz<>Ziegenhals (VEB Kraftverkehr Schö...,1961636,1961636,Schmöckwitz<>Ziegenhals (VEB Kraftverkehr Schö...,1.0
2013,1961162,"Ziegenhals, VA Dienststelle",4,bus,Schmöckwitz<>Ziegenhals (VEB Kraftverkehr Schö...,19611370,19611370,Schmöckwitz<>Ziegenhals (VEB Kraftverkehr Schö...,1.0
2014,1961163,Kladow,0,Fähre,Wannsee<>Kladow (Betrieb durch Stern und Kreis...,1961612,1961612,Wannsee<>Kladow (Betrieb durch Stern und Kreis...,-4.0


In [104]:
# convert 'Stop ID' column to numeric values, coercing errors to NaN
line_stops_df['stop_id'] = pd.to_numeric(line_stops_df['stop_id'], errors='coerce')

# check if all values in 'Stop ID' column are numeric
if line_stops_df['stop_id'].notnull().all():
    print("All values in 'stop_id' column are numeric")
else:
    print("There are non-numeric values in 'stop_id' column")
    print(line_stops_df[line_stops_df['stop_id'].isnull()])


TypeError: arg must be a list, tuple, 1-d array, or Series

In [39]:
line_stops_df.drop(['line_name', "in_lines", "type", "diff"], axis=1, inplace=True)

In [40]:
line_stops_df.to_csv("line_stops_1961-final.csv")
line_df.to_csv("line_df_1961-final.csv")
df_stops.to_csv("stops_df_1961-final.csv")