Goal for this file is to clean the csv files. Specifically, 
- standardise missing values
- split start start_stop column into start and stop columns in lines.csv
- standardise station names in stations.csv
- fix that lines in lines.csv that look like O30, O37, O40 are Oberleitungsbusse and not Omnibusse
- fix spelling of things to german, so fähre, strassenbahn, autobus, u-bahn, s-bahn, oberleitungsbus, omnibus
- remove stop_description column in stations.csv
- remove Unnamed: 0 column in lines.csv and line_stops.csv
- remove stop_name from line_stops.csv


Then I want to create a list of all the columns in every table.

In [27]:
# 1. Import Required Libraries
import pandas as pd
import numpy as np
import os

In [28]:
# 2. Load CSV Files
stations = pd.read_csv('./raw/stations.csv')
lines = pd.read_csv('./raw/lines.csv')
line_stops = pd.read_csv('./raw/line_stops.csv')


In [29]:
# 3. Standardise Missing Values (replace common missing value markers with np.nan)
for df in [stations, lines, line_stops]:
    df.replace(['', 'NA', 'N/A', 'na', '-', '--', 'null', 'None'], np.nan, inplace=True)

    for col in ['Length (km)', 'Length (time)']:
        if col in lines.columns:
            lines.loc[lines[col] == 0, col] = np.nan


In [30]:
# 4. Split start_stop column in lines.csv, strip whitespace, and print rows without '<>'
if 'start_stop' in lines.columns:
    # Print rows where '<>' is not present
    no_split = lines[~lines['start_stop'].str.contains('<>', na=False)]
    if not no_split.empty:
        print("Rows where 'start_stop' does not contain '<>':")
        print(no_split[['start_stop']])
    # Split and strip whitespace
    split_df = lines['start_stop'].str.split('<>', expand=True)
    lines['start'] = split_df[0].str.strip()
    lines['stop'] = split_df[1].str.strip()
    lines.drop('start_stop', axis=1, inplace=True)

In [31]:
# 5. Standardise station names in stations.csv (strip whitespace, title case)
if 'station_name' in stations.columns:
    stations['station_name'] = stations['station_name'].str.strip().str.title()


In [32]:
# 6. Fix Oberleitungsbusse in lines.csv
if 'line' in lines.columns and 'type' in lines.columns:
    mask = lines['line'].str.match(r'O\s?\d{2,}', na=False)
    lines.loc[mask, 'type'] = 'oberleitungsbus'

In [33]:
# 7. Standardise vehicle type spellings in lines.csv
type_map = {
    'ferry': 'fähre',
    'strassenbahn': 'straßenbahn',
    'autobus': 'autobus',
    'u-bahn': 'u-bahn',
    's-bahn': 's-bahn',
    'oberleitungsbus': 'oberleitungsbus',
    'omnibus': 'omnibus',
    'bus': 'autobus',
    'tram': 'straßenbahn'
}
if 'type' in lines.columns:
    lines['type'] = lines['type'].str.lower().replace(type_map)


In [34]:
# 8. Remove unnecessary columns
if 'stop_description' in stations.columns:
    stations.drop('stop_description', axis=1, inplace=True)
if 'Unnamed: 0' in lines.columns:
    lines.drop('Unnamed: 0', axis=1, inplace=True)
if 'Unnamed: 0' in line_stops.columns:
    line_stops.drop('Unnamed: 0', axis=1, inplace=True)
if 'stop_name' in line_stops.columns:
    line_stops.drop('stop_name', axis=1, inplace=True)

In [35]:
print("Stations columns:", list(stations.columns))
print("Lines columns:", list(lines.columns))
print("Line_stops columns:", list(line_stops.columns))

Stations columns: ['stop_id', 'stop_name', 'type', 'location', 'in_lines', 'identifier']
Lines columns: ['line_id', 'year', 'line_name', 'type', 'Length (time)', 'east_west', 'Frequency', 'Length (km)', 'start', 'stop']
Line_stops columns: ['line_id', 'stop_order', 'stop_id']


In [36]:
# Rename columns in lines.csv to German
rename_map = {
    'Length (time)': 'dauer (min)',
    'Length (km)': 'länge (km)',
    'year': 'jahr',
    'line_name': 'linien_name',
    'type': 'typ',
    'east_west': 'ost_west',
    'Frequency': 'frequenz',
    'start': 'start',
    'stop': 'ziel'
}
lines.rename(columns=rename_map, inplace=True)

In [37]:
# 9. List all columns in every table
print("\nStations (first 4 rows):")
print(stations.head(2))
print("Stations (last 4 rows):")
print(stations.tail(2))
print("\nLines (first 4 rows):")
print(lines.head(2))
print("Lines (last 4 rows):")
print(lines.tail(2))
print("\nLine_stops (first 4 rows):")
print(line_stops.head(2))
print("Line_stops (last 4 rows):")
print(line_stops.tail(2))



Stations (first 4 rows):
   stop_id     stop_name     type                               location  \
0    19460     Adlershof   s-bahn        52.434722222222,13.541388888889   
1    19461  Akazienallee  omnibus  52.47849809201335, 13.607179083668733   

                   in_lines identifier  
0  ['KBS 100a', 'KBS 100c']    Q323551  
1                     ['D']        NaN  
Stations (last 4 rows):
        stop_id        stop_name     type  \
19756  19892666  Öschebronnerweg  autobus   
19757  19892667         Übergang  autobus   

                                     location in_lines identifier  
19756   52.61154475375042, 13.337669105320359   {'20'}        NaN  
19757  52.543078667825824, 13.157884773521127   {'63'}        NaN  

Lines (first 4 rows):
   line_id  jahr linien_name          typ  dauer (min)     ost_west  frequenz  \
0    19461  1946           1      autobus          NaN  both/unkown        25   
1    19462  1946           1  straßenbahn         15.0  both/unkown      

In [38]:
# 10. Save cleaned data to CSV files
stations.to_csv('./data/stations.csv', index=False)
lines.to_csv('./data/lines.csv', index=False)
line_stops.to_csv('./data/line_stops.csv', index=False)