# TFL API - Connection for Tube Station Dimesion

### Notebook Set Up

In [41]:
import os
import glob
from pathlib import Path
import pandas as pd

In [None]:
# Primero vemos los files para escoger los datos que necestiamos para creae

In [43]:
import os
print(os.getcwd())

/Users/nicoletondu/Desktop/data-science-thesis-2025/eda_notebooks


In [45]:
# Ruta relativa desde tu notebook
data_dir = "../data/tfl-stationdata-detailed/"

# Buscar todos los CSV en la carpeta
csv_files = glob.glob(os.path.join(data_dir, "*.csv"))

# Cargar cada CSV en un diccionario
dfs = {}
for f in csv_files:
    name = os.path.splitext(os.path.basename(f))[0]  # nombre sin .csv
    try:
        df = pd.read_csv(f, sep=None, engine="python", encoding="utf-8")
    except UnicodeDecodeError:
        df = pd.read_csv(f, sep=None, engine="python", encoding="latin-1")
    dfs[name] = df
    print(f"{name}: {df.shape[0]} filas, {df.shape[1]} columnas")


StationPoints: 4066 filas, 8 columnas
ModesAndLines: 23 filas, 2 columnas
RampRoutes: 416 filas, 2 columnas
Toilets: 398 filas, 9 columnas
Stations: 509 filas, 14 columnas
Lifts: 550 filas, 12 columnas
SameLevelPaths: 7843 filas, 2 columnas
Platforms: 1480 filas, 10 columnas
StepFreeIntechangeInfo: 114 filas, 3 columnas
PlatformServices: 1771 filas, 15 columnas
FeedInfo: 1 filas, 4 columnas


In [47]:
# nombras los dataframes para acceder facilamente
station_points_df = dfs["StationPoints"]
modes_and_lines_df = dfs["ModesAndLines"]
ramp_routes_df = dfs["RampRoutes"]
toilets_df = dfs["Toilets"]
stations_df = dfs["Stations"]
lifts_df = dfs["Lifts"]
same_level_paths_df = dfs["SameLevelPaths"]
platforms_df = dfs["Platforms"]
step_free_interchange_info_df = dfs["StepFreeIntechangeInfo"]
platform_services_df = dfs["PlatformServices"]

### Construccion de Tabla de Dimensiones para las estaciones de Tube

In [49]:
platform_services_df.head()

Unnamed: 0,PlatformUniqueId,StopAreaNaptanCode,Line,DirectionTowards,MinGap,MaxGap,AverageGap,MinStep,MaxStep,AverageStep,DesignatedLevelAccessPoint,LocationOfLevelAccess,LevelAccessByManualRamp,AdditionalAccessibilityInformation,GroupName
0,HUBABW-Plat01-WB-national-rail,910GABWD,national-rail,,,,,,,,False,,True,,
1,HUBABW-Plat02-EB-national-rail,910GABWD,national-rail,,,,,,,,False,,True,,
2,HUBABW-Plat03-WB-elizabeth,910GABWD,elizabeth,,,,,,,,False,,True,,
3,HUBABW-Plat04-WB-elizabeth,910GABWD,elizabeth,,,,,,,,False,,True,,
4,910GACTNCTL-Plat01-WB-london-overground,910GACTNCTL,mildmay,Richmond,,,,,,,False,,True,Step-free platform interchange requires a 50m ...,


In [51]:
# Solo necesitamos las columnas de 'StopAreaNaptanCode' y 'Line'. 
# Abstraemos estas dos columnas, eliminando duplicados. Este df nos servira como base de mapeo

station_line_df = platform_services_df[["StopAreaNaptanCode", "Line"]].drop_duplicates()
station_line_df.head()

Unnamed: 0,StopAreaNaptanCode,Line
0,910GABWD,national-rail
2,910GABWD,elizabeth
4,910GACTNCTL,mildmay
6,910GACTONML,elizabeth
8,910GANERLEY,windrush


In [53]:
# vemos modes and lines
modes_and_lines_df.head()

Unnamed: 0,Mode,Name
0,cableCar,london-cable-car
1,dlr,dlr
2,nationalRail,thameslink
3,nationalRail,national-rail
4,overground,liberty


In [55]:
# Merge para mapear Line ↔ Name
line_mapping_df = station_line_df.merge(
    modes_and_lines_df,
    left_on="Line",
    right_on="Name",
    how="left"
)

line_mapping_df = line_mapping_df[["StopAreaNaptanCode","Mode","Line"]].drop_duplicates()
line_mapping_df.head()

Unnamed: 0,StopAreaNaptanCode,Mode,Line
0,910GABWD,nationalRail,national-rail
1,910GABWD,elizabeth-line,elizabeth
2,910GACTNCTL,overground,mildmay
3,910GACTONML,elizabeth-line,elizabeth
4,910GANERLEY,overground,windrush


In [59]:
# Resumen de nulos por columna
line_mapping_df.isnull().sum()

StopAreaNaptanCode    0
Mode                  0
Line                  0
dtype: int64

In [61]:
# Ahora vemos el dataframe de stations
stations_df.head()

Unnamed: 0,UniqueId,Name,FareZones,HubNaptanCode,Wifi,OutsideStationUniqueId,BlueBadgeCarParking,BlueBadgeCarParkSpaces,TaxiRanksOutsideStation,MainBusInterchange,PierInterchange,NationalRailInterchange,AirportInterchange,EmiratesAirLineInterchange
0,HUBABW,Abbey Wood,4,HUBABW,False,HUBABW-Outside,False,,False,,,,,
1,910GACTNCTL,Acton Central,3,,True,910GACTNCTL-Outside,False,,False,,,,,
2,910GACTONML,Acton Main Line,3,,False,910GACTONML-Outside,False,,False,,,,,
3,910GANERLEY,Anerley,4,,True,910GANERLEY-Outside,False,,False,,,,,
4,910GBCKNHMH,Beckenham Hill,4,,False,910GBCKNHMH-Outside,False,,False,,,,,


In [63]:
# Seleccionar solo las columnas de interés
station_cols_df = stations_df[[
    "UniqueId",
    "Name",
    "FareZones",
    "HubNaptanCode",
    "Wifi",
    "AirportInterchange",
    "BlueBadgeCarParking",
    "BlueBadgeCarParkSpaces"
]]

station_cols_df.head()

Unnamed: 0,UniqueId,Name,FareZones,HubNaptanCode,Wifi,AirportInterchange,BlueBadgeCarParking,BlueBadgeCarParkSpaces
0,HUBABW,Abbey Wood,4,HUBABW,False,,False,
1,910GACTNCTL,Acton Central,3,,True,,False,
2,910GACTONML,Acton Main Line,3,,False,,False,
3,910GANERLEY,Anerley,4,,True,,False,
4,910GBCKNHMH,Beckenham Hill,4,,False,,False,


In [65]:
station_line_joined_df = line_mapping_df.merge(
    station_cols_df,
    left_on="StopAreaNaptanCode",
    right_on="UniqueId",
    how="left"
)

station_line_joined_df.head()

Unnamed: 0,StopAreaNaptanCode,Mode,Line,UniqueId,Name,FareZones,HubNaptanCode,Wifi,AirportInterchange,BlueBadgeCarParking,BlueBadgeCarParkSpaces
0,910GABWD,nationalRail,national-rail,,,,,,,,
1,910GABWD,elizabeth-line,elizabeth,,,,,,,,
2,910GACTNCTL,overground,mildmay,910GACTNCTL,Acton Central,3.0,,True,,False,
3,910GACTONML,elizabeth-line,elizabeth,910GACTONML,Acton Main Line,3.0,,False,,False,
4,910GANERLEY,overground,windrush,910GANERLEY,Anerley,4.0,,True,,False,


In [67]:
# vemos toilets df
toilets_df.head()

Unnamed: 0,StationUniqueId,Id,IsAccessible,HasBabyChanging,IsInsideGateLine,Location,IsFeeCharged,Type,IsManagedByTfL
0,910GACTONML,1,True,False,True,Located in ticket hall,False,Unisex,True
1,910GBHILLPK,1,False,False,True,,False,Male,True
2,910GBHILLPK,2,False,False,True,,False,Female,True
3,910GBHILLPK,3,True,True,True,,False,Unisex,True
4,910GBNHAM,1,False,False,True,Located on platform 2,False,Male,True


In [69]:
stations_dim_df = station_line_joined_df.merge(
    toilets_df[["StationUniqueId", "IsAccessible", "IsFeeCharged", "Id"]],
    left_on="UniqueId",
    right_on="StationUniqueId",
    how="left"
)

In [71]:
stations_dim_df

Unnamed: 0,StopAreaNaptanCode,Mode,Line,UniqueId,Name,FareZones,HubNaptanCode,Wifi,AirportInterchange,BlueBadgeCarParking,BlueBadgeCarParkSpaces,StationUniqueId,IsAccessible,IsFeeCharged,Id
0,910GABWD,nationalRail,national-rail,,,,,,,,,,,,
1,910GABWD,elizabeth-line,elizabeth,,,,,,,,,,,,
2,910GACTNCTL,overground,mildmay,910GACTNCTL,Acton Central,3,,True,,False,,,,,
3,910GACTONML,elizabeth-line,elizabeth,910GACTONML,Acton Main Line,3,,False,,False,,910GACTONML,TRUE,False,1.0
4,910GANERLEY,overground,windrush,910GANERLEY,Anerley,4,,True,,False,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
941,910GWCHAPEL,overground,windrush,,,,,,,,,,,,
942,910GWCHAPXR,elizabeth-line,elizabeth,,,,,,,,,,,,
943,940GZZBPSUST,tube,northern,940GZZBPSUST,Battersea Power Station,1,,True,,False,,940GZZBPSUST,TRUE,False,1.0
944,940GZZNEUGST,tube,northern,940GZZNEUGST,Nine Elms,1,,True,,False,,940GZZNEUGST,TRUE,False,1.0


### Connect to the API

In [82]:
import zipfile, io, requests

url = "https://api.tfl.gov.uk/stationdata/tfl-stationdata-detailed.zip"
r = requests.get(url)

with zipfile.ZipFile(io.BytesIO(r.content)) as z:
    print(z.namelist())  # 👈 esto lista todos los archivos dentro del ZIP


['FeedInfo.csv', 'Lifts.csv', 'ModesAndLines.csv', 'Platforms.csv', 'PlatformServices.csv', 'RampRoutes.csv', 'SameLevelPaths.csv', 'StationPoints.csv', 'Stations.csv', 'StepFreeIntechangeInfo.csv', 'Toilets.csv']


In [84]:
import requests, zipfile, io
import pandas as pd

url = "https://api.tfl.gov.uk/stationdata/tfl-stationdata-detailed.zip"
r = requests.get(url)

with zipfile.ZipFile(io.BytesIO(r.content)) as z:
    # Listar archivos dentro del ZIP
    print(z.namelist())

    # Cargar cada CSV directamente a un DataFrame
    stations_df = pd.read_csv(z.open("Stations.csv"))
    modes_lines_df = pd.read_csv(z.open("ModesAndLines.csv"))
    platforms_df = pd.read_csv(z.open("Platforms.csv"))
    toilets_df = pd.read_csv(z.open("Toilets.csv"))

['FeedInfo.csv', 'Lifts.csv', 'ModesAndLines.csv', 'Platforms.csv', 'PlatformServices.csv', 'RampRoutes.csv', 'SameLevelPaths.csv', 'StationPoints.csv', 'Stations.csv', 'StepFreeIntechangeInfo.csv', 'Toilets.csv']
      UniqueId             Name FareZones HubNaptanCode   Wifi  \
0       HUBABW       Abbey Wood         4        HUBABW  False   
1  910GACTNCTL    Acton Central         3           NaN   True   
2  910GACTONML  Acton Main Line         3           NaN  False   
3  910GANERLEY          Anerley         4           NaN   True   
4  910GBCKNHMH   Beckenham Hill         4           NaN  False   

  OutsideStationUniqueId  BlueBadgeCarParking  BlueBadgeCarParkSpaces  \
0         HUBABW-Outside                False                     NaN   
1    910GACTNCTL-Outside                False                     NaN   
2    910GACTONML-Outside                False                     NaN   
3    910GANERLEY-Outside                False                     NaN   
4    910GBCKNHMH-Outside