This code is meant to check our files for station categorisation into DfT categories

In [4]:
import sys
import os
# Get the directory of the current notebook file
# In Jupyter, we need to use __file__ equivalent or pwd
current_dir = os.getcwd()  # This gets the working directory
print(f"Current working directory: {current_dir}")

# Navigate to project root (go up one level from tests to project root)
project_root = os.path.dirname(current_dir) if current_dir.endswith('tests') else current_dir
print(f"Project root: {project_root}")

if project_root not in sys.path:
    sys.path.insert(0, project_root)

print(f"Python path includes: {project_root}")

from data.reference import reference_files
import json

# Get the station codes file path
station_codes_path = reference_files["all dft categories"]
print(f"Station codes file path: {station_codes_path}")

# Load the station codes JSON file
with open(station_codes_path, 'r') as file:
    station_codes = json.load(file)

print(f"Loaded {len(station_codes)} station records")
print(list(station_codes[0].keys()))

Current working directory: c:\Users\39342\University of Glasgow\Ji-Eun Byun - MZ-JB\RDM_analysis\tests
Project root: c:\Users\39342\University of Glasgow\Ji-Eun Byun - MZ-JB\RDM_analysis
Python path includes: c:\Users\39342\University of Glasgow\Ji-Eun Byun - MZ-JB\RDM_analysis
Station codes file path: C:\Users\39342\University of Glasgow\Ji-Eun Byun - MZ-JB\MSci (Research) 2024-25\reference data\stations_ref_with_dft.json
Loaded 6104 station records
['location_id', 'name', 'description', 'tiploc', 'crs', 'nlc', 'stanox', 'notes', 'longitude', 'latitude', 'isOffNetwork', 'timingPointType', 'dft_category']


In [6]:
# Get all unique DfT categories and their counts
from collections import Counter

# Extract DfT categories from each station record
dft_categories = [station.get('dft_category') for station in station_codes if station.get('dft_category')]

# Count occurrences of each category
category_counts = Counter(dft_categories)

print("DfT Category Counts:")
print("-" * 50)
for category, count in sorted(category_counts.items()):
    print(f"{category}: {count}")

print(f"\nTotal unique categories: {len(category_counts)}")
print(f"Total stations with DfT category: {sum(category_counts.values())}")
print(f"Total stations in dataset: {len(station_codes)}")

# Check if there are any stations without DfT category
stations_without_category = len(station_codes) - sum(category_counts.values())
if stations_without_category > 0:
    print(f"Stations without DfT category: {stations_without_category}")

DfT Category Counts:
--------------------------------------------------
A: 21
B: 76
C1: 98
C2: 199

Total unique categories: 4
Total stations with DfT category: 394
Total stations in dataset: 6104
Stations without DfT category: 5710


Sample station record structure:
First station record:
  location_id: 3785
  name: Birkenhead Iom
  description: Birkenhead 12 Quays
  tiploc: BRKNIOM
  crs: BKI
  nlc: 24000
  stanox: None
  notes: None
  longitude: -3.017415729
  latitude: 53.40251232
  isOffNetwork: FALSE
  timingPointType: O
  dft_category: None

No platform-related keys found in the first 10 station records.

All available keys in station data:
['crs', 'description', 'dft_category', 'isOffNetwork', 'latitude', 'location_id', 'longitude', 'name', 'nlc', 'notes', 'stanox', 'timingPointType', 'tiploc']
