This notebook is meant to analyze the reference stations file to clean it and provide corrected reference files.
AIM:
- keep all entries, even signals etc. as they could be included as incident locations.
- remove duplicate entries of DfT categories, and therefore of the data that will be analyzed and saved into 'processed data'

-> As of 31 March 2024, there were 2,585 open mainline stations in Great Britain

In [12]:
import pandas as pd

# Load the two reference files
stations_coords = pd.read_json('../data/reference provided/stations_ref_coordinates.json')
stations_dft = pd.read_json('../data/reference provided/stations_ref_with_dft.json')

print("Columns in stations_ref_coordinates.json:")
print(stations_coords.columns.tolist())
print(f"\nShape: {stations_coords.shape}")
print(f"\nTotal stanox entries: {len(stations_coords)}")

print("\n" + "="*50)
print("\nColumns in stations_ref_with_dft.json:")
print(stations_dft.columns.tolist())
print(f"\nShape: {stations_dft.shape}")
print(f"\nTotal stanox entries: {len(stations_dft)}")

Columns in stations_ref_coordinates.json:
['location_id', 'name', 'description', 'tiploc', 'crs', 'nlc', 'stanox', 'notes', 'longitude', 'latitude', 'isOffNetwork', 'timingPointType']

Shape: (54386, 12)

Total stanox entries: 54386


Columns in stations_ref_with_dft.json:
['location_id', 'name', 'description', 'tiploc', 'crs', 'nlc', 'stanox', 'notes', 'longitude', 'latitude', 'isOffNetwork', 'timingPointType', 'dft_category']

Shape: (6104, 13)

Total stanox entries: 6104


In [13]:
print(stations_dft['dft_category'].value_counts())

dft_category
C2    199
C1     98
B      76
A      21
Name: count, dtype: int64


In [15]:
print(f"Duplicate stanox entries: {stations_dft['stanox'].duplicated().sum()}")

Duplicate stanox entries: 576


In [18]:
# Get duplicate stanox values
duplicates = stations_dft[stations_dft['stanox'].duplicated(keep=False)].sort_values('stanox')
print(f"Total duplicate entries: {len(duplicates)}\n")

# Check dft_category for duplicates
print("Duplicate stanox entries with their dft_category:")
print(duplicates[['stanox', 'tiploc', 'description', 'dft_category']].to_string())

Total duplicate entries: 803

Duplicate stanox entries with their dft_category:
       stanox   tiploc                     description dft_category
5080   1053.0   IVRGRD                     Invergordon         None
5082   1053.0  IVRGRCE                Invergordon C.E.         None
5103   1071.0  KYLELSH                Kyle Of Lochalsh         None
5105   1071.0  KYLELCE       Section Man Dingwall Conc         None
5083   1100.0  IVRNESS                       Inverness         None
5087   1100.0  IVRNMRT             Motor Rail Terminal         None
5248   2041.0  PELPTAT         Tait Thomas & Sons Mill         None
5247   2041.0     PELP                Port Elphinstone         None
5251   2111.0     NWHL           Sig Meter(Newtonhill)         None
5253   2111.0  NWHLURS               Newtonhill U.R.S.         None
5228   3003.0   CRAIGO                      Craigo S B         None
5227   3003.0  CRAIGOC                       Ce Siding         None
5316   3014.0  MONTDTS    Montrose D