In [183]:
import pandas as pd
import numpy as np

In [184]:
df = pd.read_parquet('rail_data_cleaned_20230728132300.parquet')

### Dropping columns we don't need

In [185]:
dropped = ['working_time_pass', 'pass', 'train_length']

In [186]:
df = df.drop(columns=dropped)

## Round 1
#### Eliminating rows with duplicate id and train platforms in conjunction with null actuals

In [187]:
# Subset of columns to consider for duplicates
subset_columns = ['unique_id', 'train_platform']

# Find rows with duplicated unique_id and train_platform
duplicates = df.duplicated(subset=subset_columns, keep=False)

# Find rows with null values in actual_arrival_time and actual_departure_time
null_rows = (df['actual_arrival_time'].isnull()) & (df['actual_departure_time'].isnull())

# Combine the conditions to get the rows to be removed
rows_to_remove = duplicates & null_rows

# Keep the rows that are not in rows_to_remove
df_cleaned = df[~rows_to_remove]


#### Looking at a busy route to evaluate cleaning

In [188]:
popular_id = df_cleaned[df_cleaned['unique_id'] == 'Y55129']

In [189]:
popular_id

Unnamed: 0,route_id,unique_id,service_start_date,update_origin,train_platform,working_time_arrival,working_time_departure,planned_time_arrival,planned_time_departure,platform,actual_arrival_time,actual_departure_time,is_delayed_arrival,is_delayed_departure
24918,202307288955129,Y55129,2023-07-28,Trust,BRENTX,2023-07-28 01:13:30,2023-07-28 01:14:00,NaT,NaT,1,2023-07-28 01:19:00,2023-07-28 01:20:00,False,False
24919,202307288955129,Y55129,2023-07-28,Trust,CRKLWD,2023-07-28 01:15:30,2023-07-28 01:17:00,2023-07-28 01:16:00,2023-07-28 01:17:00,1,2023-07-28 01:21:00,2023-07-28 01:22:00,False,False
24920,202307288955129,Y55129,2023-07-28,Trust,WHMPSTM,2023-07-28 01:20:00,2023-07-28 01:20:30,2023-07-28 01:20:00,2023-07-28 01:20:00,1,2023-07-28 01:24:00,2023-07-28 01:25:00,False,False
24922,202307288955129,Y55129,2023-07-28,Trust,KNTSHTN,2023-07-28 01:24:00,2023-07-28 01:24:30,2023-07-28 01:24:00,2023-07-28 01:24:00,1,2023-07-28 01:28:00,2023-07-28 01:29:00,False,False
24924,202307288955129,Y55129,2023-07-28,Trust,STPXBOX,2023-07-28 01:30:00,2023-07-28 01:32:00,2023-07-28 01:30:00,2023-07-28 01:32:00,A,2023-07-28 01:33:00,2023-07-28 01:34:00,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
92756,202307288955129,Y55129,2023-07-28,Darwin,SELHRST,2023-07-28 02:02:30,2023-07-28 02:03:00,NaT,NaT,1,2023-07-28 02:04:00,2023-07-28 02:04:00,False,False
92758,202307288955129,Y55129,2023-07-28,Darwin,ECROYDN,2023-07-28 02:06:00,2023-07-28 02:07:00,2023-07-28 02:06:00,2023-07-28 02:07:00,5,2023-07-28 02:06:00,2023-07-28 02:07:00,False,False
92761,202307288955129,Y55129,2023-07-28,Darwin,PURLEY,2023-07-28 02:12:00,2023-07-28 02:13:00,2023-07-28 02:12:00,2023-07-28 02:13:00,2,2023-07-28 02:12:00,2023-07-28 02:13:00,False,False
92768,202307288955129,Y55129,2023-07-28,Darwin,HORLEY,2023-07-28 02:25:30,2023-07-28 02:26:00,2023-07-28 02:26:00,2023-07-28 02:26:00,2,2023-07-28 02:26:00,2023-07-28 02:26:00,False,False


In [190]:
# Looking at the number of unique train platforms

popular_id['train_platform'].unique()

array(['BRENTX', 'CRKLWD', 'WHMPSTM', 'KNTSHTN', 'STPXBOX', 'FRNDNLT',
       'BLFR', 'SELHRST', 'ECROYDN', 'PURLEY', 'HORLEY', 'GTWK',
       'LEAGRVE', 'LUTON', 'LUTOAPY', 'HRPNDN', 'STALBCY', 'RADLETT',
       'ELTR', 'MLHB', 'HDON', 'FLITWCK', 'HRLG'], dtype=object)

In [191]:
popular_id['train_platform'].nunique()

23

## Round 2
### Keeping rows with earliest and latest times

#### Looking at a busy route to evaluate cleaning