# Initial

## Constants

In [1]:
# Constants for easy reference and modification
GROUPBY_COL = 'unique_id'

DATETIME_COL = 'datetime'

TARGET_COL = 'vehicle_type'

FEATURE_COLS = [
    'vehicle_speed',
    'vehicle_angle_sine', 'vehicle_angle_cosine',
    'vehicle_x', 'vehicle_y', 'vehicle_z'
]

BATCH_SIZE = 32

## Imports

In [2]:
# Standard library imports
import pprint
import re

# Third-party imports
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder, RobustScaler

# PyTorch and related imports
import torch
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import DataLoader, Dataset, IterableDataset

## Helper Functions and Class Definitions

In [3]:
def train_val_test_split_custom(df, test_size=0.2, val_size=0.1, min_per_class=1):
    """
    Splits data into training, validation, and test sets.
    Ensures that a vehicle_id only appears in one of the datasets.
    Ensures that all possible classes are represented in each set.
    In the event where a class has not enough vehicle_ids for each set to have at least one vehicle_id,
    prioritise the train set to represent all possible classes.
    """
    np.random.seed(0)  # Seed for reproducibility

    # Get unique IDs and classes
    unique_ids = df[GROUPBY_COL].unique()
    classes = df[TARGET_COL].unique()

    # Initialize lists to store IDs for each dataset
    train_ids, val_ids, test_ids = [], [], []

    for cls in classes:
        ids_for_class = df[df[TARGET_COL] == cls][GROUPBY_COL].unique()
        np.random.shuffle(ids_for_class)

        # Allocate IDs to train, val, and test sets
        class_count = len(ids_for_class)
        train_end = min(min_per_class, class_count)
        val_end = min(train_end + min_per_class, class_count)
        test_end = min(val_end + min_per_class, class_count)

        train_ids.extend(ids_for_class[:train_end])
        val_ids.extend(ids_for_class[train_end:val_end])
        test_ids.extend(ids_for_class[val_end:test_end])

    # Shuffle remaining IDs after removing selected train IDs
    reserved_ids = set(train_ids + val_ids + test_ids)
    remaining_ids = list(set(unique_ids) - reserved_ids)
    np.random.shuffle(remaining_ids)

    # Calculate split sizes for remaining IDs
    total_remaining = len(remaining_ids)
    test_count = int(total_remaining * test_size)
    val_count = int(total_remaining * val_size)

    # Assign remaining vehicle IDs to test and validation sets
    test_ids += remaining_ids[:test_count]
    val_ids += remaining_ids[test_count:test_count + val_count]
    train_ids += remaining_ids[test_count + val_count:]

    # Create split DataFrames
    train_df = df[df[GROUPBY_COL].isin(train_ids)]
    val_df = df[df[GROUPBY_COL].isin(val_ids)]
    test_df = df[df[GROUPBY_COL].isin(test_ids)]

    return train_df, val_df, test_df

def create_sequences(df):
    sequences = []
    for vehicle_id, group in df.groupby(GROUPBY_COL):
        sorted_group = group.sort_values(by=[DATETIME_COL]).copy()
        sequence_features = sorted_group[FEATURE_COLS].values
        label = sorted_group.iloc[0][TARGET_COL]
        sequences.append((sequence_features, label, vehicle_id))
    return sequences

class VehicleDataset(Dataset):
    def __init__(self, df):
        self.sequences = create_sequences(df)

    def __len__(self):
        return len(self.sequences)

    def __getitem__(self, idx):
        sequence, label, vehicle_id = self.sequences[idx]
        return torch.tensor(sequence, dtype=torch.float32), torch.tensor(label, dtype=torch.long), vehicle_id

def collate_fn(batch):
    sequences, labels, vehicle_ids = zip(*batch)

    # Clone and detach the tensors in sequences before padding
    padded_sequences = pad_sequence([seq.clone().detach() for seq in sequences], batch_first=True)

    labels_tensor = torch.tensor(labels, dtype=torch.long)
    lengths = torch.tensor([len(seq) for seq in sequences])

    return padded_sequences, labels_tensor, vehicle_ids, lengths

### Archive

In [4]:
######################## USING GENERATORS ########################
def create_sequences_generator(df):
    for vehicle_id, group in df.groupby(GROUPBY_COL):
         # Ensure that data is sorted by  datetime
        group[DATETIME_COL] = pd.to_datetime(group[DATETIME_COL])
        group.sort_values(by=[DATETIME_COL], inplace=True)

        sequence_features = group[FEATURE_COLS].values
        label = group.iloc[0][TARGET_COL]
        yield sequence_features, label, vehicle_id

class VehicleIterableDataset(IterableDataset):
    def __init__(self, df):
        # IterableDataset stores the raw data and parameters needed to create the generator
        self.df = df
        self.groupby_col = GROUPBY_COL
        self.feature_cols = FEATURE_COLS
        self.target_col = TARGET_COL

    def __iter__(self):
        # Re-initialises the generator each time it is called, as generators are exhausted after one complete iteration
        return create_sequences_generator(self.df)

def collate_fn_generator(batch):
    # Separate features, labels, and vehicle IDs
    sequences, labels, vehicle_ids = zip(*batch)

    # Pad sequences to have the same length
    padded_sequences = pad_sequence([torch.tensor(seq, dtype=torch.float32) for seq in sequences], batch_first=True)

    # Convert labels to tensor
    labels_tensor = torch.tensor(labels, dtype=torch.long)

    # Calculate lengths of sequences
    lengths = torch.tensor([len(seq) for seq in sequences])

    return padded_sequences, labels_tensor, vehicle_ids, lengths

##  Load resampled data into df

In [5]:
df = pd.read_csv('resampled_vehicle_data.csv')
df[DATETIME_COL] = pd.to_datetime(df[DATETIME_COL])
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1228962 entries, 0 to 1228961
Data columns (total 12 columns):
 #   Column                Non-Null Count    Dtype         
---  ------                --------------    -----         
 0   unique_id             1228962 non-null  object        
 1   datetime              1228962 non-null  datetime64[ns]
 2   vehicle_id            1228962 non-null  object        
 3   timestep_time         1228962 non-null  float64       
 4   vehicle_type          1228962 non-null  object        
 5   vehicle_speed         1228962 non-null  float64       
 6   vehicle_x             1228962 non-null  float64       
 7   vehicle_y             1228962 non-null  float64       
 8   vehicle_z             1228962 non-null  float64       
 9   vehicle_angle         1228962 non-null  float64       
 10  vehicle_angle_sine    1228962 non-null  float64       
 11  vehicle_angle_cosine  1228962 non-null  float64       
dtypes: datetime64[ns](1), float64(8), object(3

## 0. Re-labelling classes

Following [this notebook on GitHub](https://github.com/pgrandinetti/standard-traffic-data/blob/main/knowledge/Urban_Traffic_Data_Statistical_Analysis.ipynb), we:

- Fill in null values with `pedestrian`
- Rename all vehicle types types that start with `passenger` as `car` as these instances are a subclass of car
- Rename vehicle types suffixed with bicycle (`fastbicycle`, `avgbicycle`, `slowbicycle`) as `bicycle`
- Rename `trailer` and `truck` as `lorry` as trailer is just a variation of truck
- Rename `uber` and `taxi` as `hire` as they represent the same concept
- Rename `army`, `authority`, `emergency` vehicles as `special`
- Drop all vehicle types starting with `hw_` as these vehicles run on motorways and hence not suitable for urban traffic analysis
- Drop rows containing `train` as they are not suitable for urban traffic analysis

In [6]:
df[TARGET_COL].unique()

array(['hw_trailer', 'hw_delivery', 'hw_passenger3', 'hw_truck',
       'hw_passenger2b', 'hw_coach', 'hw_motorcycle', 'hw_passenger1',
       'hw_passenger4', 'hw_passenger2a', 'bus', 'delivery', 'coach',
       'truck', 'trailer', 'avgbicycle', 'slowbicycle', 'fastbicycle',
       'motorcycle', 'taxi', 'moped', 'uber', 'passenger3', 'passenger4',
       'passenger2a', 'passenger1', 'passenger2b', 'emergency',
       'authority', 'army', 'train'], dtype=object)

In [7]:
# Rename vehicle types
df[TARGET_COL] = df[TARGET_COL].replace(regex={
    '^passenger.*': 'car',
    '.*bicycle$': 'bicycle',
    '^(trailer|truck)$': 'lorry',
    '^(uber|taxi)$': 'hire',
    '^(army|authority|emergency)$': 'special'
})

# Drop rows where vehicle types start with 'hw_' or are 'train'
bef = len(df)
df = df[~df[TARGET_COL].str.startswith('hw_')]
df = df[df[TARGET_COL] != 'train']
aft = len(df)
print(f"{bef - aft} rows with vehicle_type starting with 'hw_' or are 'train' dropped. {aft} rows remain.")

20675 rows with vehicle_type starting with 'hw_' or are 'train' dropped. 1208287 rows remain.


In [8]:
# Sanity check
df[TARGET_COL].unique()

array(['bus', 'delivery', 'coach', 'lorry', 'bicycle', 'motorcycle',
       'hire', 'moped', 'car', 'special'], dtype=object)

In [9]:
# Sort by 'unique_id' and 'datetime' to ensure the correct order
df = df.sort_values(by=[GROUPBY_COL, DATETIME_COL]).reset_index(drop=True)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1208287 entries, 0 to 1208286
Data columns (total 12 columns):
 #   Column                Non-Null Count    Dtype         
---  ------                --------------    -----         
 0   unique_id             1208287 non-null  object        
 1   datetime              1208287 non-null  datetime64[ns]
 2   vehicle_id            1208287 non-null  object        
 3   timestep_time         1208287 non-null  float64       
 4   vehicle_type          1208287 non-null  object        
 5   vehicle_speed         1208287 non-null  float64       
 6   vehicle_x             1208287 non-null  float64       
 7   vehicle_y             1208287 non-null  float64       
 8   vehicle_z             1208287 non-null  float64       
 9   vehicle_angle         1208287 non-null  float64       
 10  vehicle_angle_sine    1208287 non-null  float64       
 11  vehicle_angle_cosine  1208287 non-null  float64       
dtypes: datetime64[ns](1), float64(8), object(3

## 1. Label Encoding

In [10]:
le = LabelEncoder()
df[TARGET_COL] = df[TARGET_COL].apply(str)
df[TARGET_COL] = le.fit_transform(df[TARGET_COL])

In [11]:
label_mapping = dict(zip(range(len(le.classes_)), le.classes_))
pprint.pprint(label_mapping)

{0: 'bicycle',
 1: 'bus',
 2: 'car',
 3: 'coach',
 4: 'delivery',
 5: 'hire',
 6: 'lorry',
 7: 'moped',
 8: 'motorcycle',
 9: 'special'}


## 2. Train val test split

In [12]:
train_df, val_df, test_df = train_val_test_split_custom(df)

### Inspect balance of classes

#### df

In [12]:
print(f"There are {df[GROUPBY_COL].nunique()} vehicle_ids and {len(df)} rows in df.")
print(f"{df[TARGET_COL].nunique()} classes are represented in df.\n")

There are 5886 vehicle_ids and 1208287 rows in df.
10 classes are represented in df.



In [13]:
# Inspect balance of classes by number of unique_ids
print('unique_ids')
print(f"{df.groupby(TARGET_COL)[GROUPBY_COL].nunique().sort_values(ascending=False)}")

unique_ids
vehicle_type
2    2922
8    1000
0     465
7     451
4     317
5     240
1     192
9     191
6      79
3      29
Name: unique_id, dtype: int64


In [14]:
# Inspect balance of classes by number of rows
print('num_rows')
print(f"{df[TARGET_COL].value_counts()}")

num_rows
vehicle_type
2    744232
8    210160
7     91608
1     39480
0     37562
4     36238
9     20933
5     14208
6     11160
3      2706
Name: count, dtype: int64


#### train_df

In [15]:
print(f"There are {train_df[GROUPBY_COL].nunique()} vehicle_ids and {len(train_df)} rows in train_df.")
print(f"{train_df[TARGET_COL].nunique()} classes are represented in train_df.\n")

There are 4110 vehicle_ids and 829958 rows in train_df.
10 classes are represented in train_df.



In [16]:
# Inspect balance of classes by number of unique_ids
print('unique_ids')
print(f"{train_df.groupby(TARGET_COL)[GROUPBY_COL].nunique().sort_values(ascending=False)}")

unique_ids
vehicle_type
2    2042
8     674
0     330
7     319
4     237
5     172
9     133
1     130
6      56
3      17
Name: unique_id, dtype: int64


In [17]:
# Inspect balance of classes by number of rows
print('num_rows')
print(f"{train_df[TARGET_COL].value_counts()}")

num_rows
vehicle_type
2    516604
8    137658
7     63953
0     26798
4     26711
1     25681
9     13695
5      9857
6      7502
3      1499
Name: count, dtype: int64


#### val_df

In [18]:
print(f"There are {val_df[GROUPBY_COL].nunique()} vehicle_ids and {len(val_df)} rows in val_df.")
print(f"{val_df[TARGET_COL].nunique()} classes are represented in val_df.\n")

There are 595 vehicle_ids and 126543 rows in val_df.
10 classes are represented in val_df.



In [19]:
# Inspect balance of classes by number of unique_ids
print('unique_ids')
print(f"{val_df.groupby(TARGET_COL)[GROUPBY_COL].nunique().sort_values(ascending=False)}")

unique_ids
vehicle_type
2    277
8    111
7     53
0     46
4     28
1     25
5     21
9     20
6      8
3      6
Name: unique_id, dtype: int64


In [20]:
# Inspect balance of classes by number of rows
print('num_rows')
print(f"{val_df[TARGET_COL].value_counts()}")

num_rows
vehicle_type
2    72938
8    24916
7    10983
1     5054
0     3747
4     3128
9     2157
6     1686
5     1524
3      410
Name: count, dtype: int64


#### test_df

In [21]:
print(f"There are {test_df[GROUPBY_COL].nunique()} vehicle_ids and {len(test_df)} rows in test_df.")
print(f"{test_df[TARGET_COL].nunique()} classes are represented in test_df.\n")

There are 1181 vehicle_ids and 251786 rows in test_df.
10 classes are represented in test_df.



In [22]:
# Inspect balance of classes by number of unique_ids
print('unique_ids')
print(f"{test_df.groupby(TARGET_COL)[GROUPBY_COL].nunique().sort_values(ascending=False)}")

unique_ids
vehicle_type
2    603
8    215
0     89
7     79
4     52
5     47
9     38
1     37
6     15
3      6
Name: unique_id, dtype: int64


In [23]:
# Inspect balance of classes by number of rows
print('num_rows')
print(f"{test_df[TARGET_COL].value_counts()}")

num_rows
vehicle_type
2    154690
8     47586
7     16672
1      8745
0      7017
4      6399
9      5081
5      2827
6      1972
3       797
Name: count, dtype: int64


## 3. Balance classes if necessary

In [13]:
# Downsample majority class and upsample minority class

## 4. Feature normalisation

In [13]:
scaler = RobustScaler()

# Fit the scaler on the training data and transform the train, val and test data
scaler.fit(train_df[FEATURE_COLS])
train_df.loc[:, FEATURE_COLS] = scaler.transform(train_df[FEATURE_COLS])
val_df.loc[:, FEATURE_COLS] = scaler.transform(val_df[FEATURE_COLS])
test_df.loc[:, FEATURE_COLS] = scaler.transform(test_df[FEATURE_COLS])

In [None]:
# Sort by 'unique_id' and 'datetime' to ensure the correct order
train_df = train_df.sort_values(by=[GROUPBY_COL, DATETIME_COL]).reset_index(drop=True)
val_df = val_df.sort_values(by=[GROUPBY_COL, DATETIME_COL]).reset_index(drop=True)
test_df = test_df.sort_values(by=[GROUPBY_COL, DATETIME_COL]).reset_index(drop=True)

## 5. Save train, val, and test dfs

In [17]:
train_df.to_csv(f'train_data.csv', index=False)
val_df.to_csv(f'val_data.csv', index=False)
test_df.to_csv(f'test_data.csv', index=False)
print("Train, val, test data saved as .csv file.")

Train, val, test data saved as .csv file.


## 5. Reshaping data for input to LSTM

In [16]:
# Creating datasets and dataloaders
train_dataset = VehicleDataset(train_df)
val_dataset = VehicleDataset(val_df)
test_dataset = VehicleDataset(test_df)

train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate_fn)
val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=False, collate_fn=collate_fn)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False, collate_fn=collate_fn)

In [15]:
# test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False, collate_fn=collate_fn)
# test_loader1 = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False, collate_fn=collate_fn)

# assert(len(test_loader) == len(test_loader1))
# for i, batch in enumerate(test_loader):
#     for j, batch1 in enumerate(test_loader1):
#         if (i == j):
#             assert(torch.all(torch.eq(batch[0], batch1[0])))
#             assert(torch.all(torch.eq(batch[1], batch1[1])))
#             assert(batch[2] == batch1[2])
#             assert(torch.all(torch.eq(batch[3], batch1[3])))
#         else:
#             continue