# Initial

## Constants

In [3]:
# Constants for easy reference and modification
GROUPBY_COL = 'unique_id'

DATETIME_COL = 'datetime'

TARGET_COL = 'vehicle_type'

FEATURE_COLS = [
    'vehicle_speed',
    'vehicle_angle_sine', 'vehicle_angle_cosine',
    'vehicle_x', 'vehicle_y', 'vehicle_z'
]

## Imports

In [8]:
# Standard library imports
import pprint
import re

# Third-party imports
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder, RobustScaler

# PyTorch and related imports
import torch
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import DataLoader, Dataset, IterableDataset

## Helper Functions and Class Definitions

In [4]:
def train_val_test_split_custom(df, test_size=0.2, val_size=0.1, min_per_class=1):
    """
    Splits data into training, validation, and test sets.
    Ensures that a vehicle_id only appears in one of the datasets.
    Ensures that all possible classes are represented in each set.
    In the event where a class has not enough vehicle_ids for each set to have at least one vehicle_id,
    prioritise the train set to represent all possible classes.
    """
    np.random.seed(0)  # Seed for reproducibility

    # Get unique IDs and classes
    unique_ids = df[GROUPBY_COL].unique()
    classes = df[TARGET_COL].unique()

    # Initialize lists to store IDs for each dataset
    train_ids, val_ids, test_ids = [], [], []

    for cls in classes:
        ids_for_class = df[df[TARGET_COL] == cls][GROUPBY_COL].unique()
        np.random.shuffle(ids_for_class)

        # Allocate IDs to train, val, and test sets
        class_count = len(ids_for_class)
        train_end = min(min_per_class, class_count)
        val_end = min(train_end + min_per_class, class_count)
        test_end = min(val_end + min_per_class, class_count)

        train_ids.extend(ids_for_class[:train_end])
        val_ids.extend(ids_for_class[train_end:val_end])
        test_ids.extend(ids_for_class[val_end:test_end])

    # Shuffle remaining IDs after removing selected train IDs
    reserved_ids = set(train_ids + val_ids + test_ids)
    remaining_ids = list(set(unique_ids) - reserved_ids)
    np.random.shuffle(remaining_ids)

    # Calculate split sizes for remaining IDs
    total_remaining = len(remaining_ids)
    test_count = int(total_remaining * test_size)
    val_count = int(total_remaining * val_size)

    # Assign remaining vehicle IDs to test and validation sets
    test_ids += remaining_ids[:test_count]
    val_ids += remaining_ids[test_count:test_count + val_count]
    train_ids += remaining_ids[test_count + val_count:]

    # Create split DataFrames
    train_df = df[df[GROUPBY_COL].isin(train_ids)]
    val_df = df[df[GROUPBY_COL].isin(val_ids)]
    test_df = df[df[GROUPBY_COL].isin(test_ids)]

    return train_df, val_df, test_df

def create_sequences(df):
    sequences = []
    for vehicle_id, group in df.groupby(GROUPBY_COL):
        sorted_group = group.sort_values(by=[DATETIME_COL]).copy()
        sequence_features = sorted_group[FEATURE_COLS].values
        label = sorted_group.iloc[0][TARGET_COL]
        sequences.append((sequence_features, label, vehicle_id))
    return sequences

class VehicleDataset(Dataset):
    def __init__(self, df):
        self.sequences = create_sequences(df)

    def __len__(self):
        return len(self.sequences)

    def __getitem__(self, idx):
        sequence, label, vehicle_id = self.sequences[idx]
        return torch.tensor(sequence, dtype=torch.float32), torch.tensor(label, dtype=torch.long), vehicle_id

def collate_fn(batch):
    sequences, labels, vehicle_ids = zip(*batch)

    # Clone and detach the tensors in sequences before padding
    padded_sequences = pad_sequence([seq.clone().detach() for seq in sequences], batch_first=True)

    labels_tensor = torch.tensor(labels, dtype=torch.long)
    lengths = torch.tensor([len(seq) for seq in sequences])

    return padded_sequences, labels_tensor, vehicle_ids, lengths

### Archive

In [None]:
######################## USING GENERATORS ########################
def create_sequences_generator(df):
    for vehicle_id, group in df.groupby(GROUPBY_COL):
         # Ensure that data is sorted by  datetime
        group[DATETIME_COL] = pd.to_datetime(group[DATETIME_COL])
        group.sort_values(by=[DATETIME_COL], inplace=True)

        sequence_features = group[FEATURE_COLS].values
        label = group.iloc[0][TARGET_COL]
        yield sequence_features, label, vehicle_id

class VehicleIterableDataset(IterableDataset):
    def __init__(self, df):
        # IterableDataset stores the raw data and parameters needed to create the generator
        self.df = df
        self.groupby_col = GROUPBY_COL
        self.feature_cols = FEATURE_COLS
        self.target_col = TARGET_COL

    def __iter__(self):
        # Re-initialises the generator each time it is called, as generators are exhausted after one complete iteration
        return create_sequences_generator(self.df)

def collate_fn_generator(batch):
    # Separate features, labels, and vehicle IDs
    sequences, labels, vehicle_ids = zip(*batch)

    # Pad sequences to have the same length
    padded_sequences = pad_sequence([torch.tensor(seq, dtype=torch.float32) for seq in sequences], batch_first=True)

    # Convert labels to tensor
    labels_tensor = torch.tensor(labels, dtype=torch.long)

    # Calculate lengths of sequences
    lengths = torch.tensor([len(seq) for seq in sequences])

    return padded_sequences, labels_tensor, vehicle_ids, lengths

##  Load resampled data into df

In [5]:
df = pd.read_csv('resampled_vehicle_data.csv')
df[DATETIME_COL] = pd.to_datetime(df[DATETIME_COL])
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1228962 entries, 0 to 1228961
Data columns (total 12 columns):
 #   Column                Non-Null Count    Dtype         
---  ------                --------------    -----         
 0   unique_id             1228962 non-null  object        
 1   datetime              1228962 non-null  datetime64[ns]
 2   vehicle_id            1228962 non-null  object        
 3   timestep_time         1228962 non-null  float64       
 4   vehicle_type          1228962 non-null  object        
 5   vehicle_speed         1228962 non-null  float64       
 6   vehicle_x             1228962 non-null  float64       
 7   vehicle_y             1228962 non-null  float64       
 8   vehicle_z             1228962 non-null  float64       
 9   vehicle_angle         1228962 non-null  float64       
 10  vehicle_angle_sine    1228962 non-null  float64       
 11  vehicle_angle_cosine  1228962 non-null  float64       
dtypes: datetime64[ns](1), float64(8), object(3

## 0. Re-labelling classes

Following [this notebook on GitHub](https://github.com/pgrandinetti/standard-traffic-data/blob/main/knowledge/Urban_Traffic_Data_Statistical_Analysis.ipynb), we:

- Fill in null values with `pedestrian`
- Rename all vehicle types types that start with `passenger` as `car` as these instances are a subclass of car
- Rename vehicle types suffixed with bicycle (`fastbicycle`, `avgbicycle`, `slowbicycle`) as `bicycle`
- Rename `trailer` and `truck` as `lorry` as trailer is just a variation of truck
- Rename `uber` and `taxi` as `hire` as they represent the same concept
- Rename `army`, `authority`, `emergency` vehicles as `special`
- Drop all vehicle types starting with `hw_` as these vehicles run on motorways and hence not suitable for urban traffic analysis
- Drop rows containing `train` as they are not suitable for urban traffic analysis

In [None]:
df[TARGET_COL].unique()

In [None]:
# Rename vehicle types
df[TARGET_COL] = df[TARGET_COL].replace(regex={
    '^passenger.*': 'car',
    '.*bicycle$': 'bicycle',
    '^(trailer|truck)$': 'lorry',
    '^(uber|taxi)$': 'hire',
    '^(army|authority|emergency)$': 'special'
})

# Drop rows where vehicle types start with 'hw_' or are 'train'
df = df[~df[TARGET_COL].str.startswith('hw_')]
df = df[df[TARGET_COL] != 'train']

In [None]:
# Sanity check
df[TARGET_COL].unique()

In [None]:
# Sort by 'unique_id' and 'datetime' to ensure the correct order
df = df.sort_values(by=[GROUPBY_COL, DATETIME_COL]).reset_index(drop=True)


## 1. Label Encoding

In [6]:
le = LabelEncoder()
df[TARGET_COL] = df[TARGET_COL].apply(str)
df[TARGET_COL] = le.fit_transform(df[TARGET_COL])

In [9]:
label_mapping = dict(zip(range(len(le.classes_)), le.classes_))
pprint.pprint(label_mapping)

{0: 'army',
 1: 'authority',
 2: 'avgbicycle',
 3: 'bus',
 4: 'coach',
 5: 'delivery',
 6: 'emergency',
 7: 'fastbicycle',
 8: 'hw_coach',
 9: 'hw_delivery',
 10: 'hw_motorcycle',
 11: 'hw_passenger1',
 12: 'hw_passenger2a',
 13: 'hw_passenger2b',
 14: 'hw_passenger3',
 15: 'hw_passenger4',
 16: 'hw_trailer',
 17: 'hw_truck',
 18: 'moped',
 19: 'motorcycle',
 20: 'passenger1',
 21: 'passenger2a',
 22: 'passenger2b',
 23: 'passenger3',
 24: 'passenger4',
 25: 'slowbicycle',
 26: 'taxi',
 27: 'trailer',
 28: 'train',
 29: 'truck',
 30: 'uber'}


## 2. Train val test split

There are 6138 vehicle_ids and 1228962 rows in df.
31 classes are represented in df.

vehicle_ids
vehicle_type
19    1000
22     596
24     588
21     583
23     582
20     573
18     451
5      317
3      192
2      177
7      160
25     128
30     125
6      119
26     115
1       70
27      54
12      34
4       29
16      26
11      26
9       26
8       26
29      25
15      24
13      24
17      20
14      18
10      16
28      12
0        2
Name: unique_id, dtype: int64

num_rows
vehicle_type
19    210160
22    156408
23    153694
20    147510
24    145906
21    140714
18     91608
3      39480
5      36238
2      14106
6      12624
25     12255
7      11201
27      8236
1       8182
30      7526
26      6682
28      5404
29      2924
4       2706
12      1990
8       1813
16      1801
11      1632
13      1539
9       1537
15      1485
17      1376
14      1107
10       991
0        127
Name: count, dtype: int64


In [13]:
train_df, val_df, test_df = train_val_test_split_custom(df)

### Inspect balance of classes

#### df

In [14]:
print(f"There are {df[GROUPBY_COL].nunique()} vehicle_ids and {len(df)} rows in df.")
print(f"{df[TARGET_COL].nunique()} classes are represented in df.\n")

There are 6138 vehicle_ids and 1228962 rows in df.
31 classes are represented in df.



In [15]:
# Inspect balance of classes by number of unique_ids
print('unique_ids')
print(f"{df.groupby(TARGET_COL)[GROUPBY_COL].nunique().sort_values(ascending=False)}")

vehicle_ids
vehicle_type
19    1000
22     596
24     588
21     583
23     582
20     573
18     451
5      317
3      192
2      177
7      160
25     128
30     125
6      119
26     115
1       70
27      54
12      34
4       29
16      26
11      26
9       26
8       26
29      25
15      24
13      24
17      20
14      18
10      16
28      12
0        2
Name: unique_id, dtype: int64


In [16]:
# Inspect balance of classes by number of rows
print('num_rows')
print(f"{df[TARGET_COL].value_counts()}")

num_rows
vehicle_type
19    210160
22    156408
23    153694
20    147510
24    145906
21    140714
18     91608
3      39480
5      36238
2      14106
6      12624
25     12255
7      11201
27      8236
1       8182
30      7526
26      6682
28      5404
29      2924
4       2706
12      1990
8       1813
16      1801
11      1632
13      1539
9       1537
15      1485
17      1376
14      1107
10       991
0        127
Name: count, dtype: int64


#### train_df

In [17]:
print(f"There are {train_df[GROUPBY_COL].nunique()} vehicle_ids and {len(train_df)} rows in train_df.")
print(f"{train_df[TARGET_COL].nunique()} classes are represented in train_df.\n")

There are 4264 vehicle_ids and 857233 rows in train_df.
31 classes are represented in train_df.



In [18]:
# Inspect balance of classes by number of unique_ids
print('unique_ids')
print(f"{train_df.groupby(TARGET_COL)[GROUPBY_COL].nunique().sort_values(ascending=False)}")

unique_ids
vehicle_type
19    685
22    432
24    416
21    412
20    411
23    394
18    305
5     206
3     138
2     129
7     113
30     87
25     84
26     81
6      78
1      46
27     34
12     25
4      22
16     19
29     19
9      19
13     17
11     16
8      16
17     15
15     14
14     12
10     11
28      7
0       1
Name: unique_id, dtype: int64


In [19]:
# Inspect balance of classes by number of rows
print('num_rows')
print(f"{train_df[TARGET_COL].value_counts()}")

num_rows
vehicle_type
19    143905
22    116451
24    106055
20    105165
23    102836
21     99261
18     58721
3      28956
5      23348
2      10764
25      8398
6       8149
7       7729
30      5239
1       5038
26      4713
27      4386
28      3299
4       2176
29      2130
12      1542
16      1306
9       1141
8       1124
13      1078
17      1033
11       995
15       817
14       776
10       674
0         28
Name: count, dtype: int64


#### val_df

In [20]:
print(f"There are {val_df[GROUPBY_COL].nunique()} vehicle_ids and {len(val_df)} rows in val_df.")
print(f"{val_df[TARGET_COL].nunique()} classes are represented in val_df.\n")

There are 635 vehicle_ids and 128574 rows in val_df.
31 classes are represented in val_df.



In [21]:
# Inspect balance of classes by number of unique_ids
print('unique_ids')
print(f"{val_df.groupby(TARGET_COL)[GROUPBY_COL].nunique().sort_values(ascending=False)}")

unique_ids
vehicle_type
19    114
23     72
21     63
24     59
20     51
18     47
22     47
5      40
7      19
3      18
2      16
25     16
6      15
26     13
27      6
30      5
15      4
1       4
11      3
9       3
8       3
4       3
17      2
14      2
13      2
12      2
28      2
16      1
10      1
29      1
0       1
Name: unique_id, dtype: int64


In [22]:
# Inspect balance of classes by number of rows
print('num_rows')
print(f"{val_df[TARGET_COL].value_counts()}")

num_rows
vehicle_type
19    26219
23    20662
21    14921
24    14212
20    13393
18    10235
22    10149
5      4779
3      3618
6      1634
7      1433
25     1294
2      1049
26      836
27      772
28      560
1       514
4       382
30      283
15      267
8       205
9       201
11      193
17      138
14      130
13      120
0        99
12       89
10       72
16       69
29       46
Name: count, dtype: int64


#### test_df

In [23]:
print(f"There are {test_df[GROUPBY_COL].nunique()} vehicle_ids and {len(test_df)} rows in test_df.")
print(f"{test_df[TARGET_COL].nunique()} classes are represented in test_df.\n")

There are 1239 vehicle_ids and 243155 rows in test_df.
30 classes are represented in test_df.



In [24]:
# Inspect balance of classes by number of unique_ids
print('unique_ids')
print(f"{test_df.groupby(TARGET_COL)[GROUPBY_COL].nunique().sort_values(ascending=False)}")

unique_ids
vehicle_type
19    201
22    117
23    116
24    113
20    111
21    108
18     99
5      71
3      36
30     33
2      32
7      28
25     28
6      26
26     21
1      20
27     14
12      7
11      7
8       7
15      6
16      6
13      5
29      5
14      4
10      4
9       4
4       4
17      3
28      3
Name: unique_id, dtype: int64


In [25]:
# Inspect balance of classes by number of rows
print('num_rows')
print(f"{test_df[TARGET_COL].value_counts()}")

num_rows
vehicle_type
19    40036
23    30196
22    29808
20    28952
21    26532
24    25639
18    22652
5      8111
3      6906
27     3078
6      2841
1      2630
25     2563
2      2293
7      2039
30     2004
28     1545
26     1133
29      748
8       484
11      444
16      426
15      401
12      359
13      341
10      245
17      205
14      201
9       195
4       148
Name: count, dtype: int64
