# Collision stats

__May 2025__

Answering 2 key questions:
- What fraction of serious or fatal collisions happen at a junction?
- What fraction of casualties are female?

In [None]:
import yaml
import pandas as pd

from yaml import Loader

## Q1 - What fraction of serious or fatal collisions happen at a junction?

This data has the junction filter removed.

In [None]:
params = yaml.load(open("../params.yaml", 'r'), Loader=Loader)
df = pd.read_csv('../data/pedestrian-and-cyclist-collisions-all.csv')

print('Junction types:')
print(params['valid_junction_types'])

df.head()

Junction types:
['roundabout', 'mini_roundabout', 't_or_staggered_junction', 'slip_road', 'crossroads', 'multi_junction', 'other_junction', 'unknown']


Unnamed: 0,raw_collision_id,borough,easting,northing,location,collision_severity,junction_detail,date,time,year,...,serious_cyclist_casualties,slight_cyclist_casualties,max_cyclist_severity,fatal_pedestrian_casualties,serious_pedestrian_casualties,slight_pedestrian_casualties,max_pedestrian_severity,recency_weight,is_cyclist_collision,is_pedestrian_collision
0,1230419171,MERTON,525060.0,170416.0,"ON GLADSTONE ROAD, NEAR THE JUNCTION WITH SIR ...",slight,other_junction,2023-01-01,01:24:00,2023,...,,,,0.0,0.0,1.0,slight,1.0,False,True
1,1230419191,BARNET,520341.0,190175.0,"ON BURNT OAK BROADWAY, NEAR THE JUNCTION WITH ...",slight,t_or_staggered_junction,2023-01-01,02:13:00,2023,...,0.0,1.0,slight,,,,,1.0,True,False
2,1230419198,BRENT,524780.0,184471.0,"ON KILBURN HIGH ROAD, 30 METRES SOUTH OF THE J...",slight,no_junction_in_20m,2023-01-01,02:10:00,2023,...,,,,0.0,0.0,1.0,slight,1.0,False,True
3,1230419201,SOUTHWARK,532189.0,179517.0,"ON BOROUGH ROAD, NEAR THE JUNCTION WITH BOROUG...",slight,crossroads,2023-01-01,03:00:00,2023,...,0.0,1.0,slight,,,,,1.0,True,False
4,1230419209,HARINGEY,533656.0,188929.0,"ON HIGH ROAD, 28 METRES NORTH OF THE JUNCTION ...",slight,no_junction_in_20m,2023-01-01,07:23:00,2023,...,,,,0.0,0.0,1.0,slight,1.0,False,True


### Percentage of serious or fatal collisions at a junction?

Cyclist and last 5 years.

In [22]:
mask = (
    df['is_cyclist_collision']
    &
    ((df['max_cyclist_severity'] == 'serious') | (df['max_cyclist_severity'] == 'fatal'))
)

(
    df[mask & df['junction_detail'].isin(params['valid_junction_types'])]
    .raw_collision_id.nunique()
    /
    df[mask].raw_collision_id.nunique()
) * 100

77.75603392041748

### Above, but cut by year

In [71]:
# check the trend
(
    df[mask & df['junction_detail'].isin(params['valid_junction_types'])].groupby('year')['raw_collision_id'].nunique()
    /
    df[mask].groupby('year')['raw_collision_id'].nunique()
    *
    100
)

year
2019    75.515464
2020    77.739331
2021    76.653307
2022    79.060665
2023    79.380342
Name: raw_collision_id, dtype: float64

### Finally, do the stats move much if we filter out unknown?

In [32]:
# exclude unknown
junction_types = [
    'roundabout', 'mini_roundabout', 't_or_staggered_junction',
    'slip_road', 'crossroads', 'multi_junction', 'other_junction'
]

(
    df[mask & df['junction_detail'].isin(junction_types)]
    .raw_collision_id.nunique()
    /
    df[mask].raw_collision_id.nunique()
) * 100

74.40747988693194

## Q2 - What fraction of casualties are female?

In [52]:
casualties = pd.read_csv('../data/casualties.csv')
casualties['year'] = casualties['collision_id'].apply(lambda x: int(str(x)[0:4]))

casualties.head()

Unnamed: 0,raw_collision_id,casualty_id,casualty_class,casualty_gender,number_of_casualties,casualty_severity,mode_of_travel,collision_id,year
0,1230419171,1,pedestrian,Female,1,slight,pedestrian,2023010419171,2023
1,1230419183,1,driver_or_rider,Male,1,slight,car,2023010419183,2023
2,1230419183,2,passenger,Female,1,slight,car,2023010419183,2023
3,1230419189,1,driver_or_rider,Male,1,slight,car,2023010419189,2023
4,1230419191,1,driver_or_rider,Male,1,slight,pedal_cycle,2023010419191,2023


In [43]:
casualty_mask = casualties['mode_of_travel'] == 'pedal_cycle'
severity_mask = (
    (casualties['casualty_severity'] == 'serious') |
    (casualties['casualty_severity'] == 'fatal')
)

### % Female, for all severities

In [46]:
display(
    casualties[casualty_mask].groupby('casualty_gender')['raw_collision_id'].count()
)

(
    casualties[casualty_mask]
    .groupby('casualty_gender')['raw_collision_id']
    .count()
    /
    casualties[casualty_mask]['raw_collision_id']
    .count()
    *
    100
)

casualty_gender
Female      5632
Male       18497
Unknown      464
Name: raw_collision_id, dtype: int64

casualty_gender
Female     22.900825
Male       75.212459
Unknown     1.886716
Name: raw_collision_id, dtype: float64

#### Above, but broken down by year

In [66]:
gender_by_year = (
    casualties[casualty_mask]
    .groupby(['year', 'casualty_gender'])
    ['raw_collision_id']
    .count()
    .reset_index(name='count')
    .pivot(columns='casualty_gender', index='year', values='count')
)

gender_by_year['%_female'] = gender_by_year['Female'] / gender_by_year.sum(axis=1)
gender_by_year['%_male'] = gender_by_year['Male'] / gender_by_year.sum(axis=1)

gender_by_year

casualty_gender,Female,Male,Unknown,%_female,%_male
year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2019,1140,3441,53,0.246008,0.742516
2020,1140,3586,63,0.238046,0.748762
2021,1194,3971,112,0.226265,0.752479
2022,1124,3855,112,0.220782,0.757186
2023,1034,3644,124,0.215327,0.758816


### % Female, for serious or fatal

In [48]:
display(
    casualties[casualty_mask & severity_mask].groupby('casualty_gender')['raw_collision_id'].count()
)

(
    casualties[casualty_mask & severity_mask]
    .groupby('casualty_gender')['raw_collision_id']
    .count()
    /
    casualties[casualty_mask & severity_mask]['raw_collision_id']
    .count()
    *
    100
)

casualty_gender
Female     1072
Male       3529
Unknown      12
Name: raw_collision_id, dtype: int64

casualty_gender
Female     23.238673
Male       76.501192
Unknown     0.260134
Name: raw_collision_id, dtype: float64

#### Above, but broken down by year

In [67]:
gender_by_year = (
    casualties[casualty_mask & severity_mask]
    .groupby(['year', 'casualty_gender'])
    ['raw_collision_id']
    .count()
    .reset_index(name='count')
    .pivot(columns='casualty_gender', index='year', values='count')
)

gender_by_year['%_female'] = gender_by_year['Female'] / gender_by_year.sum(axis=1)
gender_by_year['%_male'] = gender_by_year['Male'] / gender_by_year.sum(axis=1)

gender_by_year

casualty_gender,Female,Male,Unknown,%_female,%_male
year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2019,175,602,1,0.224936,0.773555
2020,209,658,1,0.240783,0.757854
2021,238,758,3,0.238238,0.758578
2022,234,789,4,0.227848,0.768087
2023,216,722,3,0.229543,0.767082


In [72]:
casualties[casualty_mask].to_csv('../data/cyclist_casualties.csv', index=False)