# Accidents in Ile de France 

dataset : https://www.onisr.securite-routiere.gouv.fr/recherche-statistique-des-accidents

In [41]:
import re
import pandas as pd 
import matplotlib.pyplot as plt
import numpy as np

In [48]:
# Load the dataset
df = pd.read_csv("data/accidents_dataset.csv", delimiter=';')
df.head()

Unnamed: 0,Dépt,Commune,Date,Tué(s),Blessés,dont hospitalisés,dont légers,Milieu,Autoroute,Adresse,Latitude,Longitude,Résumé
0,75,75119 - Paris 19e Arrondissement,01/01/2023 - 00h25,0,1,0,1,En agglomération,Non,Avenue de Flandre,2379,48892,"Accident Léger non mortel, En agglomération, E..."
1,92,92072 - Sèvres,01/01/2023 - 01h50,0,4,0,4,Hors agglomération,Non,N118,2219,48821,"Accident Léger non mortel, Hors agglomération,..."
2,75,75112 - Paris 12e Arrondissement,01/01/2023 - 01h55,0,2,0,2,En agglomération,Non,BOULEVARD DE BERCY,2382,4884,"Accident Léger non mortel, En agglomération, E..."
3,93,93066 - Saint-Denis,01/01/2023 - 03h00,0,1,0,1,En agglomération,Non,Avenue du Colonel Fabien,2354,48945,"Accident Léger non mortel, En agglomération, H..."
4,77,77384 - Réau,01/01/2023 - 03h45,0,2,1,1,Hors agglomération,Oui,A5b,263,4862,"Accident Grave non mortel, Hors agglomération,..."


In [49]:
# Description of the dataset
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3000 entries, 0 to 2999
Data columns (total 13 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   Dépt               3000 non-null   int64 
 1   Commune            3000 non-null   object
 2   Date               3000 non-null   object
 3   Tué(s)             3000 non-null   int64 
 4   Blessés            3000 non-null   int64 
 5   dont hospitalisés  3000 non-null   int64 
 6   dont légers        3000 non-null   int64 
 7   Milieu             3000 non-null   object
 8   Autoroute          3000 non-null   object
 9   Adresse            2996 non-null   object
 10  Latitude           3000 non-null   object
 11  Longitude          3000 non-null   object
 12  Résumé             2999 non-null   object
dtypes: int64(5), object(8)
memory usage: 304.8+ KB


#### Data cleaning & Preprocessing

In [50]:
# Replace comma with dot for decimal and convert to float

df['Latitude'] = df['Latitude'].str.replace(',', '.').astype(float)
df['Longitude'] = df['Longitude'].str.replace(',', '.').astype(float)

In [51]:
# Convert 'Date' column to datetime objects and divide into hour and day of week
df['Date'] = pd.to_datetime(df['Date'], format='%d/%m/%Y - %Hh%M')
df['Hour'] = df['Date'].dt.hour
df['DayOfWeek'] = df['Date'].dt.dayofweek # Monday=0, Sunday=6

In [52]:
# Create new variable 'Severity' (can be used for classification later)
def define_severity(row):
    if row['Tué(s)'] > 0:
        return 'Fatal'
    if row['dont hospitalisés'] > 0:
        return 'Serious'
    if row['dont légers'] > 0:
        return 'Light'
    return 'Unharmed' # Even if blessés = 0, dont legers = 0 etc.

df['Severity'] = df.apply(define_severity, axis=1)

In [53]:
# 5. Handle missing values (simple drop for now)
df = df.dropna(subset=['Latitude', 'Adresse', 'Longitude', 'Résumé'])
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 2995 entries, 0 to 2999
Data columns (total 16 columns):
 #   Column             Non-Null Count  Dtype         
---  ------             --------------  -----         
 0   Dépt               2995 non-null   int64         
 1   Commune            2995 non-null   object        
 2   Date               2995 non-null   datetime64[ns]
 3   Tué(s)             2995 non-null   int64         
 4   Blessés            2995 non-null   int64         
 5   dont hospitalisés  2995 non-null   int64         
 6   dont légers        2995 non-null   int64         
 7   Milieu             2995 non-null   object        
 8   Autoroute          2995 non-null   object        
 9   Adresse            2995 non-null   object        
 10  Latitude           2995 non-null   float64       
 11  Longitude          2995 non-null   float64       
 12  Résumé             2995 non-null   object        
 13  Hour               2995 non-null   int32         
 14  DayOfWeek    

#### Feature engineering

In [None]:
# Resumé column engineering

# Define regex patterns to extract information
# Using non-capturing group (?:) and making it non-greedy (.*?)
pattern_weather = r'météo (.*?)(?: et |,|avec)'
pattern_road = r'surface chaussée : ([^.]+)'
pattern_light = r'(Nuit .*? éclairage .*?)(?:,|\.$)|(Jour)'

# Extract features using .str.extract()
df['Weather'] = df['Résumé'].str.extract(pattern_weather, flags=re.IGNORECASE)
df['Road_Surface'] = df['Résumé'].str.extract(pattern_road, flags=re.IGNORECASE)

# For lighting, since it has two patterns (Night or Day), we extract and combine
light_extract = df['Résumé'].str.extract(pattern_light, flags=re.IGNORECASE)
df['Lighting'] = light_extract[0].fillna(light_extract[1]) # Combine the two capture groups

# Clean up extracted data (strip whitespace, fill NaNs)
def clean_feature(x):
    if isinstance(x, str):
        return x.strip()
    return 'Unknown'

df['Weather'] = df['Weather'].apply(clean_feature)
df['Road_Surface'] = df['Road_Surface'].apply(clean_feature)
df['Lighting'] = df['Lighting'].apply(clean_feature)

print(df[['Weather', 'Road_Surface', 'Lighting']].head())

   Weather Road_Surface                               Lighting
0  Unknown      Unknown     Nuit avec éclairage public  allumé
1  Unknown      Unknown  Nuit avec éclairage public non allumé
2  Unknown      Unknown     Nuit avec éclairage public  allumé
3  Unknown      Unknown     Nuit avec éclairage public  allumé
4  Unknown      Unknown             Nuit sans éclairage public


#### Feature Engineering & Geospatial Analysis