# Road Accident Classification - Local News Articles

This notebook analyzes the local_news_articles.csv dataset to identify and flag road accidents vs non-accidents.

In [214]:
import pandas as pd
import numpy as np
import re
import matplotlib.pyplot as plt
import seaborn as sns

# Load the dataset
df = pd.read_csv('../../../data/local_news_articles.csv')

# Display basic information
print(f"Dataset shape: {df.shape}")
print(f"\nColumn names:")
print(df.columns.tolist())
print(f"\nFirst few rows:")
df.head()

Dataset shape: (321, 14)

Column names:
['article_id', 'url', 'source_name', 'source_url', 'title', 'subtitle', 'author_name', 'publish_date', 'content', 'top_image_url', 'top_image_caption', 'created_at', 'tags', 'categories']

First few rows:


Unnamed: 0,article_id,url,source_name,source_url,title,subtitle,author_name,publish_date,content,top_image_url,top_image_caption,created_at,tags,categories
0,4208,https://timesofmalta.com/article/driver-stuck-...,Times of Malta,https://timesofmalta.com,Driver stuck in traffic says speeding LESA car...,‘I was shocked at that moment but more so frus...,Emma Borg,2024-12-07,A motorist claims his car mirror was shattered...,https://cdn-attachments.timesofmalta.com/706da...,The broken car mirror. Photo: Frank Xerri De Caro,2025-07-03 15:14:21.554132+00,"{Accident,Lesa,National}",{}
1,4167,https://timesofmalta.com/article/pn-slams-gove...,Times of Malta,https://timesofmalta.com,PN slams government for diverting EU bus funds...,"'By encouraging the use of private cars, the g...",Times of Malta,2024-12-09,The PN on Monday slammed the government for di...,https://cdn-attachments.timesofmalta.com/d9afe...,"PN spokespeople Ryan Callus, Mark Anthony Samm...",2025-07-03 15:14:10.643172+00,"{""Climate Change"",Environment,""European Union""...",{}
2,4093,https://timesofmalta.com/article/motorcyclist-...,Times of Malta,https://timesofmalta.com,Motorcyclist seriously hurt in St Paul's Bay b...,Residents complained several times about inade...,Times of Malta,2024-12-11,A motorcyclist was rushed to hospital in a cri...,https://cdn-attachments.timesofmalta.com/633f6...,Photo: Malta Police Force,2025-07-03 15:13:50.605708+00,"{Accident,National,""St Paul’S Bay"",Traffic}",{}
3,4110,https://timesofmalta.com/article/skip-involved...,Times of Malta,https://timesofmalta.com,Skip involved in horror St Paul’s Bay bypass c...,Motorcyclist hurt in crash on Wednesday evenin...,Emma Borg,2024-12-12,A private contractor who placed a skip on St P...,https://cdn-attachments.timesofmalta.com/fc23e...,A 54-year-old man was seriously injured when h...,2025-07-03 15:13:54.812813+00,"{Accident,National,""St Paul’S Bay""}",{}
4,4066,https://timesofmalta.com/article/two-people-in...,Times of Malta,https://timesofmalta.com,"Two people, including teenage girl, critically...",Incidents in Mellieħa and Gudja on Friday even...,Times of Malta,2024-12-14,A 29-year-old man and 17-year-old girl were cr...,https://cdn-attachments.timesofmalta.com/f1761...,The Ford Fiesta involved in the Gudja collisio...,2025-07-03 15:13:43.83839+00,"{Accident,Gudja,Mellieħa,National,Traffic}",{}


## Road Accident Classification

Now let's create a comprehensive classification system that identifies road accidents based on multiple criteria:
- Keywords in title and content
- Tags containing 'Accident' or 'Traffic'
- Specific road accident indicators (vehicles, injuries, crashes, etc.)

In [215]:
def classify_road_accident(row):
    """Classify if article is about a road accident."""
    text = (str(row['title']) + ' ' + str(row['content'])).lower()
    tags = str(row['tags']).lower()
    
    # Exclude policy articles (government, budget, legislation, etc.)
    policy_keywords = ['government', 'minister', 'policy', 'budget', 'funds', 'legislation', 'parliament', 'proposal', 'grant', 'incentive', 'subsidy']
    if sum(1 for k in policy_keywords if k in text) >= 3:
        return 0
    
    # Exclude non-accident traffic incidents
    non_accident_keywords = ['speeding', 'speed gun', 'caught doing', 'clocked at', 'pothole', 'flat tyre', 'flat tire', 'road damage', 'traffic violation', 'employer']
    if any(k in text for k in non_accident_keywords):
        return 0
    
    person_vehicle_terms = ['motorcyclist', 'cyclist', 'pedestrian']
    accident_keywords = ['crash', 'collision', 'injured', 'grievously injured', 'seriously injured', 'hit by', 'overturned', 'lost control', 'hit-and-run', 'run over']
    vehicles = ['car', 'bus', 'truck', 'van', 'motorcycle', 'bike', 'bicycle', 'scooter', 'vehicle']
    
    has_person_vehicle = any(k in text for k in person_vehicle_terms)
    has_accident_keyword = any(k in text for k in accident_keywords)
    has_vehicle = any(v in text for v in vehicles)
    
    if 'accident' in tags and (has_vehicle or has_accident_keyword or has_person_vehicle):
        return 1
    if has_accident_keyword and (has_vehicle or has_person_vehicle):
        return 1
    if has_person_vehicle and (has_accident_keyword or 'accident' in tags):
        return 1
    
    return 0

df['is_road_accident'] = df.apply(classify_road_accident, axis=1)
accidents_df = df[df['is_road_accident'] == 1].copy()

print(f"Accidents identified: {len(accidents_df)} ({len(accidents_df)/len(df)*100:.1f}%)")

Accidents identified: 235 (73.2%)


## Extract Accident Date and Time

Let's extract the date and time when the accident occurred from the article content.

In [216]:
def parse_time_to_datetime(date_obj, time_str):
    if not time_str or pd.isna(date_obj):
        return None
    try:
        time_str = time_str.lower().strip().replace('.', ':')
        time_obj = datetime.strptime(time_str, '%I:%M%p' if ':' in time_str else '%I%p').time()
        date_only = date_obj.date() if isinstance(date_obj, pd.Timestamp) else (date_obj.date() if hasattr(date_obj, 'date') else date_obj)
        return datetime.combine(date_only, time_obj)
    except:
        return None

def extract_accident_datetime(row):
    text = str(row['content']).lower()
    try:
        pub_date = pd.to_datetime(row['publish_date'], format='%d/%m/%Y')
    except:
        pub_date = None
    
    time_patterns = [
        r'at (?:around |about )?(\d{1,2}(?:\.\d{2})?(?:am|pm))',
        r'at (?:around |about )?(\d{1,2}:\d{2}(?:am|pm)?)',
        r'(?:reported|occurred|happened) at (?:around |about )?(\d{1,2}(?:\.\d{2})?(?:am|pm))',
    ]
    
    extracted_time = None
    for pattern in time_patterns:
        match = re.search(pattern, text)
        if match:
            extracted_time = match.group(1)
            break
    
    day_patterns = {
        r'on monday': ('Monday', 0), r'on tuesday': ('Tuesday', 1), r'on wednesday': ('Wednesday', 2),
        r'on thursday': ('Thursday', 3), r'on friday': ('Friday', 4), r'on saturday': ('Saturday', 5),
        r'on sunday': ('Sunday', 6), r'this (?:morning|afternoon|evening)': ('today', None),
        r'(?:yesterday|last night)': ('yesterday', None),
    }
    
    extracted_day = None
    accident_date = None
    
    for pattern, (day_value, weekday) in day_patterns.items():
        if re.search(pattern, text):
            extracted_day = day_value
            if pub_date is not None:
                if day_value == 'today':
                    accident_date = pub_date
                elif day_value == 'yesterday':
                    accident_date = pub_date - timedelta(days=1)
                elif weekday is not None:
                    days_back = (pub_date.weekday() - weekday) % 7
                    accident_date = pub_date - timedelta(days=days_back)
            break
    
    if accident_date is None and pub_date is not None:
        accident_date = pub_date
    
    if extracted_time:
        accident_datetime = parse_time_to_datetime(accident_date, extracted_time)
    else:
        accident_datetime = pd.Timestamp(accident_date) if accident_date is not None else None
    
    return extracted_time, extracted_day, accident_date, accident_datetime

accidents_df['accident_time'], accidents_df['accident_day'], accidents_df['accident_date'], accidents_df['accident_datetime'] = zip(*accidents_df.apply(extract_accident_datetime, axis=1))

accidents_df['time_confidence'] = accidents_df.apply(
    lambda row: 'High' if pd.notna(row['accident_time']) else ('Medium' if pd.notna(row['accident_day']) else 'Low'), axis=1
)

accidents_df['accident_hour'] = accidents_df['accident_datetime'].apply(lambda dt: dt.hour if pd.notna(dt) else None)

accidents_df['accident_is_weekend'] = accidents_df['accident_datetime'].apply(
    lambda dt: 1 if (pd.notna(dt) and dt.weekday() >= 5) else (0 if pd.notna(dt) else None)
)

accidents_df['publication_delay_hours'] = accidents_df.apply(
    lambda row: max(0, (pd.to_datetime(row['publish_date'], format='%d/%m/%Y') - row['accident_datetime']).total_seconds() / 3600) if pd.notna(row['accident_datetime']) else None, axis=1
)

def categorize_time_of_day(dt):
    if pd.isna(dt):
        return None
    hour = dt.hour
    if 0 <= hour < 6: return 'Night (00:00-06:00)'
    elif 6 <= hour < 12: return 'Morning (06:00-12:00)'
    elif 12 <= hour < 18: return 'Afternoon (12:00-18:00)'
    else: return 'Evening (18:00-00:00)'

accidents_df['time_of_day_category'] = accidents_df['accident_datetime'].apply(categorize_time_of_day)

# Add holiday/event/school indicators
def get_holiday_event_status(dt):
    """
    Determine if date is a holiday, event, or school holiday period.
    
    Returns: (is_holiday, is_event, is_school_holiday)
    """
    if pd.isna(dt):
        return 'no', 'no', 'no'
    
    date = dt.date() if hasattr(dt, 'date') else dt
    year = date.year
    month = date.month
    day = date.day
    
    # Maltese public holidays (fixed dates)
    # These are official public holidays that may affect traffic patterns due to
    # increased leisure travel, celebrations, and reduced commercial activity
    holidays = [
        (1, 1),   # New Year's Day
        (2, 10),  # St Paul's Shipwreck (Feast of St Paul's Shipwreck)
        (3, 19),  # St Joseph's Day
        (3, 31),  # Freedom Day (Jum il-Ħelsien)
        (5, 1),   # Workers' Day (May Day)
        (6, 7),   # Sette Giugno (Commemoration of 1919 riots)
        (6, 29),  # St Peter & St Paul (L-Imnarja - major feast)
        (8, 15),  # Assumption of Mary (Santa Marija - mid-summer holiday)
        (9, 8),   # Victory Day (Our Lady of Victories)
        (9, 21),  # Independence Day
        (12, 8),  # Immaculate Conception
        (12, 13), # Republic Day
        (12, 25), # Christmas Day
        (12, 26), # Boxing Day
    ]
    
    # Variable holidays for 2024-2025 (change yearly based on lunar calendar)
    # Good Friday and Easter Sunday dates vary each year
    variable_holidays = {
        2024: [(3, 29), (3, 30)],  # Good Friday, Easter Sunday 2024
        2025: [(4, 18), (4, 19)],  # Good Friday, Easter Sunday 2025
    }
    
    # School holidays periods (approximate)
    # School holidays may affect traffic patterns due to families traveling,
    # reduced rush-hour congestion, and increased daytime leisure traffic
    # Returns 'yes' if accident occurred during school holidays, 'no' otherwise
    school_holidays = [
        ('summer', 6, 20, 9, 15),    # Summer break: June 20 - Sept 15 (longest holiday)
        ('christmas', 12, 20, 1, 7), # Christmas break: Dec 20 - Jan 7 (crosses year boundary)
        ('easter', 4, 10, 4, 20),    # Easter break: April 10-20 (approx, varies with Easter date)
    ]
    
    # Notable events that affect traffic patterns (festa season, carnival, etc.)
    # These events involve road closures, processions, increased pedestrian traffic,
    # and visitors traveling to/from villages. Returns 'yes' during event periods
    events = {
        2024: [
            (2, 10, 2, 13),  # Carnival 2024 (weekend of celebrations, street parties)
            (6, 1, 9, 30),   # Festa season: June-Sept (village feasts throughout Malta)
        ],
        2025: [
            (3, 1, 3, 4),    # Carnival 2025 (weekend of celebrations, street parties)
            (6, 1, 9, 30),   # Festa season: June-Sept (village feasts throughout Malta)
        ]
    }
    
    is_holiday = 'no'
    is_event = 'no'
    is_school = 'no'
    
    # Check if it's a public holiday
    # Priority: 'eve of' overrides 'yes' if accident is day before a holiday
    if (month, day) in holidays:
        is_holiday = 'yes'
    elif year in variable_holidays and (month, day) in variable_holidays[year]:
        is_holiday = 'yes'
    
    # Check if it's eve of a holiday (day before)
    # Eve of holidays often have increased social activity and traffic
    next_day = date + timedelta(days=1)
    if (next_day.month, next_day.day) in holidays:
        is_holiday = 'eve of'
    elif year in variable_holidays and (next_day.month, next_day.day) in variable_holidays[year]:
        is_holiday = 'eve of'
    
    # Check if during event period
    if year in events:
        for event_start_m, event_start_d, event_end_m, event_end_d in events[year]:
            event_start = date.replace(month=event_start_m, day=event_start_d)
            event_end = date.replace(month=event_end_m, day=event_end_d)
            if event_start <= date <= event_end:
                is_event = 'yes'
                break
    
    # Check if during school holidays
    for holiday_name, start_m, start_d, end_m, end_d in school_holidays:
        try:
            if start_m <= end_m:  # Same year period
                start_date = date.replace(month=start_m, day=start_d)
                end_date = date.replace(month=end_m, day=end_d)
                if start_date <= date <= end_date:
                    is_school = 'yes'
                    break
            else:  # Crosses year boundary (e.g., Christmas)
                start_date = date.replace(month=start_m, day=start_d)
                end_date = date.replace(year=date.year+1 if date.month < 6 else date.year, month=end_m, day=end_d)
                if date >= start_date or date <= end_date:
                    is_school = 'yes'
                    break
        except:
            pass
    
    return is_holiday, is_event, is_school

accidents_df[['is_holiday', 'is_event', 'is_school_holiday']] = accidents_df['accident_datetime'].apply(
    lambda dt: pd.Series(get_holiday_event_status(dt))
)

print(f"High: {(accidents_df['time_confidence'] == 'High').sum()} | Medium: {(accidents_df['time_confidence'] == 'Medium').sum()} | Low: {(accidents_df['time_confidence'] == 'Low').sum()}")

High: 144 | Medium: 64 | Low: 27


In [217]:
for col in ['accident_datetime', 'accident_time', 'accident_day', 'accident_date', 'time_confidence', 
            'accident_hour', 'accident_is_weekend', 'publication_delay_hours', 'time_of_day_category', 
            'is_holiday', 'is_event', 'is_school_holiday']:
    df[col] = None
    df.loc[accidents_df.index, col] = accidents_df[col]

columns_to_save = [col for col in accidents_df.columns if col not in ['accident_day', 'accident_hour']]
accidents_output_path = '../../../data/processed/road_accidents_with_datetime.csv'
accidents_df[columns_to_save].to_csv(accidents_output_path, index=False)

print(f"✓ Saved {len(accidents_df)} accidents to {accidents_output_path}")

✓ Saved 235 accidents to ../../../data/processed/road_accidents_with_datetime.csv
