In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import re
from datetime import datetime
import string

In [None]:
#we don't need this
#import httplib2
#from bs4 import BeautifulSoup, SoupStrainer
#import requests

In [None]:
#from spacy.cli import download
#print(download('en_core_web_sm'))

#import spacy # spaCy is a python module to work with NLP
#import en_core_web_sm
#nlp = spacy.load('en_core_web_sm') # loads english NLP model (small)

In [19]:
local_news_df = pd.read_csv('local_news_articles.csv', usecols=['publish_date', 'content'])
local_news_df

Unnamed: 0,publish_date,content
0,07/12/2024,A motorist claims his car mirror was shattered...
1,09/12/2024,The PN on Monday slammed the government for di...
2,11/12/2024,A motorcyclist was rushed to hospital in a cri...
3,12/12/2024,A private contractor who placed a skip on St P...
4,14/12/2024,A 29-year-old man and 17-year-old girl were cr...
...,...,...
316,12/10/2025,The Msida flyover will open by the end of the ...
317,13/10/2025,The following are the top stories in Malta's n...
318,13/10/2025,"Traffic, parking and public transport-related ..."
319,14/10/2025,A court has sharply criticised the police and ...


In [20]:
#flagging the rows with multiple accidents

patterns = [
    r"\b(two|three|\d+)\s+(separate\s+)?(accidents|crashes|collisions|incidents)\b",
    r"\bin a separate\s+(accident|incident|collision|crash)\b",
    r"\bin\s+separate\s+(accidents|crashes|collisions|incidents)\b",
    r"\bin\s+(another|a\s+second|a\s+third)\s+(accident|incident|collision|crash)\b",
    r"\bthe\s+(second|third|another)\s+(accident|incident|collision|crash)\b",
    r"\b(two|three)\s+(traffic\s+)?(incidents|accidents|collisions|crashes)\b",
]
compiled = [re.compile(p, flags=re.IGNORECASE) for p in patterns]

flags = []
for content in local_news_df['content'].astype(str):
    is_multi = any(rx.search(content) for rx in compiled)
    flags.append('1' if is_multi else '')

# Add the flag column
local_news_df['multi_accident_flag'] = flags

#output_file = "local_news_articles_multiflag.csv"
#local_news_df.to_csv(output_file, index=False)

In [21]:
#splitting rows with multiple accidents into one accident per row
#flag detection
def is_flag_one(x):
    try:
        if pd.isna(x):
            return False
        if isinstance(x, (int, float)):
            return float(x) == 1.0
        s = str(x).strip()
        return s in {'1','1.0','True','true'}
    except Exception:
        return False

flagged_mask = local_news_df['multi_accident_flag'].apply(is_flag_one)

# Exact markers 
markers = [
    r"in a separate accident",
    r"another accident",
    r"the second accident",
    r"hours later",  # will also catch 'two hours later', 'an hour later' as substring
]
pattern = re.compile(r"(?i)" + r"|".join(markers))

rows_out = []

for idx, row in local_news_df.iterrows():
    content = str(row['content']) if pd.notna(row['content']) else ''
    if flagged_mask.iloc[idx]:
        matches = list(pattern.finditer(content))
        if matches:
            # Compute segment boundaries
            starts = [0] + [m.end() for m in matches]
            ends = [matches[0].start()] + [matches[i+1].start() if i+1 < len(matches) else len(content) for i in range(len(matches))]
            segments = []
            # First segment (before first marker): first accident
            first_seg = content[starts[0]:ends[0]].strip()
            if first_seg:
                segments.append(('before_marker', first_seg))
            # Subsequent segments: text AFTER each marker
            for i, m in enumerate(matches):
                seg_text = content[starts[i+1]:ends[i+1]].strip()
                if seg_text:
                    segments.append((m.group(0).lower(), seg_text))
            total = len(segments)
            for part_no, (marker_name, seg_text) in enumerate(segments, start=1):
                new_row = row.copy()
                new_row['content'] = seg_text
                new_row['accident_part_number'] = part_no
                new_row['accident_parts_total'] = total
                rows_out.append(new_row)
        else:
            rows_out.append(row)
    else:
        rows_out.append(row)

out_df = pd.DataFrame(rows_out)

local_news_df = out_df
local_news_df
#outfile = 'local_news_articles_multiflag__accidents_split_hours_later.csv'
#out_df.to_csv(outfile, index=False)


Unnamed: 0,publish_date,content,multi_accident_flag,accident_part_number,accident_parts_total
0,07/12/2024,A motorist claims his car mirror was shattered...,,,
1,09/12/2024,The PN on Monday slammed the government for di...,,,
2,11/12/2024,A motorcyclist was rushed to hospital in a cri...,,,
3,12/12/2024,A private contractor who placed a skip on St P...,,,
4,14/12/2024,A 29-year-old man and 17-year-old girl were cr...,1,1.0,2.0
...,...,...,...,...,...
316,12/10/2025,The Msida flyover will open by the end of the ...,,,
317,13/10/2025,The following are the top stories in Malta's n...,,,
318,13/10/2025,"Traffic, parking and public transport-related ...",,,
319,14/10/2025,A court has sharply criticised the police and ...,,,


In [22]:
local_news_df.drop(['multi_accident_flag', 'accident_part_number', 'accident_parts_total'], axis=1, inplace=True)
local_news_df

Unnamed: 0,publish_date,content
0,07/12/2024,A motorist claims his car mirror was shattered...
1,09/12/2024,The PN on Monday slammed the government for di...
2,11/12/2024,A motorcyclist was rushed to hospital in a cri...
3,12/12/2024,A private contractor who placed a skip on St P...
4,14/12/2024,A 29-year-old man and 17-year-old girl were cr...
...,...,...
316,12/10/2025,The Msida flyover will open by the end of the ...
317,13/10/2025,The following are the top stories in Malta's n...
318,13/10/2025,"Traffic, parking and public transport-related ..."
319,14/10/2025,A court has sharply criticised the police and ...


In [23]:

# flagging the rows that do not report traffic accidents

# Accident indicator patterns (traffic context)
ACCIDENT_PATTERNS = [
    r"\b(accident|incident|collision|crash|pile[- ]?up|hit and run|hit\b|run over|overturned|skidded|lost control|fell off|injured|hospitalised|critically|grievously|serious injuries)\b",
    r"\b(motorcyclist|motorcycle|bike|cyclist|bicycle|scooter|moped|pedestrian|driver|van|bus|truck|car)\b",
    r"\b(driving to|while driving|while riding|while cycling|while walking)\b",
]
ACCIDENT_RX = re.compile('|'.join(ACCIDENT_PATTERNS), re.IGNORECASE)

UPDATED_HDR = re.compile(
    r"Updated\s+[A-Za-z]*\s*\d{1,2}[:.]?\d{0,2}\s?(?:am|pm)|Updated\s+\d{1,2}[:.]?\d{0,2}\s?(?:am|pm)",
    re.IGNORECASE
)

def strip_updated(text: str) -> str:
    if not isinstance(text, str):
        return ''
    return UPDATED_HDR.sub('', text)

def refers_to_accident(text: str) -> int:
    t = strip_updated(text)
    return 1 if ACCIDENT_RX.search(t) else 0

# Apply flag
local_news_df['accident_flag'] = local_news_df['content'].apply(refers_to_accident)
local_news_df['accident_flag'].value_counts()
#local_news_df

accident_flag
1    304
0     24
Name: count, dtype: int64

In [24]:
#extracting time in 24h format (used Copilot to help with the code)

# Strip leading "Updated …" banners
UPDATED_HDR = re.compile(r"^Updated\s+[A-Za-z]*\s*\d{1,2}[:.]?\d{0,2}\s?(?:am|pm)\b[:,]?\s*", re.IGNORECASE)
def clean_text(s: str) -> str:
    if not isinstance(s, str):
        return ''
    return UPDATED_HDR.sub('', s).strip()


# Helper to normalize various time strings to 24-hour HH:MM

def to_24h(t):
    if not isinstance(t, str) or not t.strip():
        return ''
    s = t.strip().lower()
    # Replace dot separator with colon for minutes (e.g., 5.30pm -> 5:30pm)
    s = re.sub(r'(\d{1,2})\.(\d{2})', r'\1:\2', s)
    # Ensure there's no extra spaces
    s = re.sub(r'\s+', ' ', s)

    try:
        # Cases with am/pm and possibly minutes
        if re.search(r'(am|pm)', s):
            # Add :00 if only hour is present (e.g., 5pm -> 5:00pm)
            if re.match(r'^\d{1,2}\s?(am|pm)$', s):
                s = re.sub(r'^(\d{1,2})\s?(am|pm)$', r'\1:00\2', s)
            # Remove spaces before am/pm: '2.15 pm' -> '2:15pm'
            s = re.sub(r'\s?(am|pm)$', r'\1', s)
            dt = datetime.strptime(s, '%I:%M%p')
            return dt.strftime('%H:%M')
        else:
            # 24h format like 10:30 or 7:05
            if re.match(r'^\d{1,2}:\d{2}$', s):
                # Normalize to two-digit hour
                h, m = s.split(':')
                return f"{int(h):02d}:{m}"
    except Exception:
        return ''
    return ''

# Create normalized column
if 'accident_time' not in local_news_df.columns:
    # Fallback: compute from content if previous step wasn't run
    import re
    time_patterns = [
        r"\b(?:at|around|about|reported at|occurred at|happened at|the accident was reported at|the incident was reported at|was reported at|was informed .* at|crash was reported at|reported to the police at|police (?:said|reported) .* at|the police (?:said|were informed).* at)\s*(\d{1,2}[:\.]?\d{0,2}\s?(?:am|pm))\b",
        r"\b(?:at|around|about|reported at|occurred at|happened at|the accident was reported at|the incident was reported at|was reported at|was informed .* at|crash was reported at|reported to the police at|police (?:said|reported) .* at|the police (?:said|were informed).* at)\s*(\d{1,2}:\d{2})\b",
        r"\b(?:at|around|about|reported at|occurred at|happened at)\s*(\d{1,2}\s?(?:am|pm))\b",
    ]
    compiled = [re.compile(p, flags=re.IGNORECASE) for p in time_patterns]
    def extract_time(text: str) -> str:
        if not isinstance(text, str):
            return ''
        text2 = re.sub(r"Updated\s+[A-Za-z]*\s*\d{1,2}[:\.]?\d{0,2}\s?(?:am|pm)\b[:,]?\s*", "", text, flags=re.IGNORECASE)
        text2 = re.sub(r"Updated\s+\d{1,2}[:\.]?\d{0,2}\s?(?:am|pm)\b[:,]?\s*", "", text2, flags=re.IGNORECASE)
        for rx in compiled:
            m = rx.search(text2)
            if m:
                return m.group(1).strip()
        m2 = re.search(r"\b(\d{1,2}[:\.]\d{2}\s?(?:am|pm))\b", text2, flags=re.IGNORECASE)
        if m2:
            return m2.group(1).strip()
        m3 = re.search(r"\b(\d{1,2}:\d{2})\b", text2)
        if m3:
            return m3.group(1).strip()
        return ''
    local_news_df['accident_time'] = local_news_df['content'].apply(extract_time)


local_news_df['accident_time_24'] = local_news_df['accident_time'].apply(to_24h)

# extracting time that's mentioned as around noon/midnight etc
# Phrase → 24h mapping
PHRASE_MAP = [
    (re.compile(r"\b(?:around|about)\s+(?:noon|midday)\b", re.IGNORECASE), '12:00'),
    (re.compile(r"\b(?:noon|midday)\b", re.IGNORECASE), '12:00'),
    (re.compile(r"\b(?:around|about)\s+midnight\b", re.IGNORECASE), '00:00'),
    (re.compile(r"\bmidnight\b", re.IGNORECASE), '00:00'),
    (re.compile(r"\b(?:just\s+|shortly\s+)?before\s+midnight\b", re.IGNORECASE), '23:30'),
    (re.compile(r"\b(?:just\s+|shortly\s+)?before\s+(?:noon|midday)\b", re.IGNORECASE), '11:30'),
    (re.compile(r"\baround\s+the\s+stroke\s+of\s+midnight\b", re.IGNORECASE), '00:00'),
]

def extract_approx_time(text: str) -> str:
    t = clean_text(text)
    for rx, hhmm in PHRASE_MAP:
        if rx.search(t):
            return hhmm
    return ''

idx_sel_rows = local_news_df.loc[((local_news_df['accident_time_24']== ' '))].index #identifying rows without time
local_news_df.loc[idx_sel_rows, 'accident_time_24'] = local_news_df.loc[idx_sel_rows, 'content'].apply(extract_approx_time)
local_news_df.drop('accident_time', axis=1, inplace=True)
#local_news_df['accident_time_24'] = (pd.to_datetime(local_news_df['accident_time_24'].astype(str), errors='coerce',format='%H:%M').dt.time)

local_news_df

#output_file = "extracted_time_test.csv"
#local_news_df.to_csv(output_file, index=False)



Unnamed: 0,publish_date,content,accident_flag,accident_time_24
0,07/12/2024,A motorist claims his car mirror was shattered...,1,17:00
1,09/12/2024,The PN on Monday slammed the government for di...,1,
2,11/12/2024,A motorcyclist was rushed to hospital in a cri...,1,17:00
3,12/12/2024,A private contractor who placed a skip on St P...,1,13:00
4,14/12/2024,A 29-year-old man and 17-year-old girl were cr...,1,17:30
...,...,...,...,...
316,12/10/2025,The Msida flyover will open by the end of the ...,1,
317,13/10/2025,The following are the top stories in Malta's n...,0,
318,13/10/2025,"Traffic, parking and public transport-related ...",0,
319,14/10/2025,A court has sharply criticised the police and ...,1,


In [25]:
#detecting whether the accident occured on the same day as reported
same_day_pattern = re.compile(r"\b(today|same day)\b", re.IGNORECASE)
previous_day_pattern = re.compile(r"\b(yesterday|previous day|last night|before midnight)\b", re.IGNORECASE)
weekday_pattern = re.compile(r"\b(Monday|Tuesday|Wednesday|Thursday|Friday|Saturday|Sunday)\b", re.IGNORECASE)

accident_day_list = []
for idx, row in local_news_df.iterrows():
    content = str(row['content'])
    publish_date = pd.to_datetime(row['publish_date'], errors='coerce', dayfirst=True)
    classification = "undetermined"

    if same_day_pattern.search(content):
        classification = "same day"
    elif previous_day_pattern.search(content):
        classification = "previous day"
    else:
        weekday_match = weekday_pattern.search(content)
        if weekday_match and publish_date is not pd.NaT:
            mentioned_day = weekday_match.group(0).lower()
            weekdays = {'monday': 0, 'tuesday': 1, 'wednesday': 2, 'thursday': 3, 'friday': 4, 'saturday': 5, 'sunday': 6}
            mentioned_num = weekdays.get(mentioned_day)
            publish_num = publish_date.weekday()
            if mentioned_num == publish_num:
                classification = "same day"
            elif (publish_num - mentioned_num) % 7 == 1:
                classification = "previous day"
    accident_day_list.append(classification)

local_news_df['accident_day'] = accident_day_list
local_news_df

Unnamed: 0,publish_date,content,accident_flag,accident_time_24,accident_day
0,07/12/2024,A motorist claims his car mirror was shattered...,1,17:00,undetermined
1,09/12/2024,The PN on Monday slammed the government for di...,1,,same day
2,11/12/2024,A motorcyclist was rushed to hospital in a cri...,1,17:00,same day
3,12/12/2024,A private contractor who placed a skip on St P...,1,13:00,previous day
4,14/12/2024,A 29-year-old man and 17-year-old girl were cr...,1,17:30,previous day
...,...,...,...,...,...
316,12/10/2025,The Msida flyover will open by the end of the ...,1,,same day
317,13/10/2025,The following are the top stories in Malta's n...,0,,same day
318,13/10/2025,"Traffic, parking and public transport-related ...",0,,undetermined
319,14/10/2025,A court has sharply criticised the police and ...,1,,undetermined


In [26]:
# detect injury severity
patterns = {
    'no injuries': re.compile(r"\b(no injuries|uninjured|escaped injury)\b", re.IGNORECASE),
    'minor': re.compile(r"\b(minor|minor injuries|slight injuries|light injuries)\b", re.IGNORECASE),
    'serious': re.compile(r"\b(serious|seriously injured)\b", re.IGNORECASE),
    'grievous': re.compile(r"\b(grievous|grievously injured)\b", re.IGNORECASE),
    'critical': re.compile(r"\b(critical|critically injured|critical condition)\b", re.IGNORECASE),
    'death': re.compile(r"\b(died|death|dead|fatal|succumbed|killed)\b", re.IGNORECASE)
}

severity_list = []
for content in local_news_df['content'].astype(str):
    severity = 'unknown'
    for category, pattern in patterns.items():
        if pattern.search(content):
            severity = category
            break
    severity_list.append(severity)

local_news_df['injury_severity'] = severity_list
local_news_df

#output_file = "injuiry_severity_Python.csv"
#local_news_df.to_csv(output_file, index=False)

Unnamed: 0,publish_date,content,accident_flag,accident_time_24,accident_day,injury_severity
0,07/12/2024,A motorist claims his car mirror was shattered...,1,17:00,undetermined,unknown
1,09/12/2024,The PN on Monday slammed the government for di...,1,,same day,unknown
2,11/12/2024,A motorcyclist was rushed to hospital in a cri...,1,17:00,same day,serious
3,12/12/2024,A private contractor who placed a skip on St P...,1,13:00,previous day,serious
4,14/12/2024,A 29-year-old man and 17-year-old girl were cr...,1,17:30,previous day,serious
...,...,...,...,...,...,...
316,12/10/2025,The Msida flyover will open by the end of the ...,1,,same day,unknown
317,13/10/2025,The following are the top stories in Malta's n...,0,,same day,unknown
318,13/10/2025,"Traffic, parking and public transport-related ...",0,,undetermined,unknown
319,14/10/2025,A court has sharply criticised the police and ...,1,,undetermined,serious


In [27]:
#detecting the victim of the accident

UPDATED_HDR = re.compile(r"Updated\s+[A-Za-z]*\s*\d{1,2}[:.]?\d{0,2}\s?(?:am|pm)|Updated\s+\d{1,2}[:.]?\d{0,2}\s?(?:am|pm)", re.IGNORECASE)

# terms
PED_TERMS = r"\b(pedestrian|walker|passer[- ]?by|passerby|foot\s*traveller|on foot)\b"
CYCLIST_TERMS = r"\b(cyclist|bicyclist|bicycle|push\s?bike|pedal\s?cycle|e\-?bike|ebike)\b"
MOTORCYCLE_TERMS = r"\b(motorcyclist|motorcycle|motor\s?bike|motorbike|bike\b(?!\s?ride)|biker)\b"

# phrases indicating being hurt
HURT_TERMS = r"\b(hurt|injur(?:ed|ies)|grievously|critically|serious(?:ly)?|run over|hit|struck|knocked down|hospitalis(?:ed|ed))\b"
# traffic context keywords
TRAFFIC_TERMS = r"\b(road|street|triq|lane|bypass|tunnel|roundabout|junction|seafront|coast road|regional road|traffic|collision|accident|crash)\b"

PED_RX = re.compile(PED_TERMS, re.IGNORECASE)
CYCLIST_RX = re.compile(CYCLIST_TERMS, re.IGNORECASE)
MOTO_RX = re.compile(MOTORCYCLE_TERMS, re.IGNORECASE)

HURT_RX = re.compile(HURT_TERMS, re.IGNORECASE)
TRAFFIC_RX = re.compile(TRAFFIC_TERMS, re.IGNORECASE)

def strip_updated(text: str) -> str: #removing Updated from the text
    if not isinstance(text, str):
        return ''
    return UPDATED_HDR.sub('', text)

def mark_motorcyclist(text: str) -> str:
    t = strip_updated(text)
    if MOTO_RX.search(t) and HURT_RX.search(t) and TRAFFIC_RX.search(t) and not CYCLIST_RX.search(t):
        return 'motorcyclist'
    return ''

def mark_cyclist(text: str) -> str:
    t = strip_updated(text)
    # Must mention a cyclist and harm in traffic context, but NOT a motorcyclist
    if CYCLIST_RX.search(t) and HURT_RX.search(t) and TRAFFIC_RX.search(t) and not MOTO_RX.search(t):
        return 'cyclist'
    # Catch constructions like "was cycling / on a bicycle ... and was hit"
    cycling_phrase = re.search(
        r"\b(cycling|on\s+a\s+bicycle|on\s+his\s+bicycle|on\s+her\s+bicycle|on\s+a\s+push\s?bike)\b",
        t, re.IGNORECASE
    )
    if cycling_phrase and HURT_RX.search(t) and TRAFFIC_RX.search(t) and not MOTO_RX.search(t):
        return 'cyclist'
    return ''

def mark_pedestrian(text: str) -> str:
    t = strip_updated(text)
    if PED_RX.search(t) and HURT_RX.search(t) and TRAFFIC_RX.search(t):
        return 'pedestrian'
    # also catch constructions like "a woman was run over" without the word pedestrian
    if re.search(r"\b(run over|knocked down|hit|struck)\b", t, re.IGNORECASE) and \
       re.search(r"\b(man|woman|boy|girl|elderly|child|teenager|youth|person)\b", t, re.IGNORECASE) and \
       TRAFFIC_RX.search(t):
        # ensure it's not inside vehicle context (driver etc.) by simple heuristic
        if not re.search(r"\b(driver|motorist|cyclist|motorcyclist)\b", t, re.IGNORECASE):
            return 'pedestrian'
    return ''

# Apply
local_news_df['pedestrian'] = local_news_df['content'].apply(mark_pedestrian)
local_news_df['motorcyclist'] = local_news_df['content'].apply(mark_motorcyclist)
local_news_df['cyclist'] = local_news_df['content'].apply(mark_cyclist)

local_news_df
#idx_sel_rowsC



Unnamed: 0,publish_date,content,accident_flag,accident_time_24,accident_day,injury_severity,pedestrian,motorcyclist,cyclist
0,07/12/2024,A motorist claims his car mirror was shattered...,1,17:00,undetermined,unknown,,,
1,09/12/2024,The PN on Monday slammed the government for di...,1,,same day,unknown,,,
2,11/12/2024,A motorcyclist was rushed to hospital in a cri...,1,17:00,same day,serious,,motorcyclist,
3,12/12/2024,A private contractor who placed a skip on St P...,1,13:00,previous day,serious,,motorcyclist,
4,14/12/2024,A 29-year-old man and 17-year-old girl were cr...,1,17:30,previous day,serious,pedestrian,,
...,...,...,...,...,...,...,...,...,...
316,12/10/2025,The Msida flyover will open by the end of the ...,1,,same day,unknown,,,
317,13/10/2025,The following are the top stories in Malta's n...,0,,same day,unknown,,,
318,13/10/2025,"Traffic, parking and public transport-related ...",0,,undetermined,unknown,,,
319,14/10/2025,A court has sharply criticised the police and ...,1,,undetermined,serious,,,


In [28]:
#placing all the above results into one column

idx_pedestrian = local_news_df.loc[(local_news_df['pedestrian'] == 'pedestrian') & (local_news_df['motorcyclist'] != 'motorcyclist') & (local_news_df['cyclist'] != 'cyclist')].index
#idx_pedestrian
local_news_df.loc[idx_pedestrian, 'affected_party'] = 'pedestrian'

idx_motorcyclist = local_news_df.loc[(local_news_df['motorcyclist'] == 'motorcyclist')].index
local_news_df.loc[idx_motorcyclist, 'affected_party'] = 'motorcyclist'

idx_cyclist = local_news_df.loc[(local_news_df['cyclist'] == 'cyclist') & (local_news_df['motorcyclist'] != 'motorcyclist')].index
local_news_df.loc[idx_cyclist, 'affected_party'] = 'cyclist'

idx_driver = local_news_df.loc[(local_news_df['pedestrian'] != 'pedestrian') & (local_news_df['motorcyclist'] != 'motorcyclist') & (local_news_df['cyclist'] != 'cyclist') & (local_news_df['accident_flag'] == 1)].index
local_news_df.loc[idx_driver, 'affected_party'] = 'driver'

local_news_df.drop(['pedestrian', 'motorcyclist', 'cyclist'], axis=1, inplace=True)

local_news_df

Unnamed: 0,publish_date,content,accident_flag,accident_time_24,accident_day,injury_severity,affected_party
0,07/12/2024,A motorist claims his car mirror was shattered...,1,17:00,undetermined,unknown,driver
1,09/12/2024,The PN on Monday slammed the government for di...,1,,same day,unknown,driver
2,11/12/2024,A motorcyclist was rushed to hospital in a cri...,1,17:00,same day,serious,motorcyclist
3,12/12/2024,A private contractor who placed a skip on St P...,1,13:00,previous day,serious,motorcyclist
4,14/12/2024,A 29-year-old man and 17-year-old girl were cr...,1,17:30,previous day,serious,pedestrian
...,...,...,...,...,...,...,...
316,12/10/2025,The Msida flyover will open by the end of the ...,1,,same day,unknown,driver
317,13/10/2025,The following are the top stories in Malta's n...,0,,same day,unknown,
318,13/10/2025,"Traffic, parking and public transport-related ...",0,,undetermined,unknown,
319,14/10/2025,A court has sharply criticised the police and ...,1,,undetermined,serious,driver


In [None]:
#type of accident (Copilot coding help)

from typing import List

# ---------------------------------------------------------------------
# 1) Lexicons and regex patterns
## ---------------------------------------------------------------------

# Vehicles: car types, brands, and common vehicle nouns
VEHICLES = r"""
car|van|truck|pickup|vehicle|bus|jeep|coach|minivan|school\s*van|
mercedes|bmw|toyota|nissan|peugeot|skoda|mazda|ford|renault|citroen|
suzuki|isuzu|honda|kymco|aprilia|yamaha|benelli|dacia|smart|passo|
sprinter|master|fiesta|civic|swift|fit|starlet|funcargo|aygo|hilux|
volvo|jaguar|optare|otokar|dac|vitz|demio|cx-?3|c180|xf|b\s*max|
gpd125-?a|tweet|vespa|piaggio
"""

# Fixed obstacles (roadside infrastructure and static objects)
OBSTACLES = r"""
wall|tree|tunnel|light\s*post|lamp\s*post|barrier|crash\s*barrier|bollard|
building|house|signpost|pillar|electricity\s*pole|skip|stationary\s*skip|
rubble\s*wall|bridge|guardrail|canopy|
"""

# Person terms likely to indicate pedestrians (on foot)
PERSON_WORDS = r"""
pedestrian|man|woman|boy|girl|child|toddler|elderly|teen(?:ager)?|
people|passer\s*by|passers\s*by|\d{1,3}\s*-?\s*year\s*-?\s*old
"""

# Rider/driver terms (exclude these from pedestrian struck logic)
RIDER_WORDS = r"""
cyclist|bicycl(?:e|ist)|motorcyclist|rider|driver
"""

# Accident/impact verbs and phrases
V_CRASH = r"crash(?:ed|es|ing)?|collid(?:e|ed|ing)|smashed\s+into|ramm(?:ed|ing)|hit|struck|slammed|clip(?:ped|s)"
V_RUN_OVER = r"run(?:\s*-)over|ran\s*over|run\s+over"
V_LOST_CONTROL = r"lost\s+control|went\s+out\s+of\s+control|careen(?:ed|ing)|skidd(?:ed|ing)|went\s+off\s+the\s+road|fell\s+off\s+(?:the\s+)?(?:bike|motorcycle|motorbike)"
V_OVERTURN = r"overturn(?:ed|s|ing)|rolled\s+over|flip(?:ped|s|ping)|ended\s+up\s+on\s+its\s+side"

# Compile regexes
P_VEHICLE = re.compile(VEHICLES, re.I | re.X)
P_OBSTACLE = re.compile(OBSTACLES, re.I | re.X)
P_PERSON = re.compile(PERSON_WORDS, re.I | re.X)
P_RIDER = re.compile(RIDER_WORDS, re.I | re.X)
P_CRASH = re.compile(V_CRASH, re.I)
P_RUN_OVER = re.compile(V_RUN_OVER, re.I)
P_LOST_CONTROL = re.compile(V_LOST_CONTROL, re.I)
P_OVERTURN = re.compile(V_OVERTURN, re.I)
P_GENERIC_ACCIDENT = re.compile(r"\b(accident|incident|injur(?:y|ies))\b", re.I)

# ---------------------------------------------------------------------
# 2) Sentence splitter (simple, rule-based)
# ---------------------------------------------------------------------
def split_sentences(t: str) -> List[str]:
    if not isinstance(t, str):
        return []
    return [s.strip() for s in re.split(r"(?<=[.!?])\s+|\n+", t) if s.strip()]

# ---------------------------------------------------------------------
# 3) Per-sentence classification
#    Returns granular flags for the sentence.
# ---------------------------------------------------------------------
def classify_sentence(s: str):
    s_low = s.lower()
    has_vehicle = bool(P_VEHICLE.search(s_low))
    has_obstacle = bool(P_OBSTACLE.search(s_low))
    has_person = bool(P_PERSON.search(s_low))
    has_rider = bool(P_RIDER.search(s_low))
    has_crash = bool(P_CRASH.search(s_low))
    has_run_over = bool(P_RUN_OVER.search(s_low))
    has_lost_control = bool(P_LOST_CONTROL.search(s_low))
    has_overturn = bool(P_OVERTURN.search(s_low))

    # Pedestrian struck/run-over: person + (hit/struck/run-over) + vehicle, excluding rider contexts
    running_over_pedestrian = 1 if (has_person and (has_run_over or has_crash) and has_vehicle and not has_rider) else 0

    # Collision with fixed object: crash verb + obstacle
    crushed_into_obstacle = 1 if (has_crash and has_obstacle) else 0

    # Overturned: rollover phrases
    overturned = 1 if has_overturn else 0

    # Lost control: phrases directly
    lost_control = 1 if has_lost_control else 0

    # Vehicle–vehicle collision: crash verb + >= 2 mentions of vehicles in the sentence
    vehicle_mentions = len(re.findall(VEHICLES, s_low, re.I | re.X))
    collision_vv = 1 if (has_crash and vehicle_mentions >= 2 and not running_over_pedestrian and not crushed_into_obstacle) else 0

    # Generic collision: crash verb + some vehicle/rider context, excluding pedestrian/obstacle cases
    collision_generic = 1 if (has_crash and (has_vehicle or has_rider) and not (running_over_pedestrian or crushed_into_obstacle)) else 0

    return {
        'collision_vv': collision_vv,
        'collision_generic': collision_generic,
        'running_over_pedestrian': running_over_pedestrian,
        'lost_control': lost_control,
        'crushed_into_obstacle': crushed_into_obstacle,
        'overturned': overturned
    }

# ---------------------------------------------------------------------
# 4) Per-article aggregation + precedence rules
#    Combines sentence flags and applies differentiation rules.
# ---------------------------------------------------------------------
def classify_article(text: str):
    sentences = split_sentences(text)
    flags = {
        'collision': 0,
        'running_over_pedestrian': 0,
        'lost_control': 0,
        'crushed_into_obstacle': 0,
        'overturned': 0,
        'other': 0
    }

    any_accident_mention = False
    saw_vv_collision = False

    for s in sentences:
        if P_GENERIC_ACCIDENT.search(s):
            any_accident_mention = True
        sf = classify_sentence(s)

        # accumulate non-collision flags
        flags['running_over_pedestrian'] |= sf['running_over_pedestrian']
        flags['lost_control'] |= sf['lost_control']
        flags['crushed_into_obstacle'] |= sf['crushed_into_obstacle']
        flags['overturned'] |= sf['overturned']

        # collision handling (prefer vehicle–vehicle over generic)
        if sf['collision_vv']:
            flags['collision'] = 1
            saw_vv_collision = True
        elif sf['collision_generic']:
            flags['collision'] |= 1

    # If obstacle crash detected and collision flagged only by generic sentences, drop collision unless vv also seen
    if flags['crushed_into_obstacle'] and flags['collision'] == 1 and not saw_vv_collision:
        flags['collision'] = 0

    # "Other" only if there is accident context but no specific type matched
    if (flags['collision'] + flags['running_over_pedestrian'] + flags['lost_control'] +
        flags['crushed_into_obstacle'] + flags['overturned'] == 0) and any_accident_mention:
        flags['other'] = 1

    return flags

# ---------------------------------------------------------------------
# 5) Apply improved detection (v2) and save
# ---------------------------------------------------------------------
cols = ['collision','running_over_pedestrian','lost_control','crushed_into_obstacle','overturned','other']
new_flags = local_news_df['content'].apply(classify_article)
for c in cols:
    local_news_df[c] = new_flags.apply(lambda d: d[c])

#OUT_V2 = 'local_news_articles_with_accident_types.csv'
#local_news_df.to_csv(OUT_V2, index=False)

# ---------------------------------------------------------------------
# 6) Create mutually exclusive primary_accident_type (hierarchy)
#    Order can be adjusted to your preference.
# ---------------------------------------------------------------------
# Ensure numeric
for c in cols:
    local_news_df[c] = pd.to_numeric(local_news_df[c], errors='coerce').fillna(0).astype(int)

# Hierarchy / precedence for single-label assignment
precedence = [
    ('running_over_pedestrian', 'running over pedestrian'),
    ('overturned', 'overturned'),
    ('crushed_into_obstacle', 'crushed into an obstacle'),
    ('collision', 'collision'),
    ('lost_control', 'lost control of the vehicle'),
    ('other', 'other')
]

labels = []
for _, row in local_news_df.iterrows():
    label = 'none'
    for col, name in precedence:
        if int(row[col]) == 1:
            label = name
            break
    labels.append(label)

local_news_df['primary_accident_type'] = labels

code_map = {
    'running over pedestrian': 'PED',
    'overturned': 'ROLL',
    'crushed into an obstacle': 'FXOBJ',
    'collision': 'COLL',
    'lost control of the vehicle': 'LOC',
    'other': 'OTH',
    'none': 'NONE'
}

local_news_df['primary_accident_type_code'] = local_news_df['primary_accident_type'].map(code_map)
local_news_df

#OUT_REFINED = 'local_news_articles_with_accident_types.csv'
#local_news_df.to_csv(OUT_REFINED, index=False)



Unnamed: 0,publish_date,content,accident_flag,accident_time_24,accident_day,injury_severity,affected_party,collision,running_over_pedestrian,lost_control,crushed_into_obstacle,overturned,other,primary_accident_type,primary_accident_type_code
0,07/12/2024,A motorist claims his car mirror was shattered...,1,17:00,undetermined,unknown,driver,0,0,0,1,0,0,crushed into an obstacle,FXOBJ
1,09/12/2024,The PN on Monday slammed the government for di...,1,,same day,unknown,driver,0,0,0,1,0,0,crushed into an obstacle,FXOBJ
2,11/12/2024,A motorcyclist was rushed to hospital in a cri...,1,17:00,same day,serious,motorcyclist,0,1,1,1,0,0,running over pedestrian,PED
3,12/12/2024,A private contractor who placed a skip on St P...,1,13:00,previous day,serious,motorcyclist,0,0,1,1,0,0,crushed into an obstacle,FXOBJ
4,14/12/2024,A 29-year-old man and 17-year-old girl were cr...,1,17:30,previous day,serious,pedestrian,0,1,0,1,0,0,running over pedestrian,PED
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
316,12/10/2025,The Msida flyover will open by the end of the ...,1,,same day,unknown,driver,0,0,0,0,0,0,none,NONE
317,13/10/2025,The following are the top stories in Malta's n...,0,,same day,unknown,,0,0,0,0,0,0,none,NONE
318,13/10/2025,"Traffic, parking and public transport-related ...",0,,undetermined,unknown,,0,0,0,0,0,0,none,NONE
319,14/10/2025,A court has sharply criticised the police and ...,1,,undetermined,serious,driver,0,0,0,0,0,1,other,OTH


In [30]:
# corrected date
from datetime import datetime, timedelta

fmt = "%d/%m/%Y"

# Parse publish_date
def parse_date(val):
    if pd.isna(val):
        return pd.NaT
    s = str(val).strip()
    return datetime.strptime(s, fmt).date()


pub_dates = local_news_df['publish_date'].apply(parse_date)

# Compute corrected_date per instruction
corr_dates = []
for i, row in local_news_df.iterrows():
    pub = pub_dates.iloc[i]
    day = str(row.get('accident_day', '')).strip().lower()
    if pd.isna(pub):
        corr_dates.append('')
    else:
        if day == 'previous day':
            corr_dates.append((pub - timedelta(days=1)).isoformat())
        else:
            corr_dates.append(pub.isoformat())

local_news_df['corrected_date'] = corr_dates
#local_news_df['corrected_date'] = pd.to_datetime(local_news_df['corrected_date'], format='%Y-%m-%d %H:%M:%S')
local_news_df['corrected_date'] = pd.to_datetime(local_news_df['corrected_date'])
local_news_df['day_of_week'] = local_news_df['corrected_date'].dt.day_name()
local_news_df


Unnamed: 0,publish_date,content,accident_flag,accident_time_24,accident_day,injury_severity,affected_party,collision,running_over_pedestrian,lost_control,crushed_into_obstacle,overturned,other,primary_accident_type,primary_accident_type_code,corrected_date,day_of_week
0,07/12/2024,A motorist claims his car mirror was shattered...,1,17:00,undetermined,unknown,driver,0,0,0,1,0,0,crushed into an obstacle,FXOBJ,2024-12-07,Saturday
1,09/12/2024,The PN on Monday slammed the government for di...,1,,same day,unknown,driver,0,0,0,1,0,0,crushed into an obstacle,FXOBJ,2024-12-09,Monday
2,11/12/2024,A motorcyclist was rushed to hospital in a cri...,1,17:00,same day,serious,motorcyclist,0,1,1,1,0,0,running over pedestrian,PED,2024-12-11,Wednesday
3,12/12/2024,A private contractor who placed a skip on St P...,1,13:00,previous day,serious,motorcyclist,0,0,1,1,0,0,crushed into an obstacle,FXOBJ,2024-12-11,Wednesday
4,14/12/2024,A 29-year-old man and 17-year-old girl were cr...,1,17:30,previous day,serious,pedestrian,0,1,0,1,0,0,running over pedestrian,PED,2024-12-13,Friday
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
316,12/10/2025,The Msida flyover will open by the end of the ...,1,,same day,unknown,driver,0,0,0,0,0,0,none,NONE,2025-10-08,Wednesday
317,13/10/2025,The following are the top stories in Malta's n...,0,,same day,unknown,,0,0,0,0,0,0,none,NONE,2025-10-08,Wednesday
318,13/10/2025,"Traffic, parking and public transport-related ...",0,,undetermined,unknown,,0,0,0,0,0,0,none,NONE,2025-10-08,Wednesday
319,14/10/2025,A court has sharply criticised the police and ...,1,,undetermined,serious,driver,0,0,0,0,0,1,other,OTH,2025-10-09,Thursday


In [32]:
#flagging public holidays

# Malta public holidays for 2024 and 2025
holidays = {
    # 2024 (Transport Malta Information Notice 53)
    datetime(2024,1,1): "New Year's Day",
    datetime(2024,2,10): "Feast of St. Paul's Shipwreck",
    datetime(2024,3,19): "Feast of St. Joseph",
    datetime(2024,3,29): "Good Friday",
    datetime(2024,3,31): "Freedom Day",
    datetime(2024,5,1): "Workers' Day",
    datetime(2024,6,7): "Sette Giugno",
    datetime(2024,6,29): "Feast of St. Peter and St. Paul",
    datetime(2024,8,15): "Assumption (Santa Marija)",
    datetime(2024,9,8): "Victory Day",
    datetime(2024,9,21): "Independence Day",
    datetime(2024,12,8): "Immaculate Conception",
    datetime(2024,12,13): "Republic Day",
    datetime(2024,12,25): "Christmas Day",
    # 2025 (gov.mt official list)
    datetime(2025,1,1): "New Year's Day",
    datetime(2025,2,10): "Feast of St. Paul's Shipwreck",
    datetime(2025,3,19): "Feast of St. Joseph",
    datetime(2025,3,31): "Freedom Day",
    datetime(2025,4,18): "Good Friday",
    datetime(2025,5,1): "Workers' Day",
    datetime(2025,6,7): "Sette Giugno",
    datetime(2025,6,29): "Feast of St. Peter and St. Paul",
    datetime(2025,8,15): "Assumption (Santa Marija)",
    datetime(2025,9,8): "Victory Day",
    datetime(2025,9,21): "Independence Day",
    datetime(2025,12,8): "Immaculate Conception",
    datetime(2025,12,13): "Republic Day",
    datetime(2025,12,25): "Christmas Day",
}

holiday_dates = set(holidays.keys())
# Eves: the day before each holiday
holiday_eves = {d - timedelta(days=1): holidays[d] for d in holiday_dates}

# Prepare flagging functions

def flag_holiday(date_val):
   # if pd.isna(date_val):
     #   return False
    return date_val.normalize() in holiday_dates

def holiday_name(date_val):
    #if pd.isna(date_val):
      #  return None
    return holidays.get(date_val.normalize())

def flag_eve(date_val):
    #if pd.isna(date_val):
     #   return False
    return date_val.normalize() in holiday_eves

def eve_name(date_val):
    #if pd.isna(date_val):
       # return None
    return holiday_eves.get(date_val.normalize())

# Create columns
local_news_df['is_ph'] = local_news_df['corrected_date'].apply(flag_holiday)
local_news_df['is_eve_ph'] = local_news_df['corrected_date'].apply(flag_eve)

local_news_df

Unnamed: 0,publish_date,content,accident_flag,accident_time_24,accident_day,injury_severity,affected_party,collision,running_over_pedestrian,lost_control,crushed_into_obstacle,overturned,other,primary_accident_type,primary_accident_type_code,corrected_date,day_of_week,is_ph,is_eve_ph
0,07/12/2024,A motorist claims his car mirror was shattered...,1,17:00,undetermined,unknown,driver,0,0,0,1,0,0,crushed into an obstacle,FXOBJ,2024-12-07,Saturday,False,True
1,09/12/2024,The PN on Monday slammed the government for di...,1,,same day,unknown,driver,0,0,0,1,0,0,crushed into an obstacle,FXOBJ,2024-12-09,Monday,False,False
2,11/12/2024,A motorcyclist was rushed to hospital in a cri...,1,17:00,same day,serious,motorcyclist,0,1,1,1,0,0,running over pedestrian,PED,2024-12-11,Wednesday,False,False
3,12/12/2024,A private contractor who placed a skip on St P...,1,13:00,previous day,serious,motorcyclist,0,0,1,1,0,0,crushed into an obstacle,FXOBJ,2024-12-11,Wednesday,False,False
4,14/12/2024,A 29-year-old man and 17-year-old girl were cr...,1,17:30,previous day,serious,pedestrian,0,1,0,1,0,0,running over pedestrian,PED,2024-12-13,Friday,True,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
316,12/10/2025,The Msida flyover will open by the end of the ...,1,,same day,unknown,driver,0,0,0,0,0,0,none,NONE,2025-10-08,Wednesday,False,False
317,13/10/2025,The following are the top stories in Malta's n...,0,,same day,unknown,,0,0,0,0,0,0,none,NONE,2025-10-08,Wednesday,False,False
318,13/10/2025,"Traffic, parking and public transport-related ...",0,,undetermined,unknown,,0,0,0,0,0,0,none,NONE,2025-10-08,Wednesday,False,False
319,14/10/2025,A court has sharply criticised the police and ...,1,,undetermined,serious,driver,0,0,0,0,0,1,other,OTH,2025-10-09,Thursday,False,False


In [33]:
#marking day of week in numbers from 0 to 6 (Monday to Sunday). Weekdays that fall on public holidays are marked 6, like Sundays. Eve of public holiday is marked 4 like Friday

map_week = {'Monday':0,'Tuesday':1,'Wednesday':2,'Thursday':3,'Friday':4,'Saturday':5,'Sunday':6}

def to_num(day):
    return map_week.get(str(day).strip(), pd.NA)

local_news_df['day_of_week_num'] = local_news_df['day_of_week'].apply(to_num)

# helpers
def is_true(val):
    if isinstance(val,bool):
        return val
    return str(val).strip().lower() in ['true','1','yes']

is_weekend = local_news_df['day_of_week'].astype(str).str.strip().isin(['Saturday','Sunday'])

# Apply eve_ph first (Friday=4) only if not weekend
mask_eve = local_news_df['is_eve_ph'].apply(is_true) & (~is_weekend)
local_news_df.loc[mask_eve, 'day_of_week_num'] = 4

# Apply ph (Sunday=6) only if not weekend
mask_ph = local_news_df['is_ph'].apply(is_true) & (~is_weekend)
local_news_df.loc[mask_ph, 'day_of_week_num'] = 6
local_news_df

Unnamed: 0,publish_date,content,accident_flag,accident_time_24,accident_day,injury_severity,affected_party,collision,running_over_pedestrian,lost_control,crushed_into_obstacle,overturned,other,primary_accident_type,primary_accident_type_code,corrected_date,day_of_week,is_ph,is_eve_ph,day_of_week_num
0,07/12/2024,A motorist claims his car mirror was shattered...,1,17:00,undetermined,unknown,driver,0,0,0,1,0,0,crushed into an obstacle,FXOBJ,2024-12-07,Saturday,False,True,5
1,09/12/2024,The PN on Monday slammed the government for di...,1,,same day,unknown,driver,0,0,0,1,0,0,crushed into an obstacle,FXOBJ,2024-12-09,Monday,False,False,0
2,11/12/2024,A motorcyclist was rushed to hospital in a cri...,1,17:00,same day,serious,motorcyclist,0,1,1,1,0,0,running over pedestrian,PED,2024-12-11,Wednesday,False,False,2
3,12/12/2024,A private contractor who placed a skip on St P...,1,13:00,previous day,serious,motorcyclist,0,0,1,1,0,0,crushed into an obstacle,FXOBJ,2024-12-11,Wednesday,False,False,2
4,14/12/2024,A 29-year-old man and 17-year-old girl were cr...,1,17:30,previous day,serious,pedestrian,0,1,0,1,0,0,running over pedestrian,PED,2024-12-13,Friday,True,False,6
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
316,12/10/2025,The Msida flyover will open by the end of the ...,1,,same day,unknown,driver,0,0,0,0,0,0,none,NONE,2025-10-08,Wednesday,False,False,2
317,13/10/2025,The following are the top stories in Malta's n...,0,,same day,unknown,,0,0,0,0,0,0,none,NONE,2025-10-08,Wednesday,False,False,2
318,13/10/2025,"Traffic, parking and public transport-related ...",0,,undetermined,unknown,,0,0,0,0,0,0,none,NONE,2025-10-08,Wednesday,False,False,2
319,14/10/2025,A court has sharply criticised the police and ...,1,,undetermined,serious,driver,0,0,0,0,0,1,other,OTH,2025-10-09,Thursday,False,False,3


In [34]:
local_news_df.dtypes

publish_date                          object
content                               object
accident_flag                          int64
accident_time_24                      object
accident_day                          object
injury_severity                       object
affected_party                        object
collision                              int32
running_over_pedestrian                int32
lost_control                           int32
crushed_into_obstacle                  int32
overturned                             int32
other                                  int32
primary_accident_type                 object
primary_accident_type_code            object
corrected_date                datetime64[ns]
day_of_week                           object
is_ph                                   bool
is_eve_ph                               bool
day_of_week_num                        int64
dtype: object

In [35]:
#dropping rows that report no accidents or have missing time value
idx_drop_rows = local_news_df.loc[(local_news_df['accident_flag']== 0) | (local_news_df['injury_severity']== 'unknown') | (local_news_df['affected_party'].isna())].index #identifying rows to be dropped
local_news_df = local_news_df.drop(idx_drop_rows)

local_news_df

Unnamed: 0,publish_date,content,accident_flag,accident_time_24,accident_day,injury_severity,affected_party,collision,running_over_pedestrian,lost_control,crushed_into_obstacle,overturned,other,primary_accident_type,primary_accident_type_code,corrected_date,day_of_week,is_ph,is_eve_ph,day_of_week_num
2,11/12/2024,A motorcyclist was rushed to hospital in a cri...,1,17:00,same day,serious,motorcyclist,0,1,1,1,0,0,running over pedestrian,PED,2024-12-11,Wednesday,False,False,2
3,12/12/2024,A private contractor who placed a skip on St P...,1,13:00,previous day,serious,motorcyclist,0,0,1,1,0,0,crushed into an obstacle,FXOBJ,2024-12-11,Wednesday,False,False,2
4,14/12/2024,A 29-year-old man and 17-year-old girl were cr...,1,17:30,previous day,serious,pedestrian,0,1,0,1,0,0,running over pedestrian,PED,2024-12-13,Friday,True,False,6
4,14/12/2024,", police were busy responding to another serio...",1,19:45,undetermined,serious,pedestrian,0,1,0,1,0,0,running over pedestrian,PED,2024-12-14,Saturday,False,False,5
5,14/12/2024,"Dieter Vink was a true gentleman, said friends...",1,,previous day,death,motorcyclist,0,0,0,1,0,0,crushed into an obstacle,FXOBJ,2024-12-13,Friday,True,False,6
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
312,09/10/2025,Malta’s traffic woes were brought home to TV v...,1,,same day,minor,driver,0,0,0,1,0,0,crushed into an obstacle,FXOBJ,2025-10-07,Tuesday,False,False,1
313,09/10/2025,A motorcyclist was left with serious injuries ...,1,09:30,same day,serious,motorcyclist,0,0,0,0,0,1,other,OTH,2025-10-07,Tuesday,False,False,1
314,09/10/2025,A man was driving on the Floriana main road wh...,1,,same day,no injuries,driver,0,0,0,0,0,0,none,NONE,2025-10-07,Tuesday,False,False,1
315,10/10/2025,The Mayor of Għajnsielem on Friday called on t...,1,09:00,same day,minor,driver,0,0,0,0,0,1,other,OTH,2025-10-08,Wednesday,False,False,2


In [36]:
blank_time = local_news_df[local_news_df['accident_time_24'].isna() | (local_news_df['accident_time_24'].astype(str).str.strip() == '')]
blank_indices = blank_time.index.tolist()
#blank_indices

local_news_df = local_news_df.drop(blank_indices)
local_news_df


Unnamed: 0,publish_date,content,accident_flag,accident_time_24,accident_day,injury_severity,affected_party,collision,running_over_pedestrian,lost_control,crushed_into_obstacle,overturned,other,primary_accident_type,primary_accident_type_code,corrected_date,day_of_week,is_ph,is_eve_ph,day_of_week_num
2,11/12/2024,A motorcyclist was rushed to hospital in a cri...,1,17:00,same day,serious,motorcyclist,0,1,1,1,0,0,running over pedestrian,PED,2024-12-11,Wednesday,False,False,2
3,12/12/2024,A private contractor who placed a skip on St P...,1,13:00,previous day,serious,motorcyclist,0,0,1,1,0,0,crushed into an obstacle,FXOBJ,2024-12-11,Wednesday,False,False,2
4,14/12/2024,A 29-year-old man and 17-year-old girl were cr...,1,17:30,previous day,serious,pedestrian,0,1,0,1,0,0,running over pedestrian,PED,2024-12-13,Friday,True,False,6
4,14/12/2024,", police were busy responding to another serio...",1,19:45,undetermined,serious,pedestrian,0,1,0,1,0,0,running over pedestrian,PED,2024-12-14,Saturday,False,False,5
7,16/12/2024,A motorcyclist was seriously injured on Sunday...,1,20:30,previous day,serious,motorcyclist,0,0,0,1,0,0,crushed into an obstacle,FXOBJ,2024-12-13,Friday,True,False,6
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
307,07/10/2025,A crowdfunding campaign for a nurse left in a ...,1,08:00,undetermined,death,pedestrian,0,0,0,1,0,0,crushed into an obstacle,FXOBJ,2025-10-04,Saturday,False,False,5
309,08/10/2025,The families of the two couples killed in a ho...,1,09:30,undetermined,minor,driver,0,0,0,1,0,0,crushed into an obstacle,FXOBJ,2025-10-05,Sunday,False,False,6
311,08/10/2025,A Polish man has died after getting into diffi...,1,13:00,same day,death,driver,0,0,0,0,0,1,other,OTH,2025-10-06,Monday,False,False,0
313,09/10/2025,A motorcyclist was left with serious injuries ...,1,09:30,same day,serious,motorcyclist,0,0,0,0,0,1,other,OTH,2025-10-07,Tuesday,False,False,1


In [38]:
#creating a timestamp to remove duplicates at a later stage
#time_formatted = datetime.strptime(time, '%I.%M%p')
local_news_df['accident_time_24'] = pd.to_datetime(local_news_df['accident_time_24'], format = '%H:%M')
local_news_df['accident_time_24'] = local_news_df['accident_time_24'].dt.time
local_news_df['corrected_date'] = local_news_df['corrected_date'].dt.date
local_news_df['timestamp'] = [datetime.combine(a, b) for a, b in zip(local_news_df['corrected_date'], local_news_df['accident_time_24'])]
local_news_df

Unnamed: 0,publish_date,content,accident_flag,accident_time_24,accident_day,injury_severity,affected_party,collision,running_over_pedestrian,lost_control,...,overturned,other,primary_accident_type,primary_accident_type_code,corrected_date,day_of_week,is_ph,is_eve_ph,day_of_week_num,timestamp
2,11/12/2024,A motorcyclist was rushed to hospital in a cri...,1,17:00:00,same day,serious,motorcyclist,0,1,1,...,0,0,running over pedestrian,PED,2024-12-11,Wednesday,False,False,2,2024-12-11 17:00:00
3,12/12/2024,A private contractor who placed a skip on St P...,1,13:00:00,previous day,serious,motorcyclist,0,0,1,...,0,0,crushed into an obstacle,FXOBJ,2024-12-11,Wednesday,False,False,2,2024-12-11 13:00:00
4,14/12/2024,A 29-year-old man and 17-year-old girl were cr...,1,17:30:00,previous day,serious,pedestrian,0,1,0,...,0,0,running over pedestrian,PED,2024-12-13,Friday,True,False,6,2024-12-13 17:30:00
4,14/12/2024,", police were busy responding to another serio...",1,19:45:00,undetermined,serious,pedestrian,0,1,0,...,0,0,running over pedestrian,PED,2024-12-14,Saturday,False,False,5,2024-12-14 19:45:00
7,16/12/2024,A motorcyclist was seriously injured on Sunday...,1,20:30:00,previous day,serious,motorcyclist,0,0,0,...,0,0,crushed into an obstacle,FXOBJ,2024-12-13,Friday,True,False,6,2024-12-13 20:30:00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
307,07/10/2025,A crowdfunding campaign for a nurse left in a ...,1,08:00:00,undetermined,death,pedestrian,0,0,0,...,0,0,crushed into an obstacle,FXOBJ,2025-10-04,Saturday,False,False,5,2025-10-04 08:00:00
309,08/10/2025,The families of the two couples killed in a ho...,1,09:30:00,undetermined,minor,driver,0,0,0,...,0,0,crushed into an obstacle,FXOBJ,2025-10-05,Sunday,False,False,6,2025-10-05 09:30:00
311,08/10/2025,A Polish man has died after getting into diffi...,1,13:00:00,same day,death,driver,0,0,0,...,0,1,other,OTH,2025-10-06,Monday,False,False,0,2025-10-06 13:00:00
313,09/10/2025,A motorcyclist was left with serious injuries ...,1,09:30:00,same day,serious,motorcyclist,0,0,0,...,0,1,other,OTH,2025-10-07,Tuesday,False,False,1,2025-10-07 09:30:00


In [40]:
# Define rush hour ranges
morning_start = datetime.strptime("07:30", "%H:%M").time()
morning_end = datetime.strptime("10:00", "%H:%M").time()
afternoon_start = datetime.strptime("16:00", "%H:%M").time()
afternoon_end = datetime.strptime("19:00", "%H:%M").time()

# Function to check if time falls in rush hour
def is_rush_hour(row):
    time_val = row['accident_time_24']
    day_num = row['day_of_week_num']
    if time_val and 0 <= day_num <= 4:  # Weekdays only
        if morning_start <= time_val <= morning_end or afternoon_start <= time_val <= afternoon_end:
            return 1
    return 0

# Apply function to create rush_hour column
local_news_df['rush_hour'] = local_news_df.apply(is_rush_hour, axis=1)
local_news_df


Unnamed: 0,publish_date,content,accident_flag,accident_time_24,accident_day,injury_severity,affected_party,collision,running_over_pedestrian,lost_control,...,other,primary_accident_type,primary_accident_type_code,corrected_date,day_of_week,is_ph,is_eve_ph,day_of_week_num,timestamp,rush_hour
2,11/12/2024,A motorcyclist was rushed to hospital in a cri...,1,17:00:00,same day,serious,motorcyclist,0,1,1,...,0,running over pedestrian,PED,2024-12-11,Wednesday,False,False,2,2024-12-11 17:00:00,1
3,12/12/2024,A private contractor who placed a skip on St P...,1,13:00:00,previous day,serious,motorcyclist,0,0,1,...,0,crushed into an obstacle,FXOBJ,2024-12-11,Wednesday,False,False,2,2024-12-11 13:00:00,0
4,14/12/2024,A 29-year-old man and 17-year-old girl were cr...,1,17:30:00,previous day,serious,pedestrian,0,1,0,...,0,running over pedestrian,PED,2024-12-13,Friday,True,False,6,2024-12-13 17:30:00,0
4,14/12/2024,", police were busy responding to another serio...",1,19:45:00,undetermined,serious,pedestrian,0,1,0,...,0,running over pedestrian,PED,2024-12-14,Saturday,False,False,5,2024-12-14 19:45:00,0
7,16/12/2024,A motorcyclist was seriously injured on Sunday...,1,20:30:00,previous day,serious,motorcyclist,0,0,0,...,0,crushed into an obstacle,FXOBJ,2024-12-13,Friday,True,False,6,2024-12-13 20:30:00,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
307,07/10/2025,A crowdfunding campaign for a nurse left in a ...,1,08:00:00,undetermined,death,pedestrian,0,0,0,...,0,crushed into an obstacle,FXOBJ,2025-10-04,Saturday,False,False,5,2025-10-04 08:00:00,0
309,08/10/2025,The families of the two couples killed in a ho...,1,09:30:00,undetermined,minor,driver,0,0,0,...,0,crushed into an obstacle,FXOBJ,2025-10-05,Sunday,False,False,6,2025-10-05 09:30:00,0
311,08/10/2025,A Polish man has died after getting into diffi...,1,13:00:00,same day,death,driver,0,0,0,...,1,other,OTH,2025-10-06,Monday,False,False,0,2025-10-06 13:00:00,0
313,09/10/2025,A motorcyclist was left with serious injuries ...,1,09:30:00,same day,serious,motorcyclist,0,0,0,...,1,other,OTH,2025-10-07,Tuesday,False,False,1,2025-10-07 09:30:00,1


In [119]:
local_news_df.dtypes

publish_date        object
content             object
accident_flag        int64
accident_time_24    object
accident_day        object
injury_severity     object
affected_party      object
accident_type       object
corrected_date      object
day_of_week         object
is_ph                 bool
is_eve_ph             bool
day_of_week_num      int64
dtype: object

In [124]:
#saving this output into a csv file to double-check the results
local_news_df.to_csv('local_news_test2.csv', index=False)

In [None]:
#extracted_df['AccidentTimes'].dtype

dtype('O')