In [1]:
import pandas as pd
import jsonlines
import math
import word2number.w2n


### Load Expected Data

In [2]:
orig_expected_df = pd.read_csv('../data/expected.csv', delimiter=';')


In [3]:
expected_df = orig_expected_df[[
    "<primary_vehicle_involved>",
    "<secondary_vehicle_involved>",
    "<tertiary_vehicle_involved>",
    "<any_more_vehicles_involved>",
    "<url>", 
    "<available_ages_of_the_deceased>", 
    "<total_number_of_people_injured>",
    "<is_reason_or_cause_for_the_accident_ploughed_or_ram_or_hit_or_collision_or_breakfail_or_others>",
    "<total_number_of_people_killed>",
    "<accident_datetime_from_url>",
    "<day_of_the_week_of_the_accident>",
    "<location metadata>",
]].rename(columns={
    "<url>": "url", 
    "<available_ages_of_the_deceased>": "expectedAgeOfCasualties", 
    "<total_number_of_people_injured>": "expectedInjuries",
    "<is_reason_or_cause_for_the_accident_ploughed_or_ram_or_hit_or_collision_or_breakfail_or_others>": "expectedReason",
    "<total_number_of_people_killed>": "expectedCasualties",
    "<accident_datetime_from_url>": "expectedDate",
    "<day_of_the_week_of_the_accident>": "expectedDayOfWeek",
    "<location metadata>": "expectedPlace",
})

def combine_vehicles(row):
    vehicles = []
    if pd.notna(row['<primary_vehicle_involved>']):
        vehicles.append(row['<primary_vehicle_involved>'])
    if pd.notna(row['<secondary_vehicle_involved>']):
        vehicles.append(row['<secondary_vehicle_involved>'])
    if pd.notna(row['<tertiary_vehicle_involved>']):
        vehicles.append(row['<tertiary_vehicle_involved>'])
    if pd.notna(row['<any_more_vehicles_involved>']):
        vehicles.append(row['<any_more_vehicles_involved>'])

    return vehicles

expected_df['expectedVehicles'] = expected_df.apply(combine_vehicles, axis=1)

def clean_vehicles(vehicles):
    return [vehicle.replace(">", "").replace("<", "").lower() for vehicle in vehicles]

expected_df['expectedVehicles'] = expected_df['expectedVehicles'].apply(clean_vehicles)

expected_df = expected_df.drop(columns=[
    "<primary_vehicle_involved>",
    "<secondary_vehicle_involved>",
    "<tertiary_vehicle_involved>",
    "<any_more_vehicles_involved>",
])

expected_df['url'] = expected_df['url'].apply(lambda x: (x.replace(">", "").replace("<", ""))).astype(str)
expected_df = expected_df.drop_duplicates(['url'])

print(expected_df.shape)
expected_df.head()


(1670, 9)


Unnamed: 0,url,expectedAgeOfCasualties,expectedInjuries,expectedReason,expectedCasualties,expectedDate,expectedDayOfWeek,expectedPlace,expectedVehicles
0,https://www.unb.com.bd/category/Bangladesh/tru...,,<0>,<collision>,<3>,<20221008 01:02>,<Friday>,<Jashore>,"[truck, motorcycle]"
1,https://www.unb.com.bd/category/Bangladesh/2-k...,,<4>,<collision>,<2>,<20221012 12:24>,<Wednesday>,<N’ganj>,"[bus, cng]"
2,https://www.unb.com.bd/category/Bangladesh/fou...,"<(45, 32, 34)>",<0>,<collision>,<4>,<20221015 10:25>,<Saturday>,<Gazipur>,"[cycle van, bus, bus, pedestrian]"
3,https://www.unb.com.bd/category/Bangladesh/kar...,"<(60, 55)>",<0>,<capsize>,<7>,<20221015 15:02>,<Tuesday>,<Chattogram>,[trawler]
4,https://www.unb.com.bd/category/Bangladesh/tou...,<32>,<6>,<collision>,<1>,<20221019 18:12>,<Wednesday>,<Rangamati>,[chander gari]


### Load Processed Data

In [4]:
data: list[dict] = []

with jsonlines.open("../data/data_processed.jsonl") as reader:
    data = list(reader.iter())

filtered_data = list(map(
    lambda x: {
        "url" : x["url"],
        "ageOfCasualties": x["ageOfCasualties"],
        "injuries": x["injuries"],
        "reason": x["reason"],
        "vehicles": x["vehicles"],
        "casualties": x["casualties"],
        "date": x["date"],
        "place": x["place"]["name"],
    }, 
    data,
))

processed_df = pd.DataFrame(filtered_data)
processed_df["url"] = processed_df["url"].astype(str)
processed_df['vehicles'] = processed_df['vehicles'].apply(clean_vehicles)


print(processed_df.shape)
processed_df.head()


(864, 8)


Unnamed: 0,url,ageOfCasualties,injuries,reason,vehicles,casualties,date,place
0,https://www.unb.com.bd/category/Bangladesh/rec...,[],several,reckless driving,[private car],0,,chattogram
1,https://www.unb.com.bd/category/Bangladesh/out...,[],10,crashed into the other truck,"[microbus, dhaka-chattogram highway, goods-lad...",0,2024-02-09,chattogram
2,https://www.unb.com.bd/category/Bangladesh/3-d...,[],two,battery-run autorickshaw,"[battery-run autorickshaw, tangail&ndashbound ...",0,2024-02-19,gazipur
3,https://www.unb.com.bd/category/Bangladesh/mot...,[45],died,Amzad lost control of his motorcycle and fell ...,[truck],1,2024-01-15,bogura
4,https://www.unb.com.bd/category/Bangladesh/tee...,[13],Dhaka Medical College morgue for an autopsy.,under the flyover,[unidentified vehicle],1,2024-01-04,dhaka


### Joined dataframes

In [5]:
joined_df = processed_df.merge(expected_df, on='url', how='left')
joined_df.shape


(864, 16)

### Score on Age Of Casualties

In [8]:
total = list(joined_df.iterrows()).__len__()
correct = 0

for index, row in joined_df.iterrows():
    try:
        if type(row['expectedAgeOfCasualties']) == float and math.isnan(row["expectedAgeOfCasualties"]) and row["ageOfCasualties"] == []:
            correct += 1
        elif type(row['expectedAgeOfCasualties']) == str and row['expectedAgeOfCasualties'] == "NA":
            if row["ageOfCasualties"] == []:
                correct += 1 
        elif type(row['expectedAgeOfCasualties']) == str and type(eval(row['expectedAgeOfCasualties'].replace(">", "").replace("<", ""))) in (tuple, list):
            if set(eval(row['expectedAgeOfCasualties'].replace(">", "").replace("<", ""))).issubset(set(row["ageOfCasualties"])):
                correct += 1
        elif type(row['expectedAgeOfCasualties']) == str and type(eval(row['expectedAgeOfCasualties'].replace(">", "").replace("<", ""))) == int:
            if eval(row['expectedAgeOfCasualties'].replace(">", "").replace("<", "")) in row["ageOfCasualties"]:
                correct += 1

    except Exception as e:
        pass


print(f"Correct ageOfCasualties: {correct}/{total} ({correct/total*100:.2f}%)")

Correct ageOfCasualties: 244/864 (28.24%)


### Score on Injuries

In [9]:
total = len(joined_df)
correct = 0

for index, row in joined_df.iterrows():
    try:
        expected = int(row['expectedInjuries'].replace(">", "").replace("<", ""))
    except Exception as e:
        continue


    try:
        actual = word2number.w2n.word_to_num(row["injuries"])
    except ValueError:
        actual = 0

    if expected == actual:
        correct += 1

print(f"Correct injuries: {correct}/{total} ({correct/total*100:.2f}%)")


Correct injuries: 382/864 (44.21%)


### Score on Reason

In [10]:
total = len(joined_df)
correct = 0

for index, row in joined_df.iterrows():
    if type(row['expectedReason']) == str and row['expectedReason'].replace(">", "").replace("<", "") in row["reason"]:
        correct += 1

print(f"Correct reason: {correct}/{total} ({correct/total*100:.2f}%)")


Correct reason: 181/864 (20.95%)


### Score on Vehicles

In [11]:
total = len(joined_df)
correct = 0

# for index, row in joined_df.iterrows():
#     print(row['expectedVehicles'], row['vehicles'])
#     if type(row['expectedVehicles']) == list and set(row['expectedVehicles']) == set(row['vehicles']):
#         correct += 1
for index, row in joined_df.iterrows():
    print(row['expectedVehicles'], row['vehicles'])
    if type(row['expectedVehicles']) == list:
        if len(row['expectedVehicles']) == 0:
            if len(row['vehicles']) == 0:
                correct += 1
            continue

        correct_vehicles = sum(expected in vehicle for expected in row['expectedVehicles'] for vehicle in row['vehicles'])
        correct += correct_vehicles / len(row['expectedVehicles'])



print(f"Correct vehicles: {correct}/{total} ({correct/total*100:.2f}%)")

['car', 'microbus'] ['private car']
['truck', '20240209 18:59'] ['microbus', 'dhaka-chattogram highway', 'goods-laden truck']
['truck', 'auto rickshaw'] ['battery-run autorickshaw', 'tangail&ndashbound truck']
['truck', 'motorcycle'] ['truck']
['unknown'] ['unidentified vehicle']
['motorcycle'] ['motorcycle']
['bus', 'pedestrian'] ['real coach']
['car'] ['sport utility vehicle (suv)']
['train', 'pedestrian'] ['speedy train']
['cng', 'truck'] ['cng autorickshaw', 'truck', 'cng']
['bus'] ['dhaka-bound bus', 'bangabandhu expressway']
['motorcycle', 'motorcycle'] ['motorcycle']
['(truck, mahindra)'] ['sand-laden truck']
['motorcycle', 'pickup'] ['pick-up van', 'fish-laden pick-up van']
['truck', 'cng', 'shallow engine'] ['truck', 'cng run auto-rickshaw', 'shallow engine', 'cng-run auto-rickshaw']
['microbus', 'cng', 'truck', 'truck'] ['dhaka-sylhet highway', 'truck', 'microbus', 'cng-run auto-rickshaw']
['truck', 'cng', 'bus'] ['sand-laden truck', 'bus', 'cng-run autorickshaw']
['motorcycl

### Score on Casualties

In [12]:
total = len(joined_df)
correct = 0

for index, row in joined_df.iterrows():
    expected = row["expectedCasualties"]
    
    if pd.isna(expected) and row["casualties"] == 0:
        correct += 1
        continue
    
    if type(expected) is float:
        if math.isnan(expected) and row["casualties"] == 0:
            correct += 1    
        continue
    
    try:
        expected = int(row['expectedCasualties'].replace(">", "").replace("<", ""))
    except Exception as e:
        continue
    
    try:
        actual = word2number.w2n.word_to_num(row["casualties"])
    except ValueError:
        actual = 0

    if expected == actual:
        correct += 1

print(f"Correct casualties: {correct}/{total} ({correct/total*100:.2f}%)")


Correct casualties: 87/864 (10.07%)


### Score on date

In [13]:
from dateutil.parser import parse

total = len(joined_df)
correct = 0

day_mapping = {"Monday": 0, "Tuesday": 1, "Wednesday": 2, "Thursday": 3, "Friday": 4, "Saturday": 5, "Sunday": 6}

for index, row in joined_df.iterrows():
    if type(row['expectedDate']) != str:
        continue
    if type(row['date']) != str:
        continue

    try:
        expectedDate = parse(row['expectedDate'].replace("<", "").replace(">", "")).date()
        actualDate = parse(row['date']).date()
        if expectedDate == actualDate:
            correct += 1
        elif abs((expectedDate - actualDate).days) < 7:
            expectedDay = day_mapping[row["expectedDayOfWeek"].replace("<", "").replace(">", "")]
            actualDay = parse(row['date']).weekday()   
            if expectedDay == actualDay:
                correct += 0.75
    except Exception as e:
        continue
        # print(f"Error: {e}")
        # print(f"Row: {row}")
        # print(f"Index: {index}")

print(f"Correct dates: {correct}/{total} ({correct/total*100:.2f}%)")

Correct dates: 460.5/864 (53.30%)


### Score on place

In [14]:
total = len(joined_df)
correct = 0

for index, row in joined_df.iterrows():
    if type(row['expectedPlace']) == str and row['expectedPlace'].replace(">", "").replace("<", "").lower() == row["place"]:
        correct += 1
    else:
        #print(row['expectedPlace'], row["place"])
        pass

print(f"Correct place: {correct}/{total} ({correct/total*100:.2f}%)")

Correct place: 343/864 (39.70%)
