In [34]:
import pandas as pd
import jsonlines
import math
import word2number.w2n


### Load Expected Data

In [2]:
orig_expected_df = pd.read_csv('../data/expected.csv', delimiter=';')
orig_expected_df.head()


Unnamed: 0,<url>,<publication date metadata>,<location metadata>,<title>,<raw text>,<number_of_accidents_occured>,<is_the_accident_data_yearly_monthly_or_daily>,<day_of_the_week_of_the_accident>,<exact_location_of_accident>,<area_of_accident>,...,<total_number_of_people_killed>,<total_number_of_people_injured>,<is_reason_or_cause_for_the_accident_ploughed_or_ram_or_hit_or_collision_or_breakfail_or_others>,<primary_vehicle_involved>,<secondary_vehicle_involved>,<tertiary_vehicle_involved>,<any_more_vehicles_involved>,<available_ages_of_the_deceased>,<accident_datetime_from_url>,file_name
0,<https://www.unb.com.bd/category/Bangladesh/tr...,<20221008>,<Jashore>,<Truck crushes three twelfth-graders of same i...,<Three twelfth-graders of the same educational...,<1>,<D>,<Friday>,<Natunhat Gazir Dorga on the Benapole-Jashore ...,<Jashore Sadar>,...,<3>,<0>,<collision>,<Truck>,<Motorcycle>,,,,<20221008 01:02>,SuheybBecerek_TrifebiShinaSabrila_20240219_202...
1,<https://www.unb.com.bd/category/Bangladesh/2-...,<20221012>,<N’ganj>,"<2 killed, 4 injured as bus hits auto in N’ganj>","<Two people, including a college student, were...",<1>,<D>,<Wednesday>,<Dhaka-Sylhet highway in Araihazar upazila>,<Araihazar>,...,<2>,<4>,<collision>,<Bus>,<CNG>,,,,<20221012 12:24>,SuheybBecerek_TrifebiShinaSabrila_20240219_202...
2,<https://www.unb.com.bd/category/Bangladesh/fo...,<20221015>,<Gazipur>,<Four die in horrific road crash in Gazipur>,<Four people died when a cycle van was sandwic...,<1>,<D>,<Saturday>,<Telipara area of Dhaka-Mymensingh highway>,<Gazipur>,...,<4>,<0>,<collision>,<Cycle Van>,<Bus>,<Bus>,<Pedestrian>,"<(45, 32, 34)>",<20221015 10:25>,SuheybBecerek_TrifebiShinaSabrila_20240219_202...
3,<https://www.unb.com.bd/category/Bangladesh/ka...,<20221015>,<Chattogram>,<Karnaphuli trawler capsize: 2 more bodies rec...,<The death toll from the trawler capsize in th...,<1>,<D>,<Tuesday>,<River Karnaphuli in Patenga area>,<River Karnaphuli in Patenga area>,...,<7>,<0>,<capsize>,<Trawler>,,,,"<(60, 55)>",<20221015 15:02>,SuheybBecerek_TrifebiShinaSabrila_20240219_202...
4,<https://www.unb.com.bd/category/Bangladesh/to...,<20221019>,<Rangamati>,<Tourist killed in Rangamati road accident>,<A tourist was killed and six others were inju...,<1>,<D>,<Wednesday>,<House Para from Sajek’s Ruilui Para>,<Sajek>,...,<1>,<6>,<collision>,<Chander Gari>,,,,<32>,<20221019 18:12>,SuheybBecerek_TrifebiShinaSabrila_20240219_202...


In [3]:
expected_df = orig_expected_df[[
    "<url>", 
    "<available_ages_of_the_deceased>", 
    "<total_number_of_people_injured>",
    "<is_reason_or_cause_for_the_accident_ploughed_or_ram_or_hit_or_collision_or_breakfail_or_others>",
]].rename(columns={
    "<url>": "url", 
    "<available_ages_of_the_deceased>": "expectedAgeOfCasualties", 
    "<total_number_of_people_injured>": "expectedInjuries",
    "<is_reason_or_cause_for_the_accident_ploughed_or_ram_or_hit_or_collision_or_breakfail_or_others>": "expectedReason",
})
expected_df.head()

Unnamed: 0,url,expectedAgeOfCasualties,expectedInjuries,expectedReason
0,<https://www.unb.com.bd/category/Bangladesh/tr...,,<0>,<collision>
1,<https://www.unb.com.bd/category/Bangladesh/2-...,,<4>,<collision>
2,<https://www.unb.com.bd/category/Bangladesh/fo...,"<(45, 32, 34)>",<0>,<collision>
3,<https://www.unb.com.bd/category/Bangladesh/ka...,"<(60, 55)>",<0>,<capsize>
4,<https://www.unb.com.bd/category/Bangladesh/to...,<32>,<6>,<collision>


### Load Processed Data

In [4]:
data: list[dict] = []

with jsonlines.open("../data/data_processed_short.jsonl") as reader:
    data = list(reader.iter())

filtered_data = list(map(
    lambda x: {
        "url" : x["url"],
        "ageOfCasualties": x["ageOfCasualties"],
        "injuries": x["injuries"],
        "reason": x["reason"],
    }, 
    data,
))

processed_df = pd.DataFrame(filtered_data)
processed_df.head()


Unnamed: 0,url,ageOfCasualties,injuries,reason
0,https://www.unb.com.bd/category/Bangladesh/rec...,[],several,reckless driving
1,https://www.unb.com.bd/category/Bangladesh/out...,[],10,crashed into the other truck
2,https://www.unb.com.bd/category/Bangladesh/3-d...,[],two,battery-run autorickshaw
3,https://www.unb.com.bd/category/Bangladesh/mot...,[45],died,Amzad lost control of his motorcycle and fell ...
4,https://www.unb.com.bd/category/Bangladesh/tee...,[13],Dhaka Medical College morgue for an autopsy.,under the flyover


### Joined dataframes

In [5]:
joined_df = pd.merge(expected_df, processed_df, on="url")
joined_df.head()


Unnamed: 0,url,expectedAgeOfCasualties,expectedInjuries,expectedReason,ageOfCasualties,injuries,reason
0,https://www.unb.com.bd/category/Bangladesh/3-d...,,<2>,<hit>,[],two,battery-run autorickshaw


### Score on Age Of Casualties

In [27]:
total = len(joined_df)
correct = 0

for index, row in joined_df.iterrows():
    if type(row['expectedAgeOfCasualties']) == float and math.isnan(row["expectedAgeOfCasualties"]) and row["ageOfCasualties"] == []:
        correct += 1
    elif type(row['expectedAgeOfCasualties']) == str and row['expectedAgeOfCasualties'] == "NA":
        if row["ageOfCasualties"] == []:
            correct += 1 
    elif type(row['expectedAgeOfCasualties']) == str and set(eval(row['expectedAgeOfCasualties'].replace(">", "").replace("<", ""))).issubset(set(row["ageOfCasualties"])):
        correct += 1



print(f"Correct: {correct}/{total} ({correct/total*100:.2f}%)")

Correct: 1/1 (100.00%)


### Score on Age Of Casualties

In [46]:
total = len(joined_df)
correct = 0

for index, row in joined_df.iterrows():
    expected = int(row['expectedInjuries'].replace(">", "").replace("<", ""))
    
    try:
        actual = word2number.w2n.word_to_num(row["injuries"])
    except ValueError:
        actual = 0

    if expected == actual:
        correct += 1

print(f"Correct: {correct}/{total} ({correct/total*100:.2f}%)")


Correct: 1/1 (100.00%)


### Score on Reason

In [49]:
total = len(joined_df)
correct = 0

for index, row in joined_df.iterrows():
    if row['expectedReason'] in row["reason"]:
        correct += 1

print(f"Correct: {correct}/{total} ({correct/total*100:.2f}%)")

Correct: 0/1 (0.00%)
