In [9]:
import pandas as pd
import jsonlines
import math
import word2number.w2n


### Load Expected Data

In [10]:
orig_expected_df = pd.read_csv('../data/expected.csv', delimiter=';')


In [11]:
expected_df = orig_expected_df[[
    "<url>", 
    "<available_ages_of_the_deceased>", 
    "<total_number_of_people_injured>",
    "<is_reason_or_cause_for_the_accident_ploughed_or_ram_or_hit_or_collision_or_breakfail_or_others>",
]].rename(columns={
    "<url>": "url", 
    "<available_ages_of_the_deceased>": "expectedAgeOfCasualties", 
    "<total_number_of_people_injured>": "expectedInjuries",
    "<is_reason_or_cause_for_the_accident_ploughed_or_ram_or_hit_or_collision_or_breakfail_or_others>": "expectedReason",
})

expected_df['url'] = expected_df['url'].apply(lambda x: (x.replace(">", "").replace("<", ""))).astype(str)
expected_df = expected_df.drop_duplicates(['url'])

print(expected_df.shape)
expected_df.head()


(1670, 4)


Unnamed: 0,url,expectedAgeOfCasualties,expectedInjuries,expectedReason
0,https://www.unb.com.bd/category/Bangladesh/tru...,,<0>,<collision>
1,https://www.unb.com.bd/category/Bangladesh/2-k...,,<4>,<collision>
2,https://www.unb.com.bd/category/Bangladesh/fou...,"<(45, 32, 34)>",<0>,<collision>
3,https://www.unb.com.bd/category/Bangladesh/kar...,"<(60, 55)>",<0>,<capsize>
4,https://www.unb.com.bd/category/Bangladesh/tou...,<32>,<6>,<collision>


### Load Processed Data

In [12]:
data: list[dict] = []

with jsonlines.open("../data/data_processed.jsonl") as reader:
    data = list(reader.iter())

filtered_data = list(map(
    lambda x: {
        "url" : x["url"],
        "ageOfCasualties": x["ageOfCasualties"],
        "injuries": x["injuries"],
        "reason": x["reason"],
    }, 
    data,
))

processed_df = pd.DataFrame(filtered_data)
processed_df["url"] = processed_df["url"].astype(str)

print(processed_df.shape)
processed_df.head()


(864, 4)


Unnamed: 0,url,ageOfCasualties,injuries,reason
0,https://www.unb.com.bd/category/Bangladesh/rec...,[],several,reckless driving
1,https://www.unb.com.bd/category/Bangladesh/out...,[],10,crashed into the other truck
2,https://www.unb.com.bd/category/Bangladesh/3-d...,[],two,battery-run autorickshaw
3,https://www.unb.com.bd/category/Bangladesh/mot...,[45],died,Amzad lost control of his motorcycle and fell ...
4,https://www.unb.com.bd/category/Bangladesh/tee...,[13],Dhaka Medical College morgue for an autopsy.,under the flyover


### Joined dataframes

In [13]:
joined_df = processed_df.merge(expected_df, on='url', how='left')
joined_df.shape


(864, 7)

### Score on Age Of Casualties

In [28]:
total = list(joined_df.iterrows()).__len__()
correct = 0

for index, row in joined_df.iterrows():
    try:
        if type(row['expectedAgeOfCasualties']) == float and math.isnan(row["expectedAgeOfCasualties"]) and row["ageOfCasualties"] == []:
            correct += 1
        elif type(row['expectedAgeOfCasualties']) == str and row['expectedAgeOfCasualties'] == "NA":
            if row["ageOfCasualties"] == []:
                correct += 1 
        elif type(row['expectedAgeOfCasualties']) == str and type(eval(row['expectedAgeOfCasualties'].replace(">", "").replace("<", ""))) in (tuple, list):
            if set(eval(row['expectedAgeOfCasualties'].replace(">", "").replace("<", ""))).issubset(set(row["ageOfCasualties"])):
                correct += 1
        elif type(row['expectedAgeOfCasualties']) == str and type(eval(row['expectedAgeOfCasualties'].replace(">", "").replace("<", ""))) == int:
            if eval(row['expectedAgeOfCasualties'].replace(">", "").replace("<", "")) in row["ageOfCasualties"]:
                correct += 1

    except Exception as e:
        print(f"Error: {e}")
        print(f"Row: {row}")
        print(f"Index: {index}")



print(f"Correct ageOfCasualties: {correct}/{total} ({correct/total*100:.2f}%)")

Error: invalid syntax (<string>, line 1)
Row: url                        https://www.unb.com.bd/category/Bangladesh/col...
ageOfCasualties                                                         [50]
injuries                                                            10:30 am
reason                                               while crossing the road
expectedAgeOfCasualties                                     <20230524 12:38>
expectedInjuries                                                       <hit>
expectedReason                                                         <Bus>
Name: 41, dtype: object
Index: 41
Error: name 'NA' is not defined
Row: url                        https://www.unb.com.bd/category/Bangladesh/lyr...
ageOfCasualties                                                           []
injuries                                                                 one
reason                     a covered van hit the motorbike while he was h...
expectedAgeOfCasualties             

### Score on Injuries

In [24]:
total = len(joined_df)
correct = 0

for index, row in joined_df.iterrows():
    try:
        expected = int(row['expectedInjuries'].replace(">", "").replace("<", ""))
    except Exception as e:
        continue


    try:
        actual = word2number.w2n.word_to_num(row["injuries"])
    except ValueError:
        actual = 0

    if expected == actual:
        correct += 1

print(f"Correct injuries: {correct}/{total} ({correct/total*100:.2f}%)")


Correct injuries: 382/864 (44.21%)


### Score on Reason

In [29]:
total = len(joined_df)
correct = 0

for index, row in joined_df.iterrows():
    if type(row['expectedReason']) == str and row['expectedReason'].replace(">", "").replace("<", "") in row["reason"]:
        correct += 1

print(f"Correct reason: {correct}/{total} ({correct/total*100:.2f}%)")


Correct reason: 181/864 (20.95%)
