In [None]:
! pip install --upgrade pip
! pip install -r ../requirements.txt


In [1]:
import datetime
from dateutil.parser import parse
from dateutil.relativedelta import relativedelta, MO

import pandas as pd
from transformers import Pipeline, pipeline
from gliner import GLiNER


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
df = pd.read_csv('../data/data.csv', delimiter=';')


In [3]:
df.head(10)

Unnamed: 0,Publish,Update,Place,HTML_TEXT,RAW_TEXT
0,"January 22, 2024","January 22, 2024",Kishoreganj,"b'<html lang=""en""> <head> <meta charset=""utf-8...",Two people died and one other was injured afte...
1,"December 28, 2023","December 28, 2023",Dhaka,"b'<html lang=""en""> <head> <meta charset=""utf-8...",Police have arrested the driver of a sport uti...
2,"January 15, 2024","January 15, 2024",Bogura,"b'<html lang=""en""> <head> <meta charset=""utf-8...",A motorcyclist died after a truck ran him over...
3,"February 19, 2024","February 19, 2024",Gazipur,"b'<html lang=""en""> <head> <meta charset=""utf-8...",Three people were killed and two others injure...
4,"December 17, 2023","December 17, 2023",Sylhet,"b'<html lang=""en""> <head> <meta charset=""utf-8...",Two men were killed in a head-on collision bet...
5,"February 11, 2024","February 11, 2024",Chattogram,"b'<html lang=""en""> <head> <meta charset=""utf-8...","In a distressing incident last night, reckless..."
6,"January 04, 2024","January 04, 2024",Dhaka,"b'<html lang=""en""> <head> <meta charset=""utf-8...",A teenager was killed after being run over by ...
7,"February 09, 2024","February 09, 2024",Chattogram,"b'<html lang=""en""> <head> <meta charset=""utf-8...",A truck driver's helper was killed and 10 othe...
8,"December 26, 2023","December 26, 2023",Tangail,"b'<html lang=""en""> <head> <meta charset=""utf-8...",Two people died and three others were injured ...
9,"February 15, 2024","February 15, 2024",Sylhet,"b'<html lang=""en""> <head> <meta charset=""utf-8...",During a routine vehicle inspection on the Syl...


In [4]:
row = df.iloc[40]


In [5]:
def ask_questions(qa: Pipeline, context: str):
    results = [None] * 5

    # Initial question
    question_what_happened = {
        'question': 'What happened?',
        'context': context
    }
    answer_what_happened = qa(question_what_happened)
    results[2] = answer_what_happened["answer"]

    # Questions about what happened before
    question_before_1 = {
        'question': f'What happened before {answer_what_happened["answer"]}?',
        'context': context
    }
    answer_before_1 = qa(question_before_1)
    results[1] = answer_before_1["answer"]

    question_before_2 = {
        'question': f'What happened before {answer_before_1["answer"]}?',
        'context': context
    }
    answer_before_2 = qa(question_before_2)
    results[0] = answer_before_2["answer"]

    # Questions about what happened after
    question_after_1 = {
        'question': f'What happened after {answer_what_happened["answer"]}?',
        'context': context
    }
    answer_after_1 = qa(question_after_1)
    results[3] = answer_after_1["answer"]

    question_after_2 = {
        'question': f'What happened after {answer_after_1["answer"]}?',
        'context': context
    }
    answer_after_2 = qa(question_after_2)
    results[4] = answer_after_2["answer"]

    return results

def process_row(row: pd.Series, ner: GLiNER, qa: Pipeline):
    LABELS = ["day when accident happened", "time when accident happened", "vehicle", "casualties", "age of people who died"]

    publish, place, text = row["Publish"], row["Place"], row["RAW_TEXT"]

    entities = ner.predict_entities(text, LABELS)

    results = {
        "place": place,
        "date": None,
        "time": None,
        "vehicles": [],
        "casualties": 0,
        "ageOfCasualties": [],
        "injuries": 0, 
        "reason": "",
        "sequenceOfEvents": [],
    }

    seen_casualities = []

    for entity in entities:
        label, value = entity["label"], entity["text"]

        if label == "day when accident happened" and not results["date"]:
            try:
                publish_weekday = parse(publish).weekday()
                accident_weekday = parse(value.split(" ")[0]).weekday()
                results["date"] = (parse(publish) - datetime.timedelta(days=(publish_weekday - accident_weekday))).strftime("%Y-%m-%d")
            except:
                pass

        elif label == "time when accident happened" and not results["time"]:
            results["time"] = value

        elif label == "vehicle" and value.lower() not in results["vehicles"]:
            results["vehicles"].append(value.lower())

        elif label == "casualties" and value.lower() not in seen_casualities:
            results["casualties"] += 1
            seen_casualities.append(value.lower())
        elif label == "age of people who died":
            results["ageOfCasualties"].append(value)
    question_reason = {
        'question': 'How did the vehicle crash?',
        'context': text
    }
    answer_reason = qa(question_reason)

    question_injuries = {
        'question': 'How many people were injured?',
        'context': text
    }
    answer_injuries = qa(question_injuries)    

    results["injuries"] = answer_injuries["answer"]
    results["reason"] = answer_reason["answer"]
    results["sequenceOfEvents"] = ask_questions(qa, text)
    
    return results


In [7]:
ner = GLiNER.from_pretrained("urchade/gliner_medium-v2.1")
qa_model_name = "deepset/roberta-base-squad2"
qa = pipeline('question-answering', model=qa_model_name, tokenizer=qa_model_name)


results = []

for i, row in df[:10].iterrows():
    print(f"Processing row {i + 1}/{len(df)}")
    results.append(process_row(row, ner, qa))

results[:10]


Processing row 1/1022
Processing row 2/1022
Processing row 3/1022
Processing row 4/1022
Processing row 5/1022
Processing row 6/1022
Processing row 7/1022
Processing row 8/1022
Processing row 9/1022
Processing row 10/1022


[{'place': 'Kishoreganj',
  'date': '2024-01-22',
  'time': '4:45 pm',
  'vehicles': ['motorcycle'],
  'casualties': 0,
  'ageOfCasualties': ['25'],
  'injuries': 'three',
  'reason': 'hit a road pillar',
  'sequenceOfEvents': ['hit a road pillar',
   'motorcycle',
   'hit a road pillar',
   'Two people died and one other was injured',
   'the motorcycle they were on lost control and hit a road pillar']},
 {'place': 'Dhaka',
  'date': '2023-12-28',
  'time': None,
  'vehicles': ['sport utility vehicle (suv)'],
  'casualties': 0,
  'ageOfCasualties': ['8'],
  'injuries': 'one',
  'reason': 'a passenger shed',
  'sequenceOfEvents': ['ploughed into a passenger shed',
   'sport utility vehicle',
   'ploughed into a passenger shed',
   'killing three people and injuring one',
   'Police have arrested']},
 {'place': 'Bogura',
  'date': '2024-01-15',
  'time': None,
  'vehicles': ['truck'],
  'casualties': 0,
  'ageOfCasualties': ['45'],
  'injuries': 'died',
  'reason': 'Amzad lost control o