In [None]:
! pip install --upgrade pip
! pip install -r ../requirements.txt


In [None]:
import datetime
from dateutil.parser import parse
from dateutil.relativedelta import relativedelta, MO

import pandas as pd
from transformers import pipeline
from gliner import GLiNER


In [4]:
df = pd.read_csv('../data/data.csv', delimiter=';')


In [5]:
df.head()

Unnamed: 0,Publish,Update,Place,HTML_TEXT,RAW_TEXT
0,"January 22, 2024","January 22, 2024",Kishoreganj,"b'<html lang=""en""> <head> <meta charset=""utf-8...",Two people died and one other was injured afte...
1,"December 28, 2023","December 28, 2023",Dhaka,"b'<html lang=""en""> <head> <meta charset=""utf-8...",Police have arrested the driver of a sport uti...
2,"January 15, 2024","January 15, 2024",Bogura,"b'<html lang=""en""> <head> <meta charset=""utf-8...",A motorcyclist died after a truck ran him over...
3,"February 19, 2024","February 19, 2024",Gazipur,"b'<html lang=""en""> <head> <meta charset=""utf-8...",Three people were killed and two others injure...
4,"December 17, 2023","December 17, 2023",Sylhet,"b'<html lang=""en""> <head> <meta charset=""utf-8...",Two men were killed in a head-on collision bet...


In [6]:
row = df.iloc[40]


In [7]:
def process_row(row: pd.Series, model: GLiNER):
    LABELS = ["day when accident happened", "time when accident happened", "vehicle", "casualties"]

    publish, place, text = row["Publish"], row["Place"], row["RAW_TEXT"]

    entities = model.predict_entities(text, LABELS)

    results = {
        "place": place,
        "date": None,
        "time": None,
        "vehicles": [],
        "casualties": 0,
    }

    seen_casualities = []

    for entity in entities:
        label, value = entity["label"], entity["text"]

        if label == "day when accident happened" and not results["date"]:
            try:
                publish_weekday = parse(publish).weekday()
                accident_weekday = parse(value.split(" ")[0]).weekday()
                results["date"] = (parse(publish) - datetime.timedelta(days=(publish_weekday - accident_weekday))).strftime("%Y-%m-%d")
            except:
                pass

        elif label == "time when accident happened" and not results["time"]:
            results["time"] = value

        elif label == "vehicle" and value.lower() not in results["vehicles"]:
            results["vehicles"].append(value.lower())

        elif label == "casualties" and value.lower() not in seen_casualities:
            results["casualties"] += 1
            seen_casualities.append(value.lower())

    return results


In [8]:
model = GLiNER.from_pretrained("urchade/gliner_medium-v2.1")

results = []

for i, row in df.iterrows():
    print(f"Processing row {i + 1}/{len(df)}")
    results.append(process_row(row, model))

results[:10]




Processing row 1/1022
Processing row 2/1022
Processing row 3/1022
Processing row 4/1022
Processing row 5/1022
Processing row 6/1022
Processing row 7/1022
Processing row 8/1022
Processing row 9/1022
Processing row 10/1022
Processing row 11/1022
Processing row 12/1022
Processing row 13/1022
Processing row 14/1022
Processing row 15/1022
Processing row 16/1022
Processing row 17/1022
Processing row 18/1022
Processing row 19/1022
Processing row 20/1022
Processing row 21/1022
Processing row 22/1022
Processing row 23/1022
Processing row 24/1022
Processing row 25/1022
Processing row 26/1022
Processing row 27/1022
Processing row 28/1022
Processing row 29/1022
Processing row 30/1022
Processing row 31/1022
Processing row 32/1022
Processing row 33/1022
Processing row 34/1022
Processing row 35/1022
Processing row 36/1022
Processing row 37/1022
Processing row 38/1022
Processing row 39/1022
Processing row 40/1022
Processing row 41/1022
Processing row 42/1022
Processing row 43/1022
Processing row 44/10

[{'place': 'Kishoreganj',
  'date': '2024-01-22',
  'time': '4:45 pm',
  'vehicles': ['motorcycle'],
  'casualties': 3},
 {'place': 'Dhaka',
  'date': '2023-12-28',
  'time': None,
  'vehicles': ['sport utility vehicle (suv)'],
  'casualties': 0},
 {'place': 'Bogura',
  'date': '2024-01-15',
  'time': None,
  'vehicles': ['truck'],
  'casualties': 0},
 {'place': 'Gazipur',
  'date': '2024-02-19',
  'time': 'morning',
  'vehicles': ['battery-run autorickshaw', 'tangail&ndashbound truck'],
  'casualties': 2},
 {'place': 'Sylhet',
  'date': '2023-12-16',
  'time': None,
  'vehicles': ['motorcycle'],
  'casualties': 2},
 {'place': 'Chattogram',
  'date': '2024-02-05',
  'time': None,
  'vehicles': ['private car'],
  'casualties': 0},
 {'place': 'Dhaka',
  'date': '2024-01-04',
  'time': None,
  'vehicles': ['unidentified vehicle'],
  'casualties': 0},
 {'place': 'Chattogram',
  'date': '2024-02-09',
  'time': None,
  'vehicles': ['microbus', 'dhaka-chattogram highway', 'goods-laden truck']