# Génération du jeu de données

## Données d'entraînement

In [1]:
import pandas as pd
import json

train_df = pd.read_csv("train.csv")
train_df = train_df.set_index("id")
train_df.entities = train_df.entities.apply(json.loads)
train_df.relations = train_df.relations.apply(json.loads)
print(train_df.shape)
train_df.head()

(800, 3)


Unnamed: 0_level_0,text,entities,relations
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
181,"Anam Destresse, président de l'ONG ""Ma passion...","[{'id': 0, 'mentions': [{'value': 'accident', ...","[[0, STARTED_IN, 9], [7, IS_LOCATED_IN, 9], [5..."
31669,"À Paris, le 8 avril 2022, l'usine de déodorant...","[{'id': 0, 'mentions': [{'value': 'explosé', '...","[[9, IS_LOCATED_IN, 8], [11, OPERATES_IN, 8], ..."
51470,"En Espagne, dans une région agricole, une cont...","[{'id': 0, 'mentions': [{'value': 'contaminati...","[[7, IS_PART_OF, 8], [9, OPERATES_IN, 1], [0, ..."
51332,Un important incendie a fait des ravages dans ...,"[{'id': 0, 'mentions': [{'value': 'incendie', ...","[[12, IS_IN_CONTACT_WITH, 5], [0, IS_LOCATED_I..."
1131,« Je coule » : onze heures après avoir envoyé ...,"[{'id': 0, 'mentions': [{'value': 'renversé', ...","[[9, IS_LOCATED_IN, 2], [0, START_DATE, 17], [..."


## Extraction des relations annotées

In [2]:
def get_annotated_relations(data):
    possible_relations = set()
    relations = data.relations
    rows = []
    for relation in relations:
        text = data.text
        entities = data.entities
        possible_relations.add((entities[relation[0]]["type"], relation[1], entities[relation[2]]["type"]))
        head_entity = ", ".join(list(set([m["value"] for m in entities[relation[0]]["mentions"]])))
        relation_type = relation[1].lower()
        tail_entity = ", ".join(list(set([m["value"] for m in entities[relation[2]]["mentions"]])))
        relation = str(relation)
        rows.append({"text": text, 
                     "head_entity": head_entity, 
                     "tail_entity": tail_entity, 
                     "type": relation_type, 
                     "relation": relation, 
                     "answer": "true"})
    return possible_relations, rows

possible_relation_profiles = set()
true_examples = []
count = 0 
for uid, data in train_df.iterrows():
    relation_profiles, true_rows = get_annotated_relations(data)
    for profile in relation_profiles:
        possible_relation_profiles.add(profile)
    for row in true_rows:
        row["uid"] = uid
        true_examples.append(row)
    count += 1
    if count == 700:
        split_index_true = len(true_examples)
print(len(true_examples), split_index_true)
profile_df = pd.DataFrame(list(possible_relation_profiles))
profile_df.columns = ["head", "type", "tail"]
profile_df.head()

31469 27657


Unnamed: 0,head,type,tail
0,CIVILIAN,DIED_IN,SUICIDE
1,CRIMINAL_ARREST,START_DATE,TIME_FUZZY
2,NON_GOVERNMENTAL_ORGANISATION,HAS_CONTROL_OVER,PLACE
3,NON_GOVERNMENTAL_ORGANISATION,IS_AT_ODDS_WITH,GROUP_OF_INDIVIDUALS
4,NON_MILITARY_GOVERNMENT_ORGANISATION,IS_IN_CONTACT_WITH,TERRORIST_OR_CRIMINAL


## Extraction de relations synthétiques selon les combinaisons de types d'entités possibles

In [3]:
from tqdm import tqdm

def get_synthetic_relations(data, test=False):
    text = data.text
    true_relations = [] if test else data.relations
    entities = data.entities
    rows = []
    for head_entity in entities:
        head_id = head_entity['id']
        head_type = head_entity['type']
        df = profile_df[profile_df['head'] == head_type]
        for tail_entity in entities:
            tail_id = tail_entity['id']
            tail_type = tail_entity['type']
            for _, relation in df.iterrows():
                if relation['tail'] == tail_type:
                    possible_relation = [head_id, relation['type'], tail_id]
                    if test or possible_relation not in true_relations:
                        head_entity = ", ".join(list(set([m["value"] for m in entities[head_id]["mentions"]])))
                        relation_type = possible_relation[1].lower()
                        tail_entity = ", ".join(list(set([m["value"] for m in entities[tail_id]["mentions"]])))
                        rows.append({"text": text, 
                                     "head_entity": head_entity, 
                                     "tail_entity": tail_entity, 
                                     "type": relation_type, 
                                     "relation": str([head_id, possible_relation[1], tail_id]), 
                                     "answer": "false"})
    return rows
false_examples = []
uids = []
count = 0
for uid, data in tqdm(train_df.iterrows()):
    false_rows = get_synthetic_relations(data)
    for row in false_rows:
        row["uid"] = uid
        false_examples.append(row)
    count += 1
    if count == 700:
        split_index_false = len(false_examples)
len(false_examples), split_index_false

800it [04:45,  2.80it/s]


(279750, 244454)

## Données de test

In [4]:
test_df = pd.read_csv("test_01-07-2024.csv")
test_df = test_df.set_index("id")
test_df.entities = test_df.entities.apply(json.loads)
print(test_df.shape)

(400, 2)


In [5]:
test_queries = []
for uid, data in tqdm(test_df.iterrows()):
    query_rows = get_synthetic_relations(data, test=True)
    for row in query_rows:
        row["uid"] = uid
        test_queries.append(row)
len(test_queries)

400it [02:06,  3.15it/s]


135012

In [6]:
import jsonlines
import random

train_rows = true_examples[:split_index_true]+false_examples[:split_index_false]
random.seed(0)
random.shuffle(train_rows)
with jsonlines.open("dataset/train.jsonl", "w") as writer:
    writer.write_all(train_rows)
with jsonlines.open("dataset/val.jsonl", "w") as writer:
    writer.write_all(true_examples[split_index_true:]+false_examples[split_index_false:])
with jsonlines.open("dataset/test.jsonl", "w") as writer:
    writer.write_all(test_queries)