# Datasets

`Melina Paxinou`

MA Linguistics, Text Mining - Vrije Universiteit Amsterdam

June 27, 2025

In [2]:
from transformers import AutoTokenizer, AutoModelForTokenClassification
import torch
import pandas as pd
import spacy
import numpy as np
nlp = spacy.load("en_core_web_md")
from openpyxl.utils.exceptions import IllegalCharacterError
from sklearn.metrics import classification_report, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns
from transformers import pipeline
from transformers import RobertaTokenizerFast
import random
import os
from rapidfuzz.fuzz import ratio
from functools import lru_cache
from tqdm import tqdm
import xml.etree.ElementTree as ET
import re
from sklearn.metrics import recall_score
from sklearn.metrics import cohen_kappa_score
import itertools
from itertools import combinations
import krippendorff
import random
from sklearn.model_selection import train_test_split
import networkx as nx
from collections import defaultdict
from spacy.tokens import Doc
import json
import ast
from difflib import SequenceMatcher

In [4]:
polywords = [
    "a bit", "a capella", "a fortiori", "a good deal", "a great deal",
    "a heck of a lot", "a hell of a lot", "à la", "à la carte", "à la mode",
    "à la provençale", "a little", "a little bit", "a lot", "a posteriori",
    "a priori", "a propos", "ab initio", "according as", "according to",
    "ad astra", "ad hoc", "ad hominem", "ad infinitum", "ad lib", "ad libs",
    "ad nauseam", "ad valorem", "adjacent to", "affaire d'honneur",
    "affaire de coeur", "affaire du coeur", "affaires d'honneur",
    "affaires de coeur", "agent provocateur", "agents provocateurs",
    "agnus dei", "ahead of", "aide de camp", "aide memoire", "aides de camp",
    "al dente", "al fresco", "all at once", "all but", "all of a sudden",
    "all right", "all the same", "alla breve", "alma mater", "alma maters",
    "along with", "alter ego", "alter egos", "an awful lot", "an' all",
    "ancien régime", "and so forth", "and so on", "anno dom", "anno domini",
    "annus horribilis", "annus mirabilis", "ante meridiem", "any longer",
    "anything but", "apart from", "aqua vitae", "art deco", "art nouveau",
    "as against", "as between", "as for", "as from", "as if", "as it were",
    "as long as", "as of", "as opposed to", "as regards", "as soon as",
    "as though", "as to", "as usual", "as well as", "as yet", "aside from",
    "asti spumante", "at all", "at best", "at first", "at large", "at last", "at least", 
    "at length", "at long last", "at long length", "at most", "at once", "at present", 
    "at random", "at worst", "au contraire", "au fait", "au naturel", "au pair", "au pairs", 
    "au revoir", "auf Wiedersehen", "aurora australis", "aurora borealis", "avant garde", 
    "away from", "bar mitzvah", "bar mitzvahs", "basso continuo", "basso profundo", "beau monde", 
    "beaujolais nouveau", "because of", "belles lettres", "bête noir", "bête noire", "bêtes noires", 
    "billet doux", "billets doux", "bon appetit", "bon jour", "bon mot", "bon soir", "bon vivant", 
    "bon viveur", "bon voyage", "bona fide", "bona fides", "bons mots", "bons vivants", "bons viveurs", 
    "bouquet garni", "brand new", "bric à brac", "but for", "by and by", "by and large", "by far", 
    "by far and away", "by means of", "by no means", "by now", "by reason of", "by the by", "by way of", 
    "café au lait", "camera obscura", "camera obscuras", "carte blanche", "casus belli", "casus omissus", 
    "cause célèbre", "causes célèbres", "caveat emptor", "ceteris paribus", "chaise longue", 
    "chaise longues", "chaises longues", "chargé d'affaires", "chargés d'affaires", "check outs", 
    "chef d'oeuvre", "chez moi", "chez nous", "chez vous", "chilli con carne", "chop suey", "chow mein", 
    "clamp down", "close to", "compos mentis", "con brio", "con fuoco", "con moto", "considering that", 
    "contrary to", "cordon bleu", "cordon sanitaire", "corps de ballet", "corpus delicti", "corpus juris", 
    "coup d'état", "coup de foudre", "coup de grâce", "coup de théâtre", "coups d'état", "coups de grâce", 
    "coups de théâtre", "crème brulée", "crème de la crème", "crème de menthe", "crème fraîche", 
    "cri de coeur", "crime passionel", "crimes passionels", "cris de coeur", "croix de guerre", "cul de sac", 
    "curriculum vitae", "danse macabre", "danse ronde", "de facto", "de jure", "de luxe", "de profundis", 
    "de rigeur", "de rigueur", "de trop", "decree nisi", "dei gratia", "déjà vu", "delirium tremens", 
    "demi monde", "depending on", "deus ex machina", "double entendre", "double entendres", "doubles entendres", 
    "dramatis personae", "due to", "each other", "eau de cologne", "eminence grise", "en bloc", "en famille", 
    "en masse", "en passant", "en route", "en suite", "enfant terrible", "enfants terribles", "entente cordiale", 
    "esprit de corps", "et al", "et cetera", "even if", "even so", "even though", "even when", "ever so", 
    "every so often", "ex ante", "ex army", "ex cathedra", "ex gratia", "ex hypothesi", "ex libris", "ex officio", 
    "ex parte", "ex post", "ex post facto", "ex silentio", "ex tempore", "ex turpi causa", "ex vitro", "ex vivo", 
    "except for", "except that", "excepting for", "fair do's", "fait accompli", "far from", "far off", 
    "faute de mieux", "faux ami", "faux amis", "faux pas", "fed up", "femme fatale", "femmes fatales", 
    "film noir", "films noirs", "fin de siècle", "fines herbes", "foie gras", "follow up", "for certain", 
    "for ever", "for example", "for fear of", "for good", "for instance", "for keeps", "for long", "for once", 
    "for sure", "for the most part", "for the time being", "force majeure", "from now on", "from time to time", 
    "fromage frais", "gee whizz", "genius loci", "getting on for", "given that", "grand mal", "grand prix", 
    "grande dame", "grands prix", "grown up", "grown ups", "guardian ad litem", "gung ho", "habeas corpus", 
    "half way", "hara kiri", "hard up", "hasta la vista", "hasta luego", "haute couture", "haute cuisine", 
    "have nots", "heave ho", "hey presto", "higgledy piggledy", "hocus pocus", "hoi polloi",
    "hoity toity","homo sapiens","hooray henry","hooray henrys","hors d'oeuvre","hors d'oeuvres", "hors de combat",
    "hotch potch","hush hush","hysteron proteron","idée fixe","ignis fatuus","in absentia","in accord with","in accordance with",
    "in addition","in addition to", "in aid of","in answer to","in as much as","in association with",
    "in back of", "in between", "in brief","in camera","in case","in case of","in charge of", "in co-operation with","in common",
    "in common with", "in comparison with", "in conjunction with","in connection with", "in consultation with", "in contact with", "in cooperation with", "in course with",
    "in defence of", "in defiance of","in excess of","in extremis","in face of","in favor of","in favour of",
    "in flagrante delicto","in front of","in full","in general","in keeping with","in lieu of","in light of",
    "in line with","in loco parentis","in medias res","in memoriam","in need of","in order","in part",
    "in particular", "in perpetuum", "in place of", "in possession of", "in private", "in proportion to", "in propria persona",
    "in public","in pursuit of","in quest of","in receipt of","in regard to","in relation to","in reply to",
    "in respect of","in response to","in return for","in search of","in short","in situ","in so far as",
    "in spite of","in support of","in terms of","in that","in the light of","in the main","in the order of",
    "in toto","in touch with","in vain","in view of","in vino veritas","in vitro","in vivo","inasmuch as","infra dig",
    "infra dignitatem","inside out","insofar as","insomuch as",
    "instead of", "inter alia", "into line with", "ipso facto", "irrespective of", "je ne sais quoi","joie de vivre",
    "just about", "kind of", "know how", "kung fu", "la dolce vita", "laissez faire", "lasagne verde","le mot juste","less than",
    "let 's", "let alone","lingua franca", "lo and behold", "loc cit", "locum tenens", "locus classicus",
    "long-term wise", "magna carta", "magna cum laude", "magnum opus", "maître d", "maître d'","maître d'hôtel",
    "mal de mer", "mardi gras", "matter of fact","mea culpa","Médecins sans Frontières",
    "ménage a trois","mezzo soprano",  "modus operandi", "modus vivendi", "more than", "mot juste",
    "mumbo jumbo", "mutatis mutandis", "near to", "nearer to","nearest to", "nem con", "next to",
    "nigh on","nitty gritty","no doubt","no longer",
    "no matter how", "no matter what", "no matter when", "no matter where", 
    "no matter which", "no matter who", "no matter whom", "no matter whose", 
    "no one", "noblesse oblige", "nom de guerre", "nom de plume", "noms de guerre", 
    "noms de plume", "non compos mentis", "non sequitur", "non sequiturs", 
    "none other", "none the", "none the less", "not withstanding", "nouveau rich", 
    "nouveau riche", "nouveaux riches", "nouvelle cuisine", "now that", "o' course", 
    "obiter dictum", "objet d'art", "objets d'art", "of course", "off guard", 
    "off of", "oft times", "okey doke", "okey dokey", "old fashioned", 
    "on account of", "on behalf of", "on board", "on the part of", "on to", 
    "on top of", "once again", "once and for all", "once more", "one 's", 
    "one another", "op cit", "other than", "out front", "out of", "out of date", 
    "out of line with", "out of touch with", "outside of", "over here", "over there", 
    "owing to", "papier mâché", "par excellence", "pari passu", "pas de deux", 
    "pâté de foie gras", "pax americana", "pax britannica", "pax romana", 
    "per annum", "per capita", "per cent", "per diem", "per se", "persona non grata", 
    "personae non gratae", "pertaining to", "petit bourgeois", "petit four", 
    "petit mal", "petite bougeoisie", "petits bourgeois", "pièce de résistance", 
    "pied à terre", "pina colada", "pina coladas", "pince nez", "poco a poco", 
    "point blank", "porte cochère", "post hoc", "post meridiem", "post mortem", 
    "post mortems", "poste restante", "pot pourri", "prima donna", "prima donnas", 
    "prima facie", "primus inter pares", "prior to", "pro forma", "pro rata", 
    "pro tem", "provided that", "providing that", "pursuant to", "qui vive", 
    "quid pro quo", "raison d'être", "raisons d'être", "reductio ad absurdum", 
    "relative to", "rigor mortis", "roman à clef", "sang froid", "save for", 
    "save that", "savoir faire", "savoir vivre", "seeing as", "seeing that", 
    "semper fidelis", "shin bet", "shish kebab", "shish kebabs", "sine die", 
    "sine qua non", "sinn fein", "so 's", "so as", "so called", "so long as", 
    "so much as", "so that", "some one",
    "son et lumière", "sort of", "sotto voce", "spaghetti bolognese", "spina bifida", 
    "spot on", "sq feet/metres/etc", "sq ft/m/cm/etc.", "status quo", "straight away", 
    "straight forward", "sub judice", "sub poena", "sub rosa", "subject to", 
    "subsequent to", "such as", "such that", "sui generis", "sui juris", 
    "summa cum laude", "super duper", "supposing that", "table d'hôte", "tabula rasa", 
    "tai chi", "tai kwan do", "teeny weeny", "terra firma", "terra incognita", 
    "thanks to", "that is", "that is to say", "through thick and thin", "time and again", 
    "time and again", "tittle tattle", "to and fro", "topsy turvy", "tour de force", 
    "tours de force", "tout court", "tout de suite", "tutti frutti", "ultra vires", 
    "under way", "up front", "up to", "up to date", "up to the minute", "up until", 
    "upside down", "upward of", "upwards of", "vice versa", "vin de table", 
    "vin ordinaire", "vis à vis", "viva voce", "void ab initio", "vol au vent", 
    "vols au vent", "volte face", "vox pop", "vox pops", "vox populi", "well being", 
    "well off", "whether or not", "wiener schnitzel", "wiener schnitzels", 
    "with a view to", "with reference to", "with regard to", "with relation to", 
    "with respect to"
]


In [5]:
def clean_illegal_chars(text):
    return ''.join(c for c in text if c.isprintable() or c in '\n\t')

def is_polyword(tokens):
    phrase = ' '.join([token.text.lower() for token in tokens])
    return phrase in polywords

def process_text_file(input_txt, output_excel):
    with open(input_txt, 'r', encoding='utf-8') as f:
        raw_lines = [(i + 1, line.strip()) for i, line in enumerate(f) if line.strip()]
    
    line_map = {}
    for lineno, line in raw_lines:
        line_map[lineno] = line

    full_text = ' '.join(line for _, line in raw_lines)
    doc = nlp(full_text)

    rows = []
    sent_id = 0
    token_id = 0 
    current_line_idx = 0

    print("Processing...")

    for sent in doc.sents:
        sent_id += 1  
        sentence_tokens = list(sent)  
        
        polyword_flags = [None] * len(sentence_tokens)  

        for token_idx in range(len(sentence_tokens) - 1):  
            potential_polyword = sentence_tokens[token_idx:token_idx + 2]
            if is_polyword(potential_polyword):
                polyword_flags[token_idx] = 'Maybe'
                polyword_flags[token_idx + 1] = 'Maybe'

        for token_idx, token in enumerate(sentence_tokens):
            if token.pos_ == "SPACE":
                continue  
            
            polyword_flag = polyword_flags[token_idx] if polyword_flags[token_idx] else ''

            try:
                token_text = clean_illegal_chars(token.text)
                pos = token.pos_
    
                row = {
                    'sent_id': sent_id,
                    'token_id': token_id + 1,  
                    'token_text': token_text,
                    'pos': pos,
                    'polyword': polyword_flag,  
                    'metaphor': '',
                    'motivation': '',
                    'comment': ''
                }
                rows.append(row)
                token_id += 1
    
            except IllegalCharacterError:
                print(f"\nIllegal character in token '{token.text}' (sentence {sent_id}, token {token_id + 1})")
                print("  Likely source lines nearby:")
                for offset in range(-1, 2):
                    idx = current_line_idx + offset
                    if 0 <= idx < len(raw_lines):
                        print(f"    Line {raw_lines[idx][0]}: {raw_lines[idx][1]}")
                print("  Skipping this token.\n")
                continue

        current_line_idx += 1

    df = pd.DataFrame(rows, columns=[
        'sent_id', 'token_id', 'token_text', 'pos',
        'polyword', 'metaphor', 'motivation', 'comment'
    ])
    df.to_excel(output_excel, index=False)

# process_text_file('epidemic.txt', 'epidemic.xlsx')
# process_text_file('grief_counsellor.txt', 'grief_counsellor.xlsx')
# process_text_file('lung_cancer.txt', 'lung_cancer.xlsx')
# process_text_file('radiation_therapy.txt', 'radiation_therapy.xlsx')
# process_text_file('sniffler.txt', 'sniffler.xlsx')

## Inter-annotator Agreement

In [45]:
grief_counsellor = pd.read_excel("annotations/grief_counsellor.xlsx") 
radiation_therapy = pd.read_excel('annotations/urte_radiation_therapy.xlsx')
immuno1_combined = pd.read_excel('annotations/immuno1_combined.xlsx')
epidemic = pd.read_excel('annotations/epidemic.xlsx')
lung_cancer = pd.read_excel('annotations/lung_cancer.xlsx')

In [55]:
def compute_annotation_agreement(df, annot_cols):
    """
    Computes pairwise Cohen's Kappa scores and Krippendorff's Alpha for given annotation columns.
    
    Parameters:
        df (pd.DataFrame): The input dataframe containing annotations.
        annot_cols (list of str): The columns with annotations.
        
    Returns:
        dict: Dictionary with pairwise kappa scores, average kappa, and Krippendorff's alpha.
    """
    df = df.copy()
    df[annot_cols] = df[annot_cols].replace('_', pd.NA).astype("float")

    df_nonempty = df.dropna(subset=annot_cols, how='all')

    pairs = list(combinations(annot_cols, 2))
    kappas = {}

    for a1, a2 in pairs:
        pair_df = df_nonempty[[a1, a2]].dropna()
        if not pair_df.empty:
            score = cohen_kappa_score(pair_df[a1], pair_df[a2])
            kappas[f"{a1} vs {a2}"] = score

    avg_kappa = sum(kappas.values()) / len(kappas) if kappas else None

    alpha_data = df_nonempty[annot_cols].T.to_numpy()
    alpha = krippendorff.alpha(reliability_data=alpha_data, level_of_measurement='nominal')

    print("\nPairwise Cohen's Kappa scores:")
    for pair, score in kappas.items():
        print(f"{pair}: {score:.3f}")
    if avg_kappa is not None:
        print(f"\nAverage pairwise Cohen’s Kappa: {avg_kappa:.3f}")
    else:
        print("No valid annotation pairs found.")
    print(f"\nKrippendorff’s Alpha: {alpha:.3f}")

    return {
        'pairwise_kappas': kappas,
        'average_kappa': avg_kappa,
        'krippendorff_alpha': alpha
    }


In [57]:
grief_counsellor_results = compute_annotation_agreement(grief_counsellor, ['urte', 'basti', 'meli'])


Pairwise Cohen's Kappa scores:
urte vs meli: 0.583
basti vs meli: 0.629

Average pairwise Cohen’s Kappa: 0.606

Krippendorff’s Alpha: 0.606


In [59]:
radiation_therapy_results = compute_annotation_agreement(radiation_therapy, ['Urte', 'Meli'])


Pairwise Cohen's Kappa scores:
Urte vs Meli: 0.597

Average pairwise Cohen’s Kappa: 0.597

Krippendorff’s Alpha: 0.595


In [61]:
immuno1_combined_results = compute_annotation_agreement(immuno1_combined, ['Urte', 'Melina', 'Basti'])


Pairwise Cohen's Kappa scores:
Urte vs Melina: 0.606
Urte vs Basti: 0.628
Melina vs Basti: 0.647

Average pairwise Cohen’s Kappa: 0.627

Krippendorff’s Alpha: 0.621


In [63]:
epidemic_results = compute_annotation_agreement(epidemic, ['meli', 'basti'])


Pairwise Cohen's Kappa scores:
meli vs basti: 0.692

Average pairwise Cohen’s Kappa: 0.692

Krippendorff’s Alpha: 0.692


In [65]:
lung_cancer_results = compute_annotation_agreement(lung_cancer, ['meli', 'basti'])


Pairwise Cohen's Kappa scores:
meli vs basti: 0.654

Average pairwise Cohen’s Kappa: 0.654

Krippendorff’s Alpha: 0.654


# Preprocessing

In [290]:
file1 = pd.read_csv("i_tried_one_last_time.tsv", sep="\t")
file2 = pd.read_csv("vua_texts_with_mflag.tsv", sep="\t")

file1['is_it_relevant'] = file1['metaphor'].apply(lambda x: 1 if x == 1 else 0)
file1 = file1[['text_id', 'sentence_id', 'token_id', 'token_text', 'metaphor', 'is_it_relevant']]

file2 = file2.rename(columns={'sentence_number': 'sentence_id', 'token': 'token_text'})
file2['token_id'] = file2.groupby(['text_id', 'sentence_id']).cumcount() + 1
file2['is_it_relevant'] = 0  # always 0 for file2
file2 = file2[['text_id', 'sentence_id', 'token_id', 'token_text', 'metaphor', 'is_it_relevant']]

def split_file_by_text_id(df):
    unique_text_ids = df['text_id'].unique()
    train_ids, temp_ids = train_test_split(unique_text_ids, test_size=0.4, random_state=42)
    dev_ids, test_ids = train_test_split(temp_ids, test_size=0.5, random_state=42)
    
    df_train = df[df['text_id'].isin(train_ids)]
    df_dev = df[df['text_id'].isin(dev_ids)]
    df_test = df[df['text_id'].isin(test_ids)]
    
    return df_train, df_dev, df_test

file1_train, file1_dev, file1_test = split_file_by_text_id(file1)
file2_train, file2_dev, file2_test = split_file_by_text_id(file2)

train_combined = pd.concat([file1_train, file2_train], ignore_index=True)
dev_combined = pd.concat([file1_dev, file2_dev], ignore_index=True)
test_combined = pd.concat([file1_test, file2_test], ignore_index=True)

for df in [train_combined, dev_combined, test_combined]:
    df['text_id'] = df['text_id'].astype(str).str.strip().str.replace('\ufeff', '').str.replace('\u200b', '')

train_combined.to_csv("train_combined.tsv", sep="\t", index=False)
dev_combined.to_csv("dev_combined.tsv", sep="\t", index=False)
test_combined.to_csv("test_combined.tsv", sep="\t", index=False)

def print_split_stats(df, name):
    num_texts = df['text_id'].nunique()
    num_tokens = len(df)
    print(f"{name}: {num_texts} texts, {num_tokens} tokens")

print("\n--- FILE 1 SPLITS ---")
print_split_stats(file1_train, "File1 Train")
print_split_stats(file1_dev, "File1 Dev")
print_split_stats(file1_test, "File1 Test")

print("\n--- FILE 2 SPLITS ---")
print_split_stats(file2_train, "File2 Train")
print_split_stats(file2_dev, "File2 Dev")
print_split_stats(file2_test, "File2 Test")

print("\n--- COMBINED SPLITS ---")
print_split_stats(train_combined, "Combined Train")
print_split_stats(dev_combined, "Combined Dev")
print_split_stats(test_combined, "Combined Test")

  file1 = pd.read_csv("i_tried_one_last_time.tsv", sep="\t")



--- FILE 1 SPLITS ---
File1 Train: 273 texts, 1917792 tokens
File1 Dev: 91 texts, 556296 tokens
File1 Test: 91 texts, 755118 tokens

--- FILE 2 SPLITS ---
File2 Train: 24 texts, 70512 tokens
File2 Dev: 8 texts, 16014 tokens
File2 Test: 9 texts, 19745 tokens

--- COMBINED SPLITS ---
Combined Train: 297 texts, 1988304 tokens
Combined Dev: 99 texts, 572310 tokens
Combined Test: 100 texts, 774863 tokens


In [11]:
train_final = "train.tsv"
dev_final = "dev.tsv"
test_final = "test.tsv"

train_combined = pd.read_csv(train_final, sep="\t")
dev_combined = pd.read_csv(dev_final, sep="\t")
test_combined = pd.read_csv(test_final, sep="\t")

extra_file1 = "data_creation/grief_counselor_final.xlsx" 
extra_file2 = "data_creation/radiation_therapy_final.xlsx"
extra_file3 = "data_creation/wonder_cure_final.xlsx"
extra_file4 = "data_creation/lung_cancer_final.xlsx"
extra_file5 = "data_creation/epidemic_final.xlsx"

existing_text_ids = set(pd.concat([train_combined, dev_combined, test_combined])['text_id'].unique())

def add_extra_file_to_combined(extra_file, output_file, existing_text_ids=None, starting_extra_id=1):
    try:
        combined = pd.read_csv(output_file, sep="\t")
    except FileNotFoundError:
        combined = pd.DataFrame(columns=['text_id', 'sentence_id', 'token_id', 'token_text', 'metaphor', 'is_it_relevant'])

    if existing_text_ids is None:
        existing_text_ids = set(combined['text_id'].unique())

    next_extra_id = starting_extra_id

    def generate_unique_text_id(existing_ids):
        nonlocal next_extra_id
        new_id = f"extra_{next_extra_id:03}"
        while new_id in existing_ids:
            next_extra_id += 1
            new_id = f"extra_{next_extra_id:03}"
        existing_ids.add(new_id)
        return new_id

    df = pd.read_excel(extra_file)

    df_clean = df[['sentence_id', 'token_id', 'token_text', 'FINAL', 'is_it_relevant']].copy()
    df_clean = df_clean.rename(columns={'FINAL': 'metaphor'})

    df_clean['metaphor'] = df_clean['metaphor'].apply(lambda x: 1 if str(x).strip() == '1' else 0).astype(int)

    df_clean['is_it_relevant'] = df_clean['is_it_relevant'].fillna(0).astype(int)

    new_text_id = generate_unique_text_id(existing_text_ids)
    df_clean['text_id'] = new_text_id

    df_clean = df_clean[['text_id', 'sentence_id', 'token_id', 'token_text', 'metaphor', 'is_it_relevant']]

    combined = pd.concat([combined, df_clean], ignore_index=True)
    combined.to_csv(output_file, sep="\t", index=False)
    print(f"Added '{extra_file}' as '{new_text_id}' into '{output_file}'")

    return combined, existing_text_ids


# train_combined, existing_text_ids = add_extra_file_to_combined(extra_file1, train_final, 
#                                                                existing_text_ids=existing_text_ids, starting_extra_id=1)
# train_combined, existing_text_ids = add_extra_file_to_combined(extra_file2, train_final, 
#                                                                existing_text_ids=existing_text_ids, starting_extra_id=1)
# test_combined, existing_text_ids = add_extra_file_to_combined(extra_file3, test_final, 
#                                                                existing_text_ids=existing_text_ids, starting_extra_id=1)
test_combined, existing_text_ids = add_extra_file_to_combined(extra_file4, test_final, 
                                                               existing_text_ids=existing_text_ids, starting_extra_id=1)
test_combined, existing_text_ids = add_extra_file_to_combined(extra_file5, test_final, 
                                                               existing_text_ids=existing_text_ids, starting_extra_id=1)

  train_combined = pd.read_csv(train_final, sep="\t")
  dev_combined = pd.read_csv(dev_final, sep="\t")
  test_combined = pd.read_csv(test_final, sep="\t")
  combined = pd.read_csv(output_file, sep="\t")


Added 'data_creation/lung_cancer_final.xlsx' as 'extra_004' into 'test.tsv'


  combined = pd.read_csv(output_file, sep="\t")


Added 'data_creation/epidemic_final.xlsx' as 'extra_005' into 'test.tsv'


## Full-text datasets

In [5]:
dataset = '../data/20221220 Metaforen IT_DEF.xlsx'

In [15]:
def is_similar(a, b, threshold=85):
    return ratio(a, b) >= threshold

def remove_whitespace(text):
    return ''.join(text.split())

@lru_cache(maxsize=512)
def find_full_text(fragment, txt_folders):
    """
    Finds a full text containing the given fragment, ignoring whitespace differences.
    """
    fragment_clean = remove_whitespace(fragment.strip())
    if not fragment_clean:
        return None

    for folder in txt_folders:
        for filename in os.listdir(folder):
            if filename.endswith('.txt'):
                filepath = os.path.join(folder, filename)
                with open(filepath, 'r', encoding='utf-8') as f:
                    full_text = f.read()
                    full_text_clean = remove_whitespace(full_text)

                    if fragment_clean in full_text_clean:
                        return full_text

                    words = full_text.split()
                    frag_words = fragment.split()
                    window_size = len(frag_words)

                    for i in range(len(words) - window_size + 1):
                        window = ' '.join(words[i:i+window_size])
                        window_clean = remove_whitespace(window)

                        if abs(len(window_clean) - len(fragment_clean)) > 20:
                            continue

                        if is_similar(fragment_clean, window_clean):
                            return full_text
    return None


def preprocess_excel_and_texts(input_excel, txt_folders, output_tsv):
    df = pd.read_excel(input_excel)

    rows = []
    token_id = 1  

    for text_num, row in tqdm(enumerate(df.itertuples(index=False), start=1), total=len(df), desc="Processing texts"):
        fragment = str(row.Text1).strip() if pd.notna(row.Text1) else ''
        metaphor_word = str(row.Metafoor).strip().lower() if pd.notna(row.Metafoor) else None

        full_text = find_full_text(fragment, tuple(txt_folders))
        if full_text is None:
            print(f"Warning: Fragment not found for text {text_num}. Skipping.")
            continue

        doc = nlp(full_text)

        for sent_id, sent in enumerate(doc.sents, start=1):
            for token in sent:
                if token.is_space:
                    continue

                token_text = token.text
                is_metaphor = 1 if metaphor_word and token_text.lower() == metaphor_word else 0

                rows.append({
                    'text_id': text_num,
                    'sentence_id': sent_id,
                    'token_id': token_id,
                    'token_text': token_text,
                    'metaphor': is_metaphor,
                    'Categorie': row.Categorie if is_metaphor else '',
                    'Domein1': row.Domein1 if is_metaphor else '',
                    'Domein2': row.Domein2 if is_metaphor else ''
                })
                token_id += 1

    df_out = pd.DataFrame(rows)
    df_out.to_csv(output_tsv, sep='\t', index=False)
    print(f"Preprocessing complete! Saved to {output_tsv}")


In [None]:
txt_folders = ['../data/Corpus_IT_UK_news/Corpus_IT_UK_news', '../data/Dataset wetenschappelijke artikelen txt/txt']
preprocess_excel_and_texts('../data/20221220 Metaforen IT_DEF.xlsx', txt_folders, 'full_texts_recall.tsv')

## Sentence-level datasets

In [78]:
def normalize_tokens_with_hyphen_handling(tokens):
    merged_tokens = []
    merged_indices = []
    i = 0
    while i < len(tokens):
        token = tokens[i]
        if i + 1 < len(tokens) and token.endswith('-') and len(token) > 1:
            next_token = tokens[i + 1]
            if next_token.isalpha() and next_token[0].islower():
                merged_tokens.append(token[:-1] + next_token)
                merged_indices.append([i, i + 1])
                i += 2
                continue
        merged_tokens.append(token)
        merged_indices.append([i])
        i += 1
    return merged_tokens, merged_indices

def clean_token_text(token):
    token = token.replace('\u201c', '"').replace('\u201d', '"')  
    token = token.replace('\u2018', "'").replace('\u2019', "'")  

    token = re.sub(r'[^\w\s\.\,\:\;\-\(\)\'\"\/\[\]\{\}\!\?\u00B0]', '', token, flags=re.UNICODE)

    token = re.sub(r'[\u00A0\u200B\u200C\u200D\uFEFF]', '', token)

    return token

def preprocess_sentences(row, text_num, token_id_start):
    rows = []
    text1 = str(row.Text1)
    text2 = str(row.Text2)

    sent1 = list(nlp(text1).sents)[-1].text if list(nlp(text1).sents) else ''
    sent2 = list(nlp(text2).sents)[0].text if list(nlp(text2).sents) else ''
    full_sent = sent1 + " " + sent2
    doc = nlp(full_sent)

    tokens = [t.text for t in doc if not t.is_space]
    norm_tokens, norm_to_orig = normalize_tokens_with_hyphen_handling(tokens)

    norm_tokens = [clean_token_text(t) for t in norm_tokens]

    metaphors_raw = str(row.Metafoor)
    metaphors = [m.strip().lower() for m in metaphors_raw.split(',') if m.strip()]

    if not any(
        any(metaphor in nt.lower() for nt in norm_tokens)
        for metaphor in metaphors
    ):
        print(f"Metaphor(s) '{', '.join(metaphors)}' not found in row {text_num}. Skipping.")
        return rows, token_id_start

    metaphor_token_indices = set()
    for metaphor in metaphors:
        metaphor_parts = metaphor.split()
        for idx in range(len(norm_tokens) - len(metaphor_parts) + 1):
            window = norm_tokens[idx:idx + len(metaphor_parts)]
            if [t.lower() for t in window] == metaphor_parts:
                for offset in range(len(metaphor_parts)):
                    metaphor_token_indices.add(idx + offset)

    token_id = token_id_start
    sentence_id = 1
    for idx, norm_token in enumerate(norm_tokens):
        token_text = ' '.join(tokens[i] for i in norm_to_orig[idx])
        token_text = clean_token_text(token_text) 
        is_metaphor = 1 if idx in metaphor_token_indices else 0
        rows.append({
            'text_id': text_num,
            'sentence_id': sentence_id,
            'token_id': token_id,
            'token_text': token_text,
            'metaphor': is_metaphor,
            'Categorie': row.Categorie if is_metaphor else '',
            'Domein1': row.Domein1 if is_metaphor else '',
            'Domein2': row.Domein2 if is_metaphor else ''
        })
        token_id += 1

    return rows, token_id

def process_excel(input_excel, output_tsv):
    df = pd.read_excel(input_excel)
    all_rows = []
    token_id = 1

    for idx, row in tqdm(df.iterrows(), total=len(df), desc="Processing"):
        rows, token_id = preprocess_sentences(row, idx + 1, token_id)
        all_rows.extend(rows)

    out_df = pd.DataFrame(all_rows)
    out_df.to_csv(output_tsv, sep='\t', index=False)
    print(f"Saved processed file to {output_tsv}")


In [None]:
process_excel('../data/20221220 Metaforen IT_DEF.xlsx', 'sentences_immuno.tsv')

## Find sentences with metaphors in VU corpus

In [45]:
def extract_metaphor_sentences(xml_path, output_tsv_with_mflag, output_tsv_without_mflag, direct_metaphor_limit=40):
    tree = ET.parse(xml_path)
    root = tree.getroot()
    namespace = {'tei': 'http://www.tei-c.org/ns/1.0'}
    
    texts = root.findall('.//tei:text', namespace)
    
    sentences_with_mflag = []
    sentences_without_mflag = []
    
    for text in texts:
        text_id = text.attrib.get('{http://www.w3.org/XML/1998/namespace}id', '').strip()
        if not text_id:
            continue
        
        for s in text.findall('.//tei:s', namespace):
            sentence_number = s.attrib.get('n', '').strip()
            tokens = []
            has_mflag = False
            
            for word in s.findall('tei:w', namespace):
                seg_els = word.findall('tei:seg', namespace)
                
                token_text = None
                for seg_el in seg_els:
                    if seg_el.text and seg_el.text.strip():
                        token_text = seg_el.text.strip()
                        break
                if not token_text:
                    token_text = word.text.strip() if word.text else ''
                
                lemma = word.attrib.get('lemma', '').strip()
                
                mflag = 0
                for seg in seg_els:
                    if seg.attrib.get('function') == 'mFlag':
                        mflag = 1
                        has_mflag = True
                
                tokens.append({
                    'text_id': text_id,
                    'sentence_number': sentence_number,
                    'token_text': token_text,
                    'lemma': lemma,
                    'mflag': mflag,
                    'seg_els': seg_els
                })
            
            if has_mflag:
                sentences_with_mflag.append(tokens)
            else:
                sentences_without_mflag.append(tokens)
    
    for tokens in sentences_with_mflag:
        for token in tokens:
            token['metaphor'] = 0
            for seg in token['seg_els']:
                if seg.attrib.get('function') == 'mrw' and seg.attrib.get('type') == 'lit':
                    token['metaphor'] = 1
                    break
            del token['seg_els']
    
    selected_sentences_without_mflag = []
    metaphor_sentence_count = 0
    
    for tokens in sentences_without_mflag:
        metaphor_candidates = []
        for idx, token in enumerate(tokens):
            token['metaphor'] = 0
            for seg in token['seg_els']:
                if seg.attrib.get('function') == 'mrw' and seg.attrib.get('type') == 'met':
                    metaphor_candidates.append(idx)
        
        if metaphor_candidates:
            if metaphor_sentence_count < direct_metaphor_limit:
                chosen_idx = random.choice(metaphor_candidates)
                tokens[chosen_idx]['metaphor'] = 1
                selected_sentences_without_mflag.append(tokens)
                metaphor_sentence_count += 1
        
        for token in tokens:
            if 'seg_els' in token:
                del token['seg_els']
    
    print(f"Sentences WITH mflag: {len(sentences_with_mflag)}")
    print(f"Sentences WITHOUT mflag selected (limit {direct_metaphor_limit}): {len(selected_sentences_without_mflag)}")
    print(f"Total direct metaphors marked (mrw + type=met) in sentences without mflag: {metaphor_sentence_count}")
    
    rows_with = []
    sentence_id = 0
    for sent_tokens in sentences_with_mflag:
        sentence_id += 1
        for token in sent_tokens:
            rows_with.append({
                'sentence_id': sentence_id,
                'sentence_number': token['sentence_number'],
                'token_text': token['token_text'],
                'lemma': token['lemma'],
                'metaphor': token['metaphor'],
                'mflag': token['mflag']
            })
    df_with = pd.DataFrame(rows_with)
    df_with.to_csv(output_tsv_with_mflag, sep='\t', index=False)
    
    rows_without = []
    sentence_id = 0
    for sent_tokens in selected_sentences_without_mflag:
        sentence_id += 1
        for token in sent_tokens:
            rows_without.append({
                'sentence_id': sentence_id,
                'sentence_number': token['sentence_number'],
                'token_text': token['token_text'],
                'lemma': token['lemma'],
                'metaphor': token['metaphor'],
                'mflag': token['mflag']
            })
    df_without = pd.DataFrame(rows_without)
    df_without.to_csv(output_tsv_without_mflag, sep='\t', index=False)
    
    print(f'[DONE] Saved {len(sentences_with_mflag)} sentences WITH mflag to {output_tsv_with_mflag}')
    print(f'[DONE] Saved {len(selected_sentences_without_mflag)} sentences WITHOUT mflag to {output_tsv_without_mflag}')

extract_metaphor_sentences(
    '../data/vu dataset/VUAMC.xml',
    'metaphors_with_mflag.tsv',
    'metaphors_without_mflag.tsv',
    direct_metaphor_limit=90
)

Sentences WITH mflag: 120
Sentences WITHOUT mflag selected (limit 90): 90
Total direct metaphors marked (mrw + type=met) in sentences without mflag: 90
[DONE] Saved 120 sentences WITH mflag to metaphors_with_mflag.tsv
[DONE] Saved 90 sentences WITHOUT mflag to metaphors_without_mflag.tsv


## Split

### Creation of train, dev, test splits of sentence level files

In [119]:
df_train_small = pd.read_csv("sentences_immuno.tsv", sep="\t") 

In [121]:
print(df_train_small.columns)

Index(['text_id', 'sentence_id', 'token_id', 'token_text', 'metaphor',
       'Categorie', 'Domein1', 'Domein2'],
      dtype='object')


In [51]:
unique_text_ids = df_train_small['text_id'].unique().tolist()

random.seed(42)
random.shuffle(unique_text_ids)

n_total = len(unique_text_ids)
n_train = int(n_total * 0.6)
n_dev = int(n_total * 0.2)
n_test = n_total - n_train - n_dev 

train_ids = unique_text_ids[:n_train]
dev_ids = unique_text_ids[n_train:n_train + n_dev]
test_ids = unique_text_ids[n_train + n_dev:]

train_small = df_train_small[df_train_small['text_id'].isin(train_ids)]
dev_small = df_train_small[df_train_small['text_id'].isin(dev_ids)]
test_small = df_train_small[df_train_small['text_id'].isin(test_ids)]

train_small.to_csv("train_small.tsv", sep="\t", index=False)
dev_small.to_csv("dev_small.tsv", sep="\t", index=False)
test_small.to_csv("test_small.tsv", sep="\t", index=False)


In [55]:
train_small = pd.read_csv("train_small.tsv", sep="\t")
dev_small = pd.read_csv("dev_small.tsv", sep="\t")
test_small = pd.read_csv("test_small.tsv", sep="\t")

for df in [train_small, dev_small, test_small]:
    if 'sentence_id' in df.columns:
        df.rename(columns={'sentence_id': 'sentence_number'}, inplace=True)

keep_cols = ['text_id', 'sentence_number', 'token_text', 'metaphor']
train_small = train_small[keep_cols]
dev_small = dev_small[keep_cols]
test_small = test_small[keep_cols]

train_small['is_it_relevant'] = (train_small['metaphor'] == 1).astype(int)
dev_small['is_it_relevant'] = (dev_small['metaphor'] == 1).astype(int)
test_small['is_it_relevant'] = (test_small['metaphor'] == 1).astype(int)

mflag_df = pd.read_csv("metaphors_with_mflag.tsv", sep="\t")
nomflag_df = pd.read_csv("metaphors_without_mflag.tsv", sep="\t")

mflag_df.rename(columns={'sentence_id': 'sentence_number'}, inplace=True)
nomflag_df.rename(columns={'sentence_id': 'sentence_number'}, inplace=True)

mflag_df = mflag_df[keep_cols]
nomflag_df = nomflag_df[keep_cols]

mflag_df['is_it_relevant'] = 0
nomflag_df['is_it_relevant'] = 0

max_text_id = max(
    train_small['text_id'].max(),
    dev_small['text_id'].max(),
    test_small['text_id'].max()
)
max_sentence_number = max(
    train_small['sentence_number'].max(),
    dev_small['sentence_number'].max(),
    test_small['sentence_number'].max()
)

mflag_df['text_id'] += max_text_id + 1
mflag_df['sentence_number'] += max_sentence_number + 1

nomflag_df['text_id'] += max_text_id + 10001
nomflag_df['sentence_number'] += max_sentence_number + 10001

def split_by_sentence(df, proportions, seed=42):
    random.seed(seed)
    sentence_ids = df['sentence_number'].unique().tolist()
    random.shuffle(sentence_ids)

    n = len(sentence_ids)
    n_train = int(n * proportions[0])
    n_dev = int(n * proportions[1])

    train_ids = sentence_ids[:n_train]
    dev_ids = sentence_ids[n_train:n_train + n_dev]
    test_ids = sentence_ids[n_train + n_dev:]

    return (
        df[df['sentence_number'].isin(train_ids)],
        df[df['sentence_number'].isin(dev_ids)],
        df[df['sentence_number'].isin(test_ids)]
    )

total_sentences = sum([
    train_small['sentence_number'].nunique(),
    dev_small['sentence_number'].nunique(),
    test_small['sentence_number'].nunique()
])

proportions = [
    train_small['sentence_number'].nunique() / total_sentences,
    dev_small['sentence_number'].nunique() / total_sentences
]

mflag_train, mflag_dev, mflag_test = split_by_sentence(mflag_df, proportions)
nomflag_train, nomflag_dev, nomflag_test = split_by_sentence(nomflag_df, proportions)

train_all = pd.concat([train_small, mflag_train, nomflag_train])
dev_all = pd.concat([dev_small, mflag_dev, nomflag_dev])
test_all = pd.concat([test_small, mflag_test, nomflag_test])

train_all.to_csv("train_small_updated.tsv", sep="\t", index=False)
dev_all.to_csv("dev_small_updated.tsv", sep="\t", index=False)
test_all.to_csv("test_small_updated.tsv", sep="\t", index=False)

print(f"[DONE] Updated datasets saved: {len(train_all)} train, {len(dev_all)} dev, {len(test_all)} test tokens.")

[DONE] Updated datasets saved: 11189 train, 5316 dev, 4967 test tokens.


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_small['is_it_relevant'] = (test_small['metaphor'] == 1).astype(int)


### Duplicating extra files to add to sentence-level datasets

In [139]:
def clean_token_text(token):
    if not isinstance(token, str):
        return ''
    
    token = token.replace('\u201c', '"').replace('\u201d', '"')  
    token = token.replace('\u2018', "'").replace('\u2019', "'")  
    
    try:
        token = token.encode('latin1').decode('utf-8')
    except:
        pass
    
    token = ''.join(c for c in token if c.isprintable() or c in '\n\t')
    
    return token


extra_files = [
    "data_creation/grief_counselor_final.xlsx",
    "data_creation/radiation_therapy_final.xlsx",
    "data_creation/wonder_cure_final.xlsx",
    "data_creation/epidemic_final.xlsx",
    "data_creation/lung_cancer_final.xlsx"
]

output_rows = []

for file in extra_files:
    print(f"Processing {file}...")
    df = pd.read_excel(file)

    df['FINAL'] = df['FINAL'].fillna(0).astype(int)
    if 'is_it_relevant' not in df.columns:
        df['is_it_relevant'] = df['FINAL']
    else:
        df['is_it_relevant'] = df['is_it_relevant'].fillna(0).astype(int)

    df['token_text'] = df['token_text'].apply(clean_token_text)
    df = df.rename(columns={'FINAL': 'metaphor'})

    for sent_id, group in df.groupby("sentence_id"):
        group = group.sort_values("token_id").copy().reset_index(drop=True)

        metaphor_spans = []
        current_span = []

        for i, row in group.iterrows():
            if row["metaphor"] == 1:
                current_span.append(i)
            elif current_span:
                metaphor_spans.append(current_span)
                current_span = []
        if current_span:
            metaphor_spans.append(current_span)

        if not metaphor_spans:
            continue  

        for idx, span in enumerate(metaphor_spans):
            duplicated = group.copy()

            duplicated["metaphor"] = 0
            duplicated["is_it_relevant"] = 0

            duplicated.loc[span, "metaphor"] = 1
            duplicated.loc[span, "is_it_relevant"] = group.loc[span, "is_it_relevant"]

            duplicated["text_id"] = f"extra_{file.split('/')[-1].split('_')[0]}_{sent_id}_{idx:02}"
            output_rows.append(duplicated)

if output_rows:
    final_df = pd.concat(output_rows, ignore_index=True)
    final_df = final_df[["text_id", "sentence_id", "token_text", "pos", "metaphor", "is_it_relevant"]]
    final_df.to_csv("extra_sentences.tsv", sep="\t", index=False)
    print(f"\nSaved duplicated sentences with metaphors to 'extra_sentences.tsv'")
else:
    print("No metaphor spans found in any file.")


Processing data_creation/grief_counselor_final.xlsx...
Processing data_creation/radiation_therapy_final.xlsx...
Processing data_creation/wonder_cure_final.xlsx...
Processing data_creation/epidemic_final.xlsx...
Processing data_creation/lung_cancer_final.xlsx...

Saved duplicated sentences with metaphors to 'extra_sentences.tsv'


### Creation of train, dev, test splits of complete text files

In [86]:
df_train = pd.read_csv("full_texts_recall.tsv", sep="\t") 

  df_train = pd.read_csv("i_tried_one_more_time.tsv", sep="\t")


In [31]:
unique_text_ids = df_train['text_id'].unique().tolist()

random.seed(42)
random.shuffle(unique_text_ids)

n_total = len(unique_text_ids)
n_train = int(n_total * 0.6)
n_dev = int(n_total * 0.2)
n_test = n_total - n_train - n_dev 

train_ids = unique_text_ids[:n_train]
dev_ids = unique_text_ids[n_train:n_train + n_dev]
test_ids = unique_text_ids[n_train + n_dev:]

train = df_train[df_train['text_id'].isin(train_ids)]
dev = df_train[df_train['text_id'].isin(dev_ids)]
test = df_train[df_train['text_id'].isin(test_ids)]

train.to_csv("train.tsv", sep="\t", index=False)
dev.to_csv("dev.tsv", sep="\t", index=False)
test.to_csv("test.tsv", sep="\t", index=False)


### Combined sentence-level datasets

In [3]:
train_file = '../train_small_updated.tsv'
dev_file = '../dev_small_updated.tsv'
test_file = '../test_small_updated.tsv'

In [4]:
anno_file = '../test_sentences_anno.xlsx'

In [5]:
train = pd.read_csv(train_file, sep="\t", header=0)
dev = pd.read_csv(dev_file, sep="\t", header=0)
test = pd.read_csv(test_file, sep="\t", header=0)
anno = pd.read_excel(anno_file, header=0)

In [6]:
column_names = [
        "text_id",
        "sentence_number",
        "token_text",
        "metaphor",
        "is_it_relevant",
    ]

dev_df = pd.read_csv(
        dev_file,
        sep="\t",
        names=column_names,
        encoding="utf-8",
        header=0,
        na_filter=False, 
    )

train_df = pd.read_csv(
        train_file,
        sep="\t",
        names=column_names,
        encoding="utf-8",
        header=0,
        na_filter=False,
    )

test_df = pd.read_csv(
        test_file,
        sep="\t",
        names=column_names,
        encoding="utf-8",
        header=0,
        na_filter=False,
    )



In [7]:
def convert_token_level_to_sentence_level(tsv_path, output_path=None):
    """
    Convert token-level metaphor annotations into sentence-level,
    duplicating the sentence for each metaphor token that passes POS filtering.

    Parameters:
    - tsv_path: path to input TSV
    - output_path: path to save output TSV (optional)

    Returns:
    - pd.DataFrame with columns ['text_id' (if available), 'sentence_id', 'sentence_tokens',
      'target_token_index', 'is_metaphor', 'is_it_relevant', 'pos']
    """

    _, ext = os.path.splitext(tsv_path)
    if ext.lower() in ['.xls', '.xlsx']:
        df = pd.read_excel(tsv_path)
    elif ext.lower() in ['.tsv', '.txt']:
        df = pd.read_csv(tsv_path, sep='\t')
    else:
        raise ValueError("Unsupported file format.")

    sentence_col = 'sentence_number' if 'sentence_number' in df.columns else 'sentence_id'
    metaphor_col = 'metaphor' if 'metaphor' in df.columns else 'FINAL'
    text_id_available = 'text_id' in df.columns

    required_columns = {sentence_col, metaphor_col, 'token_text', 'is_it_relevant'}
    if not required_columns.issubset(df.columns):
        missing = required_columns - set(df.columns)
        raise ValueError(f"Missing required columns: {missing}")

    group_cols = ['text_id', sentence_col] if text_id_available else [sentence_col]

    sentence_level_data = []
    grouped = df.groupby(group_cols, sort=False)

    for group_keys, group in grouped:
        tokens = [
            token if pd.notna(token) and str(token).strip() != '' else '[empty]'
            for token in group['token_text'].tolist()
        ]
        metaphors = group[metaphor_col].tolist()
        relevances = group['is_it_relevant'].tolist()

        for i, is_metaphor in enumerate(metaphors):
            if is_metaphor == 1:
                sentence_info = {
                    sentence_col: group[sentence_col].iloc[0],
                    'sentence_tokens': tokens,
                    'target_token_index': i,
                    'is_metaphor': 1,
                    'is_it_relevant': int(relevances[i]),
                    # 'pos': pos_tag,
                }
                if text_id_available:
                    sentence_info['text_id'] = group['text_id'].iloc[0]

                sentence_level_data.append(sentence_info)

    result_df = pd.DataFrame(sentence_level_data)

    if output_path is not None:
        result_df.to_csv(output_path, sep='\t', index=False)

    return result_df


test_df_sentence = convert_token_level_to_sentence_level(test_file, 'test_sentences.tsv')
train_df_sentence = convert_token_level_to_sentence_level(train_file, 'train_sentences.tsv')
dev_df_sentence = convert_token_level_to_sentence_level(dev_file, 'dev_sentences.tsv')
anno_df_sentence = convert_token_level_to_sentence_level(anno_file, 'anno_sentences.tsv')