In [None]:
import xml.etree.ElementTree as ET
import pandas as pd
import re

In [10]:
def clean_text(text):
    return re.sub(r'\s+', ' ', text).strip()

tree = ET.parse("VUAMC.xml")
root = tree.getroot()

ns = {'tei': 'http://www.tei-c.org/ns/1.0'} # Namespace TEI

data = [] # Labeled sentences

for s in root.findall('.//tei:s', ns): # All sentences
    sentence_text = clean_text(''.join(s.itertext())) # Full sentence text
    metaphors = []

    for w in s.findall('.//tei:w', ns): # All words in the sentence
        for seg in w.findall('tei:seg', ns):
            if seg.attrib.get("function") == "mrw" and seg.attrib.get("type") == "met": # Metaphorical word
                text = clean_text(''.join(seg.itertext()))
                if text:
                    metaphors.append(text)

    label = 1 if metaphors else 0 # 1 if there are metaphors, else 0
    data.append((sentence_text, label, ';'.join(metaphors)))

df = pd.DataFrame(data, columns=["sentence", "label", "metaphors"])
df.to_csv("VUAMC_sentences_labeled.csv", index=False, encoding="utf-8")

print(f"Sentences: {len(data)}.")

Sentences: 16202.


In [None]:
df = pd.read_csv("VUAMC_sentences_labeled.csv")

for i, row in df.head(10).iterrows():
    label = row["label"]
    sentence = row["sentence"] 
    metaphors = row["metaphors"]

    print(f"[{label}] {sentence}")
    if label == 1 and metaphors:
        print(f"\tMetaphors: {metaphors}")
    print()

[1] Latest corporate unbundler reveals laid-back approach : Roland Franklin , who is leading a 697m pound break-up bid for DRG , talks to Frank Kane
	Metaphors: reveals;approach;leading;to

[0] By FRANK KANE

[1] IT SEEMS that Roland Franklin , the latest unbundler to appear in the UK , has made a fatal error in the preparation of his £697m break-up bid for stationery and packaging group DRG .
	Metaphors: made;fatal;in

[1] He has not properly investigated the target 's dining facilities .
	Metaphors: target

[1] The 63-year-old head of Pembridge Investments , through which the bid is being mounted says , ‘ rule number one in this business is : the more luxurious the luncheon rooms at headquarters , the more inefficient the business ’ .
	Metaphors: head;through;mounted;rule;in;this;headquarters

[1] If he had taken his own rule seriously , he would have found out that DRG has a very modest self-service canteen at its Bristol head office .
	Metaphors: taken;modest;head

[1] There are ot