# Imports

In [20]:


import kagglehub
import pandas as pd
import spacy
import uuid

# Setup

Dowload spacy tokenizer

In [2]:
!python -m spacy download en_core_web_sm

Collecting en-core-web-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m53.1 MB/s[0m eta [36m0:00:00[0m [36m0:00:01[0m
[?25h
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.3.1[0m[39;49m -> [0m[32;49m25.3[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')


# Load Data

In [3]:
# Download latest version from kaggle hub
path = kagglehub.dataset_download("daishinkan002/new-york-times-relation-extraction-dataset")
train_path = f"{path}/dataset/train.json"
test_path = f"{path}/dataset/test.json"
valid_path = f"{path}/dataset/valid.json"

df_train = pd.read_json(train_path, lines=True)
df_valid = pd.read_json(valid_path, lines=True)
df_test = pd.read_json(test_path, lines=True)

# create subsets for testing
df_train = df_train.sample(n=100)
df_valid = df_valid.sample(n=100)
df_test = df_test.sample(n=100)

# Preprocessing

In [4]:
def get_dep_path(doc, span1, span2):
    t1 = span1.root
    t2 = span2.root

    path1 = []
    path2 = []

    p = t1
    while p is not None:
        path1.append(p)
        p = p.head if p != p.head else None

    q = t2
    while q is not None:
        path2.append(q)
        q = q.head if q != q.head else None

    lca = next(t for t in path1 if t in path2)

    up = []
    cur = t1
    while cur != lca:
        up.append(f"<-{cur.dep_}")
        cur = cur.head

    rev = []
    cur = t2
    while cur != lca:
        rev.append(f"->{cur.dep_}")
        cur = cur.head
    down = list(reversed(rev))

    return "".join(up + down)

In [5]:
def find_span(doc, entity):
                start_char = doc.text.find(entity)
                if start_char == -1:
                    return None
                end_char = start_char + len(entity)
                return doc.char_span(start_char, end_char, alignment_mode="expand")

In [23]:
def dep_path_tokens(doc, t1, t2):
    """Return the list of tokens along the dependency path."""
    # Path from t1 → root
    path1 = []
    x = t1
    while x is not None:
        path1.append(x)
        if x == x.head:
            break
        x = x.head

    # Path from t2 → root
    path2 = []
    y = t2
    while y is not None:
        path2.append(y)
        if y == y.head:
            break
        y = y.head

    # Find LCA
    lcas = set(path1) & set(path2)
    if not lcas:
        return []  # no path found
    lca = next(t for t in path1 if t in lcas)

    # Build path from t1 → LCA
    up = []
    cur = t1
    while cur != lca:
        up.append(cur)
        cur = cur.head
    up.append(lca)

    # Build path from LCA → t2
    down = []
    cur = t2
    tmp = []
    while cur != lca:
        tmp.append(cur)
        cur = cur.head
    down = list(reversed(tmp))

    # Combined token path
    return up + down


def get_trigger(doc, dep_path_tokens):
    for t in dep_path_tokens:
        if t.pos_ in {"NOUN", "VERB"}:
            return t.text
    return None


In [24]:

# ToDo: some relationships appear douple in the ouptut data .txt!!

def preprocess_kaggle_df(df) -> pd.DataFrame:

    """
    Every setence of the NYT data set can possibly contain more than one relationships.
    This methods s
    
    1.) plits them up and creates a pandas.Dataframe with one line for each every mentioned relations
    and available corresponding relationship types. Relationships with no given corresponding relationship types are ommitted.

    2.) Tokenize the sentence

    3.) Create POS Tags
    """

    nlp = spacy.load("en_core_web_sm")

    null_trigger = 0
    
    columns=[ 'deppath',
        'entity_head',
                     'entity_tail',
                     'enttypes',
                     'trigger', 
                     'fname',
                     'tokenized_text',
                     "POS",
                     'relationship']
    
    df_structured = pd.DataFrame(columns=columns)

    for row in df.itertuples():

        # Tokize Sentence
        raw_text = row.sentText
        doc = nlp(raw_text)
        tokens = [t.text for t in doc]
        pos = [t.pos_ for t in doc]
        
        tokenized_text = tokens[0]
        tokens.pop(0)
        for t in tokens:
            tokenized_text = tokenized_text + " " + t
        
        # Get POS taggs
        pos_text = pos[0]
        pos.pop(0)
        for p in pos:
            pos_text = pos_text + " " + p

        for r in row.relationMentions:
            # get entity head and tailk
            entity_head = r['em1Text']
            entity_tail = r['em2Text']

            # get relationship
            relationship = r["label"]

            # get entity types
            type_head = None
            type_tail = None
            for e in row.entityMentions:
                if e['text'] == entity_head:
                    type_head = e['label']
                if e['text'] == entity_tail:
                    type_tail = e['label']

            if not (type_head and type_tail):
                continue

            enttypes = type_head + "-" + type_tail

            # create dependenthy path    
            span1 = find_span(doc, entity_head)
            span2 = find_span(doc, entity_tail)

            if span1 is None or span2 is None:
                deppath = "NA"
                print("deppath is NA")
                print(raw_text)
            else:
                try:
                    deppath = get_dep_path(doc, span1, span2)
                except StopIteration:
                    deppath = "NA"
                    print("deppath is NA")
                    print(raw_text)

            # get trigger
            tokens = dep_path_tokens(doc, span1.root, span2.root)
            trigger = get_trigger(doc, tokens)

            # construct row for .txt
            r_tmp = { 'deppath': deppath,
                'entity_head': entity_head,
                     'entity_tail': entity_tail,
                     'enttypes': enttypes,
                     'trigger': trigger, 
                     'fname': str(uuid.uuid4()),
                     'tokenized_text': tokenized_text,
                     "POS": pos_text,
                     'relationship': relationship}

            if not trigger:
                continue
            if deppath == "NA":
                continue
            if type_head and type_tail:
                df_structured.loc[len(df_structured)] = r_tmp
    
    print("null triggger:", null_trigger)
    return df_structured



train_prepocessed = preprocess_kaggle_df(df_train)
valid_prepocessed = preprocess_kaggle_df(df_valid)
test_prepocessed = preprocess_kaggle_df(df_test)



def write_to_txt(df, path):
    df.to_csv(path, sep='\t', index=False, header=False)

train_txt_path = "../data/nyt/train.txt"
valid_txt_path = "../data/nyt/dev.txt"
test_txt_path = "../data/nyt/test.txt"

write_to_txt(train_prepocessed, train_txt_path)
write_to_txt(valid_prepocessed, valid_txt_path)
write_to_txt(test_prepocessed, test_txt_path)


deppath is NA
New York Authorities Plan Civil Complaint Against A.I.G. and Two Former Officers The New York attorney general and the state insurance department are expected to file a civil complaint against the American International Group , its former chairman and chief executive , Maurice R. Greenberg , and its former chief financial officer , accusing them of accounting practices that allowed it to make its financial position look stronger than it actually was .
deppath is NA
Gene Pitney , the clean-cut crooner who became a teenage idol in the early 1960 's with hits like '' Town Without Pity , '' '' -LRB- The Man Who Shot -RRB- Liberty Valance '' and '' Only Love Can Break a Heart '' -- songs that showcased his keening tenor -- was found dead yesterday in a hotel in Cardiff , Wales , while on a tour of Britain .
deppath is NA
He is also the owner of the Glen Head Racquet Club in Glen Head , N.Y. Her mother retired as an occupational therapist at the Association for Children With Do

In [10]:
nlp = spacy.load("en_core_web_sm")


doc = nlp("1, 2, 3, 4, 5")

tokens = [t.text for t in doc]

In [13]:
tokens.pop(0)
tokens

['2', ',', '3', ',', '4', ',', '5']