In [None]:
import pandas as pd

dataf = pd.read_csv("Extracting training data - Sheet7.csv")
dataf = dataf.replace(r'\n',' ', regex=True)
dataf = dataf.replace(r'\r',' ', regex=True)
print(dataf)

                                              Sentence  \
0    The genetic model plant Arabidopsis thaliana, ...   
1    Such heterogeneity may drive local adaptation,...   
2    Here, we describe a study in which we used gen...   
3    A weak allele of AtHKT1;1 that drives elevated...   
4    Inspection of the geographical distribution of...   
..                                                 ...   
604  However, the upregulation of these transporter...   
605  We believe these two strategies are not exclus...   
606  We conclude that the phenotypic and genomic va...   
607  Future studies with deeper sampling will surel...   
608  Besides, given the success with _A. thaliana, ...   

                                              Entities  \
0    [   {     "type": "Organism",     "text": "Ara...   
1    [   {     "type": "Concept / Abstract Entity",...   
2    [   {     "type": "Study / Analysis",     "tex...   
3    [    {      "type": "Gene",      "text": "AtHK...   
4    [    {  

In [None]:
pip install fuzzywuzzy



In [None]:
import json

def parse_json_columns(row):
    """
    Convert the string-encoded JSON in 'Entities' and 'relations' into lists/dicts.
    """
    entities_str = row["Entities"]
    relations_str = row["Relations"]
    try:
        entities = json.loads(entities_str) if pd.notnull(entities_str) else []
        relations = json.loads(relations_str) if pd.notnull(relations_str) else []
    except (json.JSONDecodeError, TypeError) as e:
        print(f"ERROR: Could not parse Entities or Relations for row:\n{row}\nReason: {e}")
        entities = []
        relations = []
    return entities, relations

# Example usage on each row
for idx, row in dataf.iterrows():
    entities, relations = parse_json_columns(row)
    row["Entities"] = entities
    row["Relations"] = relations
   # print(f"Row {idx} => Entities: {entities}")
   # print(f"Row {idx} => Relations: {relations}")


In [None]:
print(dataf)

                                              Sentence  \
0    The genetic model plant Arabidopsis thaliana, ...   
1    Such heterogeneity may drive local adaptation,...   
2    Here, we describe a study in which we used gen...   
3    A weak allele of AtHKT1;1 that drives elevated...   
4    Inspection of the geographical distribution of...   
..                                                 ...   
604  However, the upregulation of these transporter...   
605  We believe these two strategies are not exclus...   
606  We conclude that the phenotypic and genomic va...   
607  Future studies with deeper sampling will surel...   
608  Besides, given the success with _A. thaliana, ...   

                                              Entities  \
0    [{'type': 'Organism', 'text': 'Arabidopsis tha...   
1    [{'type': 'Concept / Abstract Entity', 'text':...   
2    [{'type': 'Study / Analysis', 'text': 'a study...   
3    [{'type': 'Gene', 'text': 'AtHKT1;1'}, {'type'...   
4    [{'type'

In [None]:
import pandas as pd
import difflib
import spacy
import difflib

nlp = spacy.load("en_core_web_sm")

def find_best_substring(sentence, target_text):
    """
    1) Tokenize 'sentence' with spaCy => doc
    2) Tokenize 'target_text' with spaCy => ent_doc
    3) Check if ent_tokens appear as a contiguous subarray in sent_tokens:
       If found, return that exact substring from the original 'sentence'.
    4) Otherwise, do a fuzzy search among *all* consecutive token spans
       in the sentence.
    """

    # 1) Quick trivial checks
    sentence_stripped = sentence.strip()
    target_stripped   = target_text.strip()
    if not target_stripped:
        return ""

    # 2) Tokenize
    doc = nlp(sentence)
    ent_doc = nlp(target_text)
    if not ent_doc:
        return ""

    sent_tokens = list(doc)      # list of spaCy Token objects for the sentence
    ent_tokens  = list(ent_doc)  # list of spaCy Token objects for target_text

    # 3) Attempt token-level subarray match
    #    If ent_tokens appear contiguously in sent_tokens, short-circuit:
    n_sent = len(sent_tokens)
    n_ent  = len(ent_tokens)

    # Convert ent_tokens to pure strings for direct equality checks
    ent_str_list = [t.text for t in ent_tokens]
    sent_str_list = [t.text for t in sent_tokens]

    # We'll scan for an exact match of ent_str_list in sent_str_list
    for i in range(n_sent - n_ent + 1):
        # Check if the next n_ent tokens match
        if sent_str_list[i : i + n_ent] == ent_str_list:
            # Found an exact token-level match at indices [i.. i+n_ent-1]
            start_char = sent_tokens[i].idx
            end_char   = sent_tokens[i + n_ent - 1].idx + len(sent_tokens[i + n_ent - 1].text)
            # Return the exact substring from the original sentence
            return sentence[start_char : end_char]

    # 4) If we didn't find an exact subarray match, fallback to fuzzy search
    best_substring = None
    best_ratio = -1.0

    for start_idx in range(n_sent):
        for end_idx in range(start_idx + 1, n_sent + 1):
            span = sent_tokens[start_idx:end_idx]
            start_char = span[0].idx
            end_char = span[-1].idx + len(span[-1].text)
            candidate_substring = sentence[start_char:end_char]

            ratio = difflib.SequenceMatcher(None, candidate_substring, target_text).ratio()
            if ratio > best_ratio:
                best_ratio = ratio
                best_substring = candidate_substring

    if best_substring is None:
        print("EEEEEEEERRRRRRRRRRRRORRRRRRRRRRRR")
        return target_text

    return best_substring


def snap_entities_to_sentence(
    df,
    sentence_col="Sentence",
    entities_col="Entities",
    relations_col="Relations"
):
    """
    For each row:
      - For each entity in `entities_col`, replaces `entity["text"]` with the
        best fuzzy substring from the original sentence (character-based).
      - For each relation in `relations_col`, updates the 'text' of each 'arg'
        in the same manner, or uses the already snapped entity text if available.

    Returns a new DataFrame with corrected Entities/Relations.
    """
    new_df = df.copy(deep=True)

    corrected_entities_all = []
    corrected_relations_all = []

    for idx, row in new_df.iterrows():
        sentence = row[sentence_col]
        entities = row[entities_col]
        relations = row[relations_col]

        # Map from original entity text -> snapped text
        entity_text_map = {}
        corrected_entities = []

        # 1) Fix the entity text
        for entity in entities:
            original_text = entity["text"]
            snapped_text = find_best_substring(sentence, original_text)

            entity_text_map[original_text] = snapped_text
            corrected_entity = {
                "type": entity["type"],
                "text": snapped_text
            }
            corrected_entities.append(corrected_entity)

        # 2) Fix the relation text
        corrected_relations = []
        for rel in relations:
            rel_type = rel["type"]
            corrected_args = []
            for arg in rel["args"]:
                original_text = arg["text"]
                if original_text in entity_text_map:
                    # If we've already snapped this entity text, reuse it
                    corrected_arg_text = entity_text_map[original_text]
                else:
                    # Otherwise, find the best match in the sentence
                    corrected_arg_text = find_best_substring(sentence, original_text)

                corrected_args.append({
                    "type": arg["type"],
                    "text": corrected_arg_text
                })

            corrected_relations.append({
                "type": rel_type,
                "args": corrected_args
            })

        corrected_entities_all.append(corrected_entities)
        corrected_relations_all.append(corrected_relations)

    # Replace the columns in the new DataFrame
    new_df['Entities'] = corrected_entities_all
    new_df['Relations'] = corrected_relations_all

    return new_df



df = pd.DataFrame(dataf)
dataf = snap_entities_to_sentence(df)
print(dataf.to_dict(orient='records'))


[{'Sentence': 'The genetic model plant Arabidopsis thaliana, like many plant species, experiences a range of edaphic conditions across its natural habitat.', 'Entities': [{'type': 'Organism', 'text': 'Arabidopsis thaliana'}, {'type': 'Population / Group', 'text': 'many plant species'}, {'type': 'Environmental Factor', 'text': 'a range of edaphic conditions'}, {'type': 'Environmental Factor', 'text': 'natural habitat'}], 'Relations': [{'type': 'ASSOCIATED_WITH', 'args': [{'type': 'Organism', 'text': 'Arabidopsis thaliana'}, {'type': 'Environmental Factor', 'text': 'a range of edaphic conditions'}]}, {'type': 'ASSOCIATED_WITH', 'args': [{'type': 'Population / Group', 'text': 'many plant species'}, {'type': 'Environmental Factor', 'text': 'a range of edaphic conditions'}]}, {'type': 'LOCATED_IN', 'args': [{'type': 'Environmental Factor', 'text': 'a range of edaphic conditions'}, {'type': 'Environmental Factor', 'text': 'natural habitat'}]}]}, {'Sentence': 'Such heterogeneity may drive loc

In [None]:
import spacy
import json

nlp = spacy.load("en_core_web_sm")

def tokenize_text(text):
    """
    Tokenize the input text using spaCy and return a list of token strings.
    """
    doc = nlp(text)
    return [token.text for token in doc]

def find_offsets(sentence_tokens, entity_text):
    """
    Tokenize 'entity_text' with spaCy, then find a SINGLE contiguous span
    in 'sentence_tokens' that exactly matches all tokens in 'entity_text'.

    Returns:
        a list of integer offsets [start_idx, start_idx+1, ...] if matched
        [] if no contiguous match is found
    """
    # Tokenize the entity text with spaCy
    entity_doc = nlp(entity_text)
    entity_tokens = [token.text for token in entity_doc]

    # If entity text is empty, return []
    if not entity_tokens:
        return []

    # We'll search for the first occurrence of the entire contiguous sequence
    # of tokens in the sentence tokens.
    n = len(sentence_tokens)
    m = len(entity_tokens)

    # For each possible start index in sentence_tokens:
    for start_idx in range(n - m + 1):
        # Compare contiguous sequence from start_idx up to start_idx + m
        match = True
        for offset in range(m):
            # EXACT match: check punctuation, capitalization, etc.
            if sentence_tokens[start_idx + offset] != entity_tokens[offset]:
                match = False
                break
        if match:
            # Found a contiguous exact match
            return list(range(start_idx, start_idx + m))

    # If we exit the loop, no match was found
    return []

def build_record_string_paper_style(entity_list, relation_list):
    from collections import defaultdict
    subject_to_rels = defaultdict(list)

    # Build a lookup { subject_text -> list of (rel_type, object_text) }
    for r in relation_list:
        if len(r["args"]) == 2:
            subj = r["args"][0]
            obj = r["args"][1]
            subject_to_rels[subj["text"]].append((r["type"], obj["text"]))

    lines = ["<extra_id_0>"]  # <-- Open top-level

    for e in entity_list:
        e_type = e["type"]
        e_text = e["text"]

        # Start the block for this entity
        lines.append(f"<extra_id_0> {e_type} <extra_id_5> {e_text}")

        # Add each relation (if any) where e_text is subject
        for (rel_type, obj_text) in subject_to_rels[e_text]:
            lines.append(f"<extra_id_0> {rel_type} <extra_id_5> {obj_text} <extra_id_1>")

        # Close this entity block
        lines.append("<extra_id_1>")

    # Close top-level
    lines.append("<extra_id_1>")

    # Join everything into a single string
    record_str = " ".join(lines)
    tokens = record_str.strip().split()
    # Basic length check
    if len(tokens) < 4:
        raise ValueError(
            "[build_record_string_paper_style] Not enough tokens to have two <extra_id_0> at "
            f"start and two <extra_id_1> at end. Tokens={tokens}"
        )

    # Check the first two tokens are exactly <extra_id_0>, <extra_id_0>, and NOT three in a row
    if tokens[0] != "<extra_id_0>" or tokens[1] != "<extra_id_0>":
        raise ValueError(
            "[build_record_string_paper_style] The record does not start with exactly "
            f"two <extra_id_0>. Found first tokens: {tokens[:3]}\nFull record: {record_str}"
        )
    if len(tokens) > 2 and tokens[2] == "<extra_id_0>":
        raise ValueError(
            "[build_record_string_paper_style] Found THREE consecutive <extra_id_0> at the start. "
            f"First tokens: {tokens[:4]}\nFull record: {record_str}"
        )

    return record_str







def row_to_spot_asoc_format(row):
    """
    Convert one DataFrame row to the target UIE format with properly
    nested <extra_id_0> ... <extra_id_1> blocks.

    Returns:
        (output_dict, None, None)                 if success
        (None, fail_reason_string, offending_text) if failure
    """
    text = row["Sentence"]
    tokens = tokenize_text(text)

    # Build entity list with offsets
    entity_list = []
    for e in row["Entities"]:
        e_text = e.get("text", "")
        e_type = e.get("type", "")

        # If e_text is empty or type is empty, consider it a failure
        if not e_text or not e_type:
            return None, "Entity missing 'text' or 'type'", str(e)

        offset = find_offsets(tokens, e_text)
        if not offset:
            return None, "Entity mismatch (cannot find text in sentence)", e_text

        entity_list.append({
            "type": e_type,
            "offset": offset,
            "text": e_text
        })

    # Build relation list with offsets
    relation_list = []
    for r in row["Relations"]:
        rel_type = r.get("type", "")
        if not rel_type:
            return None, "Relation missing 'type'", str(r)

        # Safely handle if 'args' is missing or invalid
        args_iterable = r.get("args", None)
        if not isinstance(args_iterable, list):
            return None, "Missing or invalid 'args' in relation", str(r)

        rel_args = []
        for arg in args_iterable:
            arg_text = arg.get("text", "")
            arg_type = arg.get("type", "")
            if not arg_text or not arg_type:
                return None, "Relation arg missing 'text' or 'type'", str(arg)

            offset = find_offsets(tokens, arg_text)
            if not offset:
                return None, "Relation mismatch (cannot find arg text)", arg_text

            rel_args.append({
                "type": arg_type,
                "offset": offset,
                "text": arg_text
            })

        relation_list.append({
            "type": rel_type,
            "args": rel_args
        })

    # Identify unique entity types (spot) and unique relation types (asoc)
    spot = list({e["type"] for e in entity_list})
    asoc = list({r["type"] for r in relation_list})

    # Build spot_asoc
    spot_asoc_list = []
    for e in entity_list:
        out_rels = []
        for r in relation_list:
            if len(r["args"]) == 2:
                subj = r["args"][0]
                obj  = r["args"][1]
                if subj["text"] == e["text"]:
                    out_rels.append([r["type"], obj["text"]])
        spot_asoc_list.append({
            "span": e["text"],
            "label": e["type"],
            "asoc": out_rels
        })

    # ---------------------------------------------------------
    # Build the "record" string with properly nested brackets:
    # For each entity E:
    #   <extra_id_0>  (open subject block)
    #       <extra_id_0> E's label <extra_id_5> E's text
    #       for each relation R where E is subject:
    #           <extra_id_0> R.type <extra_id_5> R.object_text <extra_id_1>
    #   <extra_id_1>  (close subject block)
    # ---------------------------------------------------------
    # 4) Build record in the PAPER-STYLE bracket format
    record_str = build_record_string_paper_style(entity_list, relation_list)

    # Construct final output dict
    output = {
        "text": text,
        "tokens": tokens,
        "record": record_str,
        "entity": entity_list,
        "relation": relation_list,
        "event": [],
        "spot": spot,
        "asoc": asoc,
        "spot_asoc": spot_asoc_list,
        "task": "pair"
    }
    return output, None, None



# --------------------------------------------------------------------------
# 6. Build final lists: successful conversions & failures
# --------------------------------------------------------------------------
all_converted = []
failures = []

for idx, row in dataf.iterrows():
    converted, fail_reason, offending_piece = row_to_spot_asoc_format(row)
    if converted is not None:
        # Success case
        all_converted.append(converted)
    else:
        # Failure case
        failures.append({
            "row_index": idx,
            "sentence": row["Sentence"],
            "failure_reason": fail_reason,
            "offending_piece": offending_piece
        })

print(f"Successfully converted {len(all_converted)} rows out of {len(dataf)}.")
print(f"Failed to convert {len(failures)} rows.")

Successfully converted 608 rows out of 609.
Failed to convert 1 rows.


In [None]:
failures_df = pd.DataFrame(failures)
print(failures_df)


   row_index                                           sentence  \
0         11  While adaptation to high altitude in Peromyscu...   

                                   failure_reason offending_piece  
0  Entity mismatch (cannot find text in sentence)   -globin genes  


In [None]:
failures_df.to_csv("failures.csv")

In [None]:
for entry in not_converted_rows:
  print(entry)

Sentence     While adaptation to high altitude in Peromyscu...
Entities     [{'type': 'Organism', 'text': 'Peromyscus mani...
Relations    [{'type': 'ASSOCIATED_WITH', 'args': [{'type':...
Name: 11, dtype: object
Sentence     Given the critical role Na[+] accumulation pla...
Entities     [{'type': 'Trait / Phenotype', 'text': 'Na+ ac...
Relations    [{'type': 'CONTRIBUTES_TO', 'args': [{'type': ...
Name: 26, dtype: object
Sentence     Accessions with a thymine (T) at the SNP most ...
Entities     [{'type': 'Accession / Line', 'text': 'Accessi...
Relations    [{'type': 'ASSOCIATED_WITH', 'args': [{'type':...
Name: 39, dtype: object
Sentence     The genotype at Chr4:6392276 of each accession...
Entities     [{'type': 'Accession / Line', 'text': 'each ac...
Relations    [{'type': 'HAS_MEASUREMENT', 'args': [{'type':...
Name: 53, dtype: object
Sentence     Furthermore, using 13 SNPs within a 20kb regio...
Entities     [{'type': 'Allele / Variant', 'text': '13 SNPs...
Relations    [{'type':

In [None]:
import json
import random

# Suppose all_converted is the list of all your converted entries.
# e.g., all_converted = [ { ... }, { ... }, ... ]

# 1) Shuffle the data so there's no order bias in splits (optional).
random.shuffle(all_converted)

# 2) Slice out 15 for test, 15 for val, and the rest for train.
#    Make sure your all_converted has at least 30 entries!
test_set = all_converted[:35]
val_set = all_converted[35:70]
train_set = all_converted[70:]

def write_jsonl(filename, data):
    """
    Write each item in 'data' as a JSON-encoded line (JSONL format).
    """
    with open(filename, "w", encoding="utf-8") as f:
        for item in data:
            line = json.dumps(item, ensure_ascii=False)
            f.write(line + "\n")

# Write out each split in JSONL format (each entry on one line).
write_jsonl("train.json", train_set)
write_jsonl("test.json", test_set)
write_jsonl("val.json", val_set)

print("Train, test, and val JSON files have been created (one record per line)!")


Train, test, and val JSON files have been created (one record per line)!


In [None]:
import json
import pandas as pd
from collections import defaultdict

# Assume df is your DataFrame after you've done parse_json_columns on it
# so that df["Entities"] is a list of entity dicts,
# and df["Relations"] is a list of relation dicts.

entity_types = set()
relation_types = set()

# This will track which relations each entity type can have as "subject".
# For example, if we see a row with r["type"] = "CONTRIBUTES_TO"
# and r["args"][0]["type"] = "Trait / Phenotype", then
#   subj_relations_by_entity["Trait / Phenotype"] => "CONTRIBUTES_TO"
subj_relations_by_entity = defaultdict(set)

for idx, row in dataf.iterrows():
    for rel in row["Relations"]:
        rtype = rel["type"]
        relation_types.add(rtype)
        # If exactly 2 arguments, treat first as subject
        try:
          if len(rel["args"]) == 2:
            subj = rel["args"][0]
            subj_type = subj["type"]
            subj_relations_by_entity[subj_type].add(rtype)
        except:
          continue
    # 1) Gather entity types
    for ent in row["Entities"]:
        entity_types.add(ent["type"])




# Convert sets to sorted lists
sorted_entity_types = sorted(entity_types)
sorted_relation_types = sorted(relation_types)

print(sorted_entity_types)
print(sorted_relation_types)

['Accession / Line', 'Allele / Variant', 'Amino Acid', 'Biological Process', 'Chemical Substance', 'Concept / Abstract Entity', 'Environmental Factor', 'Gene', 'Genomic Region', 'Measurement / Statistical Result', 'Organism', 'Population / Group', 'Protein / Polypeptide', 'Study / Analysis', 'Tissue / Plant Structure', 'Tool / Method', 'Trait / Phenotype']
['ASSOCIATED_WITH', 'CAUSES', 'CONTRIBUTES_TO', 'ENCODES', 'HAS_MEASUREMENT', 'IDENTIFIED_IN', 'IS_A', 'LOCATED_IN', 'NOT_ASSOCIATED_WITH', 'PART_OF', 'REGULATES', 'USED_IN']


In [None]:
with open("entity.schema", "w", encoding="utf-8") as f:
    # 1) All entity types (sorted)
    json.dump(sorted_entity_types, f, ensure_ascii=False)
    f.write("\n")
    # 2) Empty list
    json.dump([], f)
    f.write("\n")
    # 3) Empty dictionary
    json.dump({}, f)


In [None]:
with open("relation.schema", "w", encoding="utf-8") as f:
    # 1) All relation types
    json.dump(sorted_relation_types, f, ensure_ascii=False)
    f.write("\n")

    # 2) All entity types (or whichever set of entity types you want to show here)
    json.dump(sorted_entity_types, f, ensure_ascii=False)
    f.write("\n")

    # 3) A dict from each relation type to an empty list (to mimic your example)
    relation_dict = { r: [] for r in sorted_relation_types }
    json.dump(relation_dict, f, ensure_ascii=False)


In [None]:
with open("record.schema", "w", encoding="utf-8") as f:
    # 1) All entity types
    json.dump(sorted_entity_types, f, ensure_ascii=False)
    f.write("\n")

    # 2) All relation types
    json.dump(sorted_relation_types, f, ensure_ascii=False)
    f.write("\n")

    # 3) A dictionary from entity_type -> list of relations for that entity as subject
    record_dict = {}
    for et in sorted_entity_types:
        # convert the set of relations to a sorted list
        record_dict[et] = sorted(subj_relations_by_entity[et])

    json.dump(record_dict, f, ensure_ascii=False)
