In [10]:
import os
import sys

import pandas as pd
import numpy as np

from tqdm import tqdm

import spacy
from spacy.tokens.doc import Doc
from spacy.tokens import DocBin

import json
import jsonlines
from sklearn.model_selection import train_test_split

In [11]:
def convert_jsonl_to_docbin(input_file, output_train_file, output_test_file, model="en_core_web_trf", relation_labels=[]):
    """
    Convert JSONL file to Spacy DocBin file for training and testing datasets.

    Args:
        input_file (str): Path to the input JSONL file.
        output_train_file (str): Path to save the training DocBin file.
        output_test_file (str): Path to save the testing DocBin file.
        model (str, optional): Spacy model to use. Defaults to "en_core_web_trf".
        relation_labels (list, optional): List of relation labels. Defaults to an empty list.
    """

    # Load Spacy model
    nlp = spacy.load(model)

    # Read JSONL file
    annotations = pd.read_json(input_file, lines=True)

    # Set extension for Doc object
    Doc.set_extension("rel", default={}, force=True)

    # Split the data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(annotations["text"], annotations["entities"], test_size=0.33, random_state=42)

    def create_docbin(texts, entities_list, output_file):
        """
        Convert text and entities list to Spacy DocBin format and save to file.
        
        Args:
            texts (pd.Series): Series of texts.
            entities_list (pd.Series): Series of entities.
            output_file (str): Path to save the DocBin file.
        """
        db = DocBin()
        entry_id_counter = 1  # Counter for entry IDs
        entity_id_counter = [1]  # Mutable list to maintain the global counter for entities

        # List to store IDs with issues
        ids_with_issues = []

        # Iterate over each annotation
        for text, entities in zip(texts, entities_list):
            try:
                span_starts = set()
                entities_out = []
                span_end_to_start = {}

                # Create a Doc object from the text using the Spacy model
                doc = nlp.make_doc(text)

                # Keep track of referenced entities to avoid duplicates
                ref_entities = []

                # Process each entity
                for entity in entities:
                    start = entity["start_offset"]
                    end = entity["end_offset"]

                    if (start, end) not in ref_entities:
                        ref_entities.append((start, end))
                        label = entity["label"]

                        # Create a span from the character offsets and label
                        span = doc.char_span(start, end, label=label)

                        if span is None:
                            print(f"Skipping entity for {label} ({start},{end})")
                        else:
                            entities_out.append(span)
                            span_end_to_start[entity["id"]] = entity["id"]
                            span_starts.add(entity["id"])

                # Create an empty dictionary to store relations
                rels = {}
                for x1 in span_starts:
                    for x2 in span_starts:
                        rels[(x1, x2)] = {}

                # Process relations
                relations = []  # Relations can be processed similarly if available
                for relation in relations:
                    start = span_end_to_start.get(relation["from_id"], None)
                    end = span_end_to_start.get(relation["to_id"], None)
                    label = relation["type"]

                    if start is not None and end is not None:
                        if label not in rels[(start, end)]:
                            rels[(start, end)][label] = 1.0

                # Fill in zeros where relation data is missing
                for x1 in span_starts:
                    for x2 in span_starts:
                        for label in relation_labels:
                            if label not in rels[(x1, x2)]:
                                rels[(x1, x2)][label] = 0.0

                # Assign the entities and relations to the Doc object
                doc.ents = entities_out
                doc._.rel = rels

                # Add the processed document to the DocBin object
                db.add(doc)

            except Exception as e:
                print(f"Error processing ID {entry_id_counter}: {str(e)}")
                ids_with_issues.append(entry_id_counter)
            
            entry_id_counter += 1  # Increment entry ID after each entry

        # Save the DocBin object to disk
        db.to_disk(output_file)

        return db, ids_with_issues

    # Create DocBin files for training and testing data
    create_docbin(X_train, y_train, output_train_file)
    create_docbin(X_test, y_test, output_test_file)

    print(f"Training data saved to {output_train_file}")
    print(f"Testing data saved to {output_test_file}")

In [12]:
ner_annotations = pd.read_json("data/output.jsonl", lines=True)
ner_annotations.drop("tag", axis=1, inplace=True)
ner_annotations.head()

Unnamed: 0,id,text,entities,relations,Comments
0,1,"BP spikes 1 day PTC, patient has dizziness. BO...",[],[],[]
1,2,"follow up occ cough no nasal catarrh, no dob o...","[{'id': 1, 'label': 'ILI', 'start_offset': 48,...",[],[]
2,3,general check up Patient came in for general c...,[],[],[]
3,4,eye redness Bacterial conjunctivitis,[],[],[]
4,5,hypogastric pain 3 days PTC patient had hypoga...,[],[],[]


In [13]:
convert_jsonl_to_docbin(
    input_file='data/output.jsonl',
    output_train_file='data/train_data.spacy',
    output_test_file='data/test_data.spacy',
    model = 'en_core_web_lg',
)

Skipping entity for ILI (72,100)
Skipping entity for ILI (91,100)
Skipping entity for ILI (404,408)
Skipping entity for ILI (13,18)
Skipping entity for ILI (203,208)
Error processing ID 272: [E1010] Unable to set entity information for token 39 which is included in more than one span in entities, blocked, missing or outside.
Skipping entity for ILI (214,218)
Skipping entity for ILI (13,18)
Error processing ID 505: [E1010] Unable to set entity information for token 29 which is included in more than one span in entities, blocked, missing or outside.
Skipping entity for ILI (71,76)
Skipping entity for ILI (13,18)
Error processing ID 600: [E1010] Unable to set entity information for token 18 which is included in more than one span in entities, blocked, missing or outside.
Error processing ID 631: [E1010] Unable to set entity information for token 49 which is included in more than one span in entities, blocked, missing or outside.
Skipping entity for ILI (11,16)
Error processing ID 898: [E1

In [14]:
# Define Configurations
os.system("python3 -m spacy init fill-config base_config.cfg experiment_config.cfg")

[38;5;2m✔ Auto-filled config with all values[0m
[38;5;2m✔ Saved config[0m
experiment_config.cfg
You can now add your data and train your pipeline:
python -m spacy train experiment_config.cfg --paths.train ./train.spacy --paths.dev ./dev.spacy


0

In [15]:
os.system("python3 -m spacy train experiment_config.cfg --output output/v0 --paths.train data/train_data.spacy --paths.dev data/train_data.spacy")

[38;5;4mℹ Saving to output directory: output/v0[0m
[38;5;4mℹ Using CPU[0m
[1m
[38;5;2m✔ Initialized pipeline[0m
[1m
[38;5;4mℹ Pipeline: ['ner'][0m
[38;5;4mℹ Initial learn rate: 0.0001[0m
E    #       LOSS NER  ENTS_F  ENTS_P  ENTS_R  SCORE 
---  ------  --------  ------  ------  ------  ------
  0       0     26.86    0.00    0.00    0.00    0.00
  0     200   2422.84   19.54   74.02   11.26    0.20
  1     400    708.39   95.99   97.39   94.63    0.96
  1     600    240.69   98.61   99.58   97.65    0.99
  2     800    157.45   98.86   99.51   98.21    0.99
  3    1000    130.11   98.94   99.10   98.77    0.99
  5    1200    123.14   99.18   99.55   98.81    0.99
  6    1400    124.34   99.10   99.25   98.96    0.99
  8    1600    139.87   99.66   99.66   99.66    1.00
 11    1800    203.10   99.78   99.78   99.78    1.00
 14    2000    204.86   99.83   99.78   99.89    1.00
 18    2200     75.86   99.87   99.81   99.93    1.00
 23    2400     46.22   99.96   99.93  100.00

0

In [1]:
import spacy
model = spacy.load("output/v0/model-best/")

In [2]:
text = model(ner_annotations["text"][1])

NameError: name 'ner_annotations' is not defined

In [None]:
doc = model(text)

In [3]:
entities = []

for ent in doc.ents:
    entity = {
        "start_index": ent.start_char,
        "end_index": ent.end_char,
        "text": ent.text,
        "entity_type": ent.label_
    }
    entities.append(entity)

NameError: name 'doc' is not defined

In [4]:
for entity in entities:
    print(entity)

In [5]:
def compute_metrics(true_entities, pred_entities):
    """
    Compute True Positives (TP), False Positives (FP), and False Negatives (FN),
    as well as Precision, Recall, and F1 Score.

    Args:
        true_entities (list): List of true entities as tuples (text, label).
        pred_entities (list): List of predicted entities as tuples (text, label).

    Returns:
        dict: Dictionary with TP, FP, FN, Precision, Recall, and F1 Score.
    """
    true_entities_set = set(true_entities)
    pred_entities_set = set(pred_entities)

    # True Positives (TP): Predicted entities that are also in true entities
    tp = len(true_entities_set.intersection(pred_entities_set))

    # False Positives (FP): Predicted entities that are not in true entities
    fp = len(pred_entities_set - true_entities_set)

    # False Negatives (FN): True entities that are not in predicted entities
    fn = len(true_entities_set - pred_entities_set)

    # Calculate Precision, Recall, and F1 Score
    precision = tp / (tp + fp) if (tp + fp) > 0 else 0
    recall = tp / (tp + fn) if (tp + fn) > 0 else 0
    f1_score = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0
    return {
        'True Positive': tp,
        'False Positive': fp,
        'False Negative': fn,
        'True Negative': 'Not Computed',  # Typically not used in NER evaluation
        'Precision': precision,
        'Recall': recall,
        'F1 Score': f1_score,
        
    }



In [6]:
def evaluate_model(test_docs, predictions):
    true_entities = []
    pred_entities = []
    
    # Collect all true and predicted entities for comparison
    for doc, (_, preds) in zip(test_docs, predictions):
        true_entities.extend([(ent.text, ent.label_) for ent in doc.ents])
        pred_entities.extend(preds)

    metrics = compute_metrics(true_entities, pred_entities)
    
    print("Metrics:")
    print(f"True Positives (TP): {metrics['True Positive']}")
    print(f"False Positives (FP): {metrics['False Positive']}")
    print(f"False Negatives (FN): {metrics['False Negative']}")
    print(f"True Negatives (TN): {metrics['True Negative']}")
    print(f"Precision: {metrics['Precision']:.2f}")
    print(f"Recall: {metrics['Recall']:.2f}")
    print(f"F1 Score: {metrics['F1 Score']:.2f}")


In [9]:
test_docs = spacy.load("/home/miniloda/Documents/GitHub/ai4pep/NER/Model/data/test_data.spacy")


OSError: [E053] Could not read meta.json from /home/miniloda/Documents/GitHub/ai4pep/NER/Model/data/test_data.spacy