In [1]:
import requests
import zipfile
import io
import os

# Download the NCBI dataset

In [2]:
# Step 1: Provide the URL to the zip file
url = "https://www.ncbi.nlm.nih.gov/CBBresearch/Dogan/DISEASE/NCBI_corpus.zip"

# Step 2: Set the extraction destination
extract_to = "/workspaces/chisel/examples/data"
os.makedirs(extract_to, exist_ok=True)

# Step 3: Download and extract
response = requests.get(url)
if response.status_code == 200:
    with zipfile.ZipFile(io.BytesIO(response.content)) as zip_ref:
        zip_ref.extractall(extract_to)
    print(f"Extracted contents to: {extract_to}")
else:
    print(f"Failed to download file. Status code: {response.status_code}")

Extracted contents to: /workspaces/chisel/examples/data


In [3]:
import sys
sys.path.append("/workspaces/chisel/")

In [4]:
annotations = []
with open("/workspaces/chisel/examples/data/NCBI_corpus_training.txt", "r") as f:
    for line in f.readlines():
        splits = line.split("\t")
        annotations.append({
            "id": splits[0].strip(),
            "text": " ".join(splits[1:]).strip()
        })

In [7]:
print(annotations[0]["text"])

Identification of APC2, a homologue of the <category="Modifier">adenomatous polyposis coli tumour</category> suppressor . The <category="Modifier">adenomatous polyposis coli ( APC ) tumour</category>-suppressor protein controls the Wnt signalling pathway by forming a complex with glycogen synthase kinase 3beta ( GSK-3beta ) , axin / conductin and betacatenin . Complex formation induces the rapid degradation of betacatenin . In <category="Modifier">colon carcinoma</category> cells , loss of APC leads to the accumulation of betacatenin in the nucleus , where it binds to and activates the Tcf-4 transcription factor ( reviewed in [ 1 ] [ 2 ] ) . Here , we report the identification and genomic structure of APC homologues . Mammalian APC2 , which closely resembles APC in overall domain structure , was functionally analyzed and shown to contain two SAMP domains , both of which are required for binding to conductin . Like APC , APC2 regulates the formation of active betacatenin-Tcf complexes ,

In [9]:
from typing import List
from chisel.extraction.base.protocols import Parser, Tokenizer, TokenChunker, Labeler, LabelEncoder, Validator, Exporter

In [12]:
class NCBIPipeline:
    def __init__(self, 
                 annotations, 
                 parser: Parser, 
                 tokenizer: Tokenizer, 
                 chunker: TokenChunker, 
                 labeler: Labeler, 
                 label_encoder: LabelEncoder, 
                 validators: List[Validator], 
                 exporter: Exporter):
        self.annotations = annotations
        self.parser = parser
        self.tokenizer = tokenizer
        self.chunker = chunker
        self.labeler = labeler
        self.label_encoder = label_encoder
        self.validators = validators
        self.exporter = exporter

    def run(self):
        processed_data = []
        for annotation in self.annotations:
            text, entities = self.parser.parse(annotation["text"])
            tokens = self.tokenizer.tokenize(text)
            chunks = self.chunker.chunk(tokens, entities)

            labels = []
            for chunk in chunks:
                labels.append(self.labeler.label(chunk["tokens"], chunk["entities"]))

            self.label_encoder.fit(labels)

            encoded_labels = []
            for label in labels:
                encoded_labels.append(self.label_encoder.encode(label))

            for i, (chunk, encoded_label, label) in enumerate(zip(chunks, encoded_labels, labels)):
                for validator in self.validators:
                     validator.validate(text, chunk["tokens"], chunk["entities"], label)
            
                processed_data.append({
                    "id": annotation["id"],
                    "chunk_id": i,
                    "text": text,
                    "input_ids": [token.id for token in tokens],
                    "attention_mask": [1] * len(tokens),  # Assuming all tokens are valid
                    "tokens": [token.text for token in tokens],
                    "bio-labels": label,
                    "labels": encoded_label
                })
        
        #self.exporter.export(processed_data)
        return processed_data

In [13]:
from chisel.extraction.parsers.html_tag_parser import HTMLTagParser
from chisel.extraction.tokenizers.hf_tokenizer import HFTokenizer
from chisel.extraction.chunkers.fixed_length_chunker import FixedLengthChunker
from chisel.extraction.labelers.bio_labeler import BIOLabeler
from chisel.extraction.labelers.simple_label_encoder import SimpleLabelEncoder
from chisel.extraction.validators.span_in_text_validator import SpanInTextValidator
from chisel.extraction.validators.span_text_match_validator import SpanTextMatchValidator
from chisel.extraction.validators.bio_alignment_validator import BIOAlignmentValidator
from chisel.extraction.exporters.json_exporter import JSONExporter

In [10]:
parser = HTMLTagParser(label_strategy="attribute")
tokenizer = HFTokenizer(model_name="bert-base-cased")
chunker = FixedLengthChunker(max_tokens=100, overlap=0)
labeler = BIOLabeler()
label_encoder = SimpleLabelEncoder()
validators = [
    v1 := SpanInTextValidator(),
    v2 := SpanTextMatchValidator(),
    v3 := BIOAlignmentValidator()
]
exporter = JSONExporter(output_path="/workspaces/chisel/examples/data/annotations.json")

In [17]:
pipeline = NCBIPipeline(
    annotations=annotations[0:10],
    parser=parser,
    tokenizer=tokenizer,
    chunker=chunker,
    labeler=labeler,
    label_encoder=label_encoder,
    validators=validators,
    exporter=exporter
)
pipeline.run()