Create the IOB file for NER.

In [None]:
import os

In [19]:
# Define paths
annotation_dir = r"C:\S24-25\TxM\CADEC.v2\cadec\original"
text_dir = r"C:\S24-25\TxM\CADEC.v2\cadec\text"
output_file_path = r"C:\S24-25\TxM\IOB\cadec_iob.tsv"

# Tag mappings
tag_mapping = {
    "ADR": "ADR",
    "Drug": "DRU",
    "Disease": "DIS",
    "Symptom": "SYM"
}

def parse_annotations(annotation_file):
    annotations = []
    with open(annotation_file, "r", encoding="utf-8") as file:
        for line in file:
            if line.startswith("T"):
                parts = line.strip().split("\t")
                tag_info = parts[1].replace(";", " ").split()  # Replace ; with space
                tag = tag_info[0]
                start = int(tag_info[1])
                end = int(tag_info[2])
                text = parts[2]
                if tag in tag_mapping:
                    annotations.append((start, end, tag_mapping[tag], text))
    return annotations

def generate_iob(text, annotations):
    tokens = text.split()
    positions = []
    iob_tags = ["O"] * len(tokens)

    current_position = 0
    for idx, token in enumerate(tokens):
        positions.append((current_position, current_position + len(token)))
        current_position += len(token) + 1  # Account for space

    for start, end, iob_tag, _ in annotations:
        for idx, (token_start, token_end) in enumerate(positions):
            if token_start >= start and token_end <= end:
                if token_start == start:
                    iob_tags[idx] = f"B-{iob_tag}"
                else:
                    iob_tags[idx] = f"I-{iob_tag}"

    return zip(tokens, iob_tags)

def process_files(annotation_dir, text_dir, output_file):
    with open(output_file, "w", encoding="utf-8") as output:
        for annotation_file in os.listdir(annotation_dir):
            if annotation_file.endswith(".ann"):
                base_name = os.path.splitext(annotation_file)[0]
                annotation_path = os.path.join(annotation_dir, annotation_file)
                text_path = os.path.join(text_dir, base_name + ".txt")

                if os.path.exists(text_path):
                    with open(text_path, "r", encoding="utf-8") as text_file:
                        text = text_file.read()

                    annotations = parse_annotations(annotation_path)
                    iob_data = generate_iob(text, annotations)

                    for token, tag in iob_data:
                        output.write(f"{token}\t{tag}\n")

                    output.write("\n")  # Separate files with a blank line

# Create output directory if it doesn't exist
os.makedirs(os.path.dirname(output_file_path), exist_ok=True)

# Process all files
process_files(annotation_dir, text_dir, output_file_path)

print(f"IOB file created at {output_file_path}")


IOB file created at C:\S24-25\TxM\IOB\cadec_iob.tsv


Split for training 

In [13]:
import random
import csv

In [21]:
import os
import random
import csv

def read_iob_file(file_path):
    """Read IOB file and split into paragraphs"""
    with open(file_path, 'r', encoding='utf-8') as f:
        content = f.read().strip()
    # Split into paragraphs (sections separated by blank lines)
    paragraphs = [p.strip() for p in content.split('\n\n') if p.strip()]
    return paragraphs

def save_tsv(file_path, data):
    """Save data to a TSV file"""
    with open(file_path, 'w', encoding='utf-8', newline='') as f:
        writer = csv.writer(f, delimiter='\t')
        for paragraph in data:
            for line in paragraph.split('\n'):
                if line.strip():
                    writer.writerow(line.split('\t'))
            writer.writerow([])  # Blank line to separate paragraphs

def create_o_tag_version(data):
    """Create a version of the dataset with all tags replaced by 'O'"""
    o_tag_data = []
    for paragraph in data:
        lines = []
        for line in paragraph.split('\n'):
            if line.strip():
                token, _ = line.split('\t')  # Split token and tag
                lines.append(f"{token}\tO")
        o_tag_data.append('\n'.join(lines))
    return o_tag_data

def main():
    input_file = r'C:\S24-25\TxM\IOB\cadec_iob.tsv'  # Input file
    output_dir = r'C:\S24-25\TxM\dataset'  # Output directory

    # Read and split data
    paragraphs = read_iob_file(input_file)
    if not paragraphs:
        print("No paragraphs found in input file. Please check the file format.")
        return

    random.shuffle(paragraphs)  # Shuffle paragraphs

    # Calculate split sizes
    total_size = len(paragraphs)
    train_size = int(total_size * 0.8)
    val_size = int(total_size * 0.1)

    # Create splits
    train_data = paragraphs[:train_size]
    val_data = paragraphs[train_size:train_size + val_size]
    test_data = paragraphs[train_size + val_size:]

    # Create O-tag versions
    val_o_tag_data = create_o_tag_version(val_data)
    test_o_tag_data = create_o_tag_version(test_data)

    # Save splits as TSV files
    os.makedirs(output_dir, exist_ok=True)
    save_tsv(os.path.join(output_dir, 'train.tsv'), train_data)
    save_tsv(os.path.join(output_dir, 'val_gold.tsv'), val_data)
    save_tsv(os.path.join(output_dir, 'val.tsv'), val_o_tag_data)
    save_tsv(os.path.join(output_dir, 'test_gold.tsv'), test_data)
    save_tsv(os.path.join(output_dir, 'test.tsv'), test_o_tag_data)

    print(f"Splits saved in {output_dir}")
    print(f"Train examples: {len(train_data)}")
    print(f"Validation examples: {len(val_data)}")
    print(f"Test examples: {len(test_data)}")

if __name__ == "__main__":
    random.seed(42)  # For reproducibility
    main()


Splits saved in C:\S24-25\TxM\dataset
Train examples: 998
Validation examples: 124
Test examples: 126
