# Task 1: CADEC Dataset Entity Extraction and Statistical Analysis

This notebook processes the CADEC dataset to enumerate distinct entities from annotation files.

## Objective
- Read all `.ann` files from the `original` subdirectory (1250 files)
- Parse each annotation file to extract entity information
- Create statistics showing distinct entities per label type (ADR, Drug, Disease, Symptom)


In [None]:
# Import required libraries
from pathlib import Path
from collections import defaultdict, Counter
import pandas as pd
import re

# Define the base directory path
BASE_DIR = Path("cadec")
ORIGINAL_DIR = BASE_DIR / "original"

# Verify directory exists
if not ORIGINAL_DIR.exists():
    raise FileNotFoundError(f"Directory not found: {ORIGINAL_DIR}")

print(f"Found annotation directory: {ORIGINAL_DIR}")
print(f"Counting annotation files...")
ann_files = list(ORIGINAL_DIR.glob("*.ann"))
print(f"Total .ann files found: {len(ann_files)}")


In [None]:
"""
Annotation File Format Explanation:
====================================

Each line in a .ann file follows one of these formats:

1. Entity annotation line:
   Format: TAG\tLABEL START_END [START_END ...]\tTEXT
   Example: T1	ADR 9 19	bit drowsy
   Example with multiple ranges: T6	Symptom 66 74;76 94;98 107	the heel I couldn't walk on very well
   
   Components:
   - TAG: Entity identifier (e.g., T1, T2, T3)
   - LABEL: One of ADR, Drug, Disease, Symptom
   - START_END pairs: Character ranges in the original text (can be multiple, separated by semicolons)
   - TEXT: The actual text segment(s) corresponding to the ranges

2. Comment line:
   Format: #NUMBER\tAnnotatorNotes TAG\tNote text
   Example: #1	AnnotatorNotes T1	Drowsy
   
   These lines start with '#' and should be ignored during entity extraction.

Parsing Logic:
==============
1. Read each line from the annotation file
2. Skip lines starting with '#' (comments)
3. For entity lines (starting with 'T' followed by a number):
   - Extract the tag, label type, character ranges, and text
   - Handle single or multiple character ranges (separated by semicolons)
   - Store the entity text in a set for the corresponding label type (ensures uniqueness)
   - Track total occurrences for frequency analysis
"""

def parse_annotation_file(file_path):
    """
    Parse a single .ann file to extract entity information.
    
    Parameters:
    -----------
    file_path : Path
        Path to the .ann annotation file
        
    Returns:
    --------
    list of tuples
        Each tuple contains (label_type, entity_text, tag, ranges)
    """
    entities = []
    
    try:
        with open(file_path, 'r', encoding='utf-8') as f:
            for line in f:
                line = line.strip()
                
                # Skip empty lines and comment lines (starting with '#')
                if not line or line.startswith('#'):
                    continue
                
                # Parse entity annotation lines (starting with 'T' followed by a number)
                # Format: TAG\tLABEL RANGES\tTEXT
                # Ranges can be: "START END" or "START END;START END;..."
                match = re.match(r'^(T\d+)\t([^\t]+)\t(.+)$', line)
                if match:
                    tag = match.group(1)
                    label_and_ranges = match.group(2)
                    text = match.group(3)
                    
                    # Extract label type (first word) and ranges (remaining part)
                    parts = label_and_ranges.split(None, 1)  # Split only on first space
                    if len(parts) < 2:  # Need at least: LABEL RANGES
                        continue
                    
                    label_type = parts[0]
                    ranges_str = parts[1]  # Everything after the label (may contain spaces and semicolons)
                    
                    # Only process ADR, Drug, Disease, Symptom labels (ignore others like "Finding")
                    if label_type not in ['ADR', 'Drug', 'Disease', 'Symptom']:
                        continue
                    
                    # Extract ranges (can be multiple pairs separated by semicolons)
                    # Handle format: "LABEL START END" or "LABEL START1 END1;START2 END2;..."
                    ranges = []
                    
                    # Split by semicolon if present (multiple ranges)
                    if ';' in ranges_str:
                        # Format: "START1 END1;START2 END2;START3 END3"
                        range_pairs = ranges_str.split(';')
                        for rp in range_pairs:
                            rp = rp.strip()
                            if rp:
                                range_nums = rp.split()
                                if len(range_nums) >= 2:
                                    try:
                                        start = int(range_nums[0])
                                        end = int(range_nums[1])
                                        ranges.append((start, end))
                                    except ValueError:
                                        continue
                    else:
                        # Single range format: "START END"
                        range_nums = ranges_str.split()
                        if len(range_nums) >= 2:
                            try:
                                start = int(range_nums[0])
                                end = int(range_nums[1])
                                ranges = [(start, end)]
                            except ValueError:
                                continue
                    
                    # Only store entity if we successfully parsed at least one range
                    if ranges:
                        entities.append((label_type, text, tag, ranges))
    
    except Exception as e:
        print(f"Error parsing file {file_path}: {e}")
        return []
    
    return entities


In [None]:
# Initialize data structures to store entity information

# Dictionary mapping label types to sets of unique entity texts
# Using sets ensures distinct entities (case-sensitive)
distinct_entities = {
    'ADR': set(),
    'Drug': set(),
    'Disease': set(),
    'Symptom': set()
}

# Counter for total occurrences of each entity (for frequency analysis)
entity_occurrences = defaultdict(Counter)

# Track total entities processed (for progress reporting)
total_files_processed = 0
total_entities_processed = 0


In [None]:
# Process all annotation files
print("Processing annotation files...")
print(f"Total files to process: {len(ann_files)}")

for ann_file in ann_files:
    # Parse the annotation file
    entities = parse_annotation_file(ann_file)
    
    # Process each entity found in the file
    for label_type, entity_text, tag, ranges in entities:
        # Normalize entity text (strip whitespace)
        entity_text = entity_text.strip()
        
        if entity_text:  # Only process non-empty entities
            # Add to distinct entities set for this label type
            distinct_entities[label_type].add(entity_text)
            
            # Increment occurrence counter
            entity_occurrences[label_type][entity_text] += 1
            
            total_entities_processed += 1
    
    total_files_processed += 1
    
    # Progress update every 100 files
    if total_files_processed % 100 == 0:
        print(f"Processed {total_files_processed}/{len(ann_files)} files...")

print(f"\nProcessing complete!")
print(f"Total files processed: {total_files_processed}")
print(f"Total entity occurrences processed: {total_entities_processed}")


In [None]:
# Create summary statistics
print("=" * 60)
print("SUMMARY STATISTICS: Distinct Entities per Label Type")
print("=" * 60)

# Dictionary to store counts
distinct_counts = {}

for label_type in ['ADR', 'Drug', 'Disease', 'Symptom']:
    count = len(distinct_entities[label_type])
    distinct_counts[label_type] = count
    print(f"{label_type:15s}: {count:5d} distinct entities")

print("=" * 60)
print(f"{'TOTAL':15s}: {sum(distinct_counts.values()):5d} distinct entities")
print("=" * 60)

# Display the distinct_entities dictionary structure info
print("\nData Structure Created:")
print("- Dictionary mapping label types to sets of unique entity texts")
print(f"  - Keys: {list(distinct_entities.keys())}")
print(f"  - Value types: Sets containing distinct entity text strings")


In [None]:
# Create a pandas DataFrame for better visualization of summary statistics
summary_df = pd.DataFrame([
    {'Label Type': 'ADR', 'Distinct Entities': distinct_counts['ADR']},
    {'Label Type': 'Drug', 'Distinct Entities': distinct_counts['Drug']},
    {'Label Type': 'Disease', 'Distinct Entities': distinct_counts['Disease']},
    {'Label Type': 'Symptom', 'Distinct Entities': distinct_counts['Symptom']},
])

summary_df['Percentage'] = (summary_df['Distinct Entities'] / summary_df['Distinct Entities'].sum() * 100).round(2)

print("\nSummary Statistics Table:")
print(summary_df.to_string(index=False))


In [None]:
# Optional: Frequency distribution of top entities per category

print("\n" + "=" * 60)
print("FREQUENCY DISTRIBUTION: Top 10 Most Frequent Entities per Category")
print("=" * 60)

for label_type in ['ADR', 'Drug', 'Disease', 'Symptom']:
    print(f"\n{label_type}:")
    print("-" * 60)
    
    # Get top 10 most frequent entities for this label type
    top_entities = entity_occurrences[label_type].most_common(10)
    
    if top_entities:
        for i, (entity, count) in enumerate(top_entities, 1):
            print(f"  {i:2d}. {entity[:50]:50s} (occurs {count:4d} times)")
    else:
        print("  No entities found")
    
print("\n" + "=" * 60)


In [None]:
# Create detailed frequency DataFrames for each label type
print("\nDetailed Frequency DataFrames:")

for label_type in ['ADR', 'Drug', 'Disease', 'Symptom']:
    if entity_occurrences[label_type]:
        freq_df = pd.DataFrame(
            entity_occurrences[label_type].most_common(20),
            columns=['Entity Text', 'Occurrences']
        )
        freq_df['Rank'] = range(1, len(freq_df) + 1)
        freq_df = freq_df[['Rank', 'Entity Text', 'Occurrences']]
        
        print(f"\n{label_type} - Top 20 Entities:")
        print(freq_df.to_string(index=False))
        print()


In [None]:
# Display sample entities from each category (first 10)
print("\n" + "=" * 60)
print("SAMPLE ENTITIES: First 10 Distinct Entities per Category")
print("=" * 60)

for label_type in ['ADR', 'Drug', 'Disease', 'Symptom']:
    entities_list = sorted(list(distinct_entities[label_type]))[:10]
    print(f"\n{label_type} (showing {min(10, len(entities_list))} of {len(distinct_entities[label_type])}):")
    for i, entity in enumerate(entities_list, 1):
        print(f"  {i:2d}. {entity}")
    
print("\n" + "=" * 60)


In [None]:
# Verify data structure output
print("\nVerification of Data Structure:")
print("\n1. Dictionary of distinct entities per label type:")
print(f"   Type: {type(distinct_entities)}")
print(f"   Keys: {list(distinct_entities.keys())}")
print(f"   Sample - ADR entities (first 3): {list(list(distinct_entities['ADR'])[:3])}")

print("\n2. Count of distinct entities per label type:")
for label_type, entity_set in distinct_entities.items():
    print(f"   {label_type}: {len(entity_set)} distinct entities")

print("\n3. Total occurrences per entity (sample for ADR):")
if entity_occurrences['ADR']:
    sample_counter = dict(list(entity_occurrences['ADR'].items())[:5])
    for entity, count in sample_counter.items():
        print(f"   '{entity}': {count} occurrences")
