In [3]:
pip install spacy pandas seqeval tqdm


Collecting seqeval
  Downloading seqeval-1.2.2.tar.gz (43 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/43.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.6/43.6 kB[0m [31m3.3 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: seqeval
  Building wheel for seqeval (setup.py) ... [?25l[?25hdone
  Created wheel for seqeval: filename=seqeval-1.2.2-py3-none-any.whl size=16162 sha256=b6576914b7cdf1c58f6d5dcf8dd4d8a61cc44943e84f3c5e286144797e238816
  Stored in directory: /root/.cache/pip/wheels/5f/b8/73/0b2c1a76b701a677653dd79ece07cfabd7457989dbfbdcd8d7
Successfully built seqeval
Installing collected packages: seqeval
Successfully installed seqeval-1.2.2


In [4]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("cweyyy/conll03")

print("Path to dataset files:", path)

Using Colab cache for faster access to the 'conll03' dataset.
Path to dataset files: /kaggle/input/conll03


In [9]:
import kagglehub
path = kagglehub.dataset_download("cweyyy/conll03")
print(path)


Using Colab cache for faster access to the 'conll03' dataset.
/kaggle/input/conll03


In [10]:
!ls $path


eng.dev.tsv  eng.test.tsv  eng.train.tsv


In [11]:
train_path = f"{path}/eng.train.tsv"
dev_path   = f"{path}/eng.dev.tsv"
test_path  = f"{path}/eng.test.tsv"


In [12]:
# ====================================================
# Task 4: Named Entity Recognition (NER) from News Articles
# Dataset: CoNLL-2003 (Kaggle - cweyyy/conll03, TSV format)
# Approach: Rule-based + Model-based (spaCy)
# ====================================================

# Install dependencies
!pip install spacy pandas tqdm seqeval kagglehub

import random
from pathlib import Path
from collections import Counter
from tqdm import tqdm
import pandas as pd
import spacy
from spacy.tokens import Doc
from spacy.training import Example
from spacy.util import minibatch, compounding

# ====================================================
# Step 1. Load dataset from KaggleHub
# ====================================================
import kagglehub

path = kagglehub.dataset_download("cweyyy/conll03")
print("Dataset path:", path)

# Your files are TSV:
train_path = Path(path) / "eng.train.tsv"
dev_path   = Path(path) / "eng.dev.tsv"
test_path  = Path(path) / "eng.test.tsv"

print("Train file:", train_path.exists())
print("Dev file:", dev_path.exists())
print("Test file:", test_path.exists())

# ====================================================
# Step 2. Helper functions (parser + converters)
# ====================================================

def parse_conll(filepath: Path):
    """Parse a CoNLL TSV file and return list of sentences (list of token columns)."""
    sentences = []
    with filepath.open('r', encoding='utf-8') as f:
        sentence = []
        for raw in f:
            line = raw.strip()
            if not line:
                if sentence:
                    sentences.append(sentence)
                    sentence = []
                continue
            if line.startswith('-DOCSTART-'):
                continue
            parts = line.split("\t")   # <-- tab separated
            if parts:
                sentence.append(parts)
        if sentence:
            sentences.append(sentence)
    return sentences

def tokens_to_text_and_offsets(tokens):
    """Join tokens into text and compute char offsets for each token."""
    text = ' '.join(tokens)
    offsets = []
    pos = 0
    for tok in tokens:
        start = pos
        end = start + len(tok)
        offsets.append((start, end))
        pos = end + 1
    return text, offsets

def bio_to_token_entities(tags):
    """Convert BIO tags to (start,end,label) entity spans (token indices)."""
    entities = []
    ent_label = None
    ent_start = None
    ent_end = None
    for i, tag in enumerate(tags):
        if tag == 'O':
            if ent_label is not None:
                entities.append((ent_start, ent_end, ent_label))
                ent_label = None
            continue
        if tag.startswith('B-'):
            if ent_label is not None:
                entities.append((ent_start, ent_end, ent_label))
            ent_label = tag[2:]
            ent_start = i
            ent_end = i + 1
        elif tag.startswith('I-') and ent_label is not None and tag[2:] == ent_label:
            ent_end = i + 1
        else:
            if ent_label is not None:
                entities.append((ent_start, ent_end, ent_label))
            ent_label = None
    if ent_label is not None:
        entities.append((ent_start, ent_end, ent_label))
    return entities

def token_entities_to_char(entities_tokenidx, tokens):
    """Convert token index entities into character span entities."""
    text, offsets = tokens_to_text_and_offsets(tokens)
    char_entities = []
    for s_idx, e_idx, lbl in entities_tokenidx:
        char_start = offsets[s_idx][0]
        char_end = offsets[e_idx - 1][1]
        char_entities.append((char_start, char_end, lbl))
    return text, char_entities

def conll_to_spacy_data(filepath: Path):
    """Convert a CoNLL TSV file to spaCy training data and parsed sentences."""
    parsed = parse_conll(filepath)
    data = []
    parsed_sentences = []
    for sent in tqdm(parsed, desc=f"Parsing {filepath.name}"):
        tokens = [cols[0] for cols in sent]
        tags = [cols[-1] for cols in sent]  # NER tags are last column
        token_entities = bio_to_token_entities(tags)
        text, char_entities = token_entities_to_char(token_entities, tokens)
        ann = {"entities": [(s, e, l) for s, e, l in char_entities]}
        data.append((text, ann))
        parsed_sentences.append((tokens, tags))
    return data, parsed_sentences

# ====================================================
# Step 3. Convert dataset
# ====================================================

train_data, train_parsed = conll_to_spacy_data(train_path)
dev_data, dev_parsed     = conll_to_spacy_data(dev_path)
test_data, test_parsed   = conll_to_spacy_data(test_path)

print("Train sentences:", len(train_data))
print("Dev sentences:", len(dev_data))
print("Test sentences:", len(test_data))

# Quick EDA
counts = Counter([ent[2] for _, ann in train_data for ent in ann['entities']])
print("Entity counts:", counts)

# ====================================================
# Step 4. Rule-based baseline
# ====================================================
import re
from spacy.language import Language
from spacy.pipeline import EntityRuler

nlp_rule = spacy.blank('en')
ruler = nlp_rule.add_pipe('entity_ruler')
ruler.add_patterns([
    {"label": "ORG", "pattern": "United Nations"},
    {"label": "MISC", "pattern": "COVID-19"},
])

@Language.component('regex_person')
def regex_person(doc):
    new_ents = list(doc.ents)
    for m in re.finditer(r"\b([A-Z][a-z]+(?:\s+[A-Z][a-z]+){0,2})\b", doc.text):
        span = doc.char_span(m.start(), m.end(), label='PER')
        if span is not None:
            new_ents.append(span)
    doc.ents = new_ents
    return doc

nlp_rule.add_pipe('regex_person', last=True)

print("\nRule-based sample:")
for t, _ in test_data[:3]:
    doc = nlp_rule(t)
    print(t[:80], "...")
    print([(ent.text, ent.label_) for ent in doc.ents])

# ====================================================
# Step 5. Train spaCy model
# ====================================================

def build_nlp(base_model=None):
    if base_model:
        print("Loading base model", base_model)
        nlp = spacy.load(base_model)
    else:
        nlp = spacy.blank("en")
    return nlp

def train_spacy_ner(nlp, train_data, n_iter=3, drop=0.3, dev_data=None):
    if 'ner' not in nlp.pipe_names:
        ner = nlp.add_pipe('ner')
    else:
        ner = nlp.get_pipe('ner')

    labels = set()
    for _, ann in train_data:
        for _, _, l in ann["entities"]:
            labels.add(l)
    for lab in labels:
        ner.add_label(lab)

    # Initialize with small examples
    init_examples = []
    for text, ann in train_data[:100]:
        doc = nlp.make_doc(text)
        init_examples.append(Example.from_dict(doc, ann))
    nlp.initialize(lambda: init_examples)

    for itn in range(n_iter):
        random.shuffle(train_data)
        losses = {}
        batches = minibatch(train_data, size=compounding(4.0, 32.0, 1.001))
        for batch in batches:
            examples = []
            for text, ann in batch:
                doc = nlp.make_doc(text)
                examples.append(Example.from_dict(doc, ann))
            nlp.update(examples, drop=drop, losses=losses)
        print(f"Epoch {itn+1}/{n_iter} - Losses: {losses}")
    return nlp

# Train small model (keep epochs low on Colab CPU)
nlp_model = build_nlp()
nlp_model = train_spacy_ner(nlp_model, train_data[:2000], n_iter=3)

# ====================================================
# Step 6. Evaluate with seqeval
# ====================================================
from seqeval.metrics import classification_report

def predict_bio_for_tokens(nlp, tokens):
    text, offsets = tokens_to_text_and_offsets(tokens)
    doc = nlp(text)
    pred_spans = [(ent.start_char, ent.end_char, ent.label_) for ent in doc.ents]
    pred_bio = []
    for (t_start, t_end) in offsets:
        label = 'O'
        for ps, pe, pl in pred_spans:
            if t_start >= ps and t_end <= pe:
                label = 'B-' + pl if t_start == ps else 'I-' + pl
                break
        pred_bio.append(label)
    return pred_bio

def evaluate_with_seqeval(nlp, parsed_sentences):
    y_true, y_pred = [], []
    for tokens, gold_tags in tqdm(parsed_sentences, desc="Evaluating"):
        y_true.append(gold_tags)
        y_pred.append(predict_bio_for_tokens(nlp, tokens))
    print(classification_report(y_true, y_pred))

evaluate_with_seqeval(nlp_model, dev_parsed[:500])

# ====================================================
# Step 7. Visualize entities
# ====================================================
from spacy import displacy

print("\nVisualization sample:")
doc = nlp_model("Barack Obama visited Germany and met Angela Merkel at the United Nations.")
displacy.render(doc, style="ent", jupyter=True)

# ====================================================
# Step 8. Save model to Google Drive
# ====================================================
from google.colab import drive
drive.mount('/content/drive')

output_dir = "/content/drive/MyDrive/spacy_conll03_model"
nlp_model.to_disk(output_dir)
print("Model saved to:", output_dir)


Using Colab cache for faster access to the 'conll03' dataset.
Dataset path: /kaggle/input/conll03
Train file: True
Dev file: True
Test file: True


Parsing eng.train.tsv: 100%|██████████| 14041/14041 [00:00<00:00, 156469.29it/s]
Parsing eng.dev.tsv: 100%|██████████| 3250/3250 [00:00<00:00, 140844.44it/s]
Parsing eng.test.tsv: 100%|██████████| 3453/3453 [00:00<00:00, 15363.15it/s]


Train sentences: 14041
Dev sentences: 3250
Test sentences: 3453
Entity counts: Counter({'PER': 4284, 'ORG': 2485, 'LOC': 1041, 'MISC': 858})

Rule-based sample:
SOCCER - JAPAN GET LUCKY WIN , CHINA IN SURPRISE DEFEAT . ...
[]
Nadim Ladki ...
[('Nadim Ladki', 'PER')]
AL-AIN , United Arab Emirates 1996-12-06 ...
[('United Arab Emirates', 'PER')]
Epoch 1/3 - Losses: {'ner': np.float32(1968.1443)}
Epoch 2/3 - Losses: {'ner': np.float32(900.50055)}
Epoch 3/3 - Losses: {'ner': np.float32(743.0066)}


Evaluating: 100%|██████████| 500/500 [00:01<00:00, 387.58it/s]

              precision    recall  f1-score   support

         LOC       0.00      0.00      0.00       268
        MISC       0.00      0.00      0.00        85
         ORG       0.00      0.00      0.00       195
         PER       0.00      0.00      0.00       332

   micro avg       0.00      0.00      0.00       880
   macro avg       0.00      0.00      0.00       880
weighted avg       0.00      0.00      0.00       880


Visualization sample:





Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Model saved to: /content/drive/MyDrive/spacy_conll03_model
