In [1]:
import torch
print(torch.cuda.is_available())

True


In [2]:
!pip install -U spacy-transformers

Collecting spacy-transformers
  Downloading spacy_transformers-1.3.8-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (7.0 kB)
Collecting transformers<4.50.0,>=3.4.0 (from spacy-transformers)
  Downloading transformers-4.49.0-py3-none-any.whl.metadata (44 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.0/44.0 kB[0m [31m2.6 MB/s[0m eta [36m0:00:00[0m
Collecting spacy-alignments<1.0.0,>=0.7.2 (from spacy-transformers)
  Downloading spacy_alignments-0.9.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (2.7 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=1.8.0->spacy-transformers)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch>=1.8.0->spacy-transformers)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from t

In [44]:
!pip install -q spacy kagglehub tqdm

In [45]:
!python -m spacy init config ner_config.cfg --lang en --pipeline ner --optimize accuracy --gpu --force

[38;5;4mℹ Generated config template specific for your use case[0m
- Language: en
- Pipeline: ner
- Optimize for: accuracy
- Hardware: GPU
- Transformer: roberta-base
[38;5;2m✔ Auto-filled config with all values[0m
[38;5;2m✔ Saved config[0m
ner_config.cfg
You can now add your data and train your pipeline:
python -m spacy train ner_config.cfg --paths.train ./train.spacy --paths.dev ./dev.spacy


In [46]:
import spacy
from spacy.tokens import DocBin
import os
import random
from tqdm.notebook import tqdm
import pandas as pd
import kagglehub
import glob
import re

# Download the dataset from Kaggle

In [47]:
print("Downloading dataset from Kaggle...")
path = kagglehub.dataset_download("debasisdotcom/name-entity-recognition-ner-dataset")
print(f"Dataset downloaded to: {path}")

Downloading dataset from Kaggle...
Dataset downloaded to: /kaggle/input/name-entity-recognition-ner-dataset


# Set up directories for spaCy

In [48]:
os.makedirs("ner_data", exist_ok=True)
os.makedirs("ner_models", exist_ok=True)

# Function to process the dataset and convert to spaCy format

In [49]:
def prepare_training_data():
    print("Preparing training data...")
    csv_files = glob.glob(os.path.join(path, "*.csv"))
    if not csv_files:
        csv_files = glob.glob(os.path.join(path, "**/*.csv"), recursive=True)
    if not csv_files:
        raise FileNotFoundError("No CSV files found in the dataset")

    print(f"Found CSV files: {csv_files}")
    df = pd.read_csv(csv_files[0], encoding='latin-1')
    print(f"Dataset shape: {df.shape}")
    print(f"Dataset columns: {df.columns}")
    print("\nFirst few rows of the dataset:")
    print(df.head())

    sentences = []
    current_sentence = []
    sentence_entities = []

    for _, group in df.groupby('Sentence #', sort=False):
        sentence_text = ' '.join(word for word in group['Word'] if isinstance(word, str))

        entities = []
        words = [w for w in group['Word'] if isinstance(w, str)]
        tags = group['Tag'].tolist()

        current_entity = None
        current_start = 0
        current_text = ""

        for word, tag in zip(words, tags):
            if tag.startswith('B-'):
                if current_entity:
                    entity_start = sentence_text.find(current_text, current_start)
                    if entity_start != -1:
                        entities.append((entity_start, entity_start + len(current_text), current_entity))

                current_entity = tag[2:]
                current_text = word
                current_start = 0 if not sentence_text else sentence_text.find(word, current_start)

            elif tag.startswith('I-') and current_entity:
                current_text += " " + word

            elif current_entity:
                entity_start = sentence_text.find(current_text, current_start)
                if entity_start != -1:
                    entities.append((entity_start, entity_start + len(current_text), current_entity))
                current_entity = None
                current_text = ""
                current_start = 0 if not sentence_text else sentence_text.find(word, current_start) + len(word)
            else:
                current_start = 0 if not sentence_text else sentence_text.find(word, current_start) + len(word)

        if current_entity:
            entity_start = sentence_text.find(current_text, current_start)
            if entity_start != -1:
                entities.append((entity_start, entity_start + len(current_text), current_entity))

        sentences.append((sentence_text, {"entities": entities}))

    data = [(text, annot) for text, annot in sentences if text.strip() and any(e[0] >= 0 for e in annot["entities"])]

    random.shuffle(data)
    split = int(len(data) * 0.8)
    train_data = data[:split]
    valid_data = data[split:]

    nlp = spacy.blank("en")

    train_db = DocBin()
    for text, annot in tqdm(train_data, desc="Processing training data"):
        doc = nlp.make_doc(text)
        ents = []
        for start, end, label in annot["entities"]:
            try:
                span = doc.char_span(start, end, label=label)
                if span is not None:
                    ents.append(span)
            except:
                continue
        doc.ents = ents
        train_db.add(doc)
    train_db.to_disk("./ner_data/train.spacy")

    valid_db = DocBin()
    for text, annot in tqdm(valid_data, desc="Processing validation data"):
        doc = nlp.make_doc(text)
        ents = []
        for start, end, label in annot["entities"]:
            try:
                span = doc.char_span(start, end, label=label)
                if span is not None:
                    ents.append(span)
            except:
                continue
        doc.ents = ents
        valid_db.add(doc)
    valid_db.to_disk("./ner_data/dev.spacy")

    entity_labels = set()
    for _, annot in data:
        for _, _, label in annot["entities"]:
            entity_labels.add(label)

    print(f"Created training data: {len(train_data)} examples")
    print(f"Created validation data: {len(valid_data)} examples")
    print(f"Entity labels found: {', '.join(sorted(entity_labels))}")

    return list(entity_labels)

# Create configuration file

In [51]:
def create_config():
    print("Creating configuration file...")
    !python -m spacy init config ner_config.cfg --lang en --pipeline ner --optimize efficiency --force
    print("Configuration file created: ner_config.cfg")


# Train the model

In [66]:
def train_model():
    print("Training model...")
    !python -m spacy train ner_config.cfg --output ner_models --paths.train ./ner_data/train.spacy --paths.dev ./ner_data/dev.spacy --gpu-id 0
    print("Model training complete")

# Test the model

In [53]:
def test_model():
    print("Testing model...")
    try:
        nlp = spacy.load("./ner_models/model-best")
        test_texts = [
            "Google is opening a new office in New York City with 50 employees",
            "Microsoft CEO Satya Nadella announced a partnership with OpenAI last year",
            "Amazon's headquarters in Seattle employs thousands of software engineers"
        ]
        for text in test_texts:
            doc = nlp(text)
            print(f"\nText: {text}")
            print("Entities:")
            for ent in doc.ents:
                print(f"  {ent.text} - {ent.label_}")
    except Exception as e:
        print(f"Error loading model: {e}")

# Function to visualize entities in a sentence

In [54]:
def visualize_entities(text):
    try:
        nlp = spacy.load("./ner_models/model-best")
        doc = nlp(text)
        from spacy import displacy
        displacy.render(doc, style="ent", jupyter=True)
    except Exception as e:
        print(f"Error: {e}")

# Run the complete pipeline

In [62]:
print("Starting NER model training pipeline...")
entity_labels = prepare_training_data()

Starting NER model training pipeline...
Preparing training data...
Found CSV files: ['/kaggle/input/name-entity-recognition-ner-dataset/NER dataset.csv']
Combined dataset shape: (1048575, 4)
Dataset columns: Index(['Sentence #', 'Word', 'POS', 'Tag'], dtype='object')

First few rows of the dataset:
    Sentence #           Word  POS Tag
0  Sentence: 1      Thousands  NNS   O
1          NaN             of   IN   O
2          NaN  demonstrators  NNS   O
3          NaN           have  VBP   O
4          NaN        marched  VBN   O


Processing training data:   0%|          | 0/10919 [00:00<?, ?it/s]

Processing validation data:   0%|          | 0/2730 [00:00<?, ?it/s]

Created training data: 10919 examples
Created validation data: 2730 examples
Entity labels found: art, eve, geo, gpe, nat, org, per, tim


In [64]:
create_config()

Creating configuration file...
[38;5;4mℹ Generated config template specific for your use case[0m
- Language: en
- Pipeline: ner
- Optimize for: efficiency
- Hardware: CPU
- Transformer: None
[38;5;2m✔ Auto-filled config with all values[0m
[38;5;2m✔ Saved config[0m
ner_config.cfg
You can now add your data and train your pipeline:
python -m spacy train ner_config.cfg --paths.train ./train.spacy --paths.dev ./dev.spacy
Configuration file created: ner_config.cfg


In [73]:
train_model()

Training model...
[38;5;4mℹ Saving to output directory: ner_models[0m
[38;5;4mℹ Using GPU: 0[0m
[1m
[38;5;2m✔ Initialized pipeline[0m
[1m
[38;5;4mℹ Pipeline: ['tok2vec', 'ner'][0m
[38;5;4mℹ Initial learn rate: 0.0001[0m
E    #       LOSS TOK2VEC  LOSS NER  ENTS_F  ENTS_P  ENTS_R  SCORE 
---  ------  ------------  --------  ------  ------  ------  ------
  0       0          0.00     96.56   37.00   36.74   37.25    0.37
  1     200         43.60  15867.56   64.11   63.93   64.29    0.64
  4     400        237.65   9663.20   79.56   79.49   79.63    0.80
  7     600        363.89   8020.09   82.41   82.36   82.45    0.82
 11     800        441.73   8252.55   83.12   83.09   83.15    0.83
 15    1000        501.33   8768.53   83.54   83.52   83.55    0.84
 20    1200        584.95  10094.75   83.81   83.81   83.81    0.84
 27    1400        664.45  11559.88   83.81   83.81   83.81    0.84
 35    1600        750.40  13507.30   83.52   83.52   83.52    0.84
 45    1800        8

In [74]:
test_model()

Testing model...

Text: Google is opening a new office in New York City with 50 employees
Entities:
  Google - org
  is - per
  opening - per
  a - per
  new - tim
  office - org
  in - org
  New - geo
  York - per
  City - geo
  with 50 - org
  employees - gpe

Text: Microsoft CEO Satya Nadella announced a partnership with OpenAI last year
Entities:
  Microsoft - org
  CEO - geo
  Satya - geo
  Nadella - per
  a - gpe
  partnership - tim
  OpenAI - per
  last - per
  year - per

Text: Amazon's headquarters in Seattle employs thousands of software engineers
Entities:
  Amazon - per
  headquarters - tim
  in - org
  Seattle - per
  employs - geo
  thousands - org
  of - per
  software engineers - org


In [76]:
print("Example: visualize_entities('Apple is planning to open a new office in Chicago')")
visualize_entities('Apple is planning to open a new office in Chicago')

Example: visualize_entities('Apple is planning to open a new office in Chicago')
