In [1]:
import pandas as pd
import numpy as np
from collections import defaultdict
import re
from typing import List, Tuple, Dict, Any
import spacy
import spacy.cli
from spacy.tokens import Doc
from spacy.matcher import PhraseMatcher
from spacy.pipeline import EntityRuler
from sklearn.metrics import precision_score, recall_score, f1_score, classification_report, confusion_matrix
from tqdm import tqdm
import os
from spacy import displacy
import math
import json
import random
from collections import Counter
from spacy.util import minibatch, compounding
from spacy.training import Example
import csv
import sys
import subprocess
import importlib
import warnings
warnings.filterwarnings('ignore')
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
print("Starting our Named Entity Recognition Journey!")

CONFIG = {
    'data_paths': {
        'train': 'Dataset/train.txt',
        'valid': 'Dataset/valid.txt', 
        'test': 'Dataset/test.txt'
    },
    'models_to_compare': ['en_core_web_sm', 'en_core_web_md'],
    'output_dir': 'ner_results'
}

Starting our Named Entity Recognition Journey!


In [3]:
class CoNLLDataLoader:
    
    def __init__(self):
        print("Initializing our data loader...")
        
    def load_conll_file(self, filepath: str) -> List[Dict]:
        if not os.path.exists(filepath):
            raise FileNotFoundError(f"Oops! Can't find the file: {filepath}")
            
        sentences = []
        current_tokens = []
        current_tags = []
        
        print(f"Reading {filepath}...")
        
        with open(filepath, 'r', encoding='utf-8') as file:
            for line_num, line in enumerate(file, 1):
                line = line.strip()
                
                if not line:
                    if current_tokens:
                        sentences.append({
                            'tokens': current_tokens.copy(),
                            'ner_tags': current_tags.copy(),
                            'sentence_id': len(sentences)
                        })
                        current_tokens.clear()
                        current_tags.clear()
                    continue
                
                if line.startswith('-DOCSTART-'):
                    continue
                    
                parts = line.split()
                if len(parts) >= 4:
                    token = parts[0]
                    ner_tag = parts[-1] 
                    
                    current_tokens.append(token)
                    current_tags.append(ner_tag)
        
        if current_tokens:
            sentences.append({
                'tokens': current_tokens.copy(),
                'ner_tags': current_tags.copy(),
                'sentence_id': len(sentences)
            })
        
        print(f"Successfully loaded {len(sentences)} sentences from {filepath}")
        return sentences

In [4]:
class NERDataProcessor:
    
    def __init__(self, spacy_model='en_core_web_sm'):
        print("Setting up data processor...")
        try:
            self.nlp = spacy.load(spacy_model)
            print(f"Loaded spaCy model: {spacy_model}")
        except IOError:
            print(f"Model {spacy_model} not found. Let me download it for you...")
            spacy.cli.download(spacy_model)  # Fixed: use spacy.cli directly
            self.nlp = spacy.load(spacy_model)
    
    def convert_iob_to_spans(self, tokens: List[str], tags: List[str]) -> Tuple[str, List[Tuple]]:
        text = ' '.join(tokens)
        
        char_positions = []
        current_pos = 0
        
        for token in tokens:
            start = current_pos
            end = start + len(token)
            char_positions.append((start, end))
            current_pos = end + 1  
        
        spans = []
        current_entity = None
        current_start = None
        current_label = None
        
        for i, tag in enumerate(tags):
            if tag == 'O': 
                if current_entity is not None:
                    end_pos = char_positions[i-1][1]
                    spans.append((current_start, end_pos, current_label))
                    current_entity = None
            else:
                tag_parts = tag.split('-', 1)
                prefix = tag_parts[0]
                label = tag_parts[1] if len(tag_parts) > 1 else 'MISC'
                
                if prefix == 'B': 
                    if current_entity is not None:
                        end_pos = char_positions[i-1][1]
                        spans.append((current_start, end_pos, current_label))
                    
                    current_entity = True
                    current_start = char_positions[i][0]
                    current_label = label
                    
                elif prefix == 'I': 
                    if current_entity is None:
                        current_entity = True
                        current_start = char_positions[i][0]
                        current_label = label
        
        if current_entity is not None:
            end_pos = char_positions[-1][1]
            spans.append((current_start, end_pos, current_label))
        
        return text, spans
    
    def process_sentences(self, sentences: List[Dict]) -> List[Dict]:
        print(f"Processing {len(sentences)} sentences...")
        
        processed = []
        for sentence in sentences:
            text, spans = self.convert_iob_to_spans(
                sentence['tokens'], 
                sentence['ner_tags']
            )
            
            processed.append({
                'text': text,
                'entities': spans,
                'original_tokens': sentence['tokens'],
                'original_tags': sentence['ner_tags'],
                'sentence_id': sentence['sentence_id']
            })
        
        print("Data processing complete!")
        return processed

In [5]:
class RuleBasedNER:
    
    def __init__(self, nlp_model):
        self.nlp = nlp_model
        self.patterns = []
        print("Rule-based NER system initialized!")
    
    def learn_patterns_from_data(self, training_data: List[Dict], min_frequency=2):
        print("Learning patterns from training data...")
        
        entity_counter = Counter()
        
        for example in training_data:
            text = example['text']
            for start, end, label in example['entities']:
                entity_text = text[start:end].strip()
                if entity_text:
                    entity_counter[(entity_text, label)] += 1
        
        patterns = []
        for (entity, label), count in entity_counter.most_common():
            if count >= min_frequency:
                patterns.append({
                    'label': label,
                    'pattern': entity,
                    'frequency': count
                })
        
        self.patterns = patterns[:3000]  
        print(f"Learned {len(self.patterns)} patterns from training data")
        
        if 'entity_ruler' in self.nlp.pipe_names:
            self.nlp.remove_pipe('entity_ruler')
        
        ruler = self.nlp.add_pipe('entity_ruler', before='ner')
        ruler.add_patterns(self.patterns)
        
        return self.patterns
    
    def predict(self, texts: List[str]) -> List[Dict]:
        predictions = []
        
        for text in texts:
            doc = self.nlp(text)
            entities = [(ent.start_char, ent.end_char, ent.label_) for ent in doc.ents]
            predictions.append({
                'text': text,
                'entities': entities
            })
        
        return predictions

In [6]:
class ModelBasedNER:
    
    def __init__(self, base_model='en_core_web_sm'):
        print("Initializing model-based NER system...")
        try:
            self.nlp = spacy.load(base_model)
        except IOError:
            print(f"📥 Downloading {base_model}...")
            spacy.cli.download(base_model)
            self.nlp = spacy.load(base_model)
            
        print(f"Model-based NER ready with {base_model}")
    
    def prepare_training_data(self, processed_data: List[Dict]) -> List[Tuple]:
        training_data = []
        
        for example in processed_data:
            entities_dict = {"entities": example['entities']}
            training_data.append((example['text'], entities_dict))
        
        return training_data
    
    def train_model(self, training_data: List[Tuple], validation_data: List[Tuple] = None, iterations=10, early_stopping=True):
        print(f"Training model for {iterations} iterations...")
        
        ner = self.nlp.get_pipe('ner')
        for text, annotations in training_data:
            for start, end, label in annotations['entities']:
                ner.add_label(label)
        
        examples = []
        for text, annotations in training_data:
            doc = self.nlp.make_doc(text)
            example = Example.from_dict(doc, annotations)
            examples.append(example)
        
        optimizer = self.nlp.resume_training()
        
        best_score = 0
        patience_counter = 0
        
        for iteration in range(iterations):
            print(f"Training iteration {iteration + 1}/{iterations}")
            
            random.shuffle(examples)
            losses = {}
            
            batches = minibatch(examples, size=compounding(4., 32., 1.001))
            for batch in batches:
                self.nlp.update(batch, drop=0.2, losses=losses, sgd=optimizer)
            
            print(f"   Loss: {losses.get('ner', 0):.4f}")
            
            if early_stopping and validation_data:
                val_score = self._evaluate_on_validation(validation_data)
                print(f"   Validation F1: {val_score:.4f}")
                
                if val_score > best_score:
                    best_score = val_score
                    patience_counter = 0
                    self._save_best_model()
                else:
                    patience_counter += 1
                    if patience_counter >= 3:
                        print("Early stopping triggered!")
                        self._load_best_model()
                        break
        
        print("Model training complete!")
        return self.nlp
    
    def _evaluate_on_validation(self, validation_data):
        correct = 0
        total = 0
        
        for text, annotations in validation_data[:100]: 
            doc = self.nlp(text)
            predicted_entities = set((ent.start_char, ent.end_char, ent.label_) for ent in doc.ents)
            true_entities = set(annotations['entities'])
            
            correct += len(predicted_entities & true_entities)
            total += len(true_entities) + len(predicted_entities - true_entities)
        
        return correct / total if total > 0 else 0
    
    def _save_best_model(self):
        self._best_model_bytes = self.nlp.to_bytes()
    
    def _load_best_model(self):
        if hasattr(self, '_best_model_bytes'):
            self.nlp.from_bytes(self._best_model_bytes)
    
    def predict(self, texts: List[str]) -> List[Dict]:
        predictions = []
        
        for text in texts:
            doc = self.nlp(text)
            entities = [(ent.start_char, ent.end_char, ent.label_) for ent in doc.ents]
            predictions.append({
                'text': text,
                'entities': entities
            })
        
        return predictions

In [7]:
class NEREvaluator:
    
    def __init__(self):
        print("NER Evaluator initialized!")
        
        self.label_mapping = {
            'PERSON': 'PER', 'PER': 'PER',
            'ORG': 'ORG', 'ORGANIZATION': 'ORG',
            'GPE': 'LOC', 'LOC': 'LOC', 'LOCATION': 'LOC', 'FAC': 'LOC',
            'NORP': 'MISC', 'PRODUCT': 'MISC', 'EVENT': 'MISC',
            'WORK_OF_ART': 'MISC', 'LAW': 'MISC', 'LANGUAGE': 'MISC',
            'DATE': 'MISC', 'TIME': 'MISC', 'PERCENT': 'MISC',
            'MONEY': 'MISC', 'QUANTITY': 'MISC', 'ORDINAL': 'MISC',
            'CARDINAL': 'MISC'
        }
    
    def normalize_predictions(self, predictions: List[Dict]) -> List[Dict]:
        normalized = []
        
        for pred in predictions:
            normalized_entities = []
            for start, end, label in pred['entities']:
                normalized_label = self.label_mapping.get(label, 'MISC')
                normalized_entities.append((start, end, normalized_label))
            
            normalized.append({
                'text': pred['text'],
                'entities': normalized_entities
            })
        
        return normalized
    
    def calculate_metrics(self, true_data: List[Dict], predicted_data: List[Dict]) -> Dict:
        true_positive = 0
        false_positive = 0
        false_negative = 0
        
        for true_example, pred_example in zip(true_data, predicted_data):
            true_entities = set(true_example['entities'])
            pred_entities = set(pred_example['entities'])
            
            true_positive += len(true_entities & pred_entities)
            false_positive += len(pred_entities - true_entities)
            false_negative += len(true_entities - pred_entities)
        
        precision = true_positive / (true_positive + false_positive) if (true_positive + false_positive) > 0 else 0
        recall = true_positive / (true_positive + false_negative) if (true_positive + false_negative) > 0 else 0
        f1 = 2 * precision * recall / (precision + recall) if (precision + recall) > 0 else 0
        
        return {
            'precision': precision,
            'recall': recall,
            'f1_score': f1,
            'true_positive': true_positive,
            'false_positive': false_positive,
            'false_negative': false_negative
        }
    
    def detailed_analysis(self, true_data: List[Dict], predicted_data: List[Dict]) -> Dict:
        all_labels = set()
        for example in true_data:
            for _, _, label in example['entities']:
                all_labels.add(label)
        
        per_label_metrics = {}
        
        for label in all_labels:
            label_true = []
            label_pred = []
            
            for true_ex, pred_ex in zip(true_data, predicted_data):
                label_true.append({
                    'text': true_ex['text'],
                    'entities': [(s, e, l) for s, e, l in true_ex['entities'] if l == label]
                })
                label_pred.append({
                    'text': pred_ex['text'],
                    'entities': [(s, e, l) for s, e, l in pred_ex['entities'] if l == label]
                })
            
            per_label_metrics[label] = self.calculate_metrics(label_true, label_pred)
        
        overall_metrics = self.calculate_metrics(true_data, predicted_data)
        
        return {
            'overall': overall_metrics,
            'per_label': per_label_metrics
        }
    
    def create_confusion_matrix(self, true_data: List[Dict], predicted_data: List[Dict]) -> None:
        print("Creating Confusion Matrix...")
        
        all_labels = set()
        y_true = []
        y_pred = []
        
        for true_example, pred_example in zip(true_data, predicted_data):
            true_spans = {(s, e): l for s, e, l in true_example['entities']}
            pred_spans = {(s, e): l for s, e, l in pred_example['entities']}
            
            all_spans = set(list(true_spans.keys()) + list(pred_spans.keys()))
            
            for span in all_spans:
                true_label = true_spans.get(span, 'O')  
                pred_label = pred_spans.get(span, 'O')
                
                if true_label != 'O':  
                    y_true.append(true_label)
                    y_pred.append(pred_label)
                    all_labels.add(true_label)
                    all_labels.add(pred_label)
        
        labels = sorted(list(all_labels - {'O'}))
        cm = confusion_matrix(y_true, y_pred, labels=labels)
        
        plt.figure(figsize=(8, 6))
        sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', 
                    xticklabels=labels, yticklabels=labels)
        plt.title('NER Confusion Matrix')
        plt.xlabel('Predicted Label')
        plt.ylabel('True Label')
        plt.tight_layout()
        
        confusion_file = os.path.join(CONFIG['output_dir'], 'confusion_matrix.png')
        plt.savefig(confusion_file, dpi=300, bbox_inches='tight')
        plt.show()
        print(f"Confusion matrix saved to: {confusion_file}")
        
        return cm, labels

In [None]:
class NERVisualization:
    
    def __init__(self, nlp_model):
        self.nlp = nlp_model
        print("Visualization system ready!")
    
    def show_predictions(self, text: str, show_in_jupyter=True, save_html=True):
        doc = self.nlp(text)
        
        if show_in_jupyter:
            try:
                displacy.render(doc, style='ent', jupyter=True)
            except:
                self._print_entities(doc)
        else:
            self._print_entities(doc)
        
        if save_html:
            try:
                html = displacy.render(doc, style='ent', page=True)
                if html is not None:
                    filename = f"ner_visualization_{hash(text) % 10000}.html"
                    with open(filename, 'w', encoding='utf-8') as f:
                        f.write(html)
                    print(f"Visualization saved to: {filename}")
                else:
                    html_content = self._create_simple_html_visualization(doc, text)
                    filename = f"ner_visualization_{hash(text) % 10000}.html"
                    with open(filename, 'w', encoding='utf-8') as f:
                        f.write(html_content)
                    print(f"Simple visualization saved to: {filename}")
            except Exception as e:
                print(f"Warning: Could not create HTML visualization: {e}")
                print("Displaying text-based visualization instead:")
                self._print_entities(doc)
    
    def _print_entities(self, doc):
        print(f"Text: {doc.text}")
        print("\nFound entities:")
        
        for ent in doc.ents:
            print(f"'{ent.text}' -> {ent.label_} (confidence: {ent._.score if hasattr(ent._, 'score') else 'N/A'})")
        
        if not doc.ents:
            print("No entities found.")
    
    def _create_simple_html_visualization(self, doc, original_text):
        """Create a simple HTML visualization when displacy fails"""
        html_template = """
        <!DOCTYPE html>
        <html>
        <head>
            <title>NER Visualization</title>
            <style>
                body {{ font-family: Arial, sans-serif; margin: 40px; }}
                .entity {{ padding: 2px 4px; margin: 2px; border-radius: 3px; }}
                .PER {{ background-color: #ffeaa7; }}
                .ORG {{ background-color: #74b9ff; color: white; }}
                .LOC {{ background-color: #fd79a8; color: white; }}
                .MISC {{ background-color: #a29bfe; color: white; }}
                .entity-label {{ font-size: 0.8em; font-weight: bold; }}
                .text-container {{ line-height: 1.6; font-size: 18px; }}
            </style>
        </head>
        <body>
            <h2>Named Entity Recognition Results</h2>
            <div class="text-container">
                {highlighted_text}
            </div>
            <h3>Entities Found:</h3>
            <ul>
                {entity_list}
            </ul>
        </body>
        </html>
        """
        
        highlighted_text = original_text
        entity_list = []
        
        entities = sorted(doc.ents, key=lambda x: x.start_char, reverse=True)
        
        for ent in entities:
            entity_html = f'<span class="entity {ent.label_}">{ent.text} <span class="entity-label">{ent.label_}</span></span>'
            highlighted_text = highlighted_text[:ent.start_char] + entity_html + highlighted_text[ent.end_char:]
            
            entity_list.append(f"<li><strong>{ent.text}</strong> - {ent.label_}</li>")
        
        entity_list_html = "\n".join(entity_list) if entity_list else "<li>No entities found</li>"
        
        return html_template.format(
            highlighted_text=highlighted_text,
            entity_list=entity_list_html
        )

In [9]:
def compare_spacy_models(test_data, models_to_compare=['en_core_web_sm', 'en_core_web_md']):
    print(f"Comparing {len(models_to_compare)} spaCy models...")
    
    results = {}
    loaded_models = {}
    
    for model_name in models_to_compare:
        try:
            loaded_models[model_name] = spacy.load(model_name)
            print(f"Loaded {model_name}")
        except OSError:
            print(f"{model_name} not available. Install with: python -m spacy download {model_name}")
            continue
    
    for model_name, nlp in loaded_models.items():
        print(f"\nTesting {model_name}...")
        
        predictions = []
        for example in test_data:
            doc = nlp(example['text'])
            entities = []
            for ent in doc.ents:
                normalized_label = _normalize_spacy_label(ent.label_)
                entities.append((ent.start_char, ent.end_char, normalized_label))
            predictions.append({'text': example['text'], 'entities': entities})
        
        evaluator = NEREvaluator()
        metrics = evaluator.calculate_metrics(test_data, predictions)
        results[model_name] = metrics
        
        print(f"   Precision: {metrics['precision']:.4f}")
        print(f"   Recall:    {metrics['recall']:.4f}")
        print(f"   F1 Score:  {metrics['f1_score']:.4f}")
    
    if len(results) > 1:
        _visualize_model_comparison(results)
    
    return results

In [10]:
def _normalize_spacy_label(spacy_label):
    mapping = {
        'PERSON': 'PER', 'PER': 'PER',
        'ORG': 'ORG', 'ORGANIZATION': 'ORG',
        'GPE': 'LOC', 'LOC': 'LOC', 'LOCATION': 'LOC', 'FAC': 'LOC',
        'NORP': 'MISC', 'PRODUCT': 'MISC', 'EVENT': 'MISC',
        'WORK_OF_ART': 'MISC', 'LAW': 'MISC', 'LANGUAGE': 'MISC',
        'DATE': 'MISC', 'TIME': 'MISC', 'PERCENT': 'MISC',
        'MONEY': 'MISC', 'QUANTITY': 'MISC', 'ORDINAL': 'MISC',
        'CARDINAL': 'MISC'
    }
    return mapping.get(spacy_label, 'MISC')

In [11]:
def _visualize_model_comparison(results):
    print("Creating model comparison visualization...")
    
    models = list(results.keys())
    metrics = ['precision', 'recall', 'f1_score']
    
    fig, axes = plt.subplots(1, 2, figsize=(15, 6))
    
    f1_scores = [results[model]['f1_score'] for model in models]
    bars = axes[0].bar(models, f1_scores, color=['skyblue', 'lightcoral', 'lightgreen'][:len(models)])
    axes[0].set_title('F1 Score Comparison')
    axes[0].set_ylabel('F1 Score')
    axes[0].set_ylim(0, 1)
    
    for bar, score in zip(bars, f1_scores):
        axes[0].text(bar.get_x() + bar.get_width()/2, bar.get_height() + 0.01, 
                f'{score:.3f}', ha='center', va='bottom')
    
    x = np.arange(len(models))
    width = 0.25
    
    for i, metric in enumerate(metrics):
        values = [results[model][metric] for model in models]
        axes[1].bar(x + i*width, values, width, label=metric.capitalize(), alpha=0.8)
    
    axes[1].set_title('All Metrics Comparison')
    axes[1].set_ylabel('Score')
    axes[1].set_xlabel('Models')
    axes[1].set_xticks(x + width)
    axes[1].set_xticklabels(models)
    axes[1].legend()
    axes[1].set_ylim(0, 1)
    
    plt.tight_layout()
    
    comparison_file = os.path.join(CONFIG['output_dir'], 'model_comparison.png')
    plt.savefig(comparison_file, dpi=300, bbox_inches='tight')
    plt.show()
    print(f"Model comparison saved to: {comparison_file}")

In [12]:
def main():
    print("Starting the Complete NER Pipeline!")
    
    print("\nSTEP 1: Loading Data")
    
    data_loader = CoNLLDataLoader()
    
    try:
        train_sentences = data_loader.load_conll_file(CONFIG['data_paths']['train'])
        valid_sentences = data_loader.load_conll_file(CONFIG['data_paths']['valid'])
        test_sentences = data_loader.load_conll_file(CONFIG['data_paths']['test'])
    except FileNotFoundError as e:
        print(f"Error: {e}")
        print("Please make sure your data files are in the correct location!")
        return
    
    print(f"Dataset Summary:")
    print(f"   Training sentences: {len(train_sentences)}")
    print(f"   Validation sentences: {len(valid_sentences)}")
    print(f"   Test sentences: {len(test_sentences)}")
    
    print("\nSTEP 2: Processing Data")
    
    processor = NERDataProcessor()
    
    train_data = processor.process_sentences(train_sentences)
    valid_data = processor.process_sentences(valid_sentences)
    test_data = processor.process_sentences(test_sentences)
    
    print("Data processing complete!")
    
    print("\nSTEP 3: Rule-Based NER")
    
    rule_based_ner = RuleBasedNER(spacy.load('en_core_web_sm'))
    patterns = rule_based_ner.learn_patterns_from_data(train_data, min_frequency=3)
    
    test_texts = [example['text'] for example in test_data]
    rule_predictions = rule_based_ner.predict(test_texts)
    
    print("Rule-based predictions complete!")
    
    print("\nSTEP 4: Model-Based NER")
    
    model_based_ner = ModelBasedNER()
    
    train_subset = train_data[:5000]  
    valid_subset = valid_data[:1000]
    
    training_data = model_based_ner.prepare_training_data(train_subset)
    validation_data = model_based_ner.prepare_training_data(valid_subset)
    
    trained_model = model_based_ner.train_model(
        training_data, 
        validation_data, 
        iterations=8,
        early_stopping=True
    )
    
    model_predictions = model_based_ner.predict(test_texts)
    
    print("Model-based predictions complete!")
    
    print("\nSTEP 5: Evaluation and Comparison")
    
    evaluator = NEREvaluator()
    
    rule_predictions_norm = evaluator.normalize_predictions(rule_predictions)
    model_predictions_norm = evaluator.normalize_predictions(model_predictions)
    
    rule_metrics = evaluator.detailed_analysis(test_data, rule_predictions_norm)
    model_metrics = evaluator.detailed_analysis(test_data, model_predictions_norm)
    
    print("RESULTS COMPARISON:")
    
    print(f"\nRule-Based NER:")
    print(f"   Precision: {rule_metrics['overall']['precision']:.4f}")
    print(f"   Recall:    {rule_metrics['overall']['recall']:.4f}")
    print(f"   F1 Score:  {rule_metrics['overall']['f1_score']:.4f}")
    
    print(f"\nModel-Based NER:")
    print(f"   Precision: {model_metrics['overall']['precision']:.4f}")
    print(f"   Recall:    {model_metrics['overall']['recall']:.4f}")
    print(f"   F1 Score:  {model_metrics['overall']['f1_score']:.4f}")
    
    print(f"\nPer-Label Performance (Model-Based):")
    for label, metrics in model_metrics['per_label'].items():
        print(f"{label:4s}: P={metrics['precision']:.3f} R={metrics['recall']:.3f} F1={metrics['f1_score']:.3f}")
    
    print("\nSTEP 6: Enhanced Visualization Demo")
    
    visualizer = NERVisualization(trained_model)
    
    sample_texts = [
        "Apple Inc. is planning to open a new store in New York City.",
        "Barack Obama was the 44th President of the United States.",
        "The European Union announced new regulations yesterday."
    ]
    
    print("Sample predictions with HTML visualization:")
    for i, text in enumerate(sample_texts, 1):
        print(f"\nExample {i}:")
        doc = trained_model(text)
        print(f"Text: {text}")
        entities_found = [(ent.text, ent.label_) for ent in doc.ents]
        print(f"Entities: {entities_found}")
        
        visualizer.show_predictions(text, show_in_jupyter=False, save_html=True)
    
    print("\nBONUS: Comparing Different spaCy Models")
    comparison_results = compare_spacy_models(test_data[:1000])  
    
    print(f"\nSTEP 7: Saving Results and Final Summary")
    
    os.makedirs(CONFIG['output_dir'], exist_ok=True)
    model_path = os.path.join(CONFIG['output_dir'], 'trained_ner_model')
    trained_model.to_disk(model_path)
    
    results_path = os.path.join(CONFIG['output_dir'], 'evaluation_results.json')
    complete_results = {
        'rule_based_metrics': rule_metrics,
        'model_based_metrics': model_metrics,
        'patterns_learned': len(patterns),
        'model_comparison': comparison_results if comparison_results else {},  
        'training_summary': {
            'training_sentences': len(train_data),  
            'validation_sentences': len(valid_data), 
            'test_sentences': len(test_data),
            'early_stopping_triggered': True,
            'final_f1_score': model_metrics['overall']['f1_score']
        }
    }
    
    with open(results_path, 'w') as f:
        json.dump(complete_results, f, indent=2)
    
    print(f"Model saved to: {model_path}")
    print(f"Results saved to: {results_path}")
    
    print("\nFINAL SUMMARY:")
    print(f" Best F1 Score Achieved: {model_metrics['overall']['f1_score']:.4f}")
    print(f" Rule-based F1 Score: {rule_metrics['overall']['f1_score']:.4f}")
    print(f" Model-based F1 Score: {model_metrics['overall']['f1_score']:.4f}")
    print(f"  Entity Types Recognized: {len(model_metrics['per_label'])} (PER, ORG, LOC, MISC)")
    print(f" Patterns Learned: {len(patterns)}")
    
    print("\nFiles Generated:")
    print("• trained_ner_model/ - Your custom NER model")
    print("• evaluation_results.json - Complete performance metrics") 
    print("• confusion_matrix.png - Entity prediction confusion matrix")
    print("• model_comparison.png - spaCy models comparison")
    print("• ner_visualization_*.html - Interactive entity visualizations")
    
    print("\nComplete NER Pipeline Finished Successfully!")
    print("Your NER system is ready! You can now:")
    print("• Use the trained model for new predictions")
    print("• Compare rule-based vs model-based approaches") 
    print("• Visualize entity extraction results with displaCy")
    print("• Analyze performance with confusion matrices")
    print("• Compare different spaCy model performances")
    print("• Fine-tune parameters for even better performance")

In [13]:
if __name__ == "__main__":
    main()

Starting the Complete NER Pipeline!

STEP 1: Loading Data
Initializing our data loader...
Reading Dataset/train.txt...
Successfully loaded 14041 sentences from Dataset/train.txt
Reading Dataset/valid.txt...
Successfully loaded 3250 sentences from Dataset/valid.txt
Reading Dataset/test.txt...
Successfully loaded 3453 sentences from Dataset/test.txt
Dataset Summary:
   Training sentences: 14041
   Validation sentences: 3250
   Test sentences: 3453

STEP 2: Processing Data
Setting up data processor...
Loaded spaCy model: en_core_web_sm
Processing 14041 sentences...
Data processing complete!
Processing 3250 sentences...
Data processing complete!
Processing 3453 sentences...
Data processing complete!
Data processing complete!

STEP 3: Rule-Based NER
Rule-based NER system initialized!
Learning patterns from training data...
Learned 1986 patterns from training data
Rule-based predictions complete!

STEP 4: Model-Based NER
Initializing model-based NER system...
Model-based NER ready with en_co

Simple visualization saved to: ner_visualization_5935.html

Example 2:
Text: Barack Obama was the 44th President of the United States.
Entities: [('Barack Obama', 'PER'), ('United States', 'LOC')]
Text: Barack Obama was the 44th President of the United States.

Found entities:
'Barack Obama' -> PER (confidence: N/A)
'United States' -> LOC (confidence: N/A)


Simple visualization saved to: ner_visualization_5548.html

Example 3:
Text: The European Union announced new regulations yesterday.
Entities: [('European Union', 'ORG')]
Text: The European Union announced new regulations yesterday.

Found entities:
'European Union' -> ORG (confidence: N/A)


Simple visualization saved to: ner_visualization_6083.html

BONUS: Comparing Different spaCy Models
Comparing 2 spaCy models...
Loaded en_core_web_sm
en_core_web_md not available. Install with: python -m spacy download en_core_web_md

Testing en_core_web_sm...
NER Evaluator initialized!
   Precision: 0.3536
   Recall:    0.4973
   F1 Score:  0.4133

STEP 7: Saving Results and Final Summary
Model saved to: ner_results\trained_ner_model
Results saved to: ner_results\evaluation_results.json

FINAL SUMMARY:
 Best F1 Score Achieved: 0.8035
 Rule-based F1 Score: 0.4847
 Model-based F1 Score: 0.8035
  Entity Types Recognized: 4 (PER, ORG, LOC, MISC)
 Patterns Learned: 1986

Files Generated:
• trained_ner_model/ - Your custom NER model
• evaluation_results.json - Complete performance metrics
• confusion_matrix.png - Entity prediction confusion matrix
• model_comparison.png - spaCy models comparison
• ner_visualization_*.html - Interactive entity visualizations

Complete NER Pipeline Finished S