## Named Entity Recognition (NER) Comparison
#### Comparing SpaCy vs. Hugging Face for entity extraction

In [32]:
# Import libraries 
import os
from pathlib import Path
import spacy
from spacy import displacy
from transformers import pipeline
from itertools import groupby
import tensorflow as tf
print(tf.__version__)


2.19.0


In [25]:
# Verify data path exists
data_path = Path('../data/reviews.txt')
if not data_path.exists():
    raise FileNotFoundError(f"Could not find data file at {data_path}")

# Read the text file
with open(data_path, 'r', encoding='utf-8') as f:
    texts = [line.strip() for line in f.readlines() if line.strip()]

print(f"Loaded {len(texts)} text samples")
for i, text in enumerate(texts[:3], 1):
    print(f"{i}. {text}")


Loaded 3 text samples
1. ﻿"I recently purchased a MacBook from Apple, and the experience was great. The customer service at their New York store was excellent."
2. "The Tesla Model S is an amazing car. I bought it in San Francisco last year."
3. "Microsoft is doing great things with Azure and AI."


### **Option 1: SpaCy Implementation**

In [27]:
# Load SpaCy's pre-trained model
nlp = spacy.load("en_core_web_sm")

def spacy_ner(text):
    """Extract entities using SpaCy"""
    doc = nlp(text)
    return [(ent.text, ent.label_) for ent in doc.ents]

# Process all texts
spacy_results = []
for text in texts:
    entities = spacy_ner(text)
    spacy_results.append({
        'text': text,
        'entities': entities
    })

# Display sample results
print("\nSpaCy Results (Sample):")
for result in spacy_results[:2]:
    print(f"\nText: {result['text']}")
    for entity, label in result['entities']:
        print(f"  {entity} ({label})")

# Visualize
doc = nlp(texts[0])
displacy.render(doc, style="ent", jupyter=True)



SpaCy Results (Sample):

Text: ﻿"I recently purchased a MacBook from Apple, and the experience was great. The customer service at their New York store was excellent."
  ﻿"I (ORG)
  MacBook from Apple (ORG)
  New York (GPE)

Text: "The Tesla Model S is an amazing car. I bought it in San Francisco last year."
  The Tesla Model S (WORK_OF_ART)
  San Francisco (GPE)
  last year (DATE)


In [28]:
import spacy
from spacy import displacy
from pathlib import Path

# First ensure the model is installed
try:
    nlp = spacy.load("en_core_web_sm")
except OSError:
    print("Downloading spaCy model...")
    !python -m spacy download en_core_web_sm
    nlp = spacy.load("en_core_web_sm")

# Load your text file
data_path = Path('../data/reviews.txt')
with open(data_path, 'r', encoding='utf-8') as f:
    texts = [line.strip().replace('﻿"', '"').strip('"') for line in f if line.strip()]

def clean_ner_results(text, entities):
    """Clean up NER results by:
    1. Removing unwanted characters
    2. Merging broken entities
    3. Filtering unwanted labels"""
    cleaned = []
    for ent in entities:
        text = ent.text.strip('"').strip()
        label = ent.label_
        
        # Skip date entities if not needed
        if label in ['DATE', 'TIME', 'PERCENT', 'MONEY', 'QUANTITY']:
            continue
            
        cleaned.append((text, label))
    return cleaned

# Process texts with improved handling
spacy_results = []
for text in texts:
    doc = nlp(text)
    entities = clean_ner_results(doc, doc.ents)
    spacy_results.append({
        'text': text,
        'entities': entities
    })

# Display cleaner results
print("Improved SpaCy NER Results:\n")
for result in spacy_results:
    print(f"Original: {result['text']}")
    if result['entities']:
        print("Entities:")
        for entity, label in result['entities']:
            print(f"  {entity:<25} ({label})")
    else:
        print("  No relevant entities found")
    print("\n" + "-"*50 + "\n")

# Visualize the first result with better formatting
if len(spacy_results) > 0:
    first_text = spacy_results[0]['text']
    doc = nlp(first_text)
    displacy.render(doc, style="ent", jupyter=True, options={'colors': {'ORG': '#ff9999', 'GPE': '#99ff99'}})

Improved SpaCy NER Results:

Original: I recently purchased a MacBook from Apple, and the experience was great. The customer service at their New York store was excellent.
Entities:
  MacBook from Apple        (ORG)
  New York                  (GPE)

--------------------------------------------------

Original: The Tesla Model S is an amazing car. I bought it in San Francisco last year.
Entities:
  The Tesla Model S         (ORG)
  San Francisco             (GPE)

--------------------------------------------------

Original: Microsoft is doing great things with Azure and AI.
Entities:
  Microsoft                 (ORG)
  AI                        (GPE)

--------------------------------------------------



In [29]:
script_content = """
SpaCy NER Extraction Script
"""

def load_model():
    try:
        return spacy.load("en_core_web_sm")
    except OSError:
        import subprocess
        import sys
        subprocess.run([sys.executable, "-m", "spacy", "download", "en_core_web_sm"])
        return spacy.load("en_core_web_sm")

def extract_entities(text, nlp):
    """Extract and clean entities from text"""
    doc = nlp(text)
    entities = []
    for ent in doc.ents:
        if ent.label_ in ['DATE', 'TIME', 'PERCENT', 'MONEY', 'QUANTITY']:
            continue
        entities.append((ent.text.strip(), ent.label_))
    return entities

def process_file(input_path, output_path=None):
    nlp = load_model()
    
    with open(input_path, 'r', encoding='utf-8') as f:
        texts = [line.strip().replace('﻿"', '"').strip('"') for line in f if line.strip()]
    
    results = []
    for text in texts:
        entities = extract_entities(text, nlp)
        results.append({
            'text': text,
            'entities': entities
        })
        if output_path:
            with open(output_path, 'a', encoding='utf-8') as f_out:
                f_out.write(f"Text: {text}\\n")
                for entity, label in entities:
                    f_out.write(f"  {entity:<25} ({label})\\n")
                f_out.write("\\n")
    
    return results

if __name__ == "__main__":
    input_file = Path('../data/reviews.txt')
    output_file = Path('../data/ner_results.txt')
    
    print("Running NER extraction...")
    results = process_file(input_file, output_file)
    print(f"Processed {len(results)} texts")
    print(f"Results saved to {output_file}")


# Save to file
output_path = Path('../scripts/ner_extraction.py')
output_path.parent.mkdir(exist_ok=True)
with open(output_path, 'w', encoding='utf-8') as f:
    f.write(script_content)

print(f"Script saved to {output_path}")

Running NER extraction...
Processed 3 texts
Results saved to ..\data\ner_results.txt
Script saved to ..\scripts\ner_extraction.py
