# **NER for News Articles**

In [11]:
import os
import shutil

repo_path = "/content/NER-based-news-article"

# Change directory to /content (or anywhere safe) before deleting
os.chdir("/content")

if os.path.exists(repo_path):
    shutil.rmtree(repo_path)
    print(f"Deleted existing folder: {repo_path}")

from getpass import getpass

username = "mahmedddd"
repo_name = "NER-based-news-article"
token = getpass("Enter your GitHub token: ")

repo_url = f"https://{token}@github.com/{username}/{repo_name}.git"

# Now clone fresh
!git clone $repo_url


repo_path = f"/content/{repo_name}"
os.makedirs(f"{repo_path}/data", exist_ok=True)

# Move the IMDB dataset into repo's data directory
!cp "/content/CoNLL003.zip" "{repo_path}/data/CoNLL003.zip"

# # Remove this block to ensure dataset is NOT ignored
# with open(f"{repo_path}/.gitignore", "w") as f:
#     f.write("data/IMDB_Dataset.csv\n")

readme_content = textwrap.dedent("""

    # NER for News Articles

    This repository demonstrates Named Entity Recognition (NER) techniques on news articles using spaCy. It includes loading CoNLL formatted datasets, extracting named entities with pretrained and custom spaCy pipelines, and comparing different spaCy models.

    ## Requirements

    **install individually:**

    ```bash
    pip install pandas matplotlib seaborn spacy
    ```

    ## Libraries Used

    - **pandas** – For data loading and manipulation
    - **matplotlib** – For plotting graphs
    - **seaborn** – For advanced visualizations
    - **spaCy** – For NLP tasks (tokenization, lemmatization, stopword removal)
""")

with open(f"{repo_path}/README.md", "w") as f:
    f.write(readme_content)

# Add, commit, and push repo changes
%cd {repo_path}
!git config --global user.email "ahmedunited902@gmail.com"
!git config --global user.name "mahmedddd"

!git add .
!git commit -m "Initial commit with README and dataset included"
!git push origin main


Deleted existing folder: /content/NER-based-news-article
Enter your GitHub token: ··········
Cloning into 'NER-based-news-article'...
/content/NER-based-news-article
[main (root-commit) e51d68c] Initial commit with README and dataset included
 2 files changed, 20 insertions(+)
 create mode 100644 README.md
 create mode 100644 data/CoNLL003.zip
Enumerating objects: 5, done.
Counting objects: 100% (5/5), done.
Delta compression using up to 2 threads
Compressing objects: 100% (4/4), done.
Writing objects: 100% (5/5), 960.81 KiB | 18.84 MiB/s, done.
Total 5 (delta 0), reused 0 (delta 0), pack-reused 0
To https://github.com/mahmedddd/NER-based-news-article.git
 * [new branch]      main -> main


In [16]:
with open("CoNLL003.zip", "r", encoding="utf-8") as f:
    content = f.read(500)
print(content)








<!DOCTYPE html>
<html
  lang="en"
  
  data-color-mode="auto" data-light-theme="light" data-dark-theme="dark"
  data-a11y-animated-images="system" data-a11y-link-underlines="true"
  
  >




  <head>
    <meta charset="utf-8">
  <link rel="dns-prefetch" href="https://github.githubassets.com">
  <link rel="dns-prefetch" href="https://avatars.githubusercontent.com">
  <link rel="dns-prefetch" href="https://github-cloud.s3.amazonaws.com">
  <link rel="dns-prefetch" href="https://user-images.g


### Download and Extract CoNLL003 Dataset from GitHub

In [22]:
!wget https://raw.githubusercontent.com/mahmedddd/NER-based-news-article/main/data/CoNLL003.zip

import zipfile

with zipfile.ZipFile("CoNLL003.zip.7", "r") as zip_ref:
    zip_ref.extractall("CoNLL003")


import os
print(os.listdir("CoNLL003"))


--2025-08-09 09:01:14--  https://raw.githubusercontent.com/mahmedddd/NER-based-news-article/main/data/CoNLL003.zip
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.111.133, 185.199.109.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 982975 (960K) [application/zip]
Saving to: ‘CoNLL003.zip.8’


2025-08-09 09:01:14 (35.5 MB/s) - ‘CoNLL003.zip.8’ saved [982975/982975]

['test.txt', 'train.txt', 'metadata', 'valid.txt']


###  Load CoNLL File Format into Python

In [None]:
def load_conll_file(filepath):
    data = []
    with open(filepath, "r", encoding="utf-8") as f:
        sentence = []
        for line in f:
            if line.strip():
                tokens = line.strip().split()
                if len(tokens) == 4:
                    sentence.append((tokens[0], tokens[-1]))  # word, ner_tag
            else:
                if sentence:
                    data.append(sentence)
                    sentence = []
    return data

# Load training data
train_path = "CoNLL003/train.txt"
train_data = load_conll_file(train_path)
print("Total sentences:", len(train_data))
print("Sample sentence:", train_data[0])


Total sentences: 14987
Sample sentence: [('-DOCSTART-', 'O')]


In [None]:
import pandas as pd
import spacy
from spacy.pipeline import EntityRuler
from spacy import displacy
import os

# Load spaCy model
nlp = spacy.load("en_core_web_sm")


In [None]:
def load_conll_file(filepath):
    data = []
    with open(filepath, "r", encoding="utf-8") as f:
        sentence = []
        for line in f:
            if line.strip():
                tokens = line.strip().split()
                if len(tokens) == 4:
                    sentence.append((tokens[0], tokens[-1]))  # word, ner_tag
            else:
                if sentence:
                    data.append(sentence)
                    sentence = []
    return data
train_data = load_conll_file("CoNLL003/train.txt")

print(f"Train: {len(train_data)}")


Train: 14987


In [None]:
def extract_all_ner(nlp_model, dataset, source="unknown"):
    all_ents = []

    for sent in dataset:
        text = " ".join([word for word, tag in sent])
        doc = nlp_model(text)
        for ent in doc.ents:
            all_ents.append({
                "Entity": ent.text,
                "Label": ent.label_,
                "Sentence": text,
                "Source": source
            })
    return pd.DataFrame(all_ents)


### Extracting Entities Using spaCy's Pretrained Model

In [None]:
df_train_ents = extract_all_ner(nlp, train_data, "train")
df_all = pd.concat([df_train_ents], ignore_index=True)
print("Total Entities Extracted:", len(df_all))
df_all.head()


Total Entities Extracted: 35225


Unnamed: 0,Entity,Label,Sentence,Source
0,EU,ORG,EU rejects German call to boycott British lamb .,train
1,German,NORP,EU rejects German call to boycott British lamb .,train
2,British,NORP,EU rejects German call to boycott British lamb .,train
3,Peter Blackburn,PERSON,Peter Blackburn,train
4,BRUSSELS,GPE,BRUSSELS 1996-08-22,train


### Visualize Top Entities by NER Label

In [None]:
grouped = df_all.groupby("Label")["Entity"].value_counts().groupby(level=0).head(5)
print("Top 5 Entities per Label:\n")
print(grouped)


Top 5 Entities per Label:

Label        Entity                     
CARDINAL     1                              342
             6                              318
             two                            275
             2                              273
             3                              258
                                           ... 
WORK_OF_ART  Wimbledon                        5
             the Nobel Peace Prize            5
             the Tour of the Netherlands      5
             Mission : Impossible             3
             Australian Open                  2
Name: count, Length: 89, dtype: int64


### Custom Rule-Based NER with EntityRuler

In [None]:
ruler = nlp.add_pipe("entity_ruler", before="ner")

patterns = [
    {"label": "ORG", "pattern": "OpenAI"},
    {"label": "GPE", "pattern": "Pakistan"},
    {"label": "PERSON", "pattern": "Elon Musk"},
    {"label": "ORG", "pattern": "United Nations"},
]
ruler.add_patterns(patterns)


###Test Custom EntityRuler with Sample Text

In [None]:
sample = "Elon Musk visited the United Nations headquarters in Pakistan. OpenAI was also there."
doc = nlp(sample)

for ent in doc.ents:
    print(ent.text, "->", ent.label_)
displacy.render(doc, style="ent", jupyter=True)


Elon Musk -> PERSON
United Nations -> ORG
Pakistan -> GPE
OpenAI -> ORG


### Load and Compare spaCy Models (Small vs Transformer)

In [None]:
# Load both models
nlp_sm = spacy.load("en_core_web_sm")
# Download and load the transformer model
!python -m spacy download en_core_web_trf
nlp_trf = spacy.load("en_core_web_trf")



Collecting en-core-web-trf==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_trf-3.8.0/en_core_web_trf-3.8.0-py3-none-any.whl (457.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m457.4/457.4 MB[0m [31m2.7 MB/s[0m eta [36m0:00:00[0m
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_trf')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


### Compare Entity Recognition: en_core_web_sm vs en_core_web_trf

In [None]:
from IPython.display import display, HTML

nlp_sm = spacy.load("en_core_web_sm")
nlp_trf = spacy.load("en_core_web_trf")

# Input sample text
sample = (
    "Elon Musk visited the United Nations headquarters in Pakistan. "
    "OpenAI was also there. Emmanuel Macron spoke at the World Economic Forum 2025 in Switzerland."
)
doc_sm = nlp_sm(sample)
doc_trf = nlp_trf(sample)

# Custom color scheme
colors = {
    "PERSON": "linear-gradient(90deg, #aa9cfc, #fc9ce7)",
    "ORG": "linear-gradient(90deg, #fbc2eb, #a6c1ee)",
    "GPE": "linear-gradient(90deg, #fddb92, #d1fdff)",
    "DATE": "linear-gradient(90deg, #f6d365, #fda085)",
    "LOC": "linear-gradient(90deg, #84fab0, #8fd3f4)",
    "NORP": "linear-gradient(90deg, #d4fc79, #96e6a1)",
    "EVENT": "linear-gradient(90deg, #ffecd2, #fcb69f)"
}
options = {"ents": list(colors.keys()), "colors": colors}

# Print plain text results
print("🔹 SMALL Model (en_core_web_sm):")
for ent in doc_sm.ents:
    print(ent.text, "->", ent.label_)
print("\n🔹 TRANSFORMER Model (en_core_web_trf):")
for ent in doc_trf.ents:
    print(ent.text, "->", ent.label_)

# Render visual output
print("<h3 style='color:#6a1b9a;'>🔹 en_core_web_sm</h3>")
displacy.render(doc_sm, style="ent", options=options, jupyter=True)
print("<h3 style='color:#2e7d32;'>🔹 en_core_web_trf</h3>")
displacy.render(doc_trf, style="ent", options=options, jupyter=True)

🔹 SMALL Model (en_core_web_sm):
Elon Musk -> PERSON
the United Nations -> ORG
Pakistan -> GPE
OpenAI -> PERSON
Emmanuel Macron -> PERSON
the World Economic Forum -> ORG
2025 -> DATE
Switzerland -> GPE

🔹 TRANSFORMER Model (en_core_web_trf):
Elon Musk -> PERSON
United Nations -> ORG
Pakistan -> GPE
OpenAI -> ORG
Emmanuel Macron -> PERSON
the World Economic Forum 2025 -> EVENT
Switzerland -> GPE
<h3 style='color:#6a1b9a;'>🔹 en_core_web_sm</h3>


<h3 style='color:#2e7d32;'>🔹 en_core_web_trf</h3>
