In [None]:
import os
from pathlib import Path
import sys
BASE_DIR = str(Path().resolve().parents[0])
if BASE_DIR not in sys.path:
    sys.path.insert(0, BASE_DIR)
import src.news_nlp.config.paths as paths

import pandas as pd
import spacy
from dotenv import load_dotenv
from tqdm import tqdm

In [None]:
# Global config
RANDOM_SEED = 31415

# Parameters
TARGET_LABELS = {"PERSON", "ORG", "GPE", "LOC"}
BATCH_SIZE = 64
N_PROCESS = 4

# Load data

In [None]:
df_train = pd.read_parquet(paths.DF_TRAIN_CLEAN)
df_train

Unnamed: 0,news_id,title,content,text
0,0,"Singer, activist Joan Baez becomes Kennedy Cen...",Kennedy Center Honoree Joan Baez reflects on 6...,"singer, activist joan baez becomes kennedy cen..."
1,1,'Nothing but problems': Shipwreck tear-down en...,Demolition of a large cargo ship along the coa...,'nothing but problems': shipwreck tear-down en...
2,2,Report: At least 13 dead in Istanbul bombings,Report : At least 13 dead in Istanbul bombings...,report: at least 13 dead in istanbul bombings....
3,3,Senate Republicans Pressure Joe Biden to Withd...,Several congressional Republicans are expressi...,senate republicans pressure joe biden to withd...
4,4,The Unwelcome Return of the Real Purveyors of ...,With the mainstream media still obsessing abou...,the unwelcome return of the real purveyors of ...
...,...,...,...,...
89790,89790,An Interview with Mark Blaxill on the Autism T...,NOTE : We 'll have an audio file to accompany ...,an interview with mark blaxill on the autism t...
89791,89791,Vietnam reconsiders methane-emitting rice amid...,Country says it can no longer be ‘ rice first ...,vietnam reconsiders methane-emitting rice amid...
89792,89792,Shaker furniture: Clean by design,The Hancock Shaker Village in western Massachu...,shaker furniture: clean by design. the hancock...
89793,89793,"53 pot shop lottery winners announced, includi...","Following a year of acrimony and delays , stat...","53 pot shop lottery winners announced, includi..."


# Download and load model

In [None]:
!python3 -m spacy download en_core_web_sm
# !python3 -m spacy download en_core_web_md

Collecting en-core-web-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m28.2 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25h[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')


In [None]:
# Load english model in CPU/GPU (uncomment as needed)
# spacy.require_gpu()
nlp = spacy.load("en_core_web_sm")
# nlp = spacy.load("en_core_web_md")

# Extract entities

In [None]:
def extract_entities_for_df(
    df: pd.DataFrame,
    nlp,
    text_col: str = "text",
    id_col: str = "news_id",
    target_labels: set[str] = TARGET_LABELS,
    batch_size: int = BATCH_SIZE,
    n_process: int = N_PROCESS,
) -> pd.DataFrame:
    """
    Loop over a DataFrame with news articles and extract named entities using spaCy's NER model.
        Args:
            df (pd.DataFrame): DataFrame containing news articles.
            nlp: spaCy language model with NER capabilities.
            text_col (str): Name of the column containing the article text.
            id_col (str): Name of the column containing the unique article identifier.
            target_labels (set[str]): Set of entity labels to extract.
            batch_size (int): Number of texts to process in each batch.
            n_process (int): Number of processes to use for parallel processing.
        Returns:
            pd.DataFrame: DataFrame with extracted entities, with the following columns:
                - news_id: Unique identifier of the news article.
                - entity_text: Text of the named entity.
                - entity_label: NER label of the named entity (PERSON, ORG, GPE, LOC...).
                - start_char / end_char: position of the entity in the original text.
    """

    records = []

    texts = df[text_col].astype(str).tolist()
    ids = df[id_col].tolist()

    for news_id, doc in tqdm(
        zip(ids, nlp.pipe(texts, batch_size=batch_size, n_process=n_process)),
        total=len(ids),
        desc="Extracting entities",
    ):
        for ent in doc.ents:
            if ent.label_ in target_labels:
                records.append(
                    {
                        "news_id": news_id,
                        "entity_text": ent.text,
                        "entity_label": ent.label_,
                        "start_char": ent.start_char,
                        "end_char": ent.end_char,
                    }
                )

    df_mentions = pd.DataFrame(records)
    
    return df_mentions

In [None]:
def normalize_entity(text: str) -> str:
    # Basic normalization: lowercase, strip, and remove extra spaces
    text = text.strip().lower()
    return " ".join(text.split())

In [None]:
# Prepare data for entity extraction
texts = df_train["text"].astype(str).tolist()
news_ids = df_train["news_id"].tolist()

In [25]:
# Extract entities or load from disk if have already been extracted
if paths.DF_ENTITIES_RAW.exists():
    df_entities_raw = pd.read_parquet(paths.DF_ENTITIES_RAW)
else:
    # Extract entities
    df_entities_raw = extract_entities_for_df(df_train, nlp=nlp)
    # Normalize entity texts
    df_entities_raw["entity_text"] = df_entities_raw["entity_text"].astype(str).str.strip()
    df_entities_raw["entity_norm"] = df_entities_raw["entity_text"].apply(normalize_entity)
df_entities_raw

Extracting entities:   0%|          | 0/89795 [00:00<?, ?it/s]

Extracting entities:  93%|█████████▎| 83827/89795 [41:49<18:54,  5.26it/s]  Process Process-1:
Process Process-2:
Process Process-4:
Traceback (most recent call last):
Traceback (most recent call last):
  File "/usr/lib/python3.10/multiprocessing/process.py", line 314, in _bootstrap
    self.run()
  File "/usr/lib/python3.10/multiprocessing/process.py", line 314, in _bootstrap
    self.run()
  File "/usr/lib/python3.10/multiprocessing/process.py", line 108, in run
    self._target(*self._args, **self._kwargs)
  File "/usr/lib/python3.10/multiprocessing/process.py", line 108, in run
    self._target(*self._args, **self._kwargs)
  File "/home/ubuntu/news-topics-ner/.venv/lib/python3.10/site-packages/spacy/language.py", line 2419, in _apply_pipes
    sender.send(data)
  File "/home/ubuntu/news-topics-ner/.venv/lib/python3.10/site-packages/spacy/language.py", line 2419, in _apply_pipes
    sender.send(data)
Traceback (most recent call last):
  File "/usr/lib/python3.10/multiprocessing/proc

KeyboardInterrupt: 

# Build entities table

In [None]:
# Aggregate entities to check how many times each one appears and in how many different news articles
df_entities = (
    df_entities_raw
    .groupby(["entity_norm", "entity_label"])
    .agg(
        entity_text=("entity_text", "first"),   # Get the first occurrence of the entity text as they all should be the same after normalization
        mention_count=("news_id", "size"),      # Total number of mentions across all articles
        news_count=("news_id", "nunique"),      # Number of unique articles the entity appears in
    )
    .sort_values(by=["mention_count", "news_count"], ascending=False)
    .reset_index()
)
# Add unique entity IDs
df_entities["entity_id"] = df_entities.index.astype(int)
# Reorder columns
cols_order = ["entity_id", "entity_norm", "entity_text", "entity_label", "mention_count", "news_count"]
df_entities = df_entities[cols_order]
df_entities

Unnamed: 0,entity_id,entity_norm,entity_text,entity_label,mention_count,news_count
0,0,u.s.,u.s.,GPE,37142,15180
1,1,senate,senate,ORG,16062,6287
2,2,china,china,GPE,13336,3921
3,3,washington,washington,GPE,11405,7361
4,4,texas,texas,GPE,11141,4252
...,...,...,...,...,...,...
143314,143314,🙄 definitely practice,🙄 definitely practice,ORG,1,1
143315,143315,🙏🙏 @ @ @ @ @ !,🙏🙏 @ @ @ @ @ !,PERSON,1,1
143316,143316,🤔🤔,🤔🤔,PERSON,1,1
143317,143317,🤝 annytee1007,🤝 annytee1007,ORG,1,1


# Build news-entity table

In [None]:
# Add entity IDs column to the raw entities DataFrame
df_entities_joined = df_entities_raw.merge(
    df_entities[["entity_id", "entity_norm", "entity_label"]],
    on=["entity_norm", "entity_label"],
    how="left"
)
df_entities_joined

Unnamed: 0,news_id,entity_text,entity_label,start_char,end_char,entity_norm,entity_id
0,0,joan baez,PERSON,17,26,joan baez,22056
1,0,kennedy,PERSON,59,66,kennedy,482
2,0,cbs,ORG,157,160,cbs,18
3,1,georgia,GPE,114,121,georgia,23
4,1,ga,ORG,265,267,ga,495
...,...,...,...,...,...,...,...
1114591,89926,chicago,GPE,1973,1980,chicago,17
1114592,89927,ani difranco,PERSON,7,19,ani difranco,28345
1114593,89927,new york,GPE,129,137,new york,10
1114594,89927,jeff glor @ @ @ @ @,PERSON,321,340,jeff glor @ @ @ @ @,84301


In [None]:
# Aggregate to get how many times each entity appears in each news article
df_news_entities = (
    df_entities_joined
    .groupby(["news_id", "entity_id"])
    .agg(count_in_news_article=("entity_id", "count"))
    .reset_index()
)
df_news_entities

Unnamed: 0,news_id,entity_id,count_in_news_article
0,0,18,1
1,0,482,1
2,0,22056,1
3,1,9,1
4,1,23,2
...,...,...,...
735785,89926,126959,1
735786,89927,10,1
735787,89927,28345,1
735788,89927,32951,1


# Export

In [None]:
df_entities_raw

Unnamed: 0,news_id,entity_text,entity_label,start_char,end_char,entity_norm
0,0,joan baez,PERSON,17,26,joan baez
1,0,kennedy,PERSON,59,66,kennedy
2,0,cbs,ORG,157,160,cbs
3,1,georgia,GPE,114,121,georgia
4,1,ga,ORG,265,267,ga
...,...,...,...,...,...,...
1114591,89926,chicago,GPE,1973,1980,chicago
1114592,89927,ani difranco,PERSON,7,19,ani difranco
1114593,89927,new york,GPE,129,137,new york
1114594,89927,jeff glor @ @ @ @ @,PERSON,321,340,jeff glor @ @ @ @ @


In [None]:
df_entities

Unnamed: 0,entity_id,entity_norm,entity_text,entity_label,mention_count,news_count
0,0,u.s.,u.s.,GPE,37142,15180
1,1,senate,senate,ORG,16062,6287
2,2,china,china,GPE,13336,3921
3,3,washington,washington,GPE,11405,7361
4,4,texas,texas,GPE,11141,4252
...,...,...,...,...,...,...
143314,143314,🙄 definitely practice,🙄 definitely practice,ORG,1,1
143315,143315,🙏🙏 @ @ @ @ @ !,🙏🙏 @ @ @ @ @ !,PERSON,1,1
143316,143316,🤔🤔,🤔🤔,PERSON,1,1
143317,143317,🤝 annytee1007,🤝 annytee1007,ORG,1,1


In [None]:
df_news_entities

Unnamed: 0,news_id,entity_id,count_in_news_article
0,0,18,1
1,0,482,1
2,0,22056,1
3,1,9,1
4,1,23,2
...,...,...,...
735785,89926,126959,1
735786,89927,10,1
735787,89927,28345,1
735788,89927,32951,1


In [None]:
df_entities_raw.to_parquet(paths.DF_ENTITIES_RAW, index=False)
df_entities.to_parquet(paths.DF_ENTITIES, index=False)
df_news_entities.to_parquet(paths.DF_NEWS_ENTITIES, index=False)