# Entity & Keyword Extraction

https://www.analyticsvidhya.com/blog/2022/03/keyword-extraction-methods-from-documents-in-nlp/

## Setup

### Load Dependencies

In [1]:
%load_ext autoreload
%autoreload 2

In [25]:
import json
import random
import re
import os
import time
from collections import Counter
from datetime import datetime, timedelta, timezone
from difflib import SequenceMatcher
from pathlib import Path
from string import punctuation

import keybert
import litellm
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import pytextrank  # noqa: F401
import spacy
import spacy.cli
import spacy.displacy
import spacy.tokens
# import spacy_transformers  # noqa: F401
from dotenv import load_dotenv
from loguru import logger
from openai import OpenAI

In [3]:
np.random.seed(42**3)
pd.options.display.max_rows = 50
load_dotenv()
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
logger.debug(f"OPENAI_API_KEY='{OPENAI_API_KEY[:3]}'...")

[32m2024-07-27 19:25:48.760[0m | [34m[1mDEBUG   [0m | [36m__main__[0m:[36m<module>[0m:[36m4[0m - [34m[1mOPENAI_API_KEY='sk-'...[0m


### Load Models

In [4]:
def load_spacy_model(spacy_model_name: str):
    try:
        spacy_nlp = spacy.load(spacy_model_name)
        logger.debug(f"loaded spacy model name='{spacy_model_name}' path='{spacy_nlp.path}'")
    except Exception as e:
        logger.debug(str(e))
        spacy.cli.download(spacy_model_name)
        logger.debug(f"downloaded spacy model from web name='{spacy_model_name}'")
        spacy_nlp = spacy.load(spacy_model_name)
        logger.debug(f"loaded spacy model name='{spacy_model_name}' path='{spacy_nlp.path}'")


load_spacy_model("en_core_web_sm")
load_spacy_model("en_core_web_md")
load_spacy_model("en_core_web_lg")
# load_spacy_model("en_core_web_trf")

[32m2024-07-27 19:25:54.905[0m | [34m[1mDEBUG   [0m | [36m__main__[0m:[36mload_spacy_model[0m:[36m4[0m - [34m[1mloaded spacy model name='en_core_web_sm' path='/Users/nameless/dev/proj/ml-practice-time/.venv/lib/python3.12/site-packages/en_core_web_sm/en_core_web_sm-3.7.1'[0m
[32m2024-07-27 19:25:55.523[0m | [34m[1mDEBUG   [0m | [36m__main__[0m:[36mload_spacy_model[0m:[36m4[0m - [34m[1mloaded spacy model name='en_core_web_md' path='/Users/nameless/dev/proj/ml-practice-time/.venv/lib/python3.12/site-packages/en_core_web_md/en_core_web_md-3.7.1'[0m
[32m2024-07-27 19:25:56.183[0m | [34m[1mDEBUG   [0m | [36m__main__[0m:[36mload_spacy_model[0m:[36m4[0m - [34m[1mloaded spacy model name='en_core_web_lg' path='/Users/nameless/dev/proj/ml-practice-time/.venv/lib/python3.12/site-packages/en_core_web_lg/en_core_web_lg-3.7.1'[0m
[32m2024-07-27 19:25:57.116[0m | [34m[1mDEBUG   [0m | [36m__main__[0m:[36mload_spacy_model[0m:[36m4[0m - [34m[1mloade

### Load Data

In [5]:
with open("news_articles.json", "rt") as f:
    news_articles: list[dict] = json.load(f)

logger.debug(f"loaded news articles n={len(news_articles)}")
logger.debug(f"JSON fields: {list(news_articles[0].keys())}")
print()

indices = sorted(list(np.random.permutation(len(news_articles))[:5]))
print(f"--- 5 articles: {indices} ---")
for i in indices:
    article = news_articles[i]
    print(f'Index: {i}')
    print(f'Date: {article["published_at"]}')
    print(f'Title: {article["title"]}')
    print(f'Text: {article["full_text"][:100]}...')
    print()

[32m2024-07-27 19:25:58.060[0m | [34m[1mDEBUG   [0m | [36m__main__[0m:[36m<module>[0m:[36m4[0m - [34m[1mloaded news articles n=33[0m
[32m2024-07-27 19:25:58.060[0m | [34m[1mDEBUG   [0m | [36m__main__[0m:[36m<module>[0m:[36m5[0m - [34m[1mJSON fields: ['title', 'description', 'partial_text', 'url', 'published_at', 'media_source_name', 'media_source_url', 'listing_query', 'listing_source', 'full_text', 'tags', 'nltk_summary', 'nltk_keywords'][0m



--- 5 articles: [8, 22, 29, 31, 32] ---
Index: 8
Date: 2024-07-25T14:56:21+00:00
Title: Moldova's finance minister accepts new job at central bank
Text: Moldova’s finance minister accepts new job at central bank

1 minute

(Reuters) – Moldovan Finance M...

Index: 22
Date: 2024-07-26T04:00:00+00:00
Title: Gonzales man accused of killing wife has history of domestic abuse, murder allegations dating back to 1990s
Text: Gonzales man accused of killing wife has history of domestic abuse, murder allegations dating back t...

Index: 29
Date: 2024-07-26T01:02:03+00:00
Title: Deadpool & Wolverine’s Mid- & Post-Credits Scenes, Explained
Text: Photo: Jay Maidment/20th Century Studios/MARVEL


...

Index: 31
Date: 2024-07-25T19:00:00+00:00
Title: Blake Lively reacts to Taylor Swift’s cheeky comment about Ryan Reynolds
Text: Taylor Swift also revealed that she’s godmother to Blake Lively and Ryan Reynolds’ kids

Blake Livel...

Index: 32
Date: 2024-07-25T15:00:18+00:00
Title: Chinese Shopkeeper S

## Entity Extraction

Let's extract entities from text: organizations, persons, and locations.

In [31]:
article = news_articles[32]
print(f"Title: {article['title']}")
print(f"Date: {article['published_at']}")
print(f"Source: {article['media_source_name']}")
text: str = article["full_text"]
print()
print(text)

Title: Chinese Shopkeeper Shows AMD Ryzen 9000 "Zen 5" Desktop CPU Prices, Lower Than Ryzen 7000 Series
Date: 2024-07-25T15:00:18+00:00
Source: Wccftech

A Chinese shopkeeper based in Bejing has a really weird yet funny way of showing off AMD Ryzen 9000 "Zen 5" Desktop CPU prices which appear to be lower than the Ryzen 7000 MSRP's for China.

AMD Ryzen 9000 "Zen 5" Desktop CPUs Prices For China Appear To Be Much Lower Than Ryzen 7000 MSRPs

80IT, a computer hardware store based in Bejing China, and affiliated with Taobao, has put up what seems to be their expected prices for China. The shop owner also showed off Ryzen 9000 Desktop CPU boxes which suggests that they might be the ones to receive the early batch of Zen 5 chips but we know now that they have been recalled due to quality assurance issues. (The live stream is still going on and do check it out if you want a chuckle).

Image Source: 80IT

So what makes this price exposure weird is that the shopkeeper has casually put up a boa

### No NLP

In [45]:
# Remove some punctuation and split into sentences
clean_text = re.sub(r"[^\w\s.]", "", text)
clean_text = re.sub(r"\n", ".", clean_text)
sentences = clean_text.split(".")
sentences = [sentence.strip() for sentence in sentences if sentence]

# Remove sentences with all capitalized words
sentence_words = [sentence.split() for sentence in sentences]
sentence_words = [words for words in sentence_words if any(word[0].islower() for word in words)]

# Remove first word of each sentence
sentence_words = [words[1:] for words in sentence_words]

# Flatten list
words = [word for sentence in sentence_words for word in sentence]

# Select capitalized words
entities = [word for word in words if word[0].isupper()]
entities = list(set(entities))

print("entities:", entities)

entities: ['Ryzen', 'MSRPs', 'Core', 'Taobao', 'CPU', 'AMD', 'RMB', 'Chinese', 'Intels', 'CPUs', 'Bejing', 'US', 'AMDs', 'USD', 'Desktop', 'August', 'Gen', 'PBO', 'China', 'Zen', 'Intel']


### Spacy

https://spacy.io/usage/spacy-101

In [48]:
def get_spacy_entities(text: str, spacy_model_name: str = "en_core_web_sm") -> dict[str, list[str]]:
    spacy_nlp = spacy.load(spacy_model_name)
    doc = spacy_nlp(text)
    entities = {}
    for ent_type in {"PERSON", "GPE", "ORG"}:
        entities[ent_type] = set([ent.text for ent in doc.ents if ent.label_ == ent_type])
    return entities


entities = get_spacy_entities(text, spacy_model_name="en_core_web_sm")
print("entities (en_core_web_sm):")
for k, v in entities.items():
    print(f"  {k}: {v}")

# print()
# entities = get_spacy_entities(text, spacy_model_name="en_core_web_trf")
# print("entities (en_core_web_trf):")
# for k, v in entities.items():
#     print(f"  {k}: {v}")

entities (en_core_web_sm):
  ORG: {'Core', 'Original Non-Translated Image', 'Taobao', 'Initial Ryzen 9000 Pricing Ryzen', 'CPU', '8-Core', 'AMD Ryzen 9000', 'the Ryzen 9000', 'the AMD Ryzen', 'AMD', 'the Ryzen 7', 'the AMD Ryzen 9000', 'Ryzen 7', 'the Ryzen 7 9700X', '9700X & Ryzen', 'The AMD Ryzen 9000', 'the Intel Core i7-14700K. Lastly', 'PBO', 'Ryzen 9000 Desktop', 'Intel'}
  GPE: {'Bejing', 'China'}
  PERSON: {'Ryzen'}

entities (en_core_web_trf):
  ORG: {'Taobao', 'Intel', '80IT', 'AMD'}
  GPE: {'Bejing', 'China'}
  PERSON: {'80IT'}


### OpenAI GPT

https://platform.openai.com/docs/overview

In [50]:
client = OpenAI()
query = (
    """Return a JSON of with a list of named entities from the_text.
    Output JSON: {
        organizations: ['organization_1', 'organization_2', ...],
        locations: ['location_1', 'location_2', ...],
        people: ['person_1', 'person_2', ...]
    }

    The_text:\n\n
    """
    + text
)
gpt_messages = [
    {"role": "system", "content": "named entity extractor"},
    {"role": "user", "content": query},
]
openai_response = client.chat.completions.create(
    model="gpt-4o-mini",
    messages=gpt_messages,  # type: ignore
    stream=False,
    max_tokens=256,
    n=1,
    frequency_penalty=0,
    temperature=0.5,
    response_format={"type": "json_object"},
)
response_text = openai_response.choices[0].message.content
json_obj = json.loads(response_text)

print("entities:")
for k, v in json_obj.items():
    print(f" {k}: {v}")

organizations: ['AMD', '80IT', 'Taobao']
locations: ['Bejing', 'China']
people: []


## Keyword extraction

Let's extract meaningful keywords from text.

### No NLP

In [7]:
# Remove punctuation
clean_text = re.sub(r"[^\w ]", "", text.lower())
print("------")
print(clean_text[:200])
print("------\n")
words = clean_text.split()
word_counts = Counter(words)
print("most common:", word_counts.most_common(8))

# Remove short words
words = [word for word in words if len(word) > 2]
word_counts = Counter(words)
print("most common:", word_counts.most_common(8))

# Remove pronouns, prepositions, articles, and simple verbs
exclude_words = set(
    ["a", "for", "in", "the", "and", "to", "of"]
    + ["can", "do", "does", "did", "have", "has", "had", "was", "is", "are", "were"]
    + ["it", "i", "you", "he", "she", "her", "his", "that", "this"]
)
words = [word for word in words if word not in exclude_words]
word_counts = Counter(words)
print("most common:", word_counts.most_common(8))

# Plural to singular
unique_words = set(words)
words = [word[:-1] if word.endswith("s") and word[:-1] in unique_words else word for word in words]
word_counts = Counter(words)
print("most common:", word_counts.most_common(8))

keywords = list(word_counts.most_common(8))
print("keywords:", keywords)


------
taylor swift also revealed that shes godmother to blake lively and ryan reynolds kidsblake lively is matching taylor swifts energythe 36yearold actress had a hilarious response to swifts recent instag
------

most common: [('and', 9), ('the', 8), ('reynolds', 6), ('to', 5), ('swifts', 4), ('a', 4), ('his', 4), ('lively', 3)]
most common: [('and', 9), ('the', 8), ('reynolds', 6), ('swifts', 4), ('his', 4), ('lively', 3), ('film', 3), ('taylor', 2)]
most common: [('reynolds', 6), ('swifts', 4), ('lively', 3), ('film', 3), ('taylor', 2), ('swift', 2), ('story', 2), ('tribute', 2)]
most common: [('swift', 6), ('reynolds', 6), ('lively', 5), ('film', 3), ('taylor', 2), ('story', 2), ('tribute', 2), ('work', 2)]
keywords: [('swift', 6), ('reynolds', 6), ('lively', 5), ('film', 3), ('taylor', 2), ('story', 2), ('tribute', 2), ('work', 2)]


### Spacy

CNN or Transformers

In [8]:
def get_spacy_keywords(
    text: str, n: int = 10, spacy_model_name: str = "en_core_web_sm"
) -> list[tuple[str, int]]:
    spacy_nlp = spacy.load(spacy_model_name)
    doc = spacy_nlp(text.lower())
    pos_tag = ["PROPN", "ADJ", "NOUN"]  # only include proper nouns, adjectives, and nouns
    hotwords = [token.text.lower() for token in doc if token.pos_ in pos_tag]
    keywords = list(Counter(hotwords).most_common(n))
    return keywords


keywords = get_spacy_keywords(text, n=8, spacy_model_name="en_core_web_sm")
print("keywords (en_core_web_sm):", keywords)

# keywords = get_spacy_keywords(text, n=8, spacy_model_name="en_core_web_trf")
# print("keywords (en_core_web_trf):", keywords)

keywords (en_core_web_sm): [('swift', 7), ('reynolds', 6), ('lively', 3), ('film', 3), ('taylor', 2), ('story', 2), ('tribute', 2), ('work', 2)]
keywords (en_core_web_trf): [('swift', 7), ('reynolds', 6), ('lively', 5), ('film', 3), ('taylor', 2), ('blake', 2), ('story', 2), ('tribute', 2)]


### Spacy + Pytextrank

Graph-based ranking algorithm inspired by Google's PageRank

In [24]:
def get_textrank_keywords(
    text: str, n: int = 10, spacy_model_name: str = "en_core_web_sm"
) -> list[tuple[str, float]]:
    spacy_nlp = spacy.load(spacy_model_name)
    spacy_nlp.add_pipe("textrank")
    doc = spacy_nlp(text.lower())
    keywords = [(phrase.text, round(phrase.rank, 2)) for phrase in doc._.phrases[:n]]
    return keywords

keywords = get_textrank_keywords(text, n=8, spacy_model_name="en_core_web_sm")
print("keywords (en_core_web_sm):", keywords)

# keywords = get_textrank_keywords(text, n=8, spacy_model_name="en_core_web_trf")
# print("keywords (en_core_web_trf):", keywords)

keywords (en_core_web_sm): [('hugh jackman', 0.14), ('james', 0.11), ('taylor', 0.09), ('james"', 0.05), (',\n    "instagram story', 0.04), (',\n    "wade wilson', 0.04), (',\n    "hugh jackman', 0.04), (',\n    "marvel film', 0.04)]
keywords (en_core_web_trf): [('blake lively', 0.14), ('ryan reynolds', 0.14), ('wade wilson', 0.13), ('hugh jackman', 0.13), ('marvel film', 0.13), ('world tour', 0.13), ('james', 0.1), ('kids', 0.1)]


### KeyBERT

BERT

In [10]:
kw_model = keybert.KeyBERT()
keywords = kw_model.extract_keywords(text)
print("keywords (KeyBERT):", keywords)

keywords (KeyBERT): [('deadpool', 0.505), ('swift', 0.4517), ('marvel', 0.4111), ('reynolds', 0.409), ('taylor', 0.3546)]


### OpenAI GPT

In [22]:
client = OpenAI()
query = (
    "Return a list of keywords from the_text. Output JSON: {keywords: ['keyword_1', 'keyword_2', ...]}. The_text:\n\n"
    + text
)
gpt_messages = [
    {"role": "system", "content": "keyword extractor"},
    {"role": "user", "content": query},
]
openai_response = client.chat.completions.create(
    model="gpt-4o-mini",
    messages=gpt_messages,  # type: ignore
    stream=False,
    max_tokens=256,
    n=1,
    frequency_penalty=0,
    temperature=0.5,
    response_format={"type": "json_object"},
)
response_text = openai_response.choices[0].message.content
json_obj = json.loads(response_text)
keywords = json_obj["keywords"]
keywords = [keyword.lower() for keyword in keywords]
print("keywords:", keywords)

keywords: ['taylor swift', 'blake lively', 'ryan reynolds', 'instagram story', 'deadpool', 'wolverine', 'grammys', 'world tour', 'marvel film', 'hugh jackman', 'wade wilson', 'goddaughter', 'kids', 'james', 'inez', 'betty', 'olin']
