# Entity & Keyword Extraction

https://www.analyticsvidhya.com/blog/2022/03/keyword-extraction-methods-from-documents-in-nlp/

## Setup

### Load Dependencies

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import json
import random
import re
import os
import time
from collections import Counter
from datetime import datetime, timedelta, timezone
from difflib import SequenceMatcher
from pathlib import Path
from string import punctuation

import keybert
import litellm
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import pytextrank  # noqa: F401
import spacy
import spacy.cli
import spacy.displacy
import spacy.tokens
import spacy_transformers  # noqa: F401
from dotenv import load_dotenv
from loguru import logger
from openai import OpenAI

  from tqdm.autonotebook import tqdm, trange
  _torch_pytree._register_pytree_node(
  _torch_pytree._register_pytree_node(


/Users/nameless/dev/proj/ml-practice-time/.venv/lib/python3.12/site-packages


In [3]:
np.random.seed(42**3)
pd.options.display.max_rows = 50
load_dotenv()
logger.debug("test log message")

[32m2024-07-27 12:25:15.691[0m | [34m[1mDEBUG   [0m | [36m__main__[0m:[36m<module>[0m:[36m3[0m - [34m[1mtest log message[0m


### Load Models

In [4]:
def load_spacy_model(spacy_model_name: str):
    try:
        spacy_nlp = spacy.load(spacy_model_name)
        logger.debug(f"loaded spacy model name='{spacy_model_name}' path='{spacy_nlp.path}'")
    except Exception as e:
        logger.debug(str(e))
        spacy.cli.download(spacy_model_name)
        logger.debug(f"downloaded spacy model from web name='{spacy_model_name}'")
        spacy_nlp = spacy.load(spacy_model_name)
        logger.debug(f"loaded spacy model name='{spacy_model_name}' path='{spacy_nlp.path}'")


load_spacy_model("en_core_web_sm")
load_spacy_model("en_core_web_md")
load_spacy_model("en_core_web_lg")
load_spacy_model("en_core_web_trf")

[32m2024-07-27 12:25:17.322[0m | [34m[1mDEBUG   [0m | [36m__main__[0m:[36mload_spacy_model[0m:[36m4[0m - [34m[1mloaded spacy model name='en_core_web_sm' path='/Users/nameless/dev/proj/ml-practice-time/.venv/lib/python3.12/site-packages/en_core_web_sm/en_core_web_sm-3.7.1'[0m
[32m2024-07-27 12:25:18.043[0m | [34m[1mDEBUG   [0m | [36m__main__[0m:[36mload_spacy_model[0m:[36m4[0m - [34m[1mloaded spacy model name='en_core_web_md' path='/Users/nameless/dev/proj/ml-practice-time/.venv/lib/python3.12/site-packages/en_core_web_md/en_core_web_md-3.7.1'[0m
[32m2024-07-27 12:25:18.743[0m | [34m[1mDEBUG   [0m | [36m__main__[0m:[36mload_spacy_model[0m:[36m4[0m - [34m[1mloaded spacy model name='en_core_web_lg' path='/Users/nameless/dev/proj/ml-practice-time/.venv/lib/python3.12/site-packages/en_core_web_lg/en_core_web_lg-3.7.1'[0m
[32m2024-07-27 12:25:19.692[0m | [34m[1mDEBUG   [0m | [36m__main__[0m:[36mload_spacy_model[0m:[36m4[0m - [34m[1mloade

### Load Data

In [17]:
with open("news_articles.json", "rt") as f:
    news_articles: list[dict] = json.load(f)

logger.debug(f"loaded news articles n={len(news_articles)}")
logger.debug(f"JSON fields: {list(news_articles[0].keys())}")
print()

indices = sorted(list(np.random.permutation(len(news_articles))[:5]))
print(f"--- 5 articles: {indices} ---")
for i in indices:
    article = news_articles[i]
    print(f'Index: {i}')
    print(f'Date: {article["published_at"]}')
    print(f'Title: {article["title"]}')
    print(f'Text: {article["full_text"][:100]}...')
    print()

[32m2024-07-27 12:58:18.305[0m | [34m[1mDEBUG   [0m | [36m__main__[0m:[36m<module>[0m:[36m4[0m - [34m[1mloaded news articles n=33[0m
[32m2024-07-27 12:58:18.305[0m | [34m[1mDEBUG   [0m | [36m__main__[0m:[36m<module>[0m:[36m5[0m - [34m[1mJSON fields: ['title', 'description', 'partial_text', 'url', 'published_at', 'media_source_name', 'media_source_url', 'listing_query', 'listing_source', 'full_text', 'tags', 'nltk_summary', 'nltk_keywords'][0m



--- 5 articles: [1, 2, 11, 12, 31] ---
Index: 1
Date: 2024-07-26T17:56:29+00:00
Title: U.S. Treasury, Brazil's Finance Ministry announce climate partnership
Text: Treasury Secretary Janet Yellen speaks during a Senate Appropriations Subcommittee on Financial Serv...

Index: 2
Date: 2024-07-26T16:08:22+00:00
Title: 'No State Denied Anything': Finance Minister Sitharaman Responds To Opposition's Criticism Of Budget 2024
Text: Union Finance Minister Nirmala Sitharaman on Friday responded to the Opposition’s criticism of budge...

Index: 11
Date: 2024-07-26T15:23:21+00:00
Title: Why at CM Chandrababu Naidu’s call, 160-odd MLAs stood up in Andhra Assembly
Text: AS FAR as messages go, Chief Minister N Chandrababu Naidu could not have planned one with more symbo...

Index: 12
Date: 2024-07-26T13:06:10+00:00
Title: ‘Selina was not modelled on Harris’: Iannucci on how US presidential race came to mirror Veep
Text: For years, British politics has echoed the ludicrous and sometimes concerning st

## Entity Extraction

Let's extract entities from text: organizations, persons, and locations.

In [20]:
article = news_articles[31]
print(f"Title: {article['title']}")
print(f"Date: {article['published_at']}")
print(f"Source: {article['media_source_name']}")
text = article["full_text"]
print()
print(text)

Title: Blake Lively reacts to Taylor Swift’s cheeky comment about Ryan Reynolds
Date: 2024-07-25T19:00:00+00:00
Source: The News International

Taylor Swift also revealed that she’s godmother to Blake Lively and Ryan Reynolds’ kids

Blake Lively is matching Taylor Swift’s energy.

The 36-year-old actress had a hilarious response to Swift’s recent Instagram Story where the pop star, 34, shared a sweet and funny tribute to Lively’s husband Reynolds, 47, and his work on the new film, Deadpool & Wolverine.

“I couldn’t have said it better myself,” Lively wrote, reposting Swift’s original Story. “Which is unsurprising given that I have 14 fewer Grammys and not a single sold-out stadium world tour.”

In Swift’s post, she praised Reynolds for pouring his heart and soul into the Marvel film. “He’s created the best work of his life, and the film feels like an actual joy portal, a wild escape from reality and an abs sandwich. I don’t know how he did it,” she wrote.

Swift then jokingly redirecte

### No NLP

In [21]:
# TODO: make an heuristic algorithm without using external libraries to extract entities

entities = []

print("entities:", entities)

[]


### Spacy

https://spacy.io/usage/spacy-101

In [None]:
spacy_nlp = spacy.load("en_core_web_sm")
doc = spacy_nlp(text)

# TODO: use doc.ents to get entities (organizations, locations, persons)
# hint: [ent for ent in doc.ents]
# hint: ent.label_ in {"PERSON", "GPE", "ORG"}

entities = {}

print("entities:")
for k, v in entities.items():
    print(f"  {k}: {v}")

# hint: visualzie with spacy.displacy.render(doc, style="ent")

# TODO: try with a different Spacy model

### OpenAI GPT

https://platform.openai.com/docs/overview

In [None]:
client = OpenAI()

# TODO: fix prompt to extract entities

query = ""

gpt_messages = [
    {"role": "system", "content": "named entity extractor"},
    {"role": "user", "content": query},
]
openai_response = client.chat.completions.create(
    model="gpt-4o-mini",
    messages=gpt_messages,  # type: ignore
    stream=False,
    max_tokens=256,
    n=1,
    frequency_penalty=0,
    temperature=0.5,
    response_format={"type": "json_object"},
)
response_text = openai_response.choices[0].message.content
json_obj = json.loads(response_text)

print("entities:")
for k, v in json_obj.items():
    print(f" {k}: {v}")

## Keyword extraction

Let's extract meaningful keywords from text.

### No NLP

In [22]:
# TODO: make an heuristic algorithm without using external libraries to extract keywords

keywords = []
print(keywords)

[]


### Spacy

In [None]:
spacy_nlp = spacy.load("en_core_web_sm")
doc = spacy_nlp(text)

# TODO: use doc token to get words
# hint: [token.text.lower() for token in doc]
# hint: token.pos_ in ["PROPN", "ADJ", "NOUN"]

keywords = []
print("keywords:", keywords)

# TODO: try with a different Spacy model

### Spacy + Pytextrank

In [None]:
spacy_nlp = spacy.load("en_core_web_sm")
spacy_nlp.add_pipe("textrank")
doc = spacy_nlp(text)

# TODO: use doc phrase to get most common phrases
# hint: [phrase.text for phrase in doc._.phrases]

keywords = []
print("keywords:", keywords)

### KeyBERT

In [None]:
kw_model = keybert.KeyBERT()

# TODO: use kw_model.extract_keywords() to get keywords

keywords = []
print("keywords:", keywords)

### OpenAI GPT

In [None]:
client = OpenAI()

# TODO: finish code