In [1]:
import os, sys
from google.colab import drive
drive.mount('/content/drive')
os.chdir("/content/drive/My Drive/Colab Notebooks")
sys.path.append("/content/drive/My Drive/Colab Notebooks")

Mounted at /content/drive


In [4]:
import requests

def fetch_news_article(api_key):
    url = f'https://newsapi.org/v2/top-headlines?country=us&apiKey={api_key}'
    response = requests.get(url)
    articles = response.json()['articles']
    if articles:
        title = articles[0]['title'] or ""
        description = articles[0]['description'] or ""
        content = articles[0]['content'] or ""
        return title + "\n" + description + "\n" + content
    return None

# Replace 'your_api_key_here' with your actual News API key
news_article = fetch_news_article('55d7aa390e9a47ca92efe7985669b23e')
print(news_article)


Giants' Blake Snell throws first no-hitter of career vs. Reds - The New York Times




In [6]:
import nltk
from nltk import word_tokenize, pos_tag, ne_chunk

# Download necessary NLTK models
nltk.download('punkt')
nltk.download('maxent_ne_chunker')
nltk.download('words')
nltk.download('averaged_perceptron_tagger')  # Download the missing resource

def nltk_ner(text):
    tokens = word_tokenize(text)
    pos_tags = pos_tag(tokens)
    ne_tree = ne_chunk(pos_tags, binary=False)
    named_entities = []

    for subtree in ne_tree:
        if isinstance(subtree, nltk.Tree):
            entity_name = " ".join([leaf[0] for leaf in subtree.leaves()])
            entity_type = subtree.label()
            named_entities.append((entity_name, entity_type))
    return named_entities

# Example usage
news_article = """Apple is looking at buying U.K. startup for $1 billion"""
nltk_entities = nltk_ner(news_article)
print("Entities extracted by NLTK:", nltk_entities)


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package maxent_ne_chunker to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package maxent_ne_chunker is already up-to-date!
[nltk_data] Downloading package words to /root/nltk_data...
[nltk_data]   Package words is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


Entities extracted by NLTK: [('Apple', 'GPE')]


In [7]:
import spacy

# Load the spaCy model
nlp = spacy.load("en_core_web_sm")

def spacy_ner(text):
    doc = nlp(text)
    named_entities = [(ent.text, ent.label_) for ent in doc.ents]
    return named_entities

spacy_entities = spacy_ner(news_article)
print("Entities extracted by spaCy:", spacy_entities)


Entities extracted by spaCy: [('Apple', 'ORG'), ('U.K.', 'GPE'), ('$1 billion', 'MONEY')]


In [8]:
def compare_entities(nltk_entities, spacy_entities):
    print("Comparison of entities extracted by NLTK and spaCy:\n")

    print("Entities by NLTK:")
    for entity in nltk_entities:
        print(entity)

    print("\nEntities by spaCy:")
    for entity in spacy_entities:
        print(entity)

    nltk_set = set(nltk_entities)
    spacy_set = set(spacy_entities)

    common_entities = nltk_set & spacy_set
    nltk_only = nltk_set - spacy_set
    spacy_only = spacy_set - nltk_set

    print("\nCommon Entities:")
    for entity in common_entities:
        print(entity)

    print("\nEntities only by NLTK:")
    for entity in nltk_only:
        print(entity)

    print("\nEntities only by spaCy:")
    for entity in spacy_only:
        print(entity)

compare_entities(nltk_entities, spacy_entities)


Comparison of entities extracted by NLTK and spaCy:

Entities by NLTK:
('Apple', 'GPE')

Entities by spaCy:
('Apple', 'ORG')
('U.K.', 'GPE')
('$1 billion', 'MONEY')

Common Entities:

Entities only by NLTK:
('Apple', 'GPE')

Entities only by spaCy:
('Apple', 'ORG')
('U.K.', 'GPE')
('$1 billion', 'MONEY')
