In [40]:
%%capture
!pip install -U newspy polars spacy
!python -m spacy download en_core_web_sm
!python -m spacy download textcat_multilabel

from dataclasses import dataclass
from datetime import datetime


@dataclass
class Entity:
    name: str
    label: str

    def __str__(self) -> str:
        return f"{self.name}:{self.label}"


@dataclass
class Article:
    title: str
    url: str
    abstract: str
    published: datetime
    source: str
    entities: str


In [41]:
import spacy
import polars as pl

from newspy import client as newspy
from newspy.models import Language

newsorg_api_key = "NEWSAPI_API_KEY"
newspy.configure(newsorg_api_key=newsorg_api_key)

nlp = spacy.load("en_core_web_sm")

articles = newspy.get_articles(language=Language.EN)
## articles = rss.get_articles()

articles_nlp = []
for article in articles:
    if article.abstract is None:
        continue

    text = ". ".join([article.title, article.abstract])

    entities = [Entity(name=ent.text, label=ent.label_) for ent in nlp(text, disable=['tok2vec', 'tagger', 'parser',
                                                                                      'senter', 'attribute_ruler',
                                                                                      'lemmatizer']).ents]
    articles_nlp.append(
        Article(
            source=str(article.source),
            url=article.url,
            title=article.title,
            abstract=article.abstract,
            published=article.published,
            entities=", ".join([str(ent) for ent in entities])
        )
    )

articles_nlp_df = pl.DataFrame(articles_nlp)

articles_nlp_df.write_csv(f"scratchpad/articles_nlp_{datetime.now().strftime('%Y-%m-%d_%H-%M-%S')}.csv")