In [5]:
from happytransformer import HappyTextClassification
from dataclasses import dataclass
from typing import Callable
from pathlib import Path
from newsapi import NewsApiClient
from datetime import date
from newspaper import Article
import nltk
import asyncio

nltk.data.path = [str(Path().resolve().parent.joinpath("nltk_data"))]

# nltk.download("punkt", download_dir="./nltk_data")

@dataclass
class Digest:
    html: str
    authors: str
    publish_date: str
    keywords: str
    summary: str
    title: str
    text: str

def process_article(article: Article) -> Callable:
    def _process_article() -> Digest:
        article.download()
        article.parse()
        article.nlp()
        return Digest(
            html=article.html,
            authors=article.authors,
            publish_date=article.publish_date,
            keywords=article.keywords,
            summary=article.summary,
            title=article.title,
            text=article.text,
        )
    return _process_article

async def process_article_async(article: Article):
    return asyncio.to_thread(process_article, article)

async def get_digests_async(term: str, client: NewsApiClient,
                      from_param=date.today(), language="en"):

    response = client.get_everything(term,
                      # sources='bbc-news,the-verge,Bloomberg',
                      # category='business',
                      from_param=from_param,
                      language=language)

    return await asyncio.gather(
        *[process_article_async(Article(article["url"])) for article in response["articles"]]
    )

async def get_all_digests_async(*terms: str, client: NewsApiClient):
    return await asyncio.gather(
        *[get_digests_async(term, client) for term in terms]
    )

# classifier = HappyTextClassification(
#     model_type="DISTILBERT", num_labels=2,
#     model_name="distilbert-base-uncased-finetuned-sst-2-english")

classifier = HappyTextClassification(
    model_type="BERT", num_labels=3,
    model_name="ProsusAI/finbert")

06/23/2022 00:51:55 - INFO - happytransformer.happy_transformer -   Using model: cpu


In [9]:
# Init pass 9gPj8KZ8XJsLbcT
api = NewsApiClient(api_key='98d763c2280d4820913c4abf3ff0270b')

# /v2/everything
all_articles = api.get_everything(q='bitcoin',
                                  # sources='bbc-news,the-verge,Bloomberg',
                                  # category='business',
                                  from_param=date.today(),
                                  language='en')

# /v2/top-headlines/sources
# sources = api.get_sources()

print(f"Cantidad de articulos: {all_articles['totalResults']}")

Cantidad de articulos: 22


In [13]:
articles = [{"title":article["title"], "url":article["url"]} for article in all_articles["articles"]]

article = Article(articles[1]["url"])

digest: Digest = process_article(article)()

result = classifier.classify_text(digest.summary)

result

TextClassificationResult(label='negative', score=0.9736753106117249)

In [14]:
digest.summary

