In [1]:
!pip install requests
!pip install bs4
!pip install transformers
!pip install sentence-transformers

Collecting bs4
  Downloading bs4-0.0.2-py2.py3-none-any.whl (1.2 kB)
Installing collected packages: bs4
Successfully installed bs4-0.0.2
Collecting sentence-transformers
  Downloading sentence_transformers-3.0.1-py3-none-any.whl (227 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m227.1/227.1 kB[0m [31m4.8 MB/s[0m eta [36m0:00:00[0m
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch>=1.11.0->sentence-transformers)
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (23.7 MB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch>=1.11.0->sentence-transformers)
  Using cached nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (823 kB)
Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch>=1.11.0->sentence-transformers)
  Using cached nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (14.1 MB)
Collecting nvidia-cudnn-cu12==8.9.2.26 (from torch>=1.11.0->sentence-transformers)
  Using cached nvidia_cudn

In [15]:
import requests
from bs4 import BeautifulSoup
from transformers import pipeline, AutoTokenizer, AutoModelForSeq2SeqLM

# Initialize the summarization pipeline
model_name = "facebook/bart-large-cnn"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
summarizer = pipeline("summarization", model=model, tokenizer=tokenizer)

def get_google_news_links():
    url = "https://news.google.com/home?hl=en-US&gl=US&ceid=US:en"
    response = requests.get(url)
    soup = BeautifulSoup(response.content, 'html.parser')
    links = []

    # Find all article tags that contain anchor tags
    for article in soup.find_all('article'):
        a_tag = article.find('a', href=True)
        if a_tag and 'article' in a_tag['href']:
            links.append('https://news.google.com' + a_tag['href'][1:])

    return links

def extract_article_text(url):
    response = requests.get(url)
    soup = BeautifulSoup(response.content, 'html.parser')
    paragraphs = soup.find_all('p')
    article_text = ' '.join([p.get_text() for p in paragraphs])

    # Tokenize and limit to 512 tokens
    tokens = tokenizer.tokenize(article_text)
    if len(tokens) > 512:
        tokens = tokens[:512]

    truncated_text = tokenizer.convert_tokens_to_string(tokens)
    return truncated_text

def summarize_text(text):
    summary = summarizer(text, max_length=130, min_length=30, do_sample=False)
    return summary[0]['summary_text']

def main():
    all_summaries = []
    news_links = get_google_news_links()
    for link in news_links:
        try:
            article_text = extract_article_text(link)
            print(f"article_text lenght - {len(article_text)}")
            summary = summarize_text(article_text)
            all_summaries.append((link, summary))
        except Exception as e:
            print(f"Failed to process {link}: {e}")

    for link, summary in all_summaries:
        print(f"Link: {link}")
        print(f"Summary: {summary}\n")

if __name__ == "__main__":
    main()

article_text lenght - 2527


Your max_length is set to 130, but your input_length is only 10. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=5)


article_text lenght - 43
article_text lenght - 2460


Your max_length is set to 130, but your input_length is only 3. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=1)


article_text lenght - 0
article_text lenght - 2458
article_text lenght - 2321


Your max_length is set to 130, but your input_length is only 3. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=1)


article_text lenght - 0
article_text lenght - 1927
article_text lenght - 1979


Your max_length is set to 130, but your input_length is only 10. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=5)


article_text lenght - 43
Link: https://news.google.com/articles/CBMigwFodHRwczovL3d3dy5ub3J0aGplcnNleS5jb20vc3RvcnkvbmV3cy9wb2xpdGljcy8yMDI0LzA3LzA5L21pa2llLXNoZXJyaWxsLW5qLWpvZS1iaWRlbi1lbmQtcHJlc2lkZW50LWVsZWN0aW9uLTIwMjQtYWdlLzc0MzQ0MDc5MDA3L9IBAA?hl=en-US&gl=US&ceid=US%3Aen
Summary: Mikie Sherrill is the seventh congressional Democrat to call on Biden to drop out of the presidential race. She's the first elected Democrat from New Jersey to do so. Biden said Monday that he has no plans to leave the race.

Link: https://news.google.com/articles/CBMiTGh0dHBzOi8vd3d3Lm55dGltZXMuY29tLzIwMjQvMDcvMDkvdXMvcG9saXRpY3MvYmlkZW4tZGVtb2NyYXRzLWNvbmdyZXNzLmh0bWzSAQA?hl=en-US&gl=US&ceid=US%3Aen
Summary: CNN.com will feature iReporter photos in a weekly Travel Snapshots gallery. Please submit your best shots for next week. Visit CNN.com/Travel next Wednesday for a new gallery of snapshots.

Link: https://news.google.com/articles/CBMiZ2h0dHBzOi8vYXBuZXdzLmNvbS9hcnRpY2xlL2JpZGVuLWVsZWN0aW9uLWhvdXNlL