In [51]:
from bs4 import BeautifulSoup
from dotenv import load_dotenv
from langchain_core.prompts import ChatPromptTemplate
from langchain_ollama.llms import OllamaLLM
import os
import re
import requests

# Load environment variables
load_dotenv()

# Configuration
google_api_key = os.environ.get("GOOGLE_API_KEY")
google_cse_id = os.environ.get("GOOGLE_CSE_ID")

# Load LLM model
llm = OllamaLLM(
    model="llama3.2",
    temperature=0.2,
    top_p=0.9,
    repeat_penalty=1.2 # as there is cleaning of the text, there is slight possibility that the model may repeat, hence the penalty to avoid such scenario.
)


def remove_space(text):
    """Remove extra spaces from a string."""
    while "  " in text:
        text = text.replace("  ", " ")
    return text


def get_recent_articles(sites, topics, api_key, cse_id, weeks=1, articlesPerWeek=5):
    """
    Scrape recent news articles from specified sites and topics using Google Custom Search API.
    Attempts to extract article content via BeautifulSoup.
    """
    articles = []

    for site in sites:
        for topic in topics:
            url = "https://www.googleapis.com/customsearch/v1"
            params = {
                'q': topic,
                'key': api_key,
                'cx': cse_id,
                'num': articlesPerWeek * weeks,
                'siteSearch': site,
                'dateRestrict': 'w' + str(weeks),
                'lr': 'lang_en'
            }

            try:
                response = requests.get(url, params=params)
                results = response.json()

                for item in results.get('items', []):
                    article = {
                        'title': item.get('title'),
                        'url': item.get('link'),
                        'snippet': item.get('snippet'),
                        'source': site,
                        'content': ''
                    }

                    # Attempt to extract main article text from the HTML page
                    try:
                        page = requests.get(article['url'], timeout=10)
                        soup = BeautifulSoup(page.content, 'html.parser')

                        # Priority: <article> → <main> → <div class="content">
                        main_content = soup.find('article') or soup.find('main') or soup.find('div', class_='content')

                        if main_content:
                            for elem in main_content(['script', 'style', 'nav', 'footer']):
                                elem.decompose()
                            article['content'] = ' '.join(main_content.stripped_strings)
                        else:
                            for elem in soup(['script', 'style', 'nav', 'footer']):
                                elem.decompose()
                            article['content'] = remove_space(re.sub(r'[^\x20-\x7E]', ' ', soup.get_text().replace("\n", " ")))

                    except Exception as e:
                        article['content'] = f'Content extraction failed: {str(e)}'

                    articles.append(article)

            except Exception as e:
                print(f"Error searching {site} for {topic}: {str(e)}")

    return articles


def clean_article_text(subject, text):
    """
    Clean an article using LLM by removing boilerplate or irrelevant content.
    Returns only the essential body text.
    """
    cleaning_messages = [
        ("system",
         "You are a text cleaner. Your task is to extract only the meaningful, article-related content based on the subject provided. "
         "Remove anything unrelated like ads, cookie notices, navigation bars, buttons, website slogans, subscription prompts, or any boilerplate text. "
         "Make sure no information from the article related content is missed. Do not rephrase the text, give back the same text which is relevant. Reply with the cleaned text ONLY"
         ),
        ("human",
         "Here is an article about '{subject}'. Extract only the main content and ignore any irrelevant site text:\n\n{content}")
    ]

    cleaning_prompt_template = ChatPromptTemplate.from_messages(cleaning_messages)

    cleaning_prompt = cleaning_prompt_template.invoke({
        "subject": subject,
        "content": text
    })

    result = llm.invoke(cleaning_prompt)

    return result


def summarize_article_text(subject, text):
    """
    Summarize the cleaned article content using LLM with focus on the given subject.
    """
    summarization_messages = [
        ("system",
         "You are a professional summarizer. Your job is to generate a concise, clear summary of the provided article content, focused on the subject. "
         "Summarize only the essential information without adding new details. Reply with the summary ONLY"
         ),
        ("human",
         "Summarize the following article about '{subject}':\n\n{cleaned_content}")
    ]

    summarization_prompt_template = ChatPromptTemplate.from_messages(summarization_messages)

    summarization_prompt = summarization_prompt_template.invoke({
        "subject": subject,
        "cleaned_content": text
    })

    result = llm.invoke(summarization_prompt)

    return result

# Fetch and process articles
parent_sites = ['cnn.com']
topics = ['Donald Trump']

recent_articles = get_recent_articles(parent_sites, topics, google_api_key, google_cse_id, 1, 1)

for i in range(len(recent_articles)):
    recent_articles[i]["clean_content"] = clean_article_text(recent_articles[i]["title"], recent_articles[i]["content"])
    recent_articles[i]["summary"] = summarize_article_text(recent_articles[i]["title"], recent_articles[i]["clean_content"])

# Output final result
recent_articles


[{'title': 'Donald J. Trump news - breaking news, video, headlines and ...',
  'url': 'https://www.cnn.com/politics/president-donald-trump-47',
  'snippet': "4 hours ago ... Keeping track of Trump. Pres. Trump is set to sign a bipartisan immigration bill called the Laken Riley Act. POOL. The status of Trump's picks for his Cabinet\xa0...",
  'source': 'cnn.com',
  'content': " Donald J. Trump news - breaking news, video, headlines and analysis | CNN Politics CNN values your feedback 1. How relevant is this ad to you? 2. Did you encounter any technical issues? Video player was slow to load content Video content never loaded Ad froze or did not finish loading Video content did not start after ad Audio on ad was too loud Other issues Ad never loaded Ad prevented/slowed the page from loading Content moved around while ad loaded Ad was repetitive to ads I've seen previously Other issues Cancel Submit Thank You! Your effort and contribution in providing this feedback is much appreciated. Clo