In [None]:
from bs4 import BeautifulSoup
from dotenv import load_dotenv
import json
from langchain_core.prompts import ChatPromptTemplate
from langchain_ollama.llms import OllamaLLM
import os
import re
import requests
   
load_dotenv()

def remove_space(text):
    while "  " in text:
        text = text.replace("  ", " ")
    return text


def get_recent_articles(sites, topics, api_key, cse_id, weeks = 1, articlesPerWeek = 5):
    
    articles = []
    
    # Iterates through each site and topic to fine n number of articles for the duration
    for site in sites:
        for topic in topics:
            # Build Google Custom Search query
            url = f"https://www.googleapis.com/customsearch/v1"
            params = {
                'q': f'{topic}',
                'key': api_key,
                'cx': cse_id,
                'num': articlesPerWeek * weeks,  # Max results per request
                'siteSearch': site,
                'dateRestrict': 'w' + str(weeks),
                'lr': 'lang_en'
            }
            
            #Scraping content through each of the websites
            try:
                response = requests.get(url, params=params)
                results = response.json()
                
                for item in results.get('items', []):
                    article = {
                        'title': item.get('title'),
                        'url': item.get('link'),
                        'snippet': item.get('snippet'),
                        'source': site,
                        'content': ''
                    }
                    
                    try:
                        page = requests.get(article['url'], timeout=10)
                        soup = BeautifulSoup(page.content, 'html.parser')
                        
                        # Checks if article element is present, saves the elements content otherwise moves to main, then to div with class content
                        main_content = soup.find('article') or soup.find('main') or soup.find('div', class_='content')
                        if main_content:
                            # Removes unnecessary elements
                            for elem in main_content(['script', 'style', 'nav', 'footer']):
                                elem.decompose()
                            article['content'] = ' '.join(main_content.stripped_strings)
                        else:
                            # Removes unnecessary elements
                            for elem in soup(['script', 'style', 'nav', 'footer']):
                                elem.decompose()
                            article['content'] = remove_space(re.sub(r'[^\x20-\x7E]', ' ', soup.get_text().replace("\n"," ")))
                            
                    except Exception as e:
                        article['content'] = f'Content extraction failed: {str(e)}'
                    
                    articles.append(article)
                    
            except Exception as e:
                print(f"Error searching {site} for {topic}: {str(e)}")
    
    return articles

# Configuration
parent_sites = ['cnn.com']
topics = ['Bitcoin']
google_api_key = os.environ.get("GOOGLE_API_KEY")
google_cse_id = os.environ.get("GOOGLE_CSE_ID")

llm = OllamaLLM(model="llama3.2")

def clean_article_text(subject, text):

    cleaning_messages = [
        ("system", 
        "You are a text cleaner. Your task is to extract only the meaningful, article-related content based on the subject provided. "
        "Remove anything unrelated like ads, cookie notices, navigation bars, buttons, website slogans, subscription prompts, or any boilerplate text. Reply with the cleaned text ONLY"
        ),
        ("human", 
        "Here is an article about '{subject}'. Extract only the main content and ignore any irrelevant site text:\n\n{content}")
    ]

    cleaning_prompt_template = ChatPromptTemplate.from_messages(cleaning_messages)

    cleaning_prompt = cleaning_prompt_template.invoke({
    "subject" : subject,
    "content" : text
    })

    result = llm.invoke(cleaning_prompt)

    return result

def summarize_article_text(subject, text):

    summarization_messages = [
        ("system", 
        "You are a professional summarizer. Your job is to generate a concise, clear summary of the provided article content, focused on the subject. "
        "Summarize only the essential information without adding new details. Reply with the summary ONLY"
        ),
        ("human", 
        "Summarize the following article about '{subject}':\n\n{cleaned_content}")
    ]

    summarization_prompt_template = ChatPromptTemplate.from_messages(summarization_messages)

    summarization_prompt = summarization_prompt_template.invoke({
        "subject" : subject,
        "cleaned_content" : text
    })

    result = llm.invoke(summarization_prompt)
    
    return result

# Get articles
recent_articles = get_recent_articles(parent_sites, topics, google_api_key, google_cse_id)

for i in range(len(recent_articles)):
    recent_articles[i]["clean_content"] = clean_article_text(recent_articles[i]["title"], recent_articles[i]["content"])
    recent_articles[i]["summary"] = summarize_article_text(recent_articles[i]["title"], recent_articles[i]["clean_content"])

TypeError: clean_article_text() missing 1 required positional argument: 'text'

In [43]:
recent_articles

[{'title': 'Stock Market Data - US Markets, World Markets, and Stock Quotes ...',
  'url': 'https://www.cnn.com/markets',
  'snippet': '3 hours ago ... Up-to-date stock market data coverage from CNN. Get the latest updates on US markets, world markets, stock quotes, crypto, commodities and currencies.',
  'source': 'cnn.com',
  'content': " Stock Market Data - US Markets, World Markets, and Stock Quotes | CNN CNN values your feedback 1. How relevant is this ad to you? 2. Did you encounter any technical issues? Video player was slow to load content Video content never loaded Ad froze or did not finish loading Video content did not start after ad Audio on ad was too loud Other issues Ad never loaded Ad prevented/slowed the page from loading Content moved around while ad loaded Ad was repetitive to ads I've seen previously Other issues Cancel Submit Thank You! Your effort and contribution in providing this feedback is much appreciated. Close Ad Feedback Markets Up-to-date stock market dat

In [14]:
page = requests.get("https://edition.cnn.com/2025/04/25/asia/south-korea-deepfake-crimes-intl-hnk-dst/index.html", timeout=10)
soup = BeautifulSoup(page.content, 'html.parser')

In [35]:
import re
removeSpace(re.sub(r'[^\x20-\x7E]', ' ', soup.get_text().replace("\n"," ")))

" Deepfake porn is destroying real lives in South Korea | CNN CNN values your feedback 1. How relevant is this ad to you? 2. Did you encounter any technical issues? Video player was slow to load content Video content never loaded Ad froze or did not finish loading Video content did not start after ad Audio on ad was too loud Other issues Ad never loaded Ad prevented/slowed the page from loading Content moved around while ad loaded Ad was repetitive to ads I've seen previously Other issues Cancel Submit Thank You! Your effort and contribution in providing this feedback is much appreciated. Close Ad Feedback Close icon World Africa Americas Asia Australia China Europe India Middle East United Kingdom More Africa Americas Asia Australia China Europe India Middle East United Kingdom Watch Listen Live TV Subscribe Sign in My Account Settings Newsletters Topics you follow Sign out Your CNN account Sign in to your CNN account Sign in My Account Settings Newsletters Topics you follow Sign out 

In [32]:
soup.get_text().replace("\n","")

" Deepfake\u202fporn is destroying real lives in South Korea | CNNCNN values your feedback                                                        1. How relevant is this ad to you?                                                                                                2. Did you encounter any technical issues?                                                                                                                Video player was slow to load content                                                                                                                                                Video content never loaded                                                                                                                                                Ad froze or did not finish loading                                                                                                                                                Video content did not start after ad     