# Web Scraping (BeautifulSoup)


In [1]:
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin
from transformers import pipeline
import re

# Define headers to mimic a real browser and avoid blocking
HEADERS = {
    "User-Agent": (
        "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
        "AppleWebKit/537.36 (KHTML, like Gecko) "
        "Chrome/115.0 Safari/537.36"
    )
}

In [2]:
def get_article_content(url):
    """Fetches and returns the full content of an article given its URL."""
    try:
        response = requests.get(url, headers=HEADERS)
        if response.status_code != 200:
            print(f"Failed to access {url}. Status Code: {response.status_code}")
            return None

        soup = BeautifulSoup(response.text, 'html.parser')
        content_div = soup.find('div', class_='entry-content wp-block-post-content is-layout-constrained wp-block-post-content-is-layout-constrained')

        if not content_div:
            print(f"No content found for {url}. The structure may have changed.")
            return None

        paragraphs = content_div.find_all('p')
        article_text = '\n'.join(p.get_text() for p in paragraphs)

        return article_text.strip() if article_text.strip() else None
    except Exception as e:
        print(f"Error fetching article content from {url}: {e}")
        return None


In [3]:
def get_latest_news_articles(base_url, num_articles=3):
    
    """Scrapes TechCrunch’s Latest News section for a given number of articles."""
    articles = []
    page = 1  # Start at page 1 for pagination

    while len(articles) < num_articles:
        url = f"{base_url}/page/{page}" if page > 1 else base_url
        try:
            response = requests.get(url, headers=HEADERS)
        except requests.exceptions.MissingSchema:
            print("Invalid URL")
            break

        if response.status_code != 200:
            print(f"Failed to retrieve {url} (Status Code: {response.status_code}).")
            break
        
        soup = BeautifulSoup(response.text, 'html.parser')

        # Locate the query container for latest articles
        query_container = soup.find("div", class_="wp-block-query")
        if not query_container:
            print("Query container not found.")
            break

        # Find all article cards
        article_cards = query_container.find_all("div", class_="wp-block-techcrunch-card")

        for card in article_cards:
            if len(articles) >= num_articles:
                break

            content_section = card.find("div", class_="loop-card__content")
            if not content_section:
                continue

            # Extract title and URL
            title_tag = content_section.find("h3", class_="loop-card__title")
            link_tag = title_tag.find("a", class_="loop-card__title-link") if title_tag else None
            title = link_tag.get_text(strip=True) if link_tag else "No Title Found"
            article_url = urljoin(base_url, link_tag['href']) if link_tag and link_tag.has_attr('href') else None

            # Extract author
            author_tag = content_section.find("a", class_="loop-card__author")
            author = author_tag.get_text(strip=True) if author_tag else "No Author Found"

            if article_url:
                full_content = get_article_content(article_url)
                if full_content:  # Only add if content exists
                    articles.append({
                        "title": title,
                        "url": article_url,
                        "author": author,
                        "full_content": full_content
                    })

        # Pagination: Go to the next page if we haven't found enough articles
        page += 1

    return articles

In [4]:
if __name__ == "__main__":
    techcrunch_url = "https://techcrunch.com"
    latest_articles = get_latest_news_articles(techcrunch_url, num_articles=3)

    for idx, article in enumerate(latest_articles, 1):
        print(f"\n{'='*80}\nArticle {idx}:")
        print(f"Title: {article['title']}")
        print(f"URL: {article['url']}")
        print(f"Author: {article['author']}")
        print("\nFull Content:\n", article["full_content"])


Article 1:
Title: The AI leaders bringing the AGI debate down to Earth
URL: https://techcrunch.com/2025/03/19/the-ai-leaders-bringing-the-agi-debate-down-to-earth/
Author: Maxwell Zeff

Full Content:
 During a recent dinner with business leaders in San Francisco, a comment I made cast a chill over the room. I hadn’t asked my dining companions anything I considered to be extremely faux pas: simply whether they thought today’s AI could someday achieve human-like intelligence (i.e. AGI) or beyond.
It’s a more controversial topic than you might think.
In 2025, there’s no shortage of tech CEOs offering the bull case for how large language models (LLMs), which power chatbots like ChatGPT and Gemini, could attain human-level or even super-human intelligence over the near term. These executives argue that highly capable AI will bring about widespread — and widely distributed — societal benefits.
For example, Dario Amodei, Anthropic’s CEO, wrote in an essay that exceptionally powerful AI could

# Text Abstractive Summarization (BART)

In [5]:
# Import necessary libraries
from transformers import pipeline  # HuggingFace's transformers library for NLP tasks
import re  # Regular expressions for text cleaning
def clean_text(text):
    """
    Cleans the input text by removing extra spaces, newlines, and non-ASCII characters.

    Args:
        text (str): The input text to be cleaned.

    Returns:
        str: The cleaned text with normalized spaces and no special characters.
    """
    # Replace one or more whitespace characters (spaces, newlines, tabs) with a single space
    text = re.sub(r'\s+', ' ', text)
    # Remove non-ASCII characters (anything outside the 0x00-0x7F range)
    text = re.sub(r'[^\x00-\x7F]+', ' ', text)
    # Strip leading and trailing spaces and return the cleaned text
    return text.strip()

def chunk_text(text, max_length=1024):
    """
    Splits the input text into smaller chunks to handle long texts that exceed the model's token limit.

    Args:
        text (str): The input text to be chunked.
        max_length (int): The maximum number of words per chunk. Default is 1024.

    Returns:
        list: A list of text chunks, each containing up to `max_length` words.
    """
    # Split the text into individual words
    words = text.split()
    # Create chunks by joining words into sublists of size `max_length`
    chunks = []
    for i in range(0, len(words), max_length):
        chunks.append(' '.join(words[i:i + max_length]))
    return chunks

def summarize_text(text, summarizer, max_length=130, min_length=30):
    """"
    Summarizes the input text using the BART model. Handles long texts by chunking them first.

    Args:
        text (str): The input text to be summarized.
        summarizer (transformers.Pipeline): The summarization pipeline (e.g., BART model).
        max_length (int): The maximum length of the summary. Default is 130 tokens.
        min_length (int): The minimum length of the summary. Default is 30 tokens.

    Returns:
        str: The concatenated summary of all chunks.
    """
    # Split the text into manageable chunks
    chunks = chunk_text(text)
    summaries = []
    # Summarize each chunk individually
    for chunk in chunks:
        # Generate a summary for the current chunk
        summary = summarizer(chunk, max_length=max_length, min_length=min_length, do_sample=False)
        # Append the summary text to the list of summaries
        summaries.append(summary[0]['summary_text'])
    # Join all summaries into a single string and return
    return ' '.join(summaries)

In [6]:
import torch
torch.cuda.empty_cache()  # Clears unused memory
torch.cuda.memory_allocated()  # Shows allocated memory
torch.cuda.memory_reserved()  # Shows reserved memory

0

In [8]:
import gc
import torch
import os

os.environ["CUDA_LAUNCH_BLOCKING"] = "1"  # Debug CUDA issues

if __name__ == "__main__":
    # Load the summarization pipeline using the BART-large-CNN model
    # The model is pre-trained for summarization tasks and is available on HuggingFace's model hub
    device = 0 if torch.cuda.is_available() else -1  # 0 for CUDA, -1 for CPU
    summarizer = pipeline("summarization", model="facebook/bart-large-cnn", device=device)

    # Iterate through the list of latest articles fetched from TechCrunch
    for idx, article in enumerate(latest_articles, 1):
        torch.cuda.init()
        
        # Print a separator line for better readability
        print(f"\n{'='*80}\nArticle {idx}:")

        # Print the article's metadata: title, URL, and author
        print(f"Title: {article['title']}")
        print(f"URL: {article['url']}")
        print(f"Author: {article['author']}")

        # Clean the article's full content to remove unnecessary spaces, newlines, and special characters
        cleaned_text = clean_text(article["full_content"])

        # Summarize the cleaned text using the BART model
        # The `summarize_text` function handles long texts by chunking them and summarizing each chunk
        summary = summarize_text(cleaned_text, summarizer)

        # Print the generated summary
        print("\nSummary:\n", summary)

Device set to use cuda:0


RuntimeError: CUDA error: device-side assert triggered
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.


In [11]:
%%writefile app.py

# Function to clean text
def clean_text(text):
    text = re.sub(r'\s+', ' ', text)
    text = re.sub(r'[^\x00-\x7F]+', ' ', text)
    return text.strip()

# Function to chunk text
def chunk_text(text, max_length=1024):
    words = text.split()
    chunks = [' '.join(words[i:i + max_length]) for i in range(0, len(words), max_length)]
    return chunks

# Function to summarize text
def summarize_text(text, summarizer, max_length=130, min_length=30):
    chunks = chunk_text(text)
    summaries = []
    for chunk in chunks:
        summary = summarizer(chunk, max_length=max_length, min_length=min_length, do_sample=False)
        summaries.append(summary[0]['summary_text'])
    return ' '.join(summaries)

Overwriting app.py


In [16]:
import streamlit as st

# Streamlit app
def main(): 
    st.title("TechCrunch Article Scraper & Summarizer")

    # Sidebar for navigation and controls
    st.sidebar.title("Navigation & Controls")
    techcrunch_url = "https://techcrunch.com"
    num_articles = st.sidebar.number_input("Number of Articles to Scrape", min_value=1, max_value=10, value=3)
    summary_length = st.sidebar.slider("Summary Length", min_value=50, max_value=200, value=130)
    custom_url = st.sidebar.text_input("Enter a custom TechCrunch article URL")

    # Load the summarization pipeline
    summarizer = pipeline("summarization", model="facebook/bart-large-cnn")

    # Scrape articles
    if st.sidebar.button("Scrape Latest Articles"):
        with st.spinner("Scraping articles..."):
            articles, error = get_latest_news_articles(techcrunch_url, num_articles)
            if error:
                st.error(error)
            else:
                st.session_state.articles = articles

    # Display articles in a dropdown
    if 'articles' in st.session_state:
        article_titles = [article['title'] for article in st.session_state.articles]
        selected_article_title = st.sidebar.selectbox("Select an Article", article_titles)
        selected_article = next(article for article in st.session_state.articles if article['title'] == selected_article_title)

        # Display original content and summary
        st.header("Original Article")
        st.write(selected_article['full_content'])

        with st.spinner("Generating summary..."):
            cleaned_text = clean_text(selected_article['full_content'])
            summary = summarize_text(cleaned_text, summarizer, max_length=summary_length)
            st.header("AI Summary")
            st.write(summary)

        # Export options
        st.sidebar.header("Export Options")
        if st.sidebar.button("Export Summary as Text File"):
            with open("summary.txt", "w") as file:
                file.write(summary)
            st.sidebar.success("Summary exported as summary.txt")

    # Handle custom URL input
    if custom_url:
        with st.spinner("Fetching custom article..."):
            custom_article_content, error = get_article_content(custom_url)
            if error:
                st.error(error)
            elif custom_article_content:
                st.header("Custom Article Content")
                st.write(custom_article_content)

                with st.spinner("Generating summary..."):
                    cleaned_text = clean_text(custom_article_content)
                    summary = summarize_text(cleaned_text, summarizer, max_length=summary_length)
                    st.header("AI Summary")
                    st.write(summary)

                # Export options for custom article
                st.sidebar.header("Export Options")
                if st.sidebar.button("Export Custom Summary as Text File"):
                    with open("custom_summary.txt", "w") as file:
                        file.write(summary)
                    st.sidebar.success("Custom summary exported as custom_summary.txt")

if __name__ == "__main__":
    main()

Device set to use cuda:0


# Testing


In [19]:
# Test get_latest_news_articles
articeles = get_latest_news_articles("https://techcrunch.com", num_articles=10)

# Test get_article_content
for article in articeles:
    print(get_article_content(article["url"]))

Google has officially unveiled the Pixel 9a smartphone, its new midrange phone that will retail for $499. The A-series smartphone’s biggest change is its appearance, as the latest model ditches the camera bar on the back of the phone.
The smartphone is getting a chip upgrade, as the new model features Google’s Tensor G4 processor, while its predecessor had a G3 chip. The 9a also promises over 30 hours of battery life, features 8GB of RAM, and comes with either 128GB or 256GB of storage.
It will be available beginning in April and comes in four colorways: Peony, Iris, Porcelain, and Obsidian.
In addition, the smartphone features an upgraded 6.3-inch Actua display, which Google says is the brightest display on an A-series ever. For comparison, the new display is 35% brighter than the Pixel 8a at 2700 nits. The 9a also features a 120 Hz adaptive refresh rate for a smoother experience.
The smartphone’s upgraded dual rear camera system has both a 13MP ultrawide camera and a 48MP main camera

In [20]:
# Testin error handling
get_latest_news_articles("1", num_articles=10)

Invalid URL


[]

In [21]:
# Test clean_text
import pandas as pd
texts = [
    "Café au lait ☕ is tasty!",      # Removes "é" and "☕"
    "你好, 世界! (Hello, World!)",    # Removes "你好, 世界!"
    "Jalapeño 🌶️ is spicy!",         # Removes "ñ" and "🌶️"
    "I ♥ Python!",                   # Removes "♥"
    "El Niño affects weather.",       # Removes "ñ"
    "Привет! Как дела?",              # Removes "Привет! Как дела?"
    "München is in Germany.",         # Removes "ü"
    "São Paulo is a big city.",       # Removes "ã"
    "Résumé or CV?",                  # Removes "é"
    "Tokyo 東京 is amazing!",         # Removes "東京"
]

cleaned_texts = [clean_text(text) for text in texts]

df = pd.DataFrame({"texts":texts,
                   "cleaned":cleaned_texts})
df

Unnamed: 0,texts,cleaned
0,Café au lait ☕ is tasty!,Caf au lait is tasty!
1,"你好, 世界! (Hello, World!)",", ! (Hello, World!)"
2,Jalapeño 🌶️ is spicy!,Jalape o is spicy!
3,I ♥ Python!,I Python!
4,El Niño affects weather.,El Ni o affects weather.
5,Привет! Как дела?,! ?
6,München is in Germany.,M nchen is in Germany.
7,São Paulo is a big city.,S o Paulo is a big city.
8,Résumé or CV?,R sum or CV?
9,Tokyo 東京 is amazing!,Tokyo is amazing!


In [30]:
# Test chunk_text
# Outputs a list of chunks each one up to the max_length
for article in articeles:
    print(len(chunk_text(article["full_content"])), end = "\t")
    for chunk in chunk_text(article["full_content"], max_length=1024):
        print(len(chunk.split()), end = "\t")
    print()

1	263	
1	244	
1	916	
1	495	
1	294	
1	533	
2	1024	30	
1	183	
1	538	
1	496	


In [31]:
# Test summarize_text
summarizer = pipeline("summarization", model="facebook/bart-large-cnn")
for article in articeles:
    print(summarize_text(article["full_content"], summarizer))

Device set to use cuda:0


RuntimeError: CUDA error: device-side assert triggered
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.


In [1]:
import subprocess

subprocess.run(["streamlit", "run", "StreamlitUI.py"])

: 

In [4]:
from exceptions import PendingDeprecationWarning

ModuleNotFoundError: No module named 'exceptions'