In [82]:
# Install feedparser and BeautifulSoup for RSS parsing
%pip install feedparser beautifulsoup4

# Install Google Translator library
# %pip install googletrans==4.0.0rc1
%pip install deep-translator

# Install CKIP Transformers for NLP tasks
%pip install ckip-transformers

# Install KeyBERT with USE (Universal Sentence Encoder) backend
%pip install keybert[use]

# Downgrade httpx to a compatible version
%pip install httpx==0.23.0



# Verify Packages

In [83]:
from keybert import KeyBERT

# Initialize KeyBERT model
kw_model = KeyBERT()

# Example text
article = """
Artificial intelligence (AI) is intelligence demonstrated by machines, in contrast to the natural intelligence displayed by humans and animals.
"""

# Extract keywords
keywords1 = kw_model.extract_keywords(article, keyphrase_ngram_range=(1, 2), stop_words='english', top_n=5)
print("Keywords top 5:", keywords1)

keywords2 = [tag for tag, score in kw_model.extract_keywords(article, keyphrase_ngram_range=(1, 2), stop_words='english') if score > 0.6]
print("Keywords with threshold > 0.6:", keywords2)


Keywords top 5: [('ai intelligence', 0.6922), ('intelligence ai', 0.6761), ('ai', 0.6491), ('artificial intelligence', 0.605), ('natural intelligence', 0.5574)]
Keywords with threshold > 0.6: ['ai intelligence', 'intelligence ai', 'ai', 'artificial intelligence']


In [84]:
# from googletrans import Translator
from deep_translator import GoogleTranslator

def translate_text(text, src='en', dest='zh-tw'):
    # Option1: from googletrans import Translator
    # return Translator().translate(text, src=src.lower(), dest=dest.lower()).text

    # Option2: from deep_translator import GoogleTranslator
    # return GoogleTranslator(source=src, target=dest).translate(text)
    if dest == 'zh-tw':
        dest = 'zh-TW'
    translator = GoogleTranslator(source=src, target=dest)
    return translator.translate(text)

result = translate_text("Empty.", src='en', dest='zh-tw')
print(result)

空的。


In [85]:
from pprint import pprint
import feedparser

rss_url = "http://rss.cnn.com/rss/cnn_topstories.rss"
feed = feedparser.parse(rss_url)
pprint(feed.entries[0])  # verify

{'guidislink': False,
 'id': 'https://www.cnn.com/business/live-news/fox-news-dominion-trial-04-18-23/index.html',
 'link': 'https://www.cnn.com/business/live-news/fox-news-dominion-trial-04-18-23/index.html',
 'links': [{'href': 'https://www.cnn.com/business/live-news/fox-news-dominion-trial-04-18-23/index.html',
            'rel': 'alternate',
            'type': 'text/html'}],
 'media_content': [{'height': '619',
                    'medium': 'image',
                    'type': 'image/jpeg',
                    'url': 'https://cdn.cnn.com/cnnnext/dam/assets/230418164538-02-dominion-fox-trial-settlement-0418-super-169.jpg',
                    'width': '1100'},
                   {'height': '300',
                    'medium': 'image',
                    'type': 'image/jpeg',
                    'url': 'https://cdn.cnn.com/cnnnext/dam/assets/230418164538-02-dominion-fox-trial-settlement-0418-large-11.jpg',
                    'width': '300'},
                   {'height': '552',
  

In [86]:
import feedparser
from bs4 import BeautifulSoup

# Example RSS feed URL (The Guardian)
rss_url = "https://www.theguardian.com/world/rss"

# Parse the RSS feed
feed = feedparser.parse(rss_url)

# Function to clean up HTML content
def clean_html(html_content):
    # Use BeautifulSoup to parse the HTML
    soup = BeautifulSoup(html_content, 'lxml')

    # Extract text and return it
    return soup.get_text(separator=' ', strip=True)

# Iterate through each entry in the feed
for entry in feed.entries:
    # Get the description which contains HTML
    description_html = entry.description

    # Clean up the HTML to extract plain text
    cleaned_description = clean_html(description_html)

    # Print title and cleaned description
    print(f"Title: {entry.title}")
    print(f"Cleaned Description: {cleaned_description}\n")


Title: Rwandan-backed M23 rebels launch new offensive in DRC
Cleaned Description: Clashes break ceasefire days before Rwandan and Congolese presidents attend crisis summit Rebels of the M23 armed group and allied Rwandan forces have launched a new offensive in the eastern Democratic Republic of Congo (DRC), days before the Rwandan and Congolese presidents are due to attend a crisis summit. The UN said the battle for the key city of Goma, which M23 and Rwandan troops seized last week, had left at least 2,900 people dead – far higher than the previous death toll of 900. Continue reading...

Title: Flies in hospital wards may be spreading drug-resistant bacteria to patients
Cleaned Description: Scientists in Nigeria found the insects carry infections resistant to last-resort antibiotics, adding to fears about superbugs Flies buzzing between beds may be spreading drug-resistant bugs among patients in hospitals, according to new research. Researchers from the Ineos Oxford Institute for anti

## generate_hashtags

In [87]:
import warnings
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from keybert import KeyBERT


# Load the pre-trained model
kw_model = KeyBERT()
model_name = "textattack/roberta-base-ag-news" # "ckiplab/bert-base-chinese"
warnings.filterwarnings("ignore", message="Some weights of the model checkpoint.*")

tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)


# Function to generate hashtags
def generate_hashtags(article, prop='summary', top_n=5, threshold=0.7):
    # Tokenize and encode the article content
    inputs = tokenizer(article[prop], return_tensors="pt", truncation=True, padding=True)

    # Get model predictions
    with torch.no_grad():
        outputs = model(**inputs)

    # Assuming the model outputs logits for classification
    predicted_labels = torch.argmax(outputs.logits, dim=-1).tolist()
    predicted_class = outputs.logits.argmax().item()
    # print(f"{predicted_class=}")

    # Map predicted labels to hashtags (this part will depend on your specific implementation)
    # For demonstration, let's assume we have a predefined mapping
    label_to_hashtag = {
        0: "world",       # World news
        1: "sports",      # Sports news
        2: "business",    # Business news
        3: "technology",  # Sci/Tech news
    }

    # Future fine-tuning for more classification
    # label_to_hashtag = {
    #     0: "news",          # 一般新聞
    #     1: "technology",    # 科技新聞
    #     2: "health",        # 健康新聞
    #     3: "sports",        # 體育新聞
    #     4: "business",      # 商業新聞
    #     5: "entertainment", # 娛樂新聞
    #     6: "politics",      # 政治新聞
    #     7: "environment",   # 環境新聞
    #     8: "education",     # 教育新聞
    #     9: "world",         # 國際新聞
    #     10: "lifestyle",    # 生活風格新聞
    #     11: "science",      # 科學新聞
    #     12: "culture",      # 文化新聞
    #     13: "travel",       # 旅遊新聞
    #     14: "food",         # 美食新聞
    #     15: "finance",      # 財經新聞
    #     16: "crime",        # 犯罪新聞
    #     17: "weather",      # 天氣新聞
    #     18: "opinion",      # 評論或觀點
    #     19: "breakingnews", # 突發新聞
    #     # Add more mappings as needed
    # }

    tags = [label_to_hashtag[label] for label in predicted_labels if label in label_to_hashtag]

    keywords = [tag for tag, score in kw_model.extract_keywords(
                        article[prop],
                        keyphrase_ngram_range=(1, 2),  # Extract unigrams and bigrams
                        stop_words='english',          # Remove common English stop words
                        top_n=top_n,                   # Get top N keywords
                    ) if score > threshold] or []

    article.update({
        'tags': tags + keywords
    })

    return article['tags']

Some weights of the model checkpoint at textattack/roberta-base-ag-news were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [88]:
# Testing example articles
dummy_articles = [
    {
        'title': 'Breaking News: Example Title',
        'summary': 'This is an example summary of the news that discusses various topics.'
    },
    {
        'title': 'Another News Title',
        'summary': 'This article covers advancements in technology and health.'
    }
]

# Generate hashtags for articles
for article in dummy_articles:
    tags = generate_hashtags(article, prop='summary', threshold=0.6)
    # Output results
    print(f"Title: {article['title']}, Tags: {', '.join(tags)}")


Title: Breaking News: Example Title, Tags: technology, summary news, news discusses, example summary
Title: Another News Title, Tags: technology, advancements technology, technology health


## get_news_from_rss

In [89]:
import re
import feedparser
from datetime import datetime
# from googletrans import Translator
from deep_translator import GoogleTranslator


def translate_text(text, src='en', dest='zh-tw'):
    # Option1: from googletrans import Translator
    # return Translator().translate(text, src=src.lower(), dest=dest.lower()).text

    # Option2: from deep_translator import GoogleTranslator
    # return GoogleTranslator(source=src, target=dest).translate(text)
    if dest == 'zh-tw':
        dest = 'zh-TW'
    translator = GoogleTranslator(source=src, target=dest)
    return translator.translate(text)


def parse_news_feed(url, num_articles=5):
    feed = feedparser.parse(url)
    return feed.entries[:num_articles]


def get_news_from_rss(rss_url, rss_name, num_articles=5):
    entries = parse_news_feed(rss_url)
    articles = []
    for entry in entries:
        article = {
            'source': rss_name,
            'title': entry.title,
            'pubdate': entry.published if 'published' in entry else "No published date",
            'link': entry.link,
            'summary': entry.summary if 'summary' in entry else '',
            'tags': [rss_name]
        }

        pub_date = article['pubdate']
        # Define the format
        date_format_rfc822 = "%a, %d %b %Y %H:%M:%S %Z"
        date_format_rfc822_tz = "%a, %d %b %Y %H:%M:%S %z"
        date_format_iso8601 = "%Y-%m-%dT%H:%M:%SZ"  # New format

        # Convert pub_date to a standard format if it's available
        if pub_date != "No published date":
            try:
                # Try parsing with ISO 8601 first (most common)
                parsed_date = datetime.strptime(pub_date, date_format_iso8601)
                print(f"[{rss_name}] Parsed Date (ISO8601):", parsed_date)
            except ValueError:
                try:
                    # If ISO 8601 fails, try parsing with RFC 822 with timezone
                    if re.search(r'[-+]', pub_date):
                        parsed_date = datetime.strptime(pub_date, date_format_rfc822_tz)
                        print(f"[{rss_name}] Parsed Date (RFC822 with TZ):", parsed_date)
                    else:
                        # If no timezone info, try parsing with RFC 822 without timezone
                        parsed_date = datetime.strptime(pub_date, date_format_rfc822)
                        print(f"[{rss_name}] Parsed Date (RFC822):", parsed_date)

                except ValueError as e:
                    print(f"[{rss_name}] Failed to parse date:", e)
                    parsed_date = None  # Ensure parsed_date is None if parsing fails
                    formatted_date = "Invalid date format" # handle this later
            if parsed_date:  # Only format if parsing was successful
                formatted_date = parsed_date.strftime('%Y-%m-%d %H:%M:%S')
            else:
                formatted_date = "Invalid date format" # keep the "Invalid date format" message
        else:
            formatted_date = "No published date"

        article.update({
            'published_date': formatted_date
        })

        articles.append(article)
    return articles

# RSS Sources

In [90]:
import os
import json


dummy_sources = [
    {'name': 'BBC', 'url': 'https://feeds.bbci.co.uk/news/rss.xml'},
    # {'name': 'CNN', 'url': 'https://rss.cnn.com/rss/cnn_topstories.rss'},
    {'name': 'CNN', 'url': 'http://rss.cnn.com/rss/cnn_world.rss'},
]

world_sources = [
    {'name': 'BBC', 'url': 'https://feeds.bbci.co.uk/news/rss.xml'},  # BBC News
    {'name': 'CNN', 'url': 'http://rss.cnn.com/rss/cnn_world.rss'},  # CNN World News
    {'name': 'Reuters', 'url': 'http://feeds.reuters.com/reuters/topNews'},  # Reuters Top News
    {'name': 'Al Jazeera', 'url': 'http://www.aljazeera.com/xml/rss/all.xml'},  # Al Jazeera News
    {'name': 'The Guardian', 'url': 'https://www.theguardian.com/world/rss'},  # The Guardian World News
    {'name': 'New York Times', 'url': 'https://rss.nytimes.com/services/xml/rss/nyt/World.xml'},  # NYT World News
    {'name': 'France 24', 'url': 'https://www.france24.com/en/rss'},  # France 24 News
    {'name': 'Bloomberg', 'url': 'https://www.bloomberg.com/feed/podcast/etf-report.xml'},  # Bloomberg News
    {'name': 'NPR', 'url': 'https://www.npr.org/rss/rss.php?id=1001'},  # NPR News
]

business_sources = [
    {'name': 'BBC Business', 'url': 'http://feeds.bbci.co.uk/news/business/rss.xml'},  # BBC Business News
    {'name': 'CNN Business', 'url': 'http://rss.cnn.com/rss/cnn_business.rss'},  # CNN Business News
    {'name': 'Reuters Business', 'url': 'http://feeds.reuters.com/reuters/businessNews'},  # Reuters Business News
    {'name': 'Al Jazeera Business', 'url': 'https://www.aljazeera.com/xml/rss/business.xml'}, # Al Jazeera Business
    {'name': 'The Guardian Business', 'url': 'https://www.theguardian.com/business/rss'},  # The Guardian Business News
    {'name': 'New York Times Business', 'url': 'https://rss.nytimes.com/services/xml/rss/nyt/Business.xml'},  # NYT Business News
    {'name': 'France 24 Business', 'url': 'https://www.france24.com/en/rss/economy'},  # France 24 Economy (Business)
    {'name': 'Bloomberg Markets', 'url': 'https://www.bloomberg.com/rss/markets'},  # Bloomberg Markets News
    {'name': 'NPR Business', 'url': 'https://www.npr.org/rss/rss.php?id=1006'},  # NPR Business News
]

news_sources = [
    {'category': 'world', 'sources': world_sources},
    {'category': 'business', 'sources': business_sources},
]


# Replace 'rss_sources.json' with the actual file path
file_path = './data/rss_sources.json'
if os.path.exists(file_path):
    try:
        with open(file_path, 'r') as f:
            data = json.load(f)
        if data and len(data) > 0:
            news_sources = data
    except Exception as e:
        print(f"Error reading JSON file: {e}")
else:
    print(f"File '{file_path}' not found.")

len(news_sources), [item['category'] for item in news_sources]

(2, ['world', 'business'])

# Start retrieving RSS news

In [91]:
news_data = news_sources.copy()

for group in news_data:
    group['data'] = []
    for source in group['sources']:
        # print(source)
        name = source['name']
        url = source['url']
        # entries = parse_news_feed(url)
        entries = get_news_from_rss(url, name)
        print(f"Source {name} {len(entries)=}")
        group['data'].extend(entries)

len(news_data), news_data[0]['data'][0]

[BBC] Parsed Date (RFC822): 2025-02-06 03:26:47
[BBC] Parsed Date (RFC822): 2025-02-05 18:08:06
[BBC] Parsed Date (RFC822): 2025-02-06 07:12:40
[BBC] Parsed Date (RFC822): 2025-02-06 00:20:38
[BBC] Parsed Date (RFC822): 2025-02-06 00:08:55
Source BBC len(entries)=5
[CNN] Parsed Date (RFC822): 2023-04-14 20:00:28
[CNN] Parsed Date (RFC822): 2023-04-14 20:07:40
[CNN] Parsed Date (RFC822): 2023-04-14 14:29:52
[CNN] Parsed Date (RFC822): 2023-04-14 06:41:00
[CNN] Parsed Date (RFC822): 2023-04-13 21:13:04
Source CNN len(entries)=5
Source Reuters len(entries)=0
[Al Jazeera] Parsed Date (RFC822 with TZ): 2025-02-06 07:01:54+00:00
[Al Jazeera] Parsed Date (RFC822 with TZ): 2025-02-06 06:37:12+00:00
[Al Jazeera] Parsed Date (RFC822 with TZ): 2025-02-06 06:10:51+00:00
[Al Jazeera] Parsed Date (RFC822 with TZ): 2025-02-06 03:42:23+00:00
[Al Jazeera] Parsed Date (RFC822 with TZ): 2025-02-06 01:29:04+00:00
Source Al Jazeera len(entries)=5
[The Guardian] Parsed Date (RFC822): 2025-02-05 20:09:00
[Th

(2,
 {'source': 'BBC',
  'title': "UN chief warns against 'ethnic cleansing' in Gaza",
  'pubdate': 'Thu, 06 Feb 2025 03:26:47 GMT',
  'link': 'https://www.bbc.com/news/articles/c4gw89x8x11o',
  'summary': 'The Palestinian president, Hamas and neighbouring countries condemn the idea of resettling Gazans abroad.',
  'tags': ['BBC'],
  'published_date': '2025-02-06 03:26:47'})

In [92]:
# Preview data
data = [data for group in news_data for data in group['data']]
top_n_news = []
top_n_news.append(data[0])
top_n_news.append(data[-1])

pprint(top_n_news)

[{'link': 'https://www.bbc.com/news/articles/c4gw89x8x11o',
  'pubdate': 'Thu, 06 Feb 2025 03:26:47 GMT',
  'published_date': '2025-02-06 03:26:47',
  'source': 'BBC',
  'summary': 'The Palestinian president, Hamas and neighbouring countries '
             'condemn the idea of resettling Gazans abroad.',
  'tags': ['BBC'],
  'title': "UN chief warns against 'ethnic cleansing' in Gaza"},
 {'link': 'https://www.npr.org/2025/02/05/nx-s1-5286733/stradivarius-violin-record-auction-history-joshua-bell',
  'pubdate': 'Wed, 05 Feb 2025 05:00:00 -0500',
  'published_date': '2025-02-05 05:00:00',
  'source': 'NPR Business',
  'summary': "Stradivarius violins often sell for millions. There's a long "
             'history behind them, and violinists who swear they sound better '
             'than modern ones.',
  'tags': ['NPR Business'],
  'title': 'A Stradivarius violin could sell for a record sum at auction. Is '
           'it worth the hype?'}]


# Start translating news and generating tags

In [93]:
translated_news = news_data.copy()

for group in translated_news:
    for article in group['data']:
        print(article)
        translated_title = translate_text(article['title'])
        summary = ''.join(article['summary']) if article['summary'] else 'Empty.'

        translated_summary = translate_text(summary)

        article.update({
            'summary': summary,
            'translated_title': translated_title,
            'translated_summary': translated_summary,
        })

for group in translated_news:
    for article in group['data']:
        tags = generate_hashtags(article, prop='summary')
        # Output results
        print(f"Title: {article['title']}, Tags: {', '.join(tags)}")

len(translated_news), translated_news[0]['data'][0]

{'source': 'BBC', 'title': "UN chief warns against 'ethnic cleansing' in Gaza", 'pubdate': 'Thu, 06 Feb 2025 03:26:47 GMT', 'link': 'https://www.bbc.com/news/articles/c4gw89x8x11o', 'summary': 'The Palestinian president, Hamas and neighbouring countries condemn the idea of resettling Gazans abroad.', 'tags': ['BBC'], 'published_date': '2025-02-06 03:26:47'}
{'source': 'BBC', 'title': "Jeremy Bowen: Trump's Gaza plan won't happen, but it will have consequences", 'pubdate': 'Wed, 05 Feb 2025 18:08:06 GMT', 'link': 'https://www.bbc.com/news/articles/cx2pwjgp59do', 'summary': 'The US president creates uncertainty and injects more instability, our international editor writes.', 'tags': ['BBC'], 'published_date': '2025-02-05 18:08:06'}
{'source': 'BBC', 'title': 'Bereaved families angry at plan to dismantle Grenfell Tower', 'pubdate': 'Thu, 06 Feb 2025 07:12:40 GMT', 'link': 'https://www.bbc.com/news/articles/cq5g99xy979o', 'summary': 'The government plans to take the tower, where a fire kil

(2,
 {'source': 'BBC',
  'title': "UN chief warns against 'ethnic cleansing' in Gaza",
  'pubdate': 'Thu, 06 Feb 2025 03:26:47 GMT',
  'link': 'https://www.bbc.com/news/articles/c4gw89x8x11o',
  'summary': 'The Palestinian president, Hamas and neighbouring countries condemn the idea of resettling Gazans abroad.',
  'tags': ['world'],
  'published_date': '2025-02-06 03:26:47',
  'translated_title': '聯合國秘書長警告不要在加薩進行“種族清洗”',
  'translated_summary': '巴勒斯坦總統、哈馬斯和鄰國譴責將加薩人移居海外的想法。'})

# HTML rendering

generate_html_layout

In [94]:
from bs4 import BeautifulSoup


def generate_html_layout1(header, articles):
    html = """
    <html>
    <body>
    <h1>{}</h1>
    {}
    </body>
    </html>
    """
    news_html = []
    for i, article in enumerate(articles, 1):
        print(f'Processing article {i}...')
        news_html.append(f"""
        <h2>{article['title']}<br/>{article['translated_title']}</h2>
        <p>{article['summary']}<br/>{article['translated_summary']}</p>
        """)
    return html.format(header, '<hr/>'.join(news_html))


def generate_html_layout2(header, articles):
    # Create a new BeautifulSoup object
    soup = BeautifulSoup(features="html.parser")

    # Create HTML structure
    html = soup.new_tag("html")
    head = soup.new_tag("head")
    title = soup.new_tag("title")
    title.string = header
    head.append(title)
    html.append(head)

    body = soup.new_tag("body")

    # Check if articles are provided
    if not articles:
        print("No articles to display.")
        return soup.prettify()  # Return an empty body if no articles

    for i, article in enumerate(articles, 1):
        print(f'Processing article {i}...')

        # ------
        # Header
        # ------
        # Create a div for each article
        div = soup.new_tag("div")

        # -----
        # Title
        # -----
        # Original Title
        h2 = soup.new_tag("h2")
        h2.string = article.get('title', 'No Title')  # Use .get() for safety
        div.append(h2)

        # Translated Title
        h3 = soup.new_tag("h3")
        h3.string = article.get('translated_title', 'No Translated Title')
        div.append(h3)

        # --------
        # Metadata
        # --------
        small = soup.new_tag("small")
        date = article.get('published_date', 'No Date')
        source_name = article.get('source', 'No Source')

        # Bold the source name
        bold_source = soup.new_tag("strong")
        bold_source.string = source_name

        # Create hashtags as blue links
        tags = article.get('tags', [])
        tag_links = []
        for tag in tags:
            a = soup.new_tag("a", href=f"https://example.com/tags/{tag}", style="color: blue; text-decoration: none;")
            a.string = f"#{tag}"  # Format as hashtag
            tag_links.append(a)

        # Initialize small.string with the date and append other elements
        small.string = f"Date: {date}, Source: "
        small.append(bold_source)  # Append bold source name

        # Append tags to small
        if tag_links:
            small.append(", Tags: ")
            for i, tag_link in enumerate(tag_links):
                if i > 0:
                    small.append(", ")  # Add a comma between tags
                small.append(tag_link)

        div.append(small)  # Append small to div

        # Separator
        hr = soup.new_tag("hr")
        div.append(hr)

        # -------
        # Summary
        # -------
        # Original Summary
        summary = article.get('summary', 'No Summary')
        if '<p>' in summary or '<em>' in summary:
            parsed_content = BeautifulSoup(summary, 'html.parser')
            div.append(parsed_content)
        else:
            p1 = soup.new_tag("p")
            p1.string = summary
            div.append(p1)

        # Translated Summary
        translated_summary = article.get('translated_summary', 'No Translated Summary')
        if '<p>' in translated_summary or '<em>' in summary:
            parsed_content = BeautifulSoup(translated_summary, 'html.parser')
            div.append(parsed_content)
        else:
            p2 = soup.new_tag("p")
            p2.string = translated_summary
            div.append(p2)

        # Link
        a = soup.new_tag("a", href=article.get('link', '#'))  # Default to '#' if no link
        a.string = "Read more"
        a['target'] = "_blank"  # Set target attribute
        a['rel'] = "noopener noreferrer"  # Set rel attribute
        div.append(a)

        # ------
        # Footer
        # ------
        br = soup.new_tag("br")
        body.append(div)
        body.append(br)
        body.append(br)

    html.append(body)  # Add <body> to <html>
    soup.append(html)  # Attach <html> to the BeautifulSoup object

    return soup.prettify()  # Return the prettified HTML structure

In [95]:
dummy_articles = [
    {
        'source': 'BBC',
        'title': 'Breaking News: Example Title',
        'translated_title': '翻譯標題',
        'summary': 'This is an example summary of the news.',
        'translated_summary': '這是新聞的例子摘要。',
        'link': 'http://example.com/news1',
        'tags': ['#world'],
    },
    {
        'source': 'CNN',
        'title': 'Another News Title',
        'translated_title': '另一個翻譯標題',
        'summary': 'This is another summary.',
        'translated_summary': '這是另一個摘要。',
        'link': 'http://example.com/news2',
        'tags': ['#world'],
    }
]

# html_content1 = generate_html_layout1('Top News', dummy_articles)
# html_content2 = generate_html_layout2('Top News', dummy_articles)
for group in translated_news:
    group['html_content_simple'] = generate_html_layout1(group['category'].upper(), group['data'])
    print(f"[{group['category']}] html layout1", group['html_content_simple'][:100])

for group in translated_news:
    group['html_content_default'] = generate_html_layout2(group['category'].upper(), group['data'])
    print(f"[{group['category']}] html layout2", group['html_content_default'][:100])

Processing article 1...
Processing article 2...
Processing article 3...
Processing article 4...
Processing article 5...
Processing article 6...
Processing article 7...
Processing article 8...
Processing article 9...
Processing article 10...
Processing article 11...
Processing article 12...
Processing article 13...
Processing article 14...
Processing article 15...
Processing article 16...
Processing article 17...
Processing article 18...
Processing article 19...
Processing article 20...
Processing article 21...
Processing article 22...
Processing article 23...
Processing article 24...
Processing article 25...
Processing article 26...
Processing article 27...
Processing article 28...
Processing article 29...
Processing article 30...
Processing article 31...
Processing article 32...
Processing article 33...
Processing article 34...
Processing article 35...
[world] html layout1 
    <html>
    <body>
    <h1>WORLD</h1>
    
        <h2>UN chief warns against 'ethnic cleansing'
Processing a

# Save news and preview HTML

In [96]:
from google.colab import drive
from datetime import datetime

drive.mount('/content/gdrive')

today_date = datetime.now().strftime('%Y%m%d')

# Save to google drive
for group in translated_news:
    group['filename'] = f"news-{today_date}-{group['category']}.html"
    html_content = group['html_content_default']

    filename = group['filename']
    with open(filename, 'w', encoding='utf-8') as f:
        f.write(html_content)

    with open(f'/content/gdrive/MyDrive/Colab Notebooks/crawler/{filename}', 'w', encoding='utf-8') as f:
        f.write(html_content)

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [97]:
from IPython.display import display, HTML


for group in translated_news:
    html_content = f"""
    <h1>{group['category'].upper()}</h1>
    <div style="border: 1px solid #ddd; padding: 10px; margin-bottom: 20px;">
        {group['html_content_default']}
    </div>
    """
    display(HTML(html_content))


# Download news

Download to desktop from browser or comment out if not needed

In [100]:
import time
from google.colab import files


print([group['filename'] for group in translated_news])

# files.download(translated_news[0]['filename'])
# files.download(translated_news[-1]['filename'])
for group in reversed(translated_news):
    print(f"[{group['category']}] Downloading {group['filename']}...")
    try:
        files.download(group['filename'])
        print(f"[{group['category']}] Downloaded {group['filename']}")
        # time.sleep(5)
        break
    except:
        print(f"[{group['category']}] Error downloading {group['filename']}")

['news-20250206-world.html', 'news-20250206-business.html']
[business] Downloading news-20250206-business.html...


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

[business] Downloaded news-20250206-business.html
