Installing Libraries

In [1]:
# Install necessary libraries if not already installed
!pip install feedparser requests beautifulsoup4

# Import required libraries
import feedparser
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time


Collecting feedparser
  Downloading feedparser-6.0.11-py3-none-any.whl.metadata (2.4 kB)
Collecting sgmllib3k (from feedparser)
  Downloading sgmllib3k-1.0.0.tar.gz (5.8 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Downloading feedparser-6.0.11-py3-none-any.whl (81 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m81.3/81.3 kB[0m [31m2.7 MB/s[0m eta [36m0:00:00[0m
[?25hBuilding wheels for collected packages: sgmllib3k
  Building wheel for sgmllib3k (setup.py) ... [?25l[?25hdone
  Created wheel for sgmllib3k: filename=sgmllib3k-1.0.0-py3-none-any.whl size=6047 sha256=ce8c01ddc757862d078477123d7b76cb82594086686ce197fd9cbae9343aaaca
  Stored in directory: /root/.cache/pip/wheels/3b/25/2a/105d6a15df6914f4d15047691c6c28f9052cc1173e40285d03
Successfully built sgmllib3k
Installing collected packages: sgmllib3k, feedparser
Successfully installed feedparser-6.0.11 sgmllib3k-1.0.0


Defining RSS feed

In [2]:
# Define RSS feed sources for each category
rss_feeds = {
    "General News": [
        "http://feeds.bbci.co.uk/news/world/rss.xml",                     # BBC World News
        "https://rss.nytimes.com/services/xml/rss/nyt/World.xml",         # The New York Times
        "https://www.reuters.com/rssFeed/topNews",                        # Reuters
    ],
    "Technology": [
        "http://feeds.feedburner.com/TechCrunch/",                        # TechCrunch
        "https://www.wired.com/feed/rss",                                 # Wired
        "https://www.technologyreview.com/feed/",                         # MIT Technology Review
        "https://feeds.arstechnica.com/arstechnica/technology-lab",       # Ars Technica
    ],
    "Finance": [
        "https://www.bloomberg.com/feed/podcast.xml",                     # Bloomberg
        "https://www.ft.com/?format=rss",                                 # Financial Times
        "https://www.cnbc.com/id/10000664/device/rss/rss.html",           # CNBC
        "https://www.coindesk.com/arc/outboundfeeds/rss/",                # CoinDesk
        "https://www.forbes.com/investing/feed/",                         # Forbes
    ],
    "Sports": [
        "https://www.espn.com/espn/rss/news",                             # ESPN
        "http://feeds.bbci.co.uk/sport/rss.xml",                          # BBC Sport
        "https://www.skysports.com/rss/12040",                            # Sky Sports
        "https://theathletic.com/feed/",                                  # The Athletic
        "https://www.skysports.com/f1/rss/12040",                         # Sky Sports F1
    ],
    "Entertainment": [
        "https://variety.com/feed/",                                      # Variety
        "https://www.hollywoodreporter.com/t/feed/",                      # Hollywood Reporter
        "https://www.billboard.com/feed/",                                # Billboard
        "https://www.rollingstone.com/feed/",                             # Rolling Stone
    ],
    "Science": [
        "https://www.nasa.gov/rss/dyn/breaking_news.rss",                 # NASA Breaking News
        "https://www.sciencedaily.com/rss/all.xml",                       # Science Daily
        "https://www.nature.com/subjects/rss.xml",                        # Nature
        "https://feeds.arstechnica.com/arstechnica/science",              # Ars Technica Science
    ]
}


RSS Feed Parser for Category-Wise News Extraction

In [3]:
import feedparser

# Function to parse RSS feeds for each category
def parse_feeds(rss_feeds):
    news_data = {}
    for category, urls in rss_feeds.items():
        news_data[category] = []  # Initialize empty list for each category
        print(f"\n🔍 Fetching news for {category}...")

        # Loop through each URL in the category
        for url in urls:
            feed = feedparser.parse(url)

            # Check if the feed is valid
            if feed.bozo:
                print(f"⚠️ Could not parse {url}")
                continue

            # Extract article details
            for entry in feed.entries:
                article = {
                    "title": entry.get("title", "No Title"),
                    "link": entry.get("link", "No Link"),
                    "published": entry.get("published", "No Date"),
                    "summary": entry.get("summary", "No Summary"),
                }
                news_data[category].append(article)

            print(f"✅ {len(feed.entries)} articles found from {url}")

    return news_data

# Parse the RSS feeds
news_articles = parse_feeds(rss_feeds)



🔍 Fetching news for General News...
✅ 26 articles found from http://feeds.bbci.co.uk/news/world/rss.xml
✅ 58 articles found from https://rss.nytimes.com/services/xml/rss/nyt/World.xml
⚠️ Could not parse https://www.reuters.com/rssFeed/topNews

🔍 Fetching news for Technology...
✅ 20 articles found from http://feeds.feedburner.com/TechCrunch/
✅ 50 articles found from https://www.wired.com/feed/rss
✅ 10 articles found from https://www.technologyreview.com/feed/
✅ 20 articles found from https://feeds.arstechnica.com/arstechnica/technology-lab

🔍 Fetching news for Finance...
⚠️ Could not parse https://www.bloomberg.com/feed/podcast.xml
✅ 9 articles found from https://www.ft.com/?format=rss
✅ 30 articles found from https://www.cnbc.com/id/10000664/device/rss/rss.html
✅ 24 articles found from https://www.coindesk.com/arc/outboundfeeds/rss/
⚠️ Could not parse https://www.forbes.com/investing/feed/

🔍 Fetching news for Sports...
✅ 30 articles found from https://www.espn.com/espn/rss/news
✅ 68 

Formatted Display of Category-Wise News Articles

In [4]:
def display_news(news_data, category=None):
    """
    Displays news articles in a clean format.

    Args:
    news_data (dict): The fetched news articles organized by category.
    category (str): The specific category to display (optional).
    """
    # Display a specific category or all categories
    categories = [category] if category else news_data.keys()

    for cat in categories:
        if cat not in news_data or not news_data[cat]:
            print(f"\n⚠️ No articles found for {cat}")
            continue

        print(f"\n📢 --- {cat.upper()} NEWS ---\n")
        for i, article in enumerate(news_data[cat][:5], 1):  # Show top 5 articles
            print(f"{i}. 📰 {article['title']}")
            print(f"   🔗 Link: {article['link']}")
            print(f"   🕰️ Published: {article['published']}")
            print(f"   📝 Summary: {article['summary'][:200]}...\n")  # Show first 200 chars
            print("-" * 80)

# Test to display all categories
display_news(news_articles)



📢 --- GENERAL NEWS NEWS ---

1. 📰 Air strike kills senior Hamas official in Gaza
   🔗 Link: https://www.bbc.com/news/articles/cq5zxe5l58go
   🕰️ Published: Sun, 23 Mar 2025 05:49:01 GMT
   📝 Summary: Salah al-Bardaweel and his wife died in the southern city of Khan Younis, a Hamas source told the BBC....

--------------------------------------------------------------------------------
2. 📰 Pope Francis to be discharged from hospital
   🔗 Link: https://www.bbc.com/news/articles/crrdv84rg4do
   🕰️ Published: Sat, 22 Mar 2025 18:42:22 GMT
   📝 Summary: The Pope's doctor said his life "was in danger" during his five weeks in hospital....

--------------------------------------------------------------------------------
3. 📰 Trump envoy dismisses Starmer plan for Ukraine
   🔗 Link: https://www.bbc.com/news/articles/c62zm4eqvp7o
   🕰️ Published: Sun, 23 Mar 2025 00:01:17 GMT
   📝 Summary: Steve Witkoff says the UK plans for an international force to support a ceasefire are a "posture"....

-

Combining All Articles

In [5]:
# Combine all articles into a single list
def combine_articles(news_data):
    combined_news = []
    for category, articles in news_data.items():
        for article in articles:
            # Add category information for context
            article['category'] = category
            combined_news.append(article)
    return combined_news

# Combine the articles
combined_news_list = combine_articles(news_articles)
print(f"Total Articles Combined: {len(combined_news_list)}")


Total Articles Combined: 485


Keyword-Based Article Filtering Using TF-IDF and Cosine Similarity

In [6]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import pandas as pd

# Function to filter articles by keyword similarity
def filter_by_keyword_similarity(articles, keywords, top_n=5):
    # Combine title and summary for better context
    documents = [article['title'] + " " + article['summary'] for article in articles]

    # Create TF-IDF matrix
    vectorizer = TfidfVectorizer(stop_words='english')
    tfidf_matrix = vectorizer.fit_transform(documents)

    # Vectorize the keyword query
    query_vec = vectorizer.transform([keywords])

    # Compute cosine similarity
    similarity_scores = cosine_similarity(query_vec, tfidf_matrix).flatten()

    # Add similarity scores to articles
    for i, score in enumerate(similarity_scores):
        articles[i]['similarity'] = score

    # Sort articles by similarity score
    sorted_articles = sorted(articles, key=lambda x: x['similarity'], reverse=True)

    # Return top N relevant articles
    return sorted_articles[:top_n]

# Example usage: Define keywords
user_keywords = "artificial intelligence technology innovation"

# Filter top 5 most relevant articles
top_relevant_articles = filter_by_keyword_similarity(combined_news_list, user_keywords, top_n=5)

# Display results
print(f"\nTop {len(top_relevant_articles)} Relevant Articles:\n")
for i, article in enumerate(top_relevant_articles, 1):
    print(f"{i}. {article['title']} (Category: {article['category']})")
    print(f"Link: {article['link']}")
    print(f"Similarity Score: {article['similarity']:.2f}\n")



Top 5 Relevant Articles:

1. DeepSeek AI cranks open the spigots on Chinese venture capital (Category: Finance)
Link: https://www.cnbc.com/2025/03/12/deepseek-ai-cranks-open-the-spigots-on-chinese-venture-capital.html
Similarity Score: 0.19

2. AI-powered mammograms: A new window into heart health (Category: Science)
Link: https://www.sciencedaily.com/releases/2025/03/250320145457.htm
Similarity Score: 0.16

3. 4 technologies that could power the future of energy (Category: Technology)
Link: https://www.technologyreview.com/2025/03/19/1113381/energy-technology-lasers-steel-batteries/
Similarity Score: 0.16

4. The elephant in the room for energy tech? Uncertainty. (Category: Technology)
Link: https://www.technologyreview.com/2025/03/20/1113392/energy-uncertainty/
Similarity Score: 0.12

5. New CRISPR tool enables more seamless gene editing -- and improved disease modeling (Category: Science)
Link: https://www.sciencedaily.com/releases/2025/03/250320145239.htm
Similarity Score: 0.08



The Keyword-Based Article Filtering using TF-IDF and Cosine Similarity is not yielding satisfactory accuracy. Therefore, we will now attempt to improve the results by leveraging the BERT model.

In [33]:
!pip install -q sentence-transformers


In [8]:
from sentence_transformers import SentenceTransformer, util
import torch

# Load pre-trained BERT model
model = SentenceTransformer('all-MiniLM-L6-v2')
print("✅ BERT model loaded successfully.")




The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.5k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

✅ BERT model loaded successfully.


Article Embedding and Query Encoding Using BERT

In [9]:
# Combine all articles from different categories
articles = []
for category, article_list in news_articles.items():
    articles.extend(article_list)  # Merging articles from all categories

# Verify if articles are combined
print(f"✅ Total Articles Combined: {len(articles)}")

# Combine title + summary for each article
documents = [article['title'] + " " + article['summary'] for article in articles]

# Encode all articles into dense vectors
print("\n🔍 Encoding articles with BERT...")
article_embeddings = model.encode(documents, convert_to_tensor=True)

# Encode the user's query
user_query = "artificial intelligence technology innovation"  # Example query
query_embedding = model.encode(user_query, convert_to_tensor=True)

# Check dimensions
print(f"🔢 Article Embedding Shape: {article_embeddings.shape}")
print(f"🔢 Query Embedding Shape: {query_embedding.shape}")


✅ Total Articles Combined: 485

🔍 Encoding articles with BERT...
🔢 Article Embedding Shape: torch.Size([485, 384])
🔢 Query Embedding Shape: torch.Size([384])


Timezone-Aware Date Parsing Using dateutil in Python

In [10]:
from dateutil import parser
from dateutil.tz import gettz

# Example input timestamp with EST
timestamp = "Fri, 21 Mar 2025 18:42:25 EST"

# Parse with timezone awareness
dt = parser.parse(timestamp, tzinfos={"EST": gettz("America/New_York")})
print(dt)


2025-03-21 18:42:25-04:00


In [11]:
tzinfos = {"EST": gettz("America/New_York"), "EDT": gettz("America/New_York")}
dt = parser.parse(timestamp, tzinfos=tzinfos)


In [12]:
from dateutil import parser
from dateutil.tz import gettz

def parse_date(date_str):
    tzinfos = {
        "EST": gettz("America/New_York"),
        "EDT": gettz("America/New_York"),
        "PST": gettz("America/Los_Angeles"),
        "UTC": gettz("UTC")
    }
    try:
        return parser.parse(date_str, tzinfos=tzinfos)
    except Exception as e:
        print(f"⚠️ Date parsing failed for: {date_str} | Error: {e}")
        return None


BERT-Based Keyword Filtering for Relevant News Article

In [13]:
!pip install -q sentence-transformers

from sentence_transformers import SentenceTransformer, util
import torch
import pandas as pd

# Initialize BERT model for embeddings
model = SentenceTransformer('all-MiniLM-L6-v2')  # Lightweight and efficient

# Define keywords for filtering
keywords = ["AI", "Machine Learning", "Finance", "Technology", "Sports", "Entertainment"]

# Function to combine all articles into a single list
def combine_articles(news_data):

    combined_news = []
    for category, articles in news_data.items():
        for article in articles:
            # Add category information for context
            article['category'] = category
            combined_news.append(article)
    return combined_news

# Combine articles from news_articles
combined_news_list = combine_articles(news_articles)
print(f"✅ Total Articles Combined: {len(combined_news_list)}")

# Convert to DataFrame for better handling
df = pd.DataFrame(combined_news_list)

# Check if DataFrame has data
if df.empty:
    print("⚠️ No articles found. Please check your data sources.")
else:
    print(f"Total Articles in DataFrame: {len(df)}")
    display(df.head())

    # Concatenate title and summary for richer embeddings
    df['content'] = df['title'] + " " + df['summary']

    # Encode article content and keywords using BERT
    print("\n🚀 Encoding articles and keywords using BERT...")
    article_embeddings = model.encode(df['content'].tolist(), convert_to_tensor=True)
    keyword_embeddings = model.encode(keywords, convert_to_tensor=True)

    # Compute cosine similarity between articles and keywords
    cosine_scores = util.cos_sim(article_embeddings, keyword_embeddings)

    # Get the highest similarity score for each article
    max_similarities = torch.max(cosine_scores, dim=1).values

    # Add similarity scores to the DataFrame
    df['similarity_score'] = max_similarities.cpu().numpy()

    # Set threshold for filtering relevant articles
    threshold = 0.3  # Adjust for stricter or looser filtering
    filtered_df = df[df['similarity_score'] > threshold].sort_values(by='similarity_score', ascending=False)

    # Show filtered results
    print(f"\n🔍 Filtered Articles (Score > {threshold}): {len(filtered_df)}")
    display(filtered_df.head(10))


✅ Total Articles Combined: 485
Total Articles in DataFrame: 485


Unnamed: 0,title,link,published,summary,category,similarity
0,Air strike kills senior Hamas official in Gaza,https://www.bbc.com/news/articles/cq5zxe5l58go,"Sun, 23 Mar 2025 05:49:01 GMT",Salah al-Bardaweel and his wife died in the so...,General News,0.0
1,Pope Francis to be discharged from hospital,https://www.bbc.com/news/articles/crrdv84rg4do,"Sat, 22 Mar 2025 18:42:22 GMT","The Pope's doctor said his life ""was in danger...",General News,0.0
2,Trump envoy dismisses Starmer plan for Ukraine,https://www.bbc.com/news/articles/c62zm4eqvp7o,"Sun, 23 Mar 2025 00:01:17 GMT",Steve Witkoff says the UK plans for an interna...,General News,0.0
3,"Earrings worth $769,500 recovered by Florida p...",https://www.bbc.com/news/articles/crrdvy4pvpeo,"Sat, 22 Mar 2025 23:09:25 GMT",The man allegedly swallowed two pairs of earri...,General News,0.0
4,Israel strikes Lebanon after first rocket atta...,https://www.bbc.com/news/articles/cn4ynpzk8d8o,"Sat, 22 Mar 2025 21:20:42 GMT","Lebanon's health ministry says seven people, i...",General News,0.0



🚀 Encoding articles and keywords using BERT...

🔍 Filtered Articles (Score > 0.3): 47


Unnamed: 0,title,link,published,summary,category,similarity,content,similarity_score
171,Researchers astonished by tool’s apparent succ...,https://arstechnica.com/ai/2025/03/researchers...,"Fri, 14 Mar 2025 20:03:41 +0000","Anthropic trains custom AI to hide objectives,...",Technology,0.0,Researchers astonished by tool’s apparent succ...,0.512849
141,Satellite Internet Will Enable AI in Everything,https://www.wired.com/story/satellite-internet...,"Thu, 20 Mar 2025 09:30:00 +0000",AI-powered agents need to be connected all the...,Technology,0.0,Satellite Internet Will Enable AI in Everythin...,0.511288
164,Cloudflare turns AI against itself with endles...,https://arstechnica.com/ai/2025/03/cloudflare-...,"Fri, 21 Mar 2025 21:14:35 +0000",New approach punishes AI companies that ignore...,Technology,0.0,Cloudflare turns AI against itself with endles...,0.504136
181,What does “PhD-level” AI mean? OpenAI’s rumore...,https://arstechnica.com/ai/2025/03/what-does-p...,"Fri, 07 Mar 2025 22:54:09 +0000",Silicon Valley may value imperfect virtual PhD...,Technology,0.0,What does “PhD-level” AI mean? OpenAI’s rumore...,0.482615
144,OpenAI’s Deep Research Agent Is Coming for Whi...,https://www.wired.com/story/openais-deep-resea...,"Wed, 19 Mar 2025 16:00:00 +0000",The research-focused agent shows how a new gen...,Technology,0.0,OpenAI’s Deep Research Agent Is Coming for Whi...,0.481795
441,How family background can help lead to athleti...,https://www.sciencedaily.com/releases/2025/03/...,"Thu, 20 Mar 2025 14:48:19 EDT",Americans have long believed that sports are o...,Science,0.0,How family background can help lead to athleti...,0.477758
85,"LimeWire AI Studio Review 2023: Details, Prici...",https://techncruncher.blogspot.com/2023/12/lim...,"Tue, 12 Dec 2023 16:10:00 +0000","<p style=""text-align: left;""><span style=""font...",Technology,0.009042,"LimeWire AI Studio Review 2023: Details, Prici...",0.472307
183,CMU research shows compression alone may unloc...,https://arstechnica.com/ai/2025/03/compression...,"Thu, 06 Mar 2025 23:22:05 +0000",New research challenges prevailing idea that A...,Technology,0.0,CMU research shows compression alone may unloc...,0.463306
152,Synchron’s Brain-Computer Interface Now Has Nv...,https://www.wired.com/story/synchrons-brain-co...,"Wed, 19 Mar 2025 12:00:00 +0000",The company has partnered with Nvidia to devel...,Technology,0.0,Synchron’s Brain-Computer Interface Now Has Nv...,0.433733
178,OpenAI pushes AI agent capabilities with new d...,https://arstechnica.com/ai/2025/03/openai-push...,"Tue, 11 Mar 2025 20:42:17 +0000",New tools may help fulfill CEO's claim that ag...,Technology,0.0,OpenAI pushes AI agent capabilities with new d...,0.428945


Filter articles with similarity score > 0.6

In [14]:
# Filter articles with similarity score > 0.6
filtered_df_high = filtered_df[filtered_df['similarity_score'] > 0.6]

# Display the number of articles after filtering
print(f"🔍 Filtered Articles (Score > 0.6): {len(filtered_df_high)}")
filtered_df_high.head()


🔍 Filtered Articles (Score > 0.6): 0


Unnamed: 0,title,link,published,summary,category,similarity,content,similarity_score


Retrieving Top Relevant Articles Using BERT and Cosine Similarity

In [16]:
from sentence_transformers import SentenceTransformer, util

model = SentenceTransformer('all-MiniLM-L6-v2')  # Lighter but effective
embeddings = model.encode(filtered_df['content'].tolist(), convert_to_tensor=True)
query_embedding = model.encode("Latest AI technology news", convert_to_tensor=True)

# Find top 5 most relevant articles
cosine_scores = util.pytorch_cos_sim(query_embedding, embeddings)
top_results = torch.topk(cosine_scores, k=5)
print(top_results)


torch.return_types.topk(
values=tensor([[0.5532, 0.5526, 0.5359, 0.5261, 0.5217]]),
indices=tensor([[26, 15, 17,  5,  1]]))


In [17]:
# Define news_sources for multiple categories
news_sources = {}
for category, feeds in rss_feeds.items():
    news_sources[category] = feeds


Fetching News Articles from the Past 7 Days Using RSS Feeds

In [18]:
import datetime
import feedparser

# Function to fetch news for the past 7 days
def fetch_news_past_7_days():
    news_articles = {}
    today = datetime.datetime.now()

    # Loop through the past 7 days
    for i in range(7):
        date = (today - datetime.timedelta(days=i)).strftime("%Y-%m-%d")
        print(f"Fetching news for {date}...")

        daily_news = []

        # Loop through each category and its feeds
        for category, feeds in rss_feeds.items():
            for feed_url in feeds:
                try:
                    feed = feedparser.parse(feed_url)
                    for entry in feed.entries:
                        published = entry.get("published", "No Date")
                        title = entry.get("title", "No Title")
                        link = entry.get("link", "No Link")
                        summary = entry.get("summary", "No Summary")

                        # Add to daily news
                        daily_news.append({
                            "title": title,
                            "link": link,
                            "published": published,
                            "summary": summary,
                            "category": category
                        })
                except Exception as e:
                    print(f"❌ Error fetching {feed_url}: {e}")

        news_articles[date] = daily_news
        print(f"✅ Fetched {len(daily_news)} articles for {date}\n")

    return news_articles

# Execute the fetching process
news_articles = fetch_news_past_7_days()
print(f"✅ Total Dates Fetched: {len(news_articles)}")


Fetching news for 2025-03-23...
✅ Fetched 485 articles for 2025-03-23

Fetching news for 2025-03-22...
✅ Fetched 485 articles for 2025-03-22

Fetching news for 2025-03-21...
✅ Fetched 485 articles for 2025-03-21

Fetching news for 2025-03-20...
✅ Fetched 485 articles for 2025-03-20

Fetching news for 2025-03-19...
✅ Fetched 485 articles for 2025-03-19

Fetching news for 2025-03-18...
✅ Fetched 485 articles for 2025-03-18

Fetching news for 2025-03-17...
✅ Fetched 485 articles for 2025-03-17

✅ Total Dates Fetched: 7


In [19]:
# Combine articles into a single list
def combine_articles(news_articles):
    combined_news = []
    for date, articles in news_articles.items():
        for article in articles:
            combined_news.append(article)
    return combined_news

combined_news_list = combine_articles(news_articles)
print(f"✅ Total Articles Combined: {len(combined_news_list)}")


✅ Total Articles Combined: 3395


Data Preprocessing

In [20]:
import pandas as pd

# Convert combined news to DataFrame
df = pd.DataFrame(combined_news_list)

# Check for missing values and handle them
df.fillna("Missing", inplace=True)

# Combine 'title' and 'summary' for meaningful context
df['content'] = df['title'] + " " + df['summary']

# Drop duplicates if any
df.drop_duplicates(subset=['content'], inplace=True)

print(f"✅ Total Articles After Preprocessing: {len(df)}")
df.head(5)


✅ Total Articles After Preprocessing: 483


Unnamed: 0,title,link,published,summary,category,content
0,Air strike kills senior Hamas official in Gaza,https://www.bbc.com/news/articles/cq5zxe5l58go,"Sun, 23 Mar 2025 05:49:01 GMT",Salah al-Bardaweel and his wife died in the so...,General News,Air strike kills senior Hamas official in Gaza...
1,Pope Francis to be discharged from hospital,https://www.bbc.com/news/articles/crrdv84rg4do,"Sat, 22 Mar 2025 18:42:22 GMT","The Pope's doctor said his life ""was in danger...",General News,Pope Francis to be discharged from hospital Th...
2,Trump envoy dismisses Starmer plan for Ukraine,https://www.bbc.com/news/articles/c62zm4eqvp7o,"Sun, 23 Mar 2025 00:01:17 GMT",Steve Witkoff says the UK plans for an interna...,General News,Trump envoy dismisses Starmer plan for Ukraine...
3,"Earrings worth $769,500 recovered by Florida p...",https://www.bbc.com/news/articles/crrdvy4pvpeo,"Sat, 22 Mar 2025 23:09:25 GMT",The man allegedly swallowed two pairs of earri...,General News,"Earrings worth $769,500 recovered by Florida p..."
4,Israel strikes Lebanon after first rocket atta...,https://www.bbc.com/news/articles/cn4ynpzk8d8o,"Sat, 22 Mar 2025 21:20:42 GMT","Lebanon's health ministry says seven people, i...",General News,Israel strikes Lebanon after first rocket atta...


Encoding the Data Using BERT

In [21]:
from sentence_transformers import SentenceTransformer
import torch

# Initialize BERT model
model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')

# Encode all articles (content column)
print("🚀 Encoding articles using BERT (This may take a few minutes)...")
article_embeddings = model.encode(df['content'].tolist(), batch_size=32, show_progress_bar=True)

# Convert embeddings to Tensor for easier computation
article_embeddings = torch.tensor(article_embeddings)
print(f"✅ Encoding complete! Shape: {article_embeddings.shape}")


🚀 Encoding articles using BERT (This may take a few minutes)...


Batches:   0%|          | 0/16 [00:00<?, ?it/s]

✅ Encoding complete! Shape: torch.Size([483, 384])


Retrieving Top 10 Relevant Articles Using BERT and Cosine Similarity

In [22]:
# Define your search query
user_query = "Artificial Intelligence advancements"

# Encode the query
query_embedding = model.encode([user_query], convert_to_tensor=True)

# Compute cosine similarity between query and articles
cos_sim = torch.nn.functional.cosine_similarity(query_embedding, article_embeddings)

# Get top 10 most similar articles
top_k = 10
top_results = torch.topk(cos_sim, k=top_k)

print(f"🔍 Top {top_k} most relevant articles for '{user_query}':\n")
for idx in top_results.indices:
    # Convert tensor to integer
    idx = idx.item()
    print(f"Title: {df.iloc[idx]['title']}")
    print(f"Link: {df.iloc[idx]['link']}")
    print(f"Similarity Score: {cos_sim[idx].item():.4f}")
    print("-" * 80)



🔍 Top 10 most relevant articles for 'Artificial Intelligence advancements':

Title: Satellite Internet Will Enable AI in Everything
Link: https://www.wired.com/story/satellite-internet-will-let-us-put-ai-in-everything/
Similarity Score: 0.4951
--------------------------------------------------------------------------------
Title: Anthropic’s new AI search feature digs through the web for answers
Link: https://arstechnica.com/ai/2025/03/anthropics-new-ai-search-feature-digs-through-the-web-for-answers/
Similarity Score: 0.4793
--------------------------------------------------------------------------------
Title: CMU research shows compression alone may unlock AI puzzle-solving abilities
Link: https://arstechnica.com/ai/2025/03/compression-conjures-apparent-intelligence-in-new-puzzle-solving-ai-approach/
Similarity Score: 0.4663
--------------------------------------------------------------------------------
Title: OpenAI pushes AI agent capabilities with new developer API
Link: https:/

In [23]:
# Define your search query
user_query = "Advances in large language models"

# Encode the query
query_embedding = model.encode([user_query], convert_to_tensor=True)

# Compute cosine similarity between query and articles
cos_sim = torch.nn.functional.cosine_similarity(query_embedding, article_embeddings)

# Get top 10 most similar articles
top_k = 10
top_results = torch.topk(cos_sim, k=top_k)

print(f"🔍 Top {top_k} most relevant articles for '{user_query}':\n")
for idx in top_results.indices:
    # Convert tensor to integer
    idx = idx.item()
    print(f"Title: {df.iloc[idx]['title']}")
    print(f"Link: {df.iloc[idx]['link']}")
    print(f"Similarity Score: {cos_sim[idx].item():.4f}")
    print("-" * 80)

🔍 Top 10 most relevant articles for 'Advances in large language models':

Title: My date used AI to psychologically profile me. Is that OK?
Link: https://www.ft.com/content/b21eaff7-7189-49a2-b791-209e8de98494
Similarity Score: 0.3292
--------------------------------------------------------------------------------
Title: Study finds AI-generated meme captions funnier than human ones on average
Link: https://arstechnica.com/ai/2025/03/ai-beats-humans-at-meme-humor-but-the-best-joke-is-still-human-made/
Similarity Score: 0.2972
--------------------------------------------------------------------------------
Title: OpenAI’s Deep Research Agent Is Coming for White-Collar Work
Link: https://www.wired.com/story/openais-deep-research-agent-is-coming-for-white-collar-work/
Similarity Score: 0.2728
--------------------------------------------------------------------------------
Title: Anthropic’s new AI search feature digs through the web for answers
Link: https://arstechnica.com/ai/2025/03/ant

In [24]:
!pip install sentence_transformers




Top 10 Relevant Articles Using 'all-mpnet-base-v2' and Cosine Similarity

all-mpnet-base-v2 is a pre-trained sentence transformer model from the SentenceTransformers library, designed for creating high-quality sentence embeddings. It leverages MPNet architecture, offering superior performance in semantic search, clustering, and sentence similarity tasks.

In [25]:
from sentence_transformers import SentenceTransformer, util
import torch

# Load the new encoding model
print("🚀 Loading the 'all-mpnet-base-v2' model...")
model = SentenceTransformer('all-mpnet-base-v2')  # Switched from BERT

# Define the user query
user_query = "Advances in large language models"

# Encode the articles and the query
print("🚀 Encoding articles using the new model (This may take a few minutes)...")
article_embeddings = model.encode(df['title'].tolist(), convert_to_tensor=True)
query_embedding = model.encode(user_query, convert_to_tensor=True)

# Calculate similarity scores (Cosine Similarity)
print("🔍 Calculating cosine similarity...")
cos_sim = util.cos_sim(query_embedding, article_embeddings)

# Get top 10 most relevant results
top_k = 10  # You can adjust this to get more results if needed
top_results = torch.topk(cos_sim, k=top_k)

# Convert tensor to a list of integers
top_indices = top_results.indices[0].tolist()

print(f"\n🔍 Top {top_k} most relevant articles for '{user_query}':\n")
for idx in top_indices:
    print(f"Title: {df.iloc[idx]['title']}")
    print(f"Link: {df.iloc[idx]['link']}")
    print(f"Similarity Score: {cos_sim[0][idx].item():.4f}")
    print("-" * 80)


🚀 Loading the 'all-mpnet-base-v2' model...


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.4k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

🚀 Encoding articles using the new model (This may take a few minutes)...
🔍 Calculating cosine similarity...

🔍 Top 10 most relevant articles for 'Advances in large language models':

Title: Inside Google’s Two-Year Frenzy to Catch Up With OpenAI
Link: https://www.wired.com/story/google-openai-gemini-chatgpt-artificial-intelligence/
Similarity Score: 0.3472
--------------------------------------------------------------------------------
Title: ProWritingAid VS Grammarly: Which Grammar Checker is Better in (2022) ?
Link: https://techncruncher.blogspot.com/2022/03/prowritingaid-vs-grammarly-which.html
Similarity Score: 0.2972
--------------------------------------------------------------------------------
Title: Roundtables: AI Chatbots Have Joined the Chat
Link: https://www.technologyreview.com/2025/03/20/1113501/roundtables-ai-chatbots-have-joined-the-chat/
Similarity Score: 0.2671
--------------------------------------------------------------------------------
Title: Ginger VS Grammarl

Sentence Embedding with 'multi-qa-MiniLM-L6-cos-v1

multi-qa-MiniLM-L6-cos-v1 is a lightweight SentenceTransformer model optimized for semantic search and question-answering tasks, offering efficient performance with minimal resource usage.

In [26]:
      # Alternative model if MiniLM is inaccessible
      model = SentenceTransformer('multi-qa-MiniLM-L6-cos-v1')
      print("🚀 Loading 'multi-qa-MiniLM-L6-cos-v1' model...")

      # Encoding the articles
      article_embeddings = model.encode(df['content'].tolist(), show_progress_bar=True)
      print(f"✅ Encoding complete! Shape: {article_embeddings.shape}")


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/11.6k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/383 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

🚀 Loading 'multi-qa-MiniLM-L6-cos-v1' model...


Batches:   0%|          | 0/16 [00:00<?, ?it/s]

✅ Encoding complete! Shape: (483, 384)


Using Cosine Similarity to measure Top Relevant Articles

In [27]:
from sklearn.metrics.pairwise import cosine_similarity
import torch

# Define your query
user_query = "Advances in large language models"  # You can modify this query
top_k = 10  # Number of top results to retrieve

# Encode the query
query_embedding = model.encode(user_query, convert_to_tensor=True)
print("✅ Query encoding complete!")

# Calculate cosine similarity
cos_sim = cosine_similarity(article_embeddings, query_embedding.unsqueeze(0))
cos_sim = torch.tensor(cos_sim)

# Get top-k results
top_results = torch.topk(cos_sim.flatten(), k=top_k)

# ✅ Convert indices to numpy integers for proper DataFrame indexing
top_indices = top_results.indices.cpu().numpy()

# Display the results
print(f"\n🔍 Top {top_k} most relevant articles for '{user_query}':\n")
for idx in top_indices:
    print(f"Title: {df.iloc[int(idx)]['title']}")
    print(f"Link: {df.iloc[int(idx)]['link']}")
    print(f"Similarity Score: {cos_sim[idx].item():.4f}")
    print("-" * 80)


✅ Query encoding complete!

🔍 Top 10 most relevant articles for 'Advances in large language models':

Title: Groundbreaking study unveils new complexities in synchronization phenomena
Link: https://www.sciencedaily.com/releases/2025/03/250321163525.htm
Similarity Score: 0.3446
--------------------------------------------------------------------------------
Title: New DESI results strengthen hints that dark energy may evolve
Link: https://www.sciencedaily.com/releases/2025/03/250320214311.htm
Similarity Score: 0.2965
--------------------------------------------------------------------------------
Title: Evidence Grows That Dark Energy Changes Over Time
Link: https://www.wired.com/story/hints-grow-stronger-that-dark-energy-changes-over-time/
Similarity Score: 0.2950
--------------------------------------------------------------------------------
Title: Hints grow stronger that dark energy changes over time
Link: https://arstechnica.com/science/2025/03/hints-grow-stronger-that-dark-energy

Using Dot product similarity to measure Top Relevant Articles

In [28]:
import torch

# Define your query
user_query = "Advances in large language models"
top_k = 10

# Encode the query
query_embedding = model.encode(user_query, convert_to_tensor=True)
print("✅ Query encoding complete with Dot Product Similarity!")

# Dot product similarity
dot_product = torch.matmul(torch.tensor(article_embeddings), query_embedding.unsqueeze(-1)).squeeze()

# Get top-k results
top_results = torch.topk(dot_product, k=top_k)

# Display the results
top_indices = top_results.indices.cpu().numpy()

print(f"\n🔍 Top {top_k} most relevant articles for '{user_query}' (Dot Product):\n")
for idx in top_indices:
    print(f"Title: {df.iloc[int(idx)]['title']}")
    print(f"Link: {df.iloc[int(idx)]['link']}")
    print(f"Similarity Score: {dot_product[idx].item():.4f}")
    print("-" * 80)


✅ Query encoding complete with Dot Product Similarity!

🔍 Top 10 most relevant articles for 'Advances in large language models' (Dot Product):

Title: Groundbreaking study unveils new complexities in synchronization phenomena
Link: https://www.sciencedaily.com/releases/2025/03/250321163525.htm
Similarity Score: 0.3446
--------------------------------------------------------------------------------
Title: New DESI results strengthen hints that dark energy may evolve
Link: https://www.sciencedaily.com/releases/2025/03/250320214311.htm
Similarity Score: 0.2965
--------------------------------------------------------------------------------
Title: Evidence Grows That Dark Energy Changes Over Time
Link: https://www.wired.com/story/hints-grow-stronger-that-dark-energy-changes-over-time/
Similarity Score: 0.2950
--------------------------------------------------------------------------------
Title: Hints grow stronger that dark energy changes over time
Link: https://arstechnica.com/science/20

Using Euclidean distance to measure Top Relevant Articles

In [29]:
from scipy.spatial.distance import cdist
import numpy as np

# Reshape embeddings for Euclidean distance
query_embedding_np = query_embedding.cpu().numpy().reshape(1, -1)
article_embeddings_np = np.array(article_embeddings)

# Calculate Euclidean distance
euclidean_distances = cdist(article_embeddings_np, query_embedding_np, metric='euclidean').flatten()

# Get top-k closest results (smallest distances)
top_indices = np.argsort(euclidean_distances)[:top_k]

print(f"\n🔍 Top {top_k} most relevant articles for '{user_query}' (Euclidean Distance):\n")
for idx in top_indices:
    print(f"Title: {df.iloc[int(idx)]['title']}")
    print(f"Link: {df.iloc[int(idx)]['link']}")
    print(f"Distance Score: {euclidean_distances[idx]:.4f}")  # Lower is better
    print("-" * 80)



🔍 Top 10 most relevant articles for 'Advances in large language models' (Euclidean Distance):

Title: Groundbreaking study unveils new complexities in synchronization phenomena
Link: https://www.sciencedaily.com/releases/2025/03/250321163525.htm
Distance Score: 1.1449
--------------------------------------------------------------------------------
Title: New DESI results strengthen hints that dark energy may evolve
Link: https://www.sciencedaily.com/releases/2025/03/250320214311.htm
Distance Score: 1.1862
--------------------------------------------------------------------------------
Title: Evidence Grows That Dark Energy Changes Over Time
Link: https://www.wired.com/story/hints-grow-stronger-that-dark-energy-changes-over-time/
Distance Score: 1.1874
--------------------------------------------------------------------------------
Title: Hints grow stronger that dark energy changes over time
Link: https://arstechnica.com/science/2025/03/hints-grow-stronger-that-dark-energy-changes-ove

We have been trying to achieve the highest accuracy, and so far, we have obtained the best results using the BERT(all-MiniLM-L6-v2) model. We have experimented with various models, such as multi-qa-MiniLM-L6-cos-v1, all-mpnet-base-v2, and Keyword-Based Article Filtering Using TF-IDF. Additionally, we have used different metrics to measure the similarity between the user query and relevant articles, including cosine similarity, Euclidean distance, and dot product similarity.

AI-Powered Personalized News Recommender with Persona-Based Querying

In [36]:
import pandas as pd
from sentence_transformers import SentenceTransformer, util
import torch

# Load the model with highest accuracy
model_name = 'all-MiniLM-L6-v2'
model = SentenceTransformer(model_name)
print(f"🚀 Model Loaded: {model_name}")

# Define user personas
personas = {
    "alex parker": {"name": "Alex Parker (Tech Enthusiast)", "query": "AI, Cybersecurity, Blockchain, Startups, Programming"},
    "priya sharma": {"name": "Priya Sharma (Finance & Business Guru)", "query": "Global Markets, Startups, Fintech, Cryptocurrency, Economics"},
    "marco rossi": {"name": "Marco Rossi (Sports Journalist)", "query": "Football, F1, NBA, Olympic Sports, Esports"},
    "lisa thompson": {"name": "Lisa Thompson (Entertainment Buff)", "query": "Movies, Celebrity News, TV Shows, Music, Books"},
    "david martinez": {"name": "David Martinez (Science & Space Nerd)", "query": "Space Exploration, AI, Biotech, Physics, Renewable Energy"},
}

# Convert combined news to DataFrame
df = pd.DataFrame(combined_news_list)

# Check for missing values and handle them
df.fillna("Missing", inplace=True)

# Combine 'title' and 'summary' for meaningful context
df['content'] = df['title'] + " " + df['summary']

# Drop duplicates if any
df.drop_duplicates(subset=['content'], inplace=True)

print(f"✅ Total Articles After Preprocessing: {len(df)}")

# Function to display personas
def display_personas():
    print("\n🎭 Available Personas (Type the name or create your own query):\n")
    for key, value in personas.items():
        print(f"- {value['name']}")

# Function to get user query input
def get_user_query():
    display_personas()
    user_input = input("\n🔍 Enter a Persona Name or Your Own Query (Type 'exit' to quit): ").strip().lower()
    if user_input == "exit":
        return None
    if user_input in personas:
        print(f"\n💬 Selected Persona: {personas[user_input]['name']}")
        return personas[user_input]['query']
    else:
        print(f"\n💬 Custom Query Entered: {user_input}")
        return user_input

# Function to fetch relevant articles
def fetch_relevant_articles(user_query):
    # Encode the user query
    print(f"\n💻 Encoding user query: '{user_query}'...")
    query_embedding = model.encode(user_query, convert_to_tensor=True)

    # Encode articles
    print("📰 Encoding articles...")
    article_embeddings = model.encode(df['content'].tolist(), convert_to_tensor=True)

    # Compute similarity scores
    print("⚙️ Computing similarity scores...")
    similarity_scores = util.pytorch_cos_sim(query_embedding, article_embeddings)[0]

    # Add similarity scores to DataFrame
    df['similarity_score'] = similarity_scores.cpu().numpy()

    # Sort by similarity score
    df_sorted = df.sort_values(by='similarity_score', ascending=False).head(5)

    # Display top 5 relevant articles
    print("\n📣 Top 5 Relevant Articles:\n")
    for idx, row in df_sorted.iterrows():
        print(f"🔗 {row['title']}")
        print(f"🌍 Link: {row['link']}")
        print(f"🗞️ Summary: {row['summary']}\n")

# Loop to keep asking for queries until the user quits
while True:
    user_query = get_user_query()
    if user_query is None:
        print("\n🚪 Exiting the program. Goodbye!")
        break
    fetch_relevant_articles(user_query)


🚀 Model Loaded: all-MiniLM-L6-v2
✅ Total Articles After Preprocessing: 483

🎭 Available Personas (Type the name or create your own query):

- Alex Parker (Tech Enthusiast)
- Priya Sharma (Finance & Business Guru)
- Marco Rossi (Sports Journalist)
- Lisa Thompson (Entertainment Buff)
- David Martinez (Science & Space Nerd)

🔍 Enter a Persona Name or Your Own Query (Type 'exit' to quit): alex parker

💬 Selected Persona: Alex Parker (Tech Enthusiast)

💻 Encoding user query: 'AI, Cybersecurity, Blockchain, Startups, Programming'...
📰 Encoding articles...
⚙️ Computing similarity scores...

📣 Top 5 Relevant Articles:

🔗 Cloudflare turns AI against itself with endless maze of irrelevant facts
🌍 Link: https://arstechnica.com/ai/2025/03/cloudflare-turns-ai-against-itself-with-endless-maze-of-irrelevant-facts/
🗞️ Summary: New approach punishes AI companies that ignore "no crawl" directives.

🔗 Satellite Internet Will Enable AI in Everything
🌍 Link: https://www.wired.com/story/satellite-internet-