# Business News Summarizer

## Importing dependencies

In [23]:
from dotenv import load_dotenv
import os
import requests
import feedparser
from bs4 import BeautifulSoup
import torch
from transformers import AutoTokenizer, AutoModel
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
from bs4 import BeautifulSoup

## Model parameters

In [4]:
model_path = 'microsoft/codebert-base'
device = "cuda" if torch.cuda.is_available() else "mps" if torch.backends.mps.is_available() else "cpu"

In [5]:
tokenizer = AutoTokenizer.from_pretrained("microsoft/codebert-base")
model = AutoModel.from_pretrained("microsoft/codebert-base")

In [6]:
model = model.to(device)

## Creating Dataframe

In [27]:
rss_url = "http://feeds.bbci.co.uk/news/business/rss.xml"
feed = feedparser.parse(rss_url)

In [8]:
df_news = pd.read_csv('sample_news.csv')

## Embedding

In [9]:
titles = df_news['title'].tolist()[:50]  # 50 najnowszych tytułów
titles = df_news['title'].tolist()[:50]  # 50 najnowszych tytułów

In [11]:
with torch.no_grad():
    inputs = tokenizer(titles, padding=True, truncation=True, return_tensors="pt")
    inputs = {k: v.to(device) for k, v in inputs.items()}
    outputs = model(**inputs)
    # Najczęściej używa się wektora CLS jako embeddingu
    embeddings = outputs.last_hidden_state[:, 0, :].cpu().numpy()

print("Embedding shape:", embeddings.shape) 

Embedding shape: (25, 768)


## Embedding prompt

In [12]:
prompt = "How the stocks of Tesla will change"


In [13]:
with torch.no_grad():
    prompt_inputs = tokenizer([prompt], padding=True, truncation=True, return_tensors="pt")
    prompt_inputs = {k: v.to(device) for k, v in prompt_inputs.items()}
    prompt_outputs = model(**prompt_inputs)
    prompt_embedding = prompt_outputs.last_hidden_state[:, 0, :].cpu().numpy()

In [18]:
similarities = cosine_similarity(prompt_embedding, embeddings)[0]
top5_idx = np.argsort(similarities)[-5:][::-1]

In [19]:
for idx in top5_idx:
    print(f"Title: {df_news['title'].iloc[idx]}")
    print(f"Similarity: {similarities[idx]:.4f}")
    print("---")

Title: Starbucks opens 1000th store in China
Similarity: 0.9982
---
Title: Intel to build new factory in US
Similarity: 0.9980
---
Title: Tesla shares surge after record deliveries
Similarity: 0.9979
---
Title: Disney+ reaches 200M subscribers
Similarity: 0.9977
---
Title: Microsoft invests in quantum computing startup
Similarity: 0.9976
---


## Parsing data from chosed articles

In [24]:
def get_article_text(url):
    try:
        response = requests.get(url, timeout=10)
        soup = BeautifulSoup(response.text, 'html.parser')
        paragraphs = soup.find_all('p')
        text = ' '.join([p.get_text() for p in paragraphs])
        return text.strip()
    except Exception as e:
        return f"Error: {e}"

In [26]:
top5_df = df_news.iloc[top5_idx].reset_index(drop=True)
top5_df['content'] = top5_df['link'].apply(get_article_text)
top5_df


Unnamed: 0,title,link,summary,date,content
0,Starbucks opens 1000th store in China,https://www.bbc.com/news/business-21,Starbucks celebrates rapid expansion in Asia.,2025-12-29,"Sorry, we're unable to bring you the page you'..."
1,Intel to build new factory in US,https://www.bbc.com/news/business-10,Intel plans a $10B investment in a new US faci...,2026-01-03,"Sorry, we're unable to bring you the page you'..."
2,Tesla shares surge after record deliveries,https://www.bbc.com/news/business-1,Tesla delivered a record number of vehicles in...,2026-01-08,"Sorry, we're unable to bring you the page you'..."
3,Disney+ reaches 200M subscribers,https://www.bbc.com/news/business-20,Disney+ hits a major milestone in streaming wars.,2025-12-29,"Sorry, we're unable to bring you the page you'..."
4,Microsoft invests in quantum computing startup,https://www.bbc.com/news/business-4,Microsoft's investment aims to accelerate quan...,2026-01-06,"Sorry, we're unable to bring you the page you'..."
