In [1]:
# --- Import Libraries ---
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, accuracy_score

# --- Load Dataset (tab-separated file) ---
df = pd.read_csv("bbc-news-data.csv", sep="\t")

# Combine title + content as input text
df["text"] = df["title"] + " " + df["content"]

# Features and labels
X = df["text"]
y = df["category"]

# --- Split dataset ---
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# --- Convert text to TF-IDF ---
vectorizer = TfidfVectorizer(stop_words="english", max_features=5000)
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

# --- Train Naive Bayes Classifier ---
clf = MultinomialNB()
clf.fit(X_train_tfidf, y_train)

# --- Predictions ---
y_pred = clf.predict(X_test_tfidf)

# --- Evaluation ---
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))


Accuracy: 0.9865168539325843

Classification Report:
                precision    recall  f1-score   support

     business       0.99      0.96      0.98       102
entertainment       1.00      1.00      1.00        77
     politics       0.98      0.99      0.98        84
        sport       1.00      1.00      1.00       102
         tech       0.96      0.99      0.98        80

     accuracy                           0.99       445
    macro avg       0.99      0.99      0.99       445
 weighted avg       0.99      0.99      0.99       445



In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report

# Load Kaggle News JSON dataset
data = pd.read_json("News_Category_Dataset_v3.json", lines=True)

# Keep relevant fields
df = data[['short_description', 'category']].rename(columns={'short_description':'text'})

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    df['text'], df['category'], test_size=0.2, random_state=42
)

# TF-IDF Vectorizer
vectorizer = TfidfVectorizer(stop_words='english', max_features=5000)
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

# Naive Bayes Classifier
clf = MultinomialNB()
clf.fit(X_train_tfidf, y_train)

# Evaluate
y_pred = clf.predict(X_test_tfidf)
print(classification_report(y_test, y_pred))


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


                precision    recall  f1-score   support

          ARTS       0.50      0.01      0.01       293
ARTS & CULTURE       1.00      0.00      0.01       275
  BLACK VOICES       0.52      0.03      0.06       889
      BUSINESS       0.46      0.18      0.26      1216
       COLLEGE       0.50      0.00      0.01       202
        COMEDY       0.44      0.04      0.07      1022
         CRIME       0.41      0.15      0.22       713
CULTURE & ARTS       1.00      0.01      0.03       202
       DIVORCE       0.82      0.30      0.44       664
     EDUCATION       0.00      0.00      0.00       209
 ENTERTAINMENT       0.28      0.47      0.35      3419
   ENVIRONMENT       1.00      0.03      0.06       313
         FIFTY       0.00      0.00      0.00       263
  FOOD & DRINK       0.50      0.51      0.51      1270
     GOOD NEWS       0.00      0.00      0.00       270
         GREEN       0.38      0.03      0.05       532
HEALTHY LIVING       0.49      0.01      0.03  

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [3]:
from sklearn.cluster import KMeans
import numpy as np

# Suppose 'articles' is a DataFrame of scraped VOA/BBC news
articles = pd.DataFrame({"text": [
    "Government investigates corruption scandal involving ministers",
    "Tech company launches new AI-powered smartphone",
    "Stock markets fall amid global financial crisis",
    "Pakistan wins cricket series against India",
    "Cybercrime rates increase worldwide"
]})

# TF-IDF
X_tfidf = vectorizer.fit_transform(articles['text'])

# KMeans clustering (5 topics)
kmeans = KMeans(n_clusters=5, random_state=42)
kmeans.fit(X_tfidf)

# Assign clusters
articles['cluster'] = kmeans.labels_

# Show top words per cluster
terms = vectorizer.get_feature_names_out()
for i in range(5):
    top_words_idx = np.argsort(kmeans.cluster_centers_[i])[-10:]
    top_words = [terms[j] for j in top_words_idx]
    print(f"Cluster {i}: {', '.join(top_words)}")

print(articles)


Cluster 0: increase, cricket, corruption, tech, smartphone, powered, new, launches, company, ai
Cluster 1: crisis, cricket, corruption, company, amid, investigates, increase, cybercrime, rates, worldwide
Cluster 2: cricket, company, amid, india, investigates, government, ministers, scandal, corruption, involving
Cluster 3: crisis, corruption, company, amid, worldwide, wins, pakistan, cricket, series, india
Cluster 4: company, india, worldwide, markets, global, financial, fall, crisis, stock, amid
                                                text  cluster
0  Government investigates corruption scandal inv...        2
1    Tech company launches new AI-powered smartphone        0
2    Stock markets fall amid global financial crisis        4
3         Pakistan wins cricket series against India        3
4                Cybercrime rates increase worldwide        1


In [4]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans

# -------------------------------
# Step 1: Scrape BBC RSS feed
# -------------------------------
url = "https://feeds.bbci.co.uk/news/world/rss.xml"
response = requests.get(url)
soup = BeautifulSoup(response.content, "xml")

titles = [item.title.text for item in soup.find_all("item")]
descriptions = [item.description.text for item in soup.find_all("item")]

articles = pd.DataFrame({"title": titles, "text": descriptions})

# -------------------------------
# Step 2: TF-IDF
# -------------------------------
vectorizer = TfidfVectorizer(stop_words='english', max_features=5000)
X_tfidf = vectorizer.fit_transform(articles['text'])

# -------------------------------
# Step 3: KMeans Clustering
# -------------------------------
kmeans = KMeans(n_clusters=5, random_state=42)
kmeans.fit(X_tfidf)

articles['cluster'] = kmeans.labels_

# Show clusters with top words
terms = vectorizer.get_feature_names_out()
import numpy as np
for i in range(5):
    top_words_idx = np.argsort(kmeans.cluster_centers_[i])[-10:]
    top_words = [terms[j] for j in top_words_idx]
    print(f"Cluster {i}: {', '.join(top_words)}")

# -------------------------------
# Step 4: User enters a news article
# -------------------------------
user_input = input("Enter a news article: ")

# Vectorize user input
user_vec = vectorizer.transform([user_input])

# Predict cluster
pred_cluster = kmeans.predict(user_vec)[0]
print(f"\nThis article belongs to Cluster {pred_cluster}")


Cluster 0: conspiracy, tried, treason, women, similar, rape, stories, thought, arrest, police
Cluster 1: unpack, lisa, means, takes, official, bank, central, aim, president, trump
Cluster 2: attendance, meet, letter, lent, names, open, staff, fema, recently, said
Cluster 3: travis, album, announcing, relationship, swift, people, video, drone, 30, new
Cluster 4: opened, june, country, 000, come, killed, says, bbc, people, year


Enter a news article:  "The prime minister announced new election reforms after corruption allegations."



This article belongs to Cluster 4


In [5]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.cluster import KMeans
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
import numpy as np

# ----------------------------
# 1. Supervised Model (Kaggle News Dataset)
# ----------------------------
# Load Kaggle labeled dataset (after downloading JSON)
kaggle_data = pd.read_json("News_Category_Dataset_v3.json", lines=True)
df = kaggle_data[['short_description', 'category']].rename(columns={'short_description':'text'})

X_train, X_test, y_train, y_test = train_test_split(
    df['text'], df['category'], test_size=0.2, random_state=42
)

vectorizer = TfidfVectorizer(stop_words='english', max_features=5000)
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

clf = MultinomialNB()
clf.fit(X_train_tfidf, y_train)

print("Supervised Model Performance:")
print(classification_report(y_test, clf.predict(X_test_tfidf)))

# ----------------------------
# 2. Unsupervised Model (BBC RSS Feed)
# ----------------------------
url = "https://feeds.bbci.co.uk/news/world/rss.xml"
response = requests.get(url)
soup = BeautifulSoup(response.content, "xml")

titles = [item.title.text for item in soup.find_all("item")]
descriptions = [item.description.text for item in soup.find_all("item")]
articles = pd.DataFrame({"text": descriptions})

X_tfidf_unsup = vectorizer.transform(articles['text'])
kmeans = KMeans(n_clusters=5, random_state=42)
kmeans.fit(X_tfidf_unsup)
articles['cluster'] = kmeans.labels_

# Show clusters with top words
terms = vectorizer.get_feature_names_out()
for i in range(5):
    top_words_idx = np.argsort(kmeans.cluster_centers_[i])[-10:]
    top_words = [terms[j] for j in top_words_idx]
    print(f"Cluster {i}: {', '.join(top_words)}")

# ----------------------------
# 3. Hybrid Prediction
# ----------------------------
user_input = input("\nEnter a news article: ")

# Supervised prediction
user_vec = vectorizer.transform([user_input])
category = clf.predict(user_vec)[0]

# Unsupervised cluster assignment
cluster = kmeans.predict(user_vec)[0]

print(f"\nArticle: {user_input}")
print(f"Supervised Category: {category}")
print(f"Unsupervised Topic Cluster: {cluster}")


Supervised Model Performance:


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


                precision    recall  f1-score   support

          ARTS       0.50      0.01      0.01       293
ARTS & CULTURE       1.00      0.00      0.01       275
  BLACK VOICES       0.52      0.03      0.06       889
      BUSINESS       0.46      0.18      0.26      1216
       COLLEGE       0.50      0.00      0.01       202
        COMEDY       0.44      0.04      0.07      1022
         CRIME       0.41      0.15      0.22       713
CULTURE & ARTS       1.00      0.01      0.03       202
       DIVORCE       0.82      0.30      0.44       664
     EDUCATION       0.00      0.00      0.00       209
 ENTERTAINMENT       0.28      0.47      0.35      3419
   ENVIRONMENT       1.00      0.03      0.06       313
         FIFTY       0.00      0.00      0.00       263
  FOOD & DRINK       0.50      0.51      0.51      1270
     GOOD NEWS       0.00      0.00      0.00       270
         GREEN       0.38      0.03      0.05       532
HEALTHY LIVING       0.49      0.01      0.03  


Enter a news article:  "Prime Minister faces corruption allegations during election campaign."



Article: "Prime Minister faces corruption allegations during election campaign."
Supervised Category: POLITICS
Unsupervised Topic Cluster: 4
