# Imports

In [1]:
import feedparser
import pandas as pd
import re
import nltk
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
import joblib
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

# Step 1 - Data set extraction

In [2]:
rss_dict = {
    'WaronTheRocks': 'https://warontherocks.com/feed/',
    'BreakingDefense': 'https://breakingdefense.com/full-rss-feed/?v=2',
    'DefenseOne': 'https://www.defenseone.com/rss/all/',
    'TheDrive': 'https://www.thedrive.com/rss',
    'MilitaryTimes': 'https://www.militarytimes.com/rss/',
    'TheNationalInterest': 'https://nationalinterest.org/rss.xml',
    'TheNewYorkTimes': 'https://rss.nytimes.com/services/xml/rss/nyt/HomePage.xml',
    'DefencePost': 'https://thedefensepost.com/feed/',
    'TheEconomist': 'https://www.economist.com/sections/international/rss',
    'BBC': 'http://feeds.bbci.co.uk/news/world/rss.xml',
    'CNN': 'http://rss.cnn.com/rss/edition_world.rss',
    'TheGuardian': 'https://www.theguardian.com/world/rss',
}


In [3]:
def parse_feed(feed: feedparser.FeedParserDict) -> pd.DataFrame:   
    data = []
    for entry in feed.entries:
        data.append({
            'title': entry.get('title', ''),
            'link': entry.get('link', ''),
            'published': entry.get('published', ''),
            'summary': entry.get('summary', '')
        })
    
    return pd.DataFrame(data)

def fetch_all_rss(urls: dict) -> pd.DataFrame:
    """Parses RSS feeds from the provided URLs and returns a concatenated DataFrame."""
    all_data = []

    for source_name, source_url in urls.items():
        print(f"\nParsing feed from {source_name}...")
        feed = feedparser.parse(source_url)

        if 'entries' in feed and len(feed.entries) > 0:
            print(f"✅ Found {len(feed.entries)} entries.")
            df = parse_feed(feed)
            df["source"] = source_name 
            df["text"] = df["title"].fillna("") + " " + df["summary"].fillna("")
            all_data.append(df)
        else:
            print("⚠️ No entries found or feed is malformed.")
    
    return pd.concat(all_data, ignore_index=True) if all_data else pd.DataFrame()

# Run parsing
df_all = fetch_all_rss(rss_dict)
print(f"\n🔎 Total articles collected: {len(df_all)}")
df_all.head()





Parsing feed from WaronTheRocks...
✅ Found 100 entries.

Parsing feed from BreakingDefense...
✅ Found 30 entries.

Parsing feed from DefenseOne...
✅ Found 24 entries.

Parsing feed from TheDrive...
✅ Found 61 entries.

Parsing feed from MilitaryTimes...
⚠️ No entries found or feed is malformed.

Parsing feed from TheNationalInterest...
⚠️ No entries found or feed is malformed.

Parsing feed from TheNewYorkTimes...
✅ Found 26 entries.

Parsing feed from DefencePost...
✅ Found 10 entries.

Parsing feed from TheEconomist...
⚠️ No entries found or feed is malformed.

Parsing feed from BBC...
✅ Found 23 entries.

Parsing feed from CNN...
✅ Found 29 entries.

Parsing feed from TheGuardian...
✅ Found 45 entries.

🔎 Total articles collected: 348


Unnamed: 0,title,link,published,summary,source,text
0,How Iran Overplayed its Hand,https://warontherocks.com/2025/07/how-iran-ove...,"Mon, 14 Jul 2025 07:30:27 +0000",<p>After Israel struck Iran&#8217;s nuclear fa...,WaronTheRocks,How Iran Overplayed its Hand <p>After Israel s...
1,"Small State, Big Stakes: The Maldives and Stra...",https://warontherocks.com/2025/07/small-state-...,"Mon, 14 Jul 2025 07:15:02 +0000",<p>Located at the crossroads of vital Indian O...,WaronTheRocks,"Small State, Big Stakes: The Maldives and Stra..."
2,Toward a New Grand Design? Reviving Sully’s Le...,https://warontherocks.com/2025/07/toward-a-new...,"Fri, 11 Jul 2025 07:30:53 +0000",<p>Editor&#8217;s Note: This is part of a runn...,WaronTheRocks,Toward a New Grand Design? Reviving Sully’s Le...
3,Consequences Unfold,https://warontherocks.com/2025/07/consequences...,"Thu, 10 Jul 2025 17:30:57 +0000",<p>Welcome to The Adversarial.&#160;Every othe...,WaronTheRocks,Consequences Unfold <p>Welcome to The Adversar...
4,China’s Mineral Monopoly Demands Renewing the ...,https://warontherocks.com/2025/07/chinas-miner...,"Thu, 10 Jul 2025 07:30:26 +0000",<p>American auto manufacturers are preparing t...,WaronTheRocks,China’s Mineral Monopoly Demands Renewing the ...


In [4]:
df_all.to_csv("../data/raw_rss.csv", index=False)