In [None]:
#RSS Feed Parsing

In [1]:
import feedparser

rss_feeds = [
    "http://rss.cnn.com/rss/cnn_topstories.rss",
    "http://qz.com/feed",
    "http://feeds.foxnews.com/foxnews/politics",
    "http://feeds.reuters.com/reuters/businessNews",
    "http://feeds.feedburner.com/NewshourWorld",
    "https://feeds.bbci.co.uk/news/world/asia/india/rss.xml"
]

news_articles = []

for feed_url in rss_feeds:
    feed = feedparser.parse(feed_url)
    for entry in feed.entries:
        news_articles.append({
            'title': entry.title,
            'link': entry.link,
            'summary': entry.summary,
            'published': entry.published,
            'category': None  # This will be filled later
        })


ModuleNotFoundError: No module named 'feedparser'

In [None]:
#Text Categorization with NLTK:

In [None]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import make_pipeline
from sklearn import metrics

nltk.download('punkt')
nltk.download('stopwords')

# Sample training data (you should replace this with a larger labeled dataset)
training_data = [
    ('Terrorism', 'Content related to terrorism and political unrest'),
    ('Positive', 'Uplifting and positive news content'),
    ('NaturalDisasters', 'News related to natural disasters'),
    ('Others', 'Miscellaneous news content')
    # Add more labeled examples as needed
]

# Tokenization, stop word removal, and stemming
stop_words = set(stopwords.words('english'))
stemmer = PorterStemmer()

def tokenize(text):
    tokens = word_tokenize(text)
    tokens = [stemmer.stem(word.lower()) for word in tokens if word.isalnum() and word.lower() not in stop_words]
    return tokens

# Create a pipeline for text classification
model = make_pipeline(
    TfidfVectorizer(tokenizer=tokenize),
    MultinomialNB()
)

# Split the training data
X, y = zip(*training_data)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train the model
model.fit(X_train, y_train)

# Predict categories for news articles
for article in news_articles:
    article_text = article['title'] + ' ' + article['summary']
    category = model.predict([article_text])[0]
    article['category'] = category


In [None]:
 #Database Storage

In [None]:
from sqlalchemy import create_engine, Column, Integer, String, DateTime
from sqlalchemy.ext.declarative import declarative_base
from sqlalchemy.orm import sessionmaker
from datetime import datetime

Base = declarative_base()

class NewsArticle(Base):
    __tablename__ = 'news_articles'

    id = Column(Integer, primary_key=True)
    title = Column(String)
    link = Column(String)
    summary = Column(String)
    published = Column(DateTime)
    category = Column(String)

engine = create_engine('sqlite:///news_database.db')
Base.metadata.create_all(engine)

Session = sessionmaker(bind=engine)
session = Session()

for article in news_articles:
    db_article = NewsArticle(
        title=article['title'],
        link=article['link'],
        summary=article['summary'],
        published=datetime.strptime(article['published'], '%a, %d %b %Y %H:%M:%S %z'),
        category=article['category']
    )
    session.add(db_article)

session.commit()


In [None]:

 #Further Enhancements:
#Add error handling and logging for robustness.
#Implement a web-based or command-line interface for users to interact with the application.
#Schedule periodic updates to fetch new articles and update the database.
#Consider using a more extensive labeled dataset for training the categorization model.
#Remember to replace the sample training data with a more comprehensive and representative dataset for better
 #categorization accuracy. Additionally, you can fine-tune the NLTK-based categorization model or explore other machine
 #learning frameworks for more advanced text classification models