In [1]:
pip install transformers




In [21]:
# Importing necessary libraries
import feedparser
from sqlalchemy import Column, Integer, String, DateTime
from sqlalchemy.ext.declarative import declarative_base
from sqlalchemy import create_engine
from sqlalchemy.orm import sessionmaker
from datetime import datetime
import logging
from celery import Celery

# Defining SQLAlchemy Base
Base = declarative_base()

# Defining Article ORM model
class Article(Base):
    __tablename__ = 'articles'
    id = Column(Integer, primary_key=True)
    title = Column(String)
    content = Column(String)
    publication_date = Column(DateTime)
    source_url = Column(String, unique=True)
    category = Column(String)

# Initializing Celery
celery = Celery('news_processor', broker='redis://localhost:6379/0')

# Configuring logging
logging.basicConfig(filename='news_processing.log', level=logging.INFO)

# Function to parse RSS feeds
@celery.task
def parse_rss(feed_urls):
    articles = []
    for url in feed_urls:
        feed = feedparser.parse(url)
        
        for entry in feed.entries:
            
#           
            summary = entry.get('summary', '')
            category = entry.get('category', '')

            
            publication_date_str = entry.get('published', '')
            try:
                publication_date = datetime.strptime(publication_date_str, '%a, %d %b %Y %H:%M:%S %Z')
            except ValueError:
                publication_date = None
            article = {
                'title': entry.get('title', ''),
                'content': summary,
                'publication_date': publication_date,
                'source_url': entry.get('link', ''),
                'category': ''
            }
            articles.append(article)
            
#     for article in articles:
#         if article['category'] != 'others':
#             print(article)
            
    return articles

# Function to store articles in the database
@celery.task
def store_articles(articles):
    print("hai from ", "store_articles")
    engine = create_engine('sqlite:///news_db.sqlite')
    Base.metadata.create_all(engine)
    Session = sessionmaker(bind=engine)
    session = Session()
    for article_data in articles:
        try:
            article = Article(
                title=article_data['title'],
                content=article_data['content'],
                publication_date=article_data['publication_date'],
                source_url=article_data['source_url'],
                category=article_data['category']
            )
            session.add(article)
            session.commit()
        except Exception as e:
            logging.error(f"Error storing article: {str(e)}")
            session.rollback()

    logging.info("store_articles function executed successfully")  


# Function to classify articles
def classify_article(content):
    # For demonstration, let's assume a simple rule-based classification
    if "terrorism" in content.lower() or "protest" in content.lower() or "political unrest" in content.lower() or "riot" in content.lower():
        return "Terrorism / Protest / Political Unrest / Riot"
    elif "positive" in content.lower() or "uplifting" in content.lower():
        return "Positive/Uplifting"
    elif "natural disaster" in content.lower():
        return "Natural Disasters"
    else:
        return "Others modified"

# from transformers import TFAutoTokenizer, TFAutoModelForSequenceClassification

# # Load the tokenizer and model using TensorFlow weights
# tokenizer = TFAutoTokenizer.from_pretrained("textattack/bert-base-uncased-imdb")
# model = TFAutoModelForSequenceClassification.from_pretrained("textattack/bert-base-uncased-imdb")

# def classify_article(content):
#     if not content:
#         return "Others"  # If the content is empty, classify as "Others"

#     try:
#         # Tokenize the content
#         inputs = tokenizer(content, return_tensors="tf", padding=True, truncation=True)

#         # Forward pass through the model
#         outputs = model(inputs)

#         # Get the predicted label
#         predicted_label_id = tf.argmax(outputs.logits, axis=1).numpy()[0]
#         predicted_label = model.config.id2label[predicted_label_id]
        
#         print("Predicted label:", predicted_label)
#         return predicted_label
#     except Exception as e:
#         print(f"Error during classification: {str(e)}")
#         return "Others"


# Function to categorize articles
@celery.task
def categorize_articles():
    print("hai from categorize_articles")
    engine = create_engine('sqlite:///news_db.sqlite')
    Session = sessionmaker(bind=engine)
    session = Session()
    
    uncategorized_articles = session.query(Article).all()

    # Fetch uncategorized articles
#     uncategorized_articles = session.query(Article).filter(Article.category == '').all()
    
#     print("uncategorized_articles knri", uncategorized_articles)
#     return

    # Assuming you have a trained text classification model (model)
    for article in uncategorized_articles:
#         print("article is ", article.content)
        # Perform text classification on article content to determine its category
        category = classify_article(article.content)

        # Update the category in the database
        article.category = category
        session.add(article)

    # Commit changes to the database
    session.commit()

if __name__ == "__main__":
    # Run tasks
    feed_urls = [
        'http://rss.cnn.com/rss/cnn_topstories.rss',
        'http://qz.com/feed',
        'http://feeds.foxnews.com/foxnews/politics',
        'http://feeds.reuters.com/reuters/businessNews',
        'http://feeds.feedburner.com/NewshourWorld',
        'https://feeds.bbci.co.uk/news/world/asia/india/rss.xml'
    ]
#     extracted_articles = parse_rss(feed_urls)
#     print("extracted_articles", extracted_articles)
#     store_articles(extracted_articles)


#     my_text = "The quick brown fox jumps over the riot dog"
#     result = classify_article(my_text)
#     print("result is ", result)
    categorize_articles()

#    # Sample text data for testing
#     sample_text_1 = "This article discusses the recent protests in the city."
#     sample_text_2 = "A heartwarming story about a community coming together after a natural disaster."
#     sample_text_3 = "The stock market saw a significant rise today, bringing positivity to investors."
#     sample_text_4 = "A report on the impact of climate change and the increasing frequency of natural disasters."

#     # Call the classification function for each sample text
#     category_1 = classify_article(sample_text_1)
#     category_2 = classify_article(sample_text_2)
#     category_3 = classify_article(sample_text_3)
#     category_4 = classify_article(sample_text_4)

#     # Print the predicted categories
#     print("Predicted Category for Text 1:", category_1)
#     print("Predicted Category for Text 2:", category_2)
#     print("Predicted Category for Text 3:", category_3)
#     print("Predicted Category for Text 4:", category_4)


Predicted Category for Text 1: Terrorism / Protest / Political Unrest / Riot
Predicted Category for Text 2: Natural Disasters
Predicted Category for Text 3: Others modified
Predicted Category for Text 4: Natural Disasters


In [24]:
import pandas as pd
from sqlalchemy import create_engine

# Connect to the SQLite database
engine = create_engine('sqlite:///news_db.sqlite')

# Query all articles from the database
query = "SELECT * FROM articles"
articles_df = pd.read_sql(query, engine)

# Display the DataFrame
print(articles_df)


     id                                              title  \
0   161  India calls Pakistan's claim of targeted killi...   
1   162  Tamil Nadu: Chennai’s 'last Jew' fights for pl...   
2   163  India v England: 'Hyderabad Heist shows Englan...   
3   164              Rohan Bopanna: 'Age is just a number'   
4   165  Adani Ports: The Tamil Nadu villagers taking o...   
5   166  Umar Khalid: Indian activist languishes in jai...   
6   167         Women take the lead in Republic Day parade   
7   168  Watch: Crowds at India's new Ram temple in Ayo...   
8   169       Indian PM Modi inaugurates temple in Ayodhya   
9   170            Plane gets jammed under bridge in India   
10  171       Intruder jumps on table in Indian parliament   
11  172      Moment worker is pulled from collapsed tunnel   
12  173     First video of trapped tunnel workers in India   

                                              content  \
0   Islamabad on Thursday accused India of killing...   
1   Davvid Levi h

In [25]:
# Export the DataFrame to a CSV file
articles_df.to_csv('articles.csv', index=False)
