importing necessary libray

In [3]:
import pandas as pd
import matplotlib.pyplot as plt
from textblob import TextBlob
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
import re

function to import  the dataset from google drive 

In [5]:
def download_file(file_id, output):
    """Downloads a file from Google Drive using its file ID."""
    import gdown
    url = f'https://drive.google.com/uc?id={file_id}'
    gdown.download(url, output, quiet=False)

function to load the dataset 

In [6]:
def load_data(filepath):
    """Loads data from a CSV file."""
    return pd.read_csv(filepath)

Preprocesses the dataset by cleaning and extracting useful information.

In [7]:
def preprocess_data(df):
    
    df['headline_length'] = df['headline'].str.len()
    df['date'] = pd.to_datetime(df['date'], errors='coerce')
    df['year'] = df['date'].dt.year
    df['month'] = df['date'].dt.month
    df['day_of_week'] = df['date'].dt.day_name()
    df['headline_clean'] = df['headline'].str.lower().str.replace('[^\w\s]', '', regex=True)
    return df

  df['headline_clean'] = df['headline'].str.lower().str.replace('[^\w\s]', '', regex=True)


Function to Prints basic statistics about the dataset.

In [9]:
def analyze_statistics(df):

    print("Headline Length Statistics:")
    print(df['headline_length'].describe())
    print("\nPublisher Counts:")
    print(df['publisher'].value_counts())

function to  Plots trends in article counts by year, month, and day of the week

In [10]:
def plot_trends(df):
    
    articles_per_year = df['year'].value_counts().sort_index()
    articles_per_month = df['month'].value_counts().sort_index()
    articles_by_day = df['day_of_week'].value_counts()

    plt.figure(figsize=(14, 8))
    plt.subplot(3, 1, 1)
    articles_per_year.plot(kind='bar', color='skyblue')
    plt.title('Articles Per Year')
    plt.subplot(3, 1, 2)
    articles_per_month.plot(kind='bar', color='orange')
    plt.title('Articles Per Month')
    plt.subplot(3, 1, 3)
    articles_by_day.plot(kind='bar', color='green')
    plt.title('Articles By Day of the Week')
    plt.tight_layout()
    plt.show()

function to Analyzes and categorizes the sentiment of headlines

In [None]:
def analyze_sentiments(df):
    
    def get_sentiment(text):
        analysis = TextBlob(text)
        return 'Positive' if analysis.sentiment.polarity > 0 else 'Negative' if analysis.sentiment.polarity < 0 else 'Neutral'

    df['sentiment'] = df['headline'].apply(get_sentiment)
    print("\nSentiment Counts:")
    print(df['sentiment'].value_counts())


function to Performs topic modeling on the dataset headlines.

In [12]:
def topic_modeling(df, n_topics=5):
    
    vectorizer = CountVectorizer(stop_words='english')
    doc_term_matrix = vectorizer.fit_transform(df['headline'].dropna())
    lda_model = LatentDirichletAllocation(n_components=n_topics, random_state=42)
    lda_model.fit(doc_term_matrix)

    words = vectorizer.get_feature_names_out()
    print("\nTopics:")
    for idx, topic in enumerate(lda_model.components_):
        print(f"Topic {idx + 1}:")
        print([words[i] for i in topic.argsort()[-10:]])

Function to Plots publication trends over time.

In [13]:
def plot_publication_trends(df):
    
    df['date_only'] = df['date'].dt.date
    daily_publications = df.groupby('date_only').size()

    plt.figure(figsize=(12, 6))
    daily_publications.plot()
    plt.title("Article Publication Frequency Over Time")
    plt.xlabel("Date")
    plt.ylabel("Number of Articles")
    plt.xticks(rotation=45)
    plt.show()


function to Analyzes publishers and plots the top 10 publishers by article count.

In [15]:
def analyze_publishers(df):
    
    publisher_counts = df['publisher'].value_counts()

    plt.figure(figsize=(12, 6))
    publisher_counts.head(10).plot(kind='bar', color='skyblue')
    plt.title("Top 10 Publishers by Number of Articles")
    plt.xlabel("Publisher")
    plt.ylabel("Number of Articles")
    plt.xticks(rotation=45, ha='right')
    plt.tight_layout()
    plt.show()
