## **Data Understanding**

In [None]:
import pandas as pd
import numpy as np
from tabulate import tabulate
import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib.dates as mdates
import tldextract
import mlflow
import mlflow.sklearn
import nltk
from scipy.stats import describe

# Preprocessing and model selection
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

# Text processing and feature extraction
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

# Topic modeling and clustering
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score

# Classification and evaluation
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report

nltk.download('stopwords')
nltk.download('wordnet')

**Import Data**

In [None]:
# Load datasets
rating_path = '../data/rating.csv'
domains_location_path = '../data/domains_location.csv'
traffic_path = '../data/traffic.csv'

rating_df = pd.read_csv(rating_path)
domains_location_df = pd.read_csv(domains_location_path)
traffic_data_df = pd.read_csv(traffic_path)

In [None]:
# Display first few rows of each dataset
print("Rating.csv:\n", tabulate(rating_df.head(), headers='keys', tablefmt='psql'))
print("\nDomains Location.csv:\n", tabulate(domains_location_df.head(), headers='keys', tablefmt='psql'))
print("\nTraffic Data.csv:\n", tabulate(traffic_data_df.head(), headers='keys', tablefmt='psql'))

In [None]:
# Function to print column names and shape
def print_info(df, df_name):
    print(f"{df_name} columns: {df.columns.tolist()}")
    print(f"{df_name} shape: The df has {df.shape[0]} rows and {df.shape[1]} columns.\n")

# Print information for each data set
print_info(rating_df, 'rating_df')
print_info(domains_location_df, 'domains_location_df')
print_info(traffic_data_df, 'traffic_data_df')

**Feature Engineering**

In [None]:
# Function to extract domain
def extract_domain(url):
    ext = tldextract.extract(url)
    return f"{ext.domain}.{ext.suffix}"

# Apply the function to the 'url' column
rating_df['domain'] = rating_df['url'].apply(extract_domain)

# Display the first few rows to verify
print(rating_df[['url', 'domain']].head())

In [None]:
# Get unique counts of domains
unique_rating_domains = rating_df['domain'].nunique()
unique_location_domains = domains_location_df['SourceCommonName'].nunique()
unique_traffic_domains = traffic_data_df['Domain'].nunique()

print(f"Unique domains in rating_df: {unique_rating_domains}")
print(f"Unique domains in domains_location_df: {unique_location_domains}")
print(f"Unique domains in traffic_data_df: {unique_traffic_domains}")

In [None]:
# Define the mapping for sentiment values
sentiment_mapping = {'Positive': 1, 'Neutral': 0, 'Negative': -1}

# Apply the mapping to the title_sentiment column
rating_df['title_sentiment_score'] = rating_df['title_sentiment'].map(sentiment_mapping)

# Verify the changes
print(rating_df[['title_sentiment', 'title_sentiment_score']].head())

In [None]:
# Merge rating_df with domains_location_df
merged_df = pd.merge(rating_df, domains_location_df, left_on='domain', right_on='SourceCommonName', how='left')

# Merge the resulting DataFrame with traffic_data_df
final_df = pd.merge(merged_df, traffic_data_df, left_on='domain', right_on='Domain', how='left')

# Display the first few rows to verify
print(final_df.head())

In [None]:
# Function to print column names and shape
def print_info(df, df_name):
    print(f"{df_name} columns: {df.columns.tolist()}")
    print(f"{df_name} shape: The df has {df.shape[0]} rows and {df.shape[1]} columns.\n")

# Print information for the data set
print_info(final_df, 'final_df')

## **Data Preparation**

**Missing Values**

In [None]:
# Function to check for missing values
def check_missing_values(df):
    total_rows = len(df)
    missing_info = df.isnull().sum()
    for column, missing_count in missing_info.items():
        if missing_count > 0:
            missing_percentage = (missing_count / total_rows) * 100
            print(f"The {column} column has {missing_count} missing values which is {missing_percentage:.2f}% of the column.")

check_missing_values(final_df)

In [None]:
# Drop source_id column
if 'source_id' in final_df.columns:
    final_df = final_df.drop(columns=['source_id'])

# Impute missing values
fill_values = {
    'author': 'Unknown',
    'description': 'No description available',
    'url_to_image': 'No image available',
    'category': 'Uncategorized',
    'SourceCommonName': 'Unknown',
    'location': 'Unknown',
    'Country': 'Unknown'
}

final_df.fillna(value=fill_values, inplace=True)

# Verify that there are no missing values
check_missing_values(final_df)

In [None]:
# Summary statistics and data types
print("\nfinal_df summary:")
print(final_df.describe(include='all'))
print("\nfinal_df data types:")
print(final_df.dtypes)

In [None]:
# Convert 'published_at' to datetime if not already
final_df['published_at'] = pd.to_datetime(final_df['published_at'], errors='coerce')

# Impute missing values
# First, forward fill and backward fill as needed
final_df['published_at'] = final_df['published_at'].fillna(method='ffill').fillna(method='bfill')

# If still missing, fill with a default date (e.g., January 1, 2000)
final_df['published_at'] = final_df['published_at'].fillna(pd.Timestamp('2000-01-01'))

# Extract minimal set of date-related features
final_df['published_year'] = final_df['published_at'].dt.year
final_df['published_month'] = final_df['published_at'].dt.month
final_df['published_day'] = final_df['published_at'].dt.day

# Check the first few rows to ensure imputation and feature extraction
print(final_df[['published_at', 'published_year', 'published_month', 'published_day']].head())


In [None]:
# Check for missing values
print(check_missing_values(final_df))

In [None]:
# Calculate content length
final_df['content_length'] = final_df['content'].str.len()

# Calculate title word count
final_df['title_word_count'] = final_df['title'].str.split().apply(len)

# Convert title_sentiment to numerical scores if needed
sentiment_mapping = {'positive': 1, 'neutral': 0, 'negative': -1}
final_df['title_sentiment_score'] = final_df['title_sentiment'].map(sentiment_mapping)

# Display the first few rows to verify
print(final_df.head())

**Outliers**

In [None]:
# Function to plot box plots for numerical columns with enhanced visuals
def plot_boxplots(df):
    # Select numerical columns
    numerical_columns = df.select_dtypes(include=['float64', 'int64']).columns
    
    # Set the overall style
    sns.set(style="whitegrid")

    for column in numerical_columns:
        # Check if the column has more than one unique value and no NaN values
        if df[column].nunique() > 1 and df[column].notna().sum() > 0:
            plt.figure(figsize=(12, 6))  # Increase figure size for better readability
            sns.boxplot(x=df[column], color='skyblue')  # Add a color for distinction
            plt.title(f'Box Plot of {column}', fontsize=16)  # Add a title with larger font size
            plt.xlabel(column, fontsize=14)  # Label the x-axis with a larger font size
            plt.grid(True, linestyle='--', alpha=0.7)  # Add gridlines for better readability
            plt.show()
        else:
            print(f"Skipping {column} due to insufficient data for plotting.")

# Call the function to plot boxplots for final_df
plot_boxplots(final_df)

In [None]:
# Function to identify outliers using Z-score
def find_outliers_zscore(df, threshold=3):
    # Select numerical columns
    numerical_columns = df.select_dtypes(include=['float64', 'int64']).columns
    
    outliers_dict = {}  # Dictionary to store outliers by column
    
    for column in numerical_columns:
        # Calculate Z-scores
        z_scores = np.abs((df[column] - df[column].mean()) / df[column].std())
        
        # Identify outliers
        outliers = df[column][z_scores > threshold]
        
        # Store outliers in the dictionary
        if not outliers.empty:
            outliers_dict[column] = outliers
    
    return outliers_dict

# Call the function to find outliers for final_df
outliers = find_outliers_zscore(final_df)

# Print the outliers found
if outliers:
    for column, outlier_values in outliers.items():
        print(f"Outliers detected in column '{column}':")
        print(outlier_values)
        print("\n")
else:
    print("No outliers detected using the Z-score method.")


In [None]:
# Function to plot histograms and display descriptive statistics
def plot_histograms(df):
    numerical_columns = df.select_dtypes(include=['float64', 'int64']).columns
    for column in numerical_columns:
        plt.figure(figsize=(10, 5))
        sns.histplot(df[column], kde=True, color='blue')
        plt.title(f'Distribution of {column}')
        plt.show()

        # Descriptive statistics
        mean = df[column].mean()
        median = df[column].median()
        std_dev = df[column].std()
        print(f"{column} - Mean: {mean}, Median: {median}, Std Dev: {std_dev}\n")

plot_histograms(final_df)


In [None]:
# Function to plot bar charts for categorical data and display value counts
#def plot_bar_charts(df):
    #categorical_columns = df.select_dtypes(include=['object']).columns
    #for column in categorical_columns:
        #plt.figure(figsize=(10, 5))
        #df[column].value_counts().plot(kind='bar', color='green')
        #plt.title(f'Bar Chart of {column}')
        #plt.show()

        # Display value counts
        #print(f"Value Counts for {column}:")
        #print(df[column].value_counts(), "\n")

#plot_bar_charts(final_df)


In [None]:
# Group sources with less than a threshold into 'Other'
threshold = 500
source_counts = final_df['source_name'].value_counts()
grouped_sources = source_counts[source_counts >= threshold]
grouped_sources['Other'] = source_counts[source_counts < threshold].sum()

# Plot the grouped sources as a bar chart
plt.figure(figsize=(12, 6))
grouped_sources.plot(kind='bar', color='blue')
plt.title('Article Count by Source (Grouped)')
plt.xlabel('Source Name')
plt.ylabel('Count')
plt.show()

# Plot the same data as a pie chart
plt.figure(figsize=(10, 10))
grouped_sources.plot(kind='pie', autopct='%1.1f%%', startangle=140)
plt.title('Article Distribution by Source (Grouped)')
plt.ylabel('')  # Hide the y-label for a cleaner pie chart
plt.show()


In [None]:
# Calculate the percentage contribution of each source
percentage_contribution = (source_counts / len(final_df)) * 100
print(percentage_contribution)

## **Exploratory Data Analysis**

**Top and Bottom 10 Websites by News Articles Count**

In [None]:
# Top 10 websites with the largest count of news articles
top_10_websites = final_df['source_name'].value_counts().head(10)
print("Top 10 Websites by Article Count:\n", top_10_websites)

# Bottom 10 websites with the smallest count of news articles
bottom_10_websites = final_df['source_name'].value_counts().tail(10)
print("\nBottom 10 Websites by Article Count:\n", bottom_10_websites)


**Top and Bottom 10 Websites by Visitor Traffic**

In [None]:
# Define estimated visitors
def estimate_visitors(row):
    # Extract relevant columns
    global_rank = row['GlobalRank']
    ref_subnets = row['RefSubNets']
    ref_ips = row['RefIPs']
    
    # Handle potential division by zero or missing values
    if global_rank <= 0:  # GlobalRank should be a positive integer
        return None
    if pd.isna(ref_subnets) or pd.isna(ref_ips):
        return None

    # Example scaling factor
    scaling_factor = 10000
    
    # Calculate estimated visitors
    estimated_visitors = (1 / global_rank) * (ref_subnets + ref_ips) * scaling_factor
    return estimated_visitors

# Apply the function to the DataFrame
final_df['EstimatedVisitors'] = final_df.apply(estimate_visitors, axis=1)

# Display the first few rows of the updated DataFrame
print(final_df[['Domain', 'GlobalRank', 'RefSubNets', 'RefIPs', 'EstimatedVisitors']].head())


In [None]:
# Top 10 websites by visitor traffic
top_10_traffic_websites = final_df[['source_name', 'EstimatedVisitors']].groupby('source_name').mean().sort_values(by='EstimatedVisitors', ascending=False).head(10)
print("Top 10 Websites by Visitor Traffic:\n", top_10_traffic_websites)

# Bottom 10 websites by visitor traffic
bottom_10_traffic_websites = final_df[['source_name', 'EstimatedVisitors']].groupby('source_name').mean().sort_values(by='EstimatedVisitors').head(10)
print("\nBottom 10 Websites by Visitor Traffic:\n", bottom_10_traffic_websites)

**Countries with the Highest Number of News Media Organizations**

In [None]:
# Count of news media organizations (domains) by country
country_media_count = final_df['location'].value_counts()
top_countries_media = country_media_count.head(10)
print("Countries with the Highest Number of News Media Organizations:\n", top_countries_media)


**Countries with Many Articles Written About Them**

In [None]:
# Grouping by location to find out the number of articles related to each country
articles_about_countries = final_df['location'].value_counts()
print("Countries with Many Articles Written About Them:\n", articles_about_countries.head(10))


**Websites Reporting About Specific Regions**

In [None]:
# Extract unique country names or codes
unique_countries = final_df['Country'].dropna().unique()

# Convert to a DataFrame for better visualization (optional)
unique_countries_df = pd.DataFrame(unique_countries, columns=['Country'])

# Display the unique country names or codes
print(unique_countries_df)

In [None]:
# Updated regions dictionary with new countries
regions = {
    'North America': ['United States'],
    'Europe': ['United Kingdom', 'Russia'],  # Including Russia here for a broader European context
    'Asia': ['India'],
    'Africa': ['Nigeria'],
    'Middle East': ['Saudi Arabia', 'UAE', 'Iran', 'Israel', 'Turkey'],
    'South America': ['Brazil', 'Argentina', 'Chile'],
    'Oceania': ['Australia', 'New Zealand'],
    'Other': []  # To include any additional or unspecified regions
}

# Function to categorize a website based on its location
def categorize_region(location):
    for region, countries in regions.items():
        if location in countries:
            return region
    return 'Other'

# Apply the function to categorize each entry
final_df['region'] = final_df['Country'].apply(categorize_region)

# Count of websites reporting about specific regions
region_report_counts = final_df['region'].value_counts()
print("Websites Reporting About Specific Regions:\n", region_report_counts)


**Websites with Highest Count of Positive, Neutral, and Negative Sentiment**

In [None]:
# Assuming sentiment column exists with values 'positive', 'neutral', 'negative'
sentiment_summary = final_df.groupby('source_name')['title_sentiment'].value_counts().unstack().fillna(0)

# Descriptive statistics for sentiment
sentiment_stats = sentiment_summary.describe()
print("Descriptive Statistics for Sentiments:\n", sentiment_stats)


**Compare Impact of Mean and Median Sentiment**

In [None]:
# Group by 'source_name' and calculate sentiment counts
sentiment_counts = final_df.groupby('source_name')['title_sentiment'].value_counts().unstack().fillna(0)

# Rename columns for clarity
sentiment_counts.columns = ['negative_count', 'neutral_count', 'positive_count']

# Get the top websites for each sentiment type
top_positive_websites = sentiment_counts[['positive_count']].nlargest(10, 'positive_count')
top_neutral_websites = sentiment_counts[['neutral_count']].nlargest(10, 'neutral_count')
top_negative_websites = sentiment_counts[['negative_count']].nlargest(10, 'negative_count')

print("Top Websites by Positive Sentiment Count:\n", top_positive_websites)
print("\nTop Websites by Neutral Sentiment Count:\n", top_neutral_websites)
print("\nTop Websites by Negative Sentiment Count:\n", top_negative_websites)


In [None]:
# Select top 10 domains by visitor traffic
top_10_domains = final_df.groupby('source_name')['EstimatedVisitors'].sum().nlargest(10).index

# Filter data for top 10 domains
top_10_df = final_df[final_df['source_name'].isin(top_10_domains)]

# Calculate sentiment counts for top 10 domains
sentiment_counts_top_10 = top_10_df.groupby('source_name')['title_sentiment'].value_counts().unstack().fillna(0)

# Calculate mean and median for each sentiment
sentiment_summary = sentiment_counts_top_10.agg(['mean', 'median', 'var'])

print("Sentiment Summary for Top 10 Domains:\n", sentiment_summary)

In [None]:
# Plot comparison of mean and median
# Data for plotting
sentiment_types = ['Negative', 'Neutral', 'Positive']
mean_values = [455.4, 1613.7, 334.3]
median_values = [247.5, 1046.0, 112.0]

# Create a bar plot for mean and median sentiment counts
x = range(len(sentiment_types))  # Position of bars

plt.figure(figsize=(12, 6))
plt.bar(x, mean_values, width=0.4, label='Mean', color='blue', align='center')
plt.bar([p + 0.4 for p in x], median_values, width=0.4, label='Median', color='orange', align='center')

plt.xlabel('Sentiment Type')
plt.ylabel('Count')
plt.title('Comparison of Mean and Median Sentiment Counts for Top 10 Domains')
plt.xticks([p + 0.2 for p in x], sentiment_types)
plt.legend()
plt.show()

**Distribution of Sentiments for Top 10 Domains**

In [None]:
# Get the top websites for each sentiment type
top_positive_websites = sentiment_counts[['positive_count']].nlargest(10, 'positive_count').index
top_neutral_websites = sentiment_counts[['neutral_count']].nlargest(10, 'neutral_count').index
top_negative_websites = sentiment_counts[['negative_count']].nlargest(10, 'negative_count').index

# Combine all top domains into a single list
top_10_domains = set(top_positive_websites) | set(top_neutral_websites) | set(top_negative_websites)

# Filter data for top 10 domains
top_10_sentiments = final_df[final_df['source_name'].isin(top_10_domains)]

# Ensure that 'title_sentiment' is used correctly (assuming it's categorical)
if 'title_sentiment' not in top_10_sentiments.columns:
    raise ValueError("Column 'title_sentiment' not found in the data.")

# Plot sentiment distribution
plt.figure(figsize=(14, 8))
sns.histplot(data=top_10_sentiments, x='title_sentiment', hue='source_name', multiple='stack', bins=30, palette='tab10')

plt.title('Sentiment Distribution for Top 10 Domains by Sentiment Counts')
plt.xlabel('Sentiment')
plt.ylabel('Frequency')
plt.legend(title='Source Name', bbox_to_anchor=(1.05, 1), loc='upper left')

plt.tight_layout()
plt.show()

**Comparing Metadata Across Sites**

**Distribution of Raw Message Lengths Across Sites**

In [None]:
# Distribution of message lengths by site
plt.figure(figsize=(14, 7))
sns.boxplot(data=final_df, x='source_name', y='content_length')
plt.xticks(rotation=90)
plt.title('Distribution of Raw Message Lengths Across Sites')
plt.show()


**Distribution of Title Word Counts Across Sites**

In [None]:
# Distribution of title word counts by site
plt.figure(figsize=(14, 7))
sns.boxplot(data=final_df, x='source_name', y='title_word_count')
plt.xticks(rotation=90)
plt.title('Distribution of Title Word Counts Across Sites')
plt.show()


**Impact of Frequent Reporting and Sentiment on Global Ranking**

In [None]:
from sklearn.preprocessing import StandardScaler

# Ensure 'title_sentiment_score' column exists and is not empty
if 'title_sentiment_score' in final_df.columns and not final_df['title_sentiment_score'].empty:
    # Drop rows with missing values in 'title_sentiment_score'
    final_df = final_df.dropna(subset=['title_sentiment_score'])

    # Check again if there's data left
    if not final_df.empty:
        # Normalize the 'title_sentiment_score' column
        scaler = StandardScaler()
        sentiment_scores = final_df[['title_sentiment_score']].values
        final_df['standardized_sentiment_score'] = scaler.fit_transform(sentiment_scores)

        # Calculate total number of articles and average standardized sentiment score for each website
        reporting_sentiment_df = final_df.groupby('domain').agg(
            total_articles=('article_id', 'count'),
            average_sentiment=('standardized_sentiment_score', 'mean')
        ).reset_index()

        # Perform correlation analysis
        correlation = reporting_sentiment_df['total_articles'].corr(reporting_sentiment_df['average_sentiment'])
        print(f"Correlation between total articles and average standardized sentiment score: {correlation:.2f}")

        # Visualize the relationship using a scatter plot
        plt.figure(figsize=(10, 6))
        sns.scatterplot(x='total_articles', y='average_sentiment', data=reporting_sentiment_df)
        plt.title('Impact of Reporting Frequency on Standardized Sentiment Score')
        plt.xlabel('Total Number of Articles')
        plt.ylabel('Average Standardized Sentiment Score')
        plt.show()
    else:
        print("No data available after dropping missing values.")
else:
    print("Column 'title_sentiment_score' is missing or contains no data.")


## **ML OPS**

**Splitting the Data**

In [None]:
print(final_df[['title', 'category']].head())  # Check the first few rows
print(final_df.shape)  # Check the number of rows and columns
print(final_df['title'].isnull().sum())  # Check for missing values in 'title'
print(final_df['category'].isnull().sum())  # Check for missing values in 'category'


In [None]:
# Since 'category' is the target variable for classification
X = final_df['title']
y = final_df['category']

# Split the data for classification tasks (70% train, 30% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)


**Key word Extraction Using TF-IDF**

In [None]:
print(final_df['title'].head(10))  # Display the first 10 titles to inspect their content


In [None]:
# Test a few titles directly
titles = [
    "superstar chef yannick alléno brings refined french cuisine to dubai",
    "nice claim top spot in ligue 1 with late win against lyon",
    "amphibians are the world’s most vulnerable species"
]

tfidf_vectorizer = TfidfVectorizer(stop_words='english', max_features=10)
try:
    tfidf_matrix = tfidf_vectorizer.fit_transform(titles)
    feature_names = tfidf_vectorizer.get_feature_names_out()
    print("TF-IDF Features:", feature_names)
except ValueError as e:
    print(f"Error: {e}")


In [None]:
# Function to extract keywords using TF-IDF with error handling
def extract_keywords_safe(text):
    if len(text.strip()) > 1:  # Ensure there's more than just whitespace
        try:
            return extract_keywords_tfidf([text])
        except ValueError as e:
            # Handle case where TF-IDF fails due to empty vocabulary
            print(f"Warning: {e} for text: {text}")
            return []
    else:
        return []

# Apply the function to your DataFrame
final_df['title_keywords'] = final_df['title'].apply(extract_keywords_safe)
final_df['content_keywords'] = final_df['content'].apply(extract_keywords_safe)

# Display the results
print(final_df[['title', 'title_keywords', 'content_keywords']].head(10))


**Compare Keywords Between Title and Content**

In [None]:
# Define a function to compare key words
def calculate_similarity(keywords1, keywords2):
    return len(set(keywords1) & set(keywords2)) / len(set(keywords1) | set(keywords2))

# Calculate similarity between title and content keywords
final_df['title_content_similarity'] = final_df.apply(
    lambda row: calculate_similarity(row['title_keywords'], row['content_keywords']), axis=1
)

# Display the similarity scores
sample_similarity = final_df[['source_name', 'title_content_similarity']].head()
print(tabulate(sample_similarity, headers='keys', tablefmt='psql'))

# Summary statistics for title-content similarity
similarity_stats = final_df['title_content_similarity'].describe()
print("Summary Statistics for Title-Content Similarity:")
print(tabulate(similarity_stats.to_frame().T, headers='keys', tablefmt='psql'))

# Additional statistics using scipy describe
similarity_scipy_stats = describe(final_df['title_content_similarity'])
print("\nScipy Summary Statistics:")
print(f"Mean: {similarity_scipy_stats.mean}")
print(f"Variance: {similarity_scipy_stats.variance}")
print(f"Skewness: {similarity_scipy_stats.skewness}")
print(f"Kurtosis: {similarity_scipy_stats.kurtosis}")


**Topic Modeling Using Latent Dirichlet Allocation (LDA)**

In [None]:
# Prepare data for LDA
count_vectorizer = CountVectorizer(max_df=0.95, min_df=2, stop_words='english')
count_data = count_vectorizer.fit_transform(final_df['content'])

# Perform LDA
lda = LatentDirichletAllocation(n_components=10, random_state=42)
lda.fit(count_data)

# Assign the dominant topic to each article
final_df['dominant_topic'] = lda.transform(count_data).argmax(axis=1)

# Display some sample topics
sample_topics = final_df[['source_name', 'dominant_topic', 'title']].head()
print(tabulate(sample_topics, headers='keys', tablefmt='psql'))


**Classify Headlines into Predefined Tags**

In [None]:
# Check if 'title' and other required columns are present
required_columns = ['title', 'content']
missing_columns = [col for col in required_columns if col not in final_df.columns]

if missing_columns:
    raise KeyError(f"Missing columns: {', '.join(missing_columns)}")

# Display the first few rows to verify data
print(final_df.head())


In [None]:
# Define the TF-IDF Vectorizer
vectorizer = TfidfVectorizer(stop_words='english')

# Fit and transform the training data, and transform the test data
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)

# Train the Naive Bayes classifier
classifier = MultinomialNB()
classifier.fit(X_train_vec, y_train)

# Predict and evaluate
y_pred = classifier.predict(X_test_vec)
print(f"Accuracy: {accuracy_score(y_test, y_pred):.2f}")
print(classification_report(y_test, y_pred))


**Trend Analysis and Visualization**

In [None]:
# Group data by date and topic
topic_trend = final_df.groupby([final_df['published_at'].dt.date, 'dominant_topic']).size().unstack(fill_value=0)

# Plotting trends
plt.figure(figsize=(12, 8))
ax = topic_trend.plot(kind='line', stacked=True, cmap='tab20', ax=plt.gca())
plt.title('Topic Trends Over Time')
plt.xlabel('Date')
plt.ylabel('Topic Count')
plt.gca().xaxis.set_major_locator(mdates.MonthLocator())
plt.gca().xaxis.set_major_formatter(mdates.DateFormatter('%Y-%m'))
plt.xticks(rotation=45)
plt.legend(title='Topics')
plt.grid(True)
plt.tight_layout()  # Adjust layout to fit labels
plt.show()

**Event Modeling and Clustering**

In [None]:
# Clustering articles by their TF-IDF features
kmeans = KMeans(n_clusters=10, random_state=42)
final_df['event_cluster'] = kmeans.fit_predict(count_data)

# Display some clustered events
sample_clusters = final_df[['source_name', 'event_cluster', 'title']].head()
print(tabulate(sample_clusters, headers='keys', tablefmt='psql'))


**Correlation Between News Sites Reporting Events**

In [None]:
event_site_correlation = final_df.pivot_table(index='source_name', columns='event_cluster', aggfunc='size', fill_value=0)
correlation_matrix = event_site_correlation.corr()

# Display the correlation matrix
print(correlation_matrix)


In [None]:
# Preprocess the text data
def preprocess_text(text):
    lemmatizer = WordNetLemmatizer()
    stop_words = set(stopwords.words('english'))
    tokens = nltk.word_tokenize(text)
    tokens = [lemmatizer.lemmatize(word) for word in tokens if word.isalpha() and word not in stop_words]
    return ' '.join(tokens)

final_df['processed_text'] = final_df['content'].apply(preprocess_text)


In [None]:
# Vectirize text data
vectorizer = TfidfVectorizer(stop_words='english')
X = vectorizer.fit_transform(final_df['processed_text'])


In [None]:
# Cluster News Articles
num_clusters = 15  
kmeans = KMeans(n_clusters=num_clusters, random_state=42)
final_df['cluster'] = kmeans.fit_predict(X)

# Evaluate the clustering
silhouette_avg = silhouette_score(X, final_df['cluster'])
print(f'Silhouette Score: {silhouette_avg}')


In [None]:
# Number of events covered
num_events = final_df['cluster'].nunique()
print(f'Number of events: {num_events}')

# News sites reporting events the earliest
final_df['published_at'] = pd.to_datetime(final_df['published_at'])
earliest_reports = final_df.groupby('source_name')['published_at'].min().reset_index()
print(earliest_reports)

# Events with the highest reporting
event_counts = final_df['cluster'].value_counts().reset_index()
event_counts.columns = ['cluster', 'count']
print(event_counts)

# Correlation between news sites reporting events
correlation_matrix = final_df.pivot_table(index='published_at', columns='source_name', values='cluster', aggfunc='count').corr()
print(correlation_matrix)


**Versioning ML Models with MLFlow**

In [None]:
mlflow.set_experiment('News Article Clustering')

with mlflow.start_run():
    mlflow.sklearn.log_model(kmeans, 'kmeans_model')
    mlflow.log_metric('silhouette_score', silhouette_avg)
    mlflow.log_param('num_clusters', num_clusters)


In [None]:
# Define a more representative example input for the Naive Bayes model
example_text = [
    "Breaking news: Major political shift in the United Kingdom.",
    "Tech companies report quarterly earnings exceeding expectations.",
    "Health experts discuss the impact of new diet trends on overall well-being."
]
example_input = vectorizer.transform(example_text)

# Get the model signature from example input
signature = infer_signature(example_input, classifier.predict(example_input))

# Start MLFlow run
with mlflow.start_run():
    # Log the model with signature
    mlflow.sklearn.log_model(
        classifier,
        "naive_bayes_classifier",
        signature=signature
    )
    # Log parameters and metrics
    mlflow.log_param("model", "Naive Bayes")
    mlflow.log_param("accuracy", accuracy_score(y_test, y_pred))
