In [1]:
import pandas as pd

In [2]:
headlines = pd.read_csv('Data/headlines.csv')

In [3]:
headlines.shape

In [4]:
headlines.head()

In [5]:
headlines.isnull().sum()

In [6]:
headlines.describe()

In [7]:
headlines.drop(['Unnamed: 0', 'url', 'index'], axis=1, inplace=True)
headlines.head()

In [8]:
# Assuming 'headlines' is the name of your dataframe
min_headlines_threshold = 5000
top_sites = headlines['site'].value_counts()
top_sites = top_sites[top_sites >= min_headlines_threshold].index

# Create a new dataframe with only the sites with at least 5000 headlines
headlines_filtered = headlines[headlines['site'].isin(top_sites)].copy()

# Display the value counts for the new dataframe
headlines_filtered['site'].value_counts()


In [9]:
headlines_filtered.head()

In [10]:
headlines_filtered.shape

feature engineering

In [11]:
# Assuming 'headlines' is your DataFrame
headlines_filtered['Word_Count'] = headlines_filtered['headline_no_site'].apply(lambda x: len(x.split()))

In [None]:
from textblob import TextBlob

# Assuming 'headlines' is your DataFrame
headlines_filtered['Sentiment_Polarity'] = headlines_filtered['headline_no_site'].apply(lambda x: TextBlob(x).sentiment.polarity)

In [None]:
headlines_filtered['Text_Length'] = headlines_filtered['headline_no_site'].apply(len)

In [None]:
headlines_filtered['Avg_Word_Length'] = headlines_filtered['Text_Length'] / headlines_filtered['Word_Count']

In [None]:
import datetime

In [None]:
# Date Features
headlines_filtered['Day_of_Week'] = pd.to_datetime(headlines_filtered['time']).dt.day_name()
headlines_filtered['Month'] = pd.to_datetime(headlines_filtered['time']).dt.month

In [None]:
# Time feature
headlines_filtered['Hour_of_Day'] = pd.to_datetime(headlines_filtered['time']).dt.hour

In [None]:
headlines_filtered.head()

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Assuming 'headlines_filtered' is the name of your dataframe
top_10_sites = headlines_filtered['site'].value_counts().nlargest(10)

# Create a pie chart
plt.figure(figsize=(8, 8))
plt.pie(top_10_sites, labels=top_10_sites.index, autopct='%1.1f%%', colors=sns.color_palette('viridis'), startangle=90)
plt.title('Top 10 News Sources Distribution')
plt.tight_layout()

# Show the plot
plt.show()

In [None]:
# Assuming 'headlines_filtered' is the name of your dataframe
plt.figure(figsize=(8, 6))
sns.countplot(x='country', data=headlines_filtered, palette='viridis')
plt.title('Distribution of Countries')
plt.xlabel('Country')
plt.ylabel('Number of Headlines')
plt.tight_layout()

# Show the plot
plt.show()

In [None]:
# Assuming 'time' is the feature containing datetime information
headlines_filtered['time'] = pd.to_datetime(headlines_filtered['time'], errors='coerce')

# Extract the year and create a new feature 'Publication_Year'
headlines_filtered['Publication_Year'] = headlines_filtered['time'].dt.year

In [None]:
headlines_filtered['Publication_Year'].value_counts()

In [None]:
# Drop the 'time' column
headlines_filtered =headlines_filtered.drop(columns=['time'])

In [None]:
# Assuming 'headlines_filtered' is the name of your dataframe
# Convert 'Publication_Year' column to datetime format if not already
headlines_filtered['Publication_Year'] = pd.to_datetime(headlines_filtered['Publication_Year'], format='%Y')

# Group by year and calculate the average bias score
average_bias_by_year = headlines_filtered.groupby(headlines_filtered['Publication_Year'].dt.year)['bias'].mean().reset_index()

# Set a Seaborn style
sns.set(style="whitegrid")

# Plot the average bias score for each year
plt.figure(figsize=(12, 6))
sns.lineplot(x='Publication_Year', y='bias', data=average_bias_by_year, marker='o', color='purple', linewidth=2)

# Style the plot
plt.title('Average Bias Score Over the Years', fontsize=16)
plt.xlabel('Publication Year', fontsize=12)
plt.ylabel('Average Bias Score', fontsize=12)
plt.xticks(rotation=45, ha='right', fontsize=10)
plt.yticks(fontsize=10)
plt.grid(axis='y', linestyle='--', alpha=0.7)

# Show the plot
plt.tight_layout()
plt.show()


In [None]:
# Set a Seaborn style
sns.set(style="whitegrid")

# Plot a violin plot of bias scores by average word count
plt.figure(figsize=(12, 6))
sns.violinplot(x='Word_Count', y='bias', data=headlines_filtered, inner='quartile', palette='Blues')

# Style the plot
plt.title('Distribution of Bias Scores by Word Count', fontsize=16)
plt.xlabel('Word Count', fontsize=12)
plt.ylabel('Bias Score', fontsize=12)
plt.xticks(fontsize=10)
plt.yticks(fontsize=10)
plt.grid(axis='both', linestyle='--', alpha=0.7)

# Show the plot
plt.tight_layout()
plt.show()


In [None]:
headlines_filtered.head()

In [None]:
# Set a Seaborn style
sns.set(style="whitegrid")

# Plot histograms for different bias scores
plt.figure(figsize=(12, 6))
sns.histplot(x='Sentiment_Polarity', hue='bias', data=headlines_filtered, bins=30, multiple="stack", palette='viridis', edgecolor='w')

# Style the plot
plt.title('Distribution of Sentiment Polarity for Different Bias Scores', fontsize=16)
plt.xlabel('Sentiment Polarity', fontsize=12)
plt.ylabel('Frequency', fontsize=12)
plt.xticks(fontsize=10)
plt.yticks(fontsize=10)
plt.legend(title='Bias Score', title_fontsize='12', loc='upper right')
plt.grid(axis='y', linestyle='--', alpha=0.7)

# Show the plot
plt.tight_layout()
plt.show()
