In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import nltk
from nltk.sentiment import SentimentIntensityAnalyzer
import datetime
from nltk.corpus import stopwords
import gensim
import gensim.downloader
from gensim.utils import simple_preprocess

In [2]:
# Loading 'headlines.csv' into a Pandas DataFrame
headlines = pd.read_csv('Data/headlines.csv')

# Showing first 5 rows of the DataFrame
headlines.head()

In [3]:
# Dropping irrelevant columns 'url', 'Unnamed: 0', and 'index'
headlines = headlines.drop(columns=['url', 'Unnamed: 0', 'index'])

In [4]:
headlines.shape

In [5]:
headlines.describe()

In [6]:
# Getting value counts for bias feature
headlines['bias'].value_counts()

In [7]:
# Plotting bias distribution

# Setting Seaborn style
sns.set(style="whitegrid")

# Plotting histogram of 'bias'
plt.figure(figsize=(10, 6))
sns.histplot(headlines['bias'], bins=20, kde=True, color='blue')

# Styling the plot
plt.title('Distribution of Bias Scores', fontsize=16)
plt.xlabel('Bias Score', fontsize=12)
plt.ylabel('Frequency', fontsize=12)
plt.xticks(fontsize=10)
plt.yticks(fontsize=10)
plt.grid(axis='y', linestyle='--', alpha=0.7)

# Showing the plot
plt.tight_layout()
plt.show()

In [8]:
# Creating feature 'sentiment_polarity'

# Creating a sentiment analyzer
sid = SentimentIntensityAnalyzer()

# Applying the sentiment analyzer to each headline and storing the compound score - this takes a while to run
headlines['sentiment_polarity'] = headlines['headline_no_site'].apply(lambda x: sid.polarity_scores(x)['compound'])

In [9]:
headlines['sentiment_polarity'].value_counts()

In [10]:
# Print min and max values for sentiment polarity
print(headlines['sentiment_polarity'].min())
print(headlines['sentiment_polarity'].max())

In [11]:
# Plotting dist of sentiment polarity

# Setting Seaborn style
sns.set(style="whitegrid")

# Plotting histogram of 'sentiment_polarity'
plt.figure(figsize=(10, 6))
sns.histplot(headlines['sentiment_polarity'], bins=20, kde=True, color='green')

# Styling the plot
plt.title('Distribution of Sentiment Polarity Scores', fontsize=16)
plt.xlabel('Sentiment Polarity Score', fontsize=12)
plt.ylabel('Frequency', fontsize=12)
plt.xticks(fontsize=10)
plt.yticks(fontsize=10)
plt.grid(axis='y', linestyle='--', alpha=0.7)

# Showing the plot
plt.tight_layout()
plt.show()

In [12]:
# Engineering 'Day of the Week' and 'Month' Features
headlines['Day_of_Week'] = pd.to_datetime(headlines['time']).dt.day_name()
headlines['Month'] = pd.to_datetime(headlines['time']).dt.month

# Engineering 'Hour of Dat' feature
headlines['Hour_of_Day'] = pd.to_datetime(headlines['time']).dt.hour

# Converting 'time' column to datetime format
headlines['time'] = pd.to_datetime(headlines['time'], errors='coerce')

# Extracting the year and creating a new 'Publication Year' feature
headlines['Publication_Year'] = headlines['time'].dt.year

In [13]:
# Dropping time column
headlines = headlines.drop(columns=['time'])

In [14]:
# Creating word count feature
headlines['Word_Count'] = headlines['headline_no_site'].apply(lambda x: len(x.split()))

# Creating text length feature
headlines['Text_Length'] = headlines['headline_no_site'].apply(len)

In [15]:
headlines.head()

In [16]:
headlines['site'].value_counts()

In [17]:
headlines['country'].value_counts()

In [18]:
# Setting a threshold for news sites with at least 5000 headlines
min_headlines_threshold = 5000
top_sites = headlines['site'].value_counts()
top_sites = top_sites[top_sites >= min_headlines_threshold].index

# Creating a new dataframe with only the sites with at least 5000 headlines
headlines_filtered = headlines[headlines['site'].isin(top_sites)].copy()

In [19]:
headlines_filtered.shape

In [20]:
# Getting top 10 sites with most headlines
top_10_sites = headlines_filtered['site'].value_counts().nlargest(10)

# Creating a pie chart
plt.figure(figsize=(8, 8))
plt.pie(top_10_sites, labels=top_10_sites.index, autopct='%1.1f%%', colors=sns.color_palette('viridis'), startangle=90)
plt.title('Top 10 News Sources Distribution')
plt.tight_layout()

# Showing the plot
plt.show()

In [21]:
# Getting distribution of countries
plt.figure(figsize=(8, 6))
sns.countplot(x='country', data=headlines_filtered, palette='viridis')
plt.title('Distribution of Countries')
plt.xlabel('Country')
plt.ylabel('Number of Headlines')
plt.tight_layout()

# Showing the plot
plt.show()

In [22]:
# Set the style for seaborn
sns.set(style="whitegrid")

# Plotting the distribution
plt.figure(figsize=(10, 6))
sns.histplot(headlines_filtered['Word_Count'], bins=30, color='skyblue', kde=False)
plt.title('Distribution of Word Count in Headlines')
plt.xlabel('Word_Count')
plt.ylabel('Frequency')

# Show the plot
plt.show()

In [23]:
# Set the style for seaborn
sns.set(style="whitegrid")

# Plotting the distribution
plt.figure(figsize=(10, 6))
sns.histplot(headlines_filtered['Text_Length'], bins=30, color='skyblue', kde=False)
plt.title('Distribution of Text Length in Headlines')
plt.xlabel('Word_Count')
plt.ylabel('Frequency')

# Show the plot
plt.show()

In [24]:
headlines_filtered['bias'].value_counts()

In [25]:
# Setting values to drop
values_to_drop = [0.666667, 0.833333]

# Use boolean indexing to drop rows with specified values in 'bias' column
headlines_filtered = headlines_filtered[~headlines_filtered['bias'].isin(values_to_drop)]

In [26]:
conditions = [
    headlines_filtered['bias'].between(0.000000, 0.000000, inclusive='both'),
    headlines_filtered['bias'].between(0.1, 0.2, inclusive='both'),
    headlines_filtered['bias'].between(0.3, 0.5, inclusive='both'),
]

labels = ['No Bias', 'Low Bias', 'High Bias']

headlines_filtered['bias_category'] = np.select(conditions, labels, default=None)

In [27]:
headlines_filtered['bias_category'].unique()

In [28]:
headlines_filtered.head()

In [29]:
# Dropping original bias column
headlines_filtered = headlines_filtered.drop(columns=['bias'])

In [30]:
headlines_filtered.isnull().sum()

In [31]:
# Set the style for seaborn
sns.set(style="whitegrid")

# Plotting the distribution of bias_category by Publication_Year
plt.figure(figsize=(12, 8))
sns.countplot(x="Publication_Year", hue="bias_category", data=headlines_filtered)
plt.title('Distribution of Bias Category by Publication_Year')
plt.xlabel('Publication Year')
plt.ylabel('Count')

# Show the plot
plt.show()

In [32]:
# Set a Seaborn style
sns.set(style="whitegrid")

# Plot a swarm plot for Sentiment_Polarity vs. bias with a gradient color scheme
plt.figure(figsize=(12, 6))
scatter = sns.scatterplot(x='sentiment_polarity', y='bias_category', data=headlines_filtered, hue='bias_category', palette='viridis', size=3)

# Style the plot
plt.title('Distribution of Sentiment Polarity for Different Bias Categories', fontsize=16)
plt.xlabel('Sentiment Polarity', fontsize=12)
plt.ylabel('Bias Category', fontsize=12)
plt.xticks(fontsize=10)
plt.yticks(fontsize=10)
plt.grid(axis='both', linestyle='--', alpha=0.7)

# Create a ScalarMappable for the colorbar
sm = plt.cm.ScalarMappable(cmap='viridis')
sm.set_array([])  # Set an empty array

# Show the plot
plt.tight_layout()
plt.show()


In [33]:
# Select the categorical columns to one-hot encode
categorical_columns = ['site', 'country', 'Day_of_Week', 'Month', 'Hour_of_Day', 'Publication_Year']

# Create one-hot encoded columns with 1s and 0s
one_hot_encoded = pd.get_dummies(headlines_filtered[categorical_columns], drop_first=True, dtype=int)

# Concatenate the one-hot encoded columns with the original DataFrame
headlines_filtered_encoded = pd.concat([headlines_filtered, one_hot_encoded], axis=1)

# Drop the original categorical columns
headlines_filtered_encoded.drop(categorical_columns, axis=1, inplace=True)

# Display the resulting DataFrame
headlines_filtered_encoded.head(1)

In [34]:
# Rename the 'headlines_no_site' column to 'headlines'
headlines_filtered_encoded.rename(columns={'headline_no_site': 'headlines'}, inplace=True)

headlines_filtered_encoded['headlines'].head()

In [35]:
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import re
from nltk.corpus import stopwords
import gensim
import gensim.downloader
from gensim.utils import simple_preprocess

In [None]:
# Tokenize the headline text
headlines_filtered_encoded['tokenized_text'] = headlines_filtered_encoded['headlines'].apply(word_tokenize)

# Remove non-alphabetic characters, handle empty strings, and extra spaces
headlines_filtered_encoded['cleaned_text'] = headlines_filtered_encoded['tokenized_text'].apply(lambda tokens: [re.sub(r'[^a-zA-Z0-9]', '', token).strip() for token in tokens if re.sub(r'[^a-zA-Z0-9]', '', token).strip()])

# Convert to lowercase
headlines_filtered_encoded['cleaned_text'] = headlines_filtered_encoded['cleaned_text'].apply(lambda tokens: [token.lower() for token in tokens])

# Lemmatization - this takes a minute or two to run
lemmatizer = WordNetLemmatizer()
headlines_filtered_encoded['lemmatized_text'] = headlines_filtered_encoded['cleaned_text'].apply(lambda tokens: [lemmatizer.lemmatize(token) for token in tokens])

In [None]:
lemmatized_df = headlines_filtered_encoded.copy()

In [None]:
# Drop the 'headlines' column from lemmatized_df
lemmatized_df.drop('headlines', axis=1, inplace=True)

# Display the first few rows of lemmatized_df after dropping the column
lemmatized_df.head(1)

In [None]:
# Getting stop words
stop_words = set(stopwords.words('english'))

# Remove stop words from the lemmatized_text column
lemmatized_df['lemmatized_text_no_stopwords'] = lemmatized_df['lemmatized_text'].apply(lambda tokens: [token for token in tokens if token not in stop_words])

In [None]:
lemmatized_df['lemmatized_text_no_stopwords'].head()

In [None]:
df_to_vectorize = lemmatized_df.copy()

In [None]:
# List of columns to drop
columns_to_drop = ['tokenized_text', 'cleaned_text', 'lemmatized_text']

# Drop the specified columns
df_to_vectorize.drop(columns=columns_to_drop, inplace=True)

In [None]:
df_to_vectorize.head(1)

In [None]:
# Loading a gensim model and assigning it to 'model'- this will take a while to run
model = gensim.downloader.load('fasttext-wiki-news-subwords-300')

In [None]:
# Printing available models in gensim-data
print(list(gensim.downloader.info()['models'].keys()))

In [None]:
def text2vec(text):
    """
    Generate an embedding for the given text by mapping the embeddings into a 300-dimensional space. 
    For out-of-vocabulary words, we use a zero-vector replacement. 
    Remove stop words from the text.
    -----
    Input: text (str): Text to be embedded.
    -----
    Output: embedding_vector (np.array): Averaged embedding vector in a 300-dimensional space.
    """
    tokenized = text
    
    word_embeddings = [np.zeros(300)]
    for word in tokenized:
        if word in model:
            vector = model[word]
        else:
            vector = np.zeros(300)
            
        word_embeddings.append(vector)
    
    text_embedding = np.stack(word_embeddings).mean(axis=0)
    
    return text_embedding

In [None]:
# Applying function over the lemmatized text column and assigning the results to new columns
df_to_vectorize['headline_vectors'] = df_to_vectorize['lemmatized_text_no_stopwords'].apply(lambda x: text2vec(x))

In [None]:
df_to_vectorize['headline_vectors'].head()

In [None]:
# Making copy of df_to_vectorize
final_df = df_to_vectorize.copy()

In [None]:
# Drop the 'lemmatized_text_no_stopwords' column
final_df = final_df.drop('lemmatized_text_no_stopwords', axis=1)

In [None]:
# Checking headlines vectors 
final_df['headline_vectors'][9207]

In [None]:
final_df.isnull().sum()

In [None]:
final_df = final_df.dropna()

In [None]:
# Ensuring there are no NaN values 
final_df.isnull().sum()

In [None]:
# Manually calculating class weights

# Extract the unique classes and their counts from the 'bias_category' column in final_df
classes, counts = np.unique(final_df['bias_category'], return_counts=True)

# Calculate class weights for the 'bias_category' column in final_df
total_samples = len(final_df['bias_category'])
class_weights = total_samples / (len(classes) * counts)

# Create a dictionary mapping class labels to their respective weights
class_weight_dict = dict(zip(classes, class_weights))

# Print the class weights
print('Class Weights:', class_weight_dict)

train test split

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
# Setting X and y variables
X = final_df.drop('bias_category', axis=1)
y = final_df['bias_category']

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print("X_train shape:", X_train.shape)
print("y_train shape:", y_train.shape)
print("X_test shape:", X_test.shape)
print("y_test shape:", y_test.shape)

In [None]:
# Ensuring there are no NaN values in y_train
y_train.unique()

In [None]:
# Ensuring there are no NaN values in y_test
y_test.unique()

In [None]:
# Converting X_train vectors to arrays
X_train_array = np.array(X_train['headline_vectors'].tolist())

# Converting X_test vectors to arrays
X_test_array = np.array(X_test['headline_vectors'].tolist())

## Simple Models

### Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import recall_score, f1_score, accuracy_score
from sklearn.model_selection import GridSearchCV

In [None]:
# Create an instance of LogisticRegression with class weights
logreg_model = LogisticRegression(max_iter=1000, random_state=42, class_weight=class_weight_dict)

# Fit the model on the training data
logreg_model.fit(X_train_array, y_train)

# Make predictions on the test data
y_pred_lr = logreg_model.predict(X_test_array)

In [None]:
from sklearn.metrics import classification_report

print(classification_report(y_test, y_pred_lr))

finding best params

In [None]:
from sklearn.metrics import confusion_matrix

# Get unique classes from y_test
unique_classes = np.unique(y_test)

# Calculate the confusion matrix
logreg_cm = confusion_matrix(y_test, y_pred_lr, labels=unique_classes)

# Plot the confusion matrix
plt.figure(figsize=(8, 6))
sns.heatmap(logreg_cm, annot=True, fmt='d', cmap='Blues', cbar=False,
            xticklabels=unique_classes, yticklabels=unique_classes)
plt.title('Confusion Matrix')
plt.xlabel('Predicted Labels')
plt.ylabel('True Labels')
plt.show()

In [None]:
logreg_cm

### Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier

# Create an instance of RandomForestClassifier
rf_model = RandomForestClassifier(random_state=42, class_weight=class_weight_dict)

# Train the model on the training data
rf_model.fit(X_train_array, y_train)

# Make predictions on the test data
y_pred_rf = rf_model.predict(X_test_array)

In [None]:
from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred_rf))

In [None]:
# Create the confusion matrix
rf_cm = confusion_matrix(y_test, y_pred_rf)

# Plot the confusion matrix
plt.figure(figsize=(8, 6))
sns.heatmap(rf_cm, annot=True, fmt='d', cmap='Blues')
plt.title('Random Forest Confusion Matrix')
plt.xlabel('Predicted Labels')
plt.ylabel('True Labels')
plt.show()

## Complex Models

### Neural Network

#### Simple Architecture  

In [None]:
from sklearn.utils.class_weight import compute_class_weight
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.utils import to_categorical
from sklearn.preprocessing import LabelEncoder

# Assuming y_train is a pandas Series with string labels
label_encoder = LabelEncoder()

# Encoding y_test and y_train
y_train_encoded = label_encoder.fit_transform(y_train)
y_test_encoded = label_encoder.transform(y_test)

# One-hot encoding y_train_encoded and y_test_encoded
y_train_one_hot = to_categorical(y_train_encoded)
y_test_one_hot = to_categorical(y_test_encoded)

# Assuming y_train_encoded is an array of class labels
class_labels = np.unique(y_train_encoded)

# Compute class weights for the neural network
class_weights = compute_class_weight(class_weight='balanced', classes=class_labels, y=y_train_encoded)
class_weight_dict = dict(enumerate(class_weights))

# Define the neural network model
snn_model = Sequential()
snn_model.add(Dense(64, input_dim=X_train_array.shape[1], activation='relu'))
snn_model.add(Dense(32, activation='relu'))
snn_model.add(Dense(len(class_labels), activation='softmax'))

# Compile the model with class weights
snn_model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

# Train the model with class weights
snn_model.fit(X_train_array, y_train_one_hot, epochs=10, batch_size=32, validation_split=0.2, class_weight=class_weight_dict)

In [None]:
# Inspect the mapping between original class labels and encoded numbers
class_labels_mapping = dict(zip(label_encoder.classes_, label_encoder.transform(label_encoder.classes_)))
print("Class Labels Mapping:", class_labels_mapping)

In [None]:
from sklearn.metrics import classification_report

# Make predictions on the test data
y_pred_one_hot = snn_model.predict(X_test_array)

# Convert the predicted probabilities to class labels
y_pred_classes = np.argmax(y_pred_one_hot, axis=1)

# Print classification report
print(classification_report(y_test_encoded, y_pred_classes, target_names=label_encoder.classes_))