In [None]:
import pandas as pd

In [None]:
headlines = pd.read_csv('Data/headlines.csv')

headlines.head()

In [None]:
# Dropping irrelevant columns url, unnamed, and index
headlines = headlines.drop(columns=['url', 'Unnamed: 0', 'index'])

In [None]:
headlines.head()

In [None]:
headlines.shape

In [None]:
# Getting value counts for bias feature
headlines['bias'].value_counts()

In [None]:
# Plotting bias distribution

import matplotlib.pyplot as plt
import seaborn as sns

# Set Seaborn style
sns.set(style="whitegrid")

# Plot histogram of 'bias'
plt.figure(figsize=(10, 6))
sns.histplot(headlines['bias'], bins=20, kde=True, color='blue')

# Style the plot
plt.title('Distribution of Bias Scores', fontsize=16)
plt.xlabel('Bias Score', fontsize=12)
plt.ylabel('Frequency', fontsize=12)
plt.xticks(fontsize=10)
plt.yticks(fontsize=10)
plt.grid(axis='y', linestyle='--', alpha=0.7)

# Show the plot
plt.tight_layout()
plt.show()


In [None]:
# Creating feature 'sentiment_polarity'
import nltk
from nltk.sentiment import SentimentIntensityAnalyzer

# Create a sentiment analyzer
sid = SentimentIntensityAnalyzer()

# Apply the sentiment analyzer to each headline and store the compound score
headlines['sentiment_polarity'] = headlines['headline_no_site'].apply(lambda x: sid.polarity_scores(x)['compound'])

In [None]:
headlines.head()

In [None]:
headlines['bias'].head()

In [None]:
headlines['sentiment_polarity'].value_counts()

In [None]:
# Print min and max values for sentiment polarity
print(headlines['sentiment_polarity'].min())
print(headlines['sentiment_polarity'].max())

In [None]:
# Plotting dist of sentiment polarity

# Set Seaborn style
sns.set(style="whitegrid")

# Plot histogram of 'sentiment_polarity'
plt.figure(figsize=(10, 6))
sns.histplot(headlines['sentiment_polarity'], bins=20, kde=True, color='green')

# Style the plot
plt.title('Distribution of Sentiment Polarity Scores', fontsize=16)
plt.xlabel('Sentiment Polarity Score', fontsize=12)
plt.ylabel('Frequency', fontsize=12)
plt.xticks(fontsize=10)
plt.yticks(fontsize=10)
plt.grid(axis='y', linestyle='--', alpha=0.7)

# Show the plot
plt.tight_layout()
plt.show()


In [None]:
import datetime
# Date Features
headlines['Day_of_Week'] = pd.to_datetime(headlines['time']).dt.day_name()
headlines['Month'] = pd.to_datetime(headlines['time']).dt.month
# Time feature
headlines['Hour_of_Day'] = pd.to_datetime(headlines['time']).dt.hour
# Assuming 'time' is the feature containing datetime information
headlines['time'] = pd.to_datetime(headlines['time'], errors='coerce')
# Extract the year and create a new feature 'Publication_Year'
headlines['Publication_Year'] = headlines['time'].dt.year

In [None]:
# Dropping time column
headlines = headlines.drop(columns=['time'])

In [None]:
headlines.head()

In [None]:
# Creating word count feature
headlines['Word_Count'] = headlines['headline_no_site'].apply(lambda x: len(x.split()))
# Creating text length feature
headlines['Text_Length'] = headlines['headline_no_site'].apply(len)

In [None]:
headlines.head()

In [None]:
headlines['country'].value_counts()

In [None]:
headlines['site'].value_counts()

In [None]:
# Assuming 'headlines' is the name of your dataframe
min_headlines_threshold = 5000
top_sites = headlines['site'].value_counts()
top_sites = top_sites[top_sites >= min_headlines_threshold].index

# Create a new dataframe with only the sites with at least 5000 headlines
headlines_filtered = headlines[headlines['site'].isin(top_sites)].copy()

In [None]:
headlines_filtered.shape

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Assuming 'headlines_filtered' is the name of your dataframe
top_10_sites = headlines_filtered['site'].value_counts().nlargest(10)

# Create a pie chart
plt.figure(figsize=(8, 8))
plt.pie(top_10_sites, labels=top_10_sites.index, autopct='%1.1f%%', colors=sns.color_palette('viridis'), startangle=90)
plt.title('Top 10 News Sources Distribution')
plt.tight_layout()

# Show the plot
plt.show()

In [None]:
# Assuming 'headlines_filtered' is the name of your dataframe
plt.figure(figsize=(8, 6))
sns.countplot(x='country', data=headlines_filtered, palette='viridis')
plt.title('Distribution of Countries')
plt.xlabel('Country')
plt.ylabel('Number of Headlines')
plt.tight_layout()

# Show the plot
plt.show()

In [None]:
headlines_filtered.head()

In [None]:
headlines_filtered['bias'].value_counts()

In [None]:
import numpy as np
# Assuming 'headlines_filtered' is your DataFrame
conditions = [
    headlines_filtered['bias'].between(0.000000, 0.2, inclusive='both'),
    headlines_filtered['bias'].between(0.2, 0.4, inclusive='both'),
    headlines_filtered['bias'].between(0.4, 0.6, inclusive='both'),
    headlines_filtered['bias'].between(0.6, 0.8, inclusive='both'),
    headlines_filtered['bias'].between(0.8, 1.0, inclusive='both'),
]

labels = ['No Bias', 'Low Bias', 'Moderate Bias', 'Medium Bias', 'High Bias']

headlines_filtered['bias_category'] = np.select(conditions, labels, default=None)


In [None]:
headlines_filtered.head()

In [None]:
headlines_filtered['bias_category'].value_counts()

In [None]:
# Dropping original bias column
headlines_filtered = headlines_filtered.drop(columns=['bias'])

In [None]:
headlines_filtered.head()

In [None]:
headlines_filtered.isnull().sum()

In [None]:
# Set the style for seaborn
sns.set(style="whitegrid")

# Plotting the distribution
plt.figure(figsize=(10, 6))
sns.histplot(headlines_filtered['Word_Count'], bins=30, color='skyblue', kde=False)
plt.title('Distribution of Word Count in Headlines')
plt.xlabel('Word_Count')
plt.ylabel('Frequency')

# Show the plot
plt.show()

In [None]:
# Set the style for seaborn
sns.set(style="whitegrid")

# Plotting the distribution
plt.figure(figsize=(10, 6))
sns.histplot(headlines_filtered['Text_Length'], bins=30, color='skyblue', kde=False)
plt.title('Distribution of Text Length in Headlines')
plt.xlabel('Word_Count')
plt.ylabel('Frequency')

# Show the plot
plt.show()

In [None]:
# Set the style for seaborn
sns.set(style="whitegrid")

# Plotting the distribution of bias_category by Publication_Year
plt.figure(figsize=(12, 8))
sns.countplot(x="Publication_Year", hue="bias_category", data=headlines_filtered)
plt.title('Distribution of Bias Category by Publication_Year')
plt.xlabel('Publication Year')
plt.ylabel('Count')

# Show the plot
plt.show()

In [None]:
# Set a Seaborn style
sns.set(style="whitegrid")

# Plot a swarm plot for Sentiment_Polarity vs. bias with a gradient color scheme
plt.figure(figsize=(12, 6))
scatter = sns.scatterplot(x='sentiment_polarity', y='bias_category', data=headlines_filtered, hue='bias_category', palette='viridis', size=3)

# Style the plot
plt.title('Distribution of Sentiment Polarity for Different Bias Categories', fontsize=16)
plt.xlabel('Sentiment Polarity', fontsize=12)
plt.ylabel('Bias Category', fontsize=12)
plt.xticks(fontsize=10)
plt.yticks(fontsize=10)
plt.grid(axis='both', linestyle='--', alpha=0.7)

# Create a ScalarMappable for the colorbar
sm = plt.cm.ScalarMappable(cmap='viridis')
sm.set_array([])  # Set an empty array

# Show the plot
plt.tight_layout()
plt.show()

In [None]:
# Select the categorical columns to one-hot encode
categorical_columns = ['site', 'country', 'Day_of_Week', 'Month', 'Hour_of_Day', 'Publication_Year']

# Create one-hot encoded columns with 1s and 0s
one_hot_encoded = pd.get_dummies(headlines_filtered[categorical_columns], drop_first=True, dtype=int)

# Concatenate the one-hot encoded columns with the original DataFrame
headlines_filtered_encoded = pd.concat([headlines_filtered, one_hot_encoded], axis=1)

# Drop the original categorical columns
headlines_filtered_encoded.drop(categorical_columns, axis=1, inplace=True)

# Display the resulting DataFrame
headlines_filtered_encoded.head()


In [None]:
# Rename the 'headlines_no_site' column to 'headlines'
headlines_filtered_encoded.rename(columns={'headline_no_site': 'headlines'}, inplace=True)

headlines_filtered_encoded.head()

In [None]:
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import re

# Tokenize the headline text
headlines_filtered_encoded['tokenized_text'] = headlines_filtered_encoded['headlines'].apply(word_tokenize)

# Remove non-alphabetic characters, handle empty strings, and extra spaces
headlines_filtered_encoded['cleaned_text'] = headlines_filtered_encoded['tokenized_text'].apply(lambda tokens: [re.sub(r'[^a-zA-Z0-9]', '', token).strip() for token in tokens if re.sub(r'[^a-zA-Z0-9]', '', token).strip()])

# Convert to lowercase
headlines_filtered_encoded['cleaned_text'] = headlines_filtered_encoded['cleaned_text'].apply(lambda tokens: [token.lower() for token in tokens])

# Lemmatization
lemmatizer = WordNetLemmatizer()
headlines_filtered_encoded['lemmatized_text'] = headlines_filtered_encoded['cleaned_text'].apply(lambda tokens: [lemmatizer.lemmatize(token) for token in tokens])


In [None]:
headlines_filtered_encoded.head()

In [None]:
lemmatized_df = headlines_filtered_encoded.copy()

In [None]:
# Drop the 'headlines' column from lemmatized_df
lemmatized_df.drop('headlines', axis=1, inplace=True)

# Display the first few rows of lemmatized_df after dropping the column
lemmatized_df.head()


remove stopwords

In [None]:
from nltk.corpus import stopwords

# Get the stop words
stop_words = set(stopwords.words('english'))

# Remove stop words from the lemmatized_text column
lemmatized_df['lemmatized_text_no_stopwords'] = lemmatized_df['lemmatized_text'].apply(lambda tokens: [token for token in tokens if token not in stop_words])


In [None]:
lemmatized_df['lemmatized_text_no_stopwords'].head()

In [None]:
lemmatized_df.head()

In [None]:
df_to_vectorize = lemmatized_df.copy()

In [None]:
# List of columns to drop
columns_to_drop = ['tokenized_text', 'cleaned_text', 'lemmatized_text']

# Drop the specified columns
df_to_vectorize.drop(columns=columns_to_drop, inplace=True)

In [None]:
df_to_vectorize.head()

In [None]:
df_to_vectorize['lemmatized_text_no_stopwords'].head()

In [None]:
#Import
import gensim
import gensim.downloader

In [None]:
#Load model and assign it to 'model'
model = gensim.downloader.load('fasttext-wiki-news-subwords-300')

#We can also print a list to show all available models in gensim-data
print(list(gensim.downloader.info()['models'].keys()))

In [None]:
from gensim.utils import simple_preprocess

In [None]:
def text2vec(text):
    """
    Embed a text by averaging the word vectors of the tokenized text. 
    Put embeddings into 300 dimensions.
    Out-of-vocabulary words are replaced by the zero-vector.
    Stop words are removed.
    -----
    
    Input: text (string)
    Output: embedding vector (np.array)
    """
    # Assuming text is already tokenized and lemmatized
    tokenized = text
    
    # We want our embeddings in 300 dimensions
    word_embeddings = [np.zeros(300)]
    for word in tokenized:
        # if the word is in the model then embed
        if word in model:
            vector = model[word]
        # add zeros for out-of-vocab words that are not in the pretrained embedding model 
        else:
            vector = np.zeros(300)
            
        word_embeddings.append(vector)
    
    # Average the word vectors with .mean()
    text_embedding = np.stack(word_embeddings).mean(axis=0)
    
    return text_embedding


In [None]:
#use lambda to apply the function over the columns and assign the results to new columns
df_to_vectorize['title_vectors'] = df_to_vectorize['lemmatized_text_no_stopwords'].apply(lambda x: text2vec(x))

In [None]:
df_to_vectorize['title_vectors'].head()

In [None]:
df_to_vectorize.head()

In [None]:
df_to_vectorize.columns

In [None]:
final_df = df_to_vectorize.copy()

In [None]:
final_df.head()

In [None]:
# Drop the 'lemmatized_text_no_stopwords' column
final_df = final_df.drop('lemmatized_text_no_stopwords', axis=1)

scale data