# Data import

In this section we import the chosen five different data sets (finance, news, movies, twitter and reddit). Each data frame will haev the same formatting, only retaining three columns:
- text: text input
- ground_truth: sentiment label (positive, negative, neutral)
- topic: indicates from which dataset the observations comes from (finance, news, movies, twitter, reddit)

In [None]:
#! pip install kagglehub

In [None]:
import kagglehub
import pandas as pd
import os

In [None]:
# finance data

path = kagglehub.dataset_download("sbhatti/financial-sentiment-analysis")

print("Path to dataset files:", path)

In [None]:
csv_file = os.path.join(path, "data.csv")

df_finance = pd.read_csv(csv_file, encoding="ISO-8859-1")

In [None]:
df_finance.rename(columns={df_finance.columns[0]: "text"}, inplace=True)
df_finance.rename(columns={df_finance.columns[1]: "ground_truth"}, inplace=True)
df_finance["topic"] = "finance"

df_finance.head()

In [None]:
# news data

path = kagglehub.dataset_download("cashbowman/sentiment-labeled-headlines")

print("Path to dataset files:", path)

In [None]:
path = os.path.join(path, 'Sentiments', 'guardian_sentiment.csv')

df_news = pd.read_csv(path, encoding="UTF-8")

In [None]:
df_news.head()

In [None]:
df_news.rename(columns={df_news.columns[2]: "text"}, inplace=True)
df_news.rename(columns={df_news.columns[3]: "ground_truth"}, inplace=True)
df_news["topic"] = "news"
df_news = df_news[["text", "ground_truth", "topic"]]

df_news.head()


In this data set, the default sentiment classification is a five-step scale. We will map these to have very negative (1) as negative, neutral (3) as neutral and very positive (5) as positive. We will drop observations that are rated 2 (negative) and 4 (positive) to only have clear cases and stronger sentiment intensity in our observaitons.

In [None]:
mapping = {1: "negative", 3: "neutral", 5: "positive"}

# Apply mapping to the "ground_truth" column
df_news["ground_truth"] = df_news["ground_truth"].map(mapping)


In [None]:
df_news.dropna()

In [None]:
# reddit data

path = kagglehub.dataset_download("cosmos98/twitter-and-reddit-sentimental-analysis-dataset")

print("Path to dataset files:", path)

In [None]:
csv_file = os.path.join(path, "Reddit_Data.csv")

df_reddit = pd.read_csv(csv_file, encoding="ISO-8859-1")

In [None]:
df_reddit["topic"] = "reddit"
df_reddit.rename(columns={df_reddit.columns[0]: "text"}, inplace=True)
df_reddit.rename(columns={df_reddit.columns[1]: "ground_truth"}, inplace=True)

The dataset labels the sentiment as -1 (negative), 0 (neutral) and 1 (positive). For conformity with other data sets, this is mapped to text labels.

In [None]:
# Define the mapping
mapping = {-1: "negative", 0: "neutral", 1: "positive"}

# Apply mapping to the "ground_truth" column
df_reddit["ground_truth"] = df_reddit["ground_truth"].map(mapping)


In [None]:
df_reddit.head()

In [None]:
# twitter data

csv_file = os.path.join(path, "Twitter_Data.csv")

#df = pd.read_csv(csv_file)
df_twitter = pd.read_csv(csv_file, encoding="UTF-8")

In [None]:
df_twitter["topic"] = "twitter"
df_twitter.rename(columns={df_twitter.columns[0]: "text"}, inplace=True)
df_twitter.rename(columns={df_twitter.columns[1]: "ground_truth"}, inplace=True)

In [None]:
df_twitter["ground_truth"] = df_twitter["ground_truth"].map(mapping)

In [None]:
df_twitter.head()

In [None]:
# movie reviews
path = kagglehub.dataset_download("yacharki/movie-review-sentiment-analysis")

print("Path to dataset files:", path)

In [None]:
path = os.path.join(path, 'Movie Reviews Sentences for Sentiment Analysis NLP')
os.listdir(path)


In [None]:
csv_file = os.path.join(path, "test.csv")


df_movie_1 = pd.read_csv(csv_file, encoding="ISO-8859-1")

In [None]:
csv_file = os.path.join(path, "train.csv")


df_movie_2 = pd.read_csv(csv_file, encoding="ISO-8859-1")

In [None]:
df_movie = pd.concat([df_movie_1, df_movie_2], ignore_index=True)

In [None]:
df_movie["topic"] = "movie"
df_movie.rename(columns={df_movie.columns[0]: "ground_truth"}, inplace=True)
df_movie.rename(columns={df_movie.columns[1]: "text"}, inplace=True)

In [None]:
df_movie["ground_truth"].unique()

This data also uses a 5-step scale, where 0 (very negative) will be converted to negative, 2 (neutral) will become neutral, 4 (very positive) will be positive. 1 (negative) and 3 (positive) will be droppped to retain only texts with stronger sentiment intensity. This is the same logic as in case of the news data set.

In [None]:
# Define the mapping
mapping = {0: "negative", 2: "neutral", 4: "positive"}

# Apply mapping to the "ground_truth" column
df_movie["ground_truth"] = df_movie["ground_truth"].map(mapping)


In [None]:
df_movie.dropna()

# Dataset comparison

We imported the datasets that will be used. They are:
- df_finance
- df_news
- df_reddit
- df_twitter
- df_movie

In [None]:
# creating one big dataset

df_all = pd.concat([df_finance, df_news, df_reddit, df_twitter, df_movie], ignore_index=True)

In [None]:
df_all.shape

Before proceeding we will drop duplicates (if any) and single word text inputs. Single word observations, esspecially for Naive Bayes model, are hard to classifiy if the word is unseen.

In [None]:
df_all = df_all.drop_duplicates(subset='text', keep='first')

In [None]:
df_all.shape # 1,323 duplicates have been removed

In [None]:
df_all = df_all[df_all['text'].str.split().str.len() > 2]

In [None]:
df_all.shape  # 45,060 single-word and double-word inputs have been removed

In [None]:
df_all = df_all.dropna(subset=['ground_truth'])
df_all = df_all[df_all['ground_truth'] != '']


In [None]:
df_all.shape # 37,638 observations dropped with missing sentiment label

The length of the text inputs is also investigated. It is optimal to stay within a defined range of word counts to obtain homogenous text inputs.

In [None]:
def input_lengths (data):
  data['word_count'] = data['text'].str.split().str.len()
  topics = data['topic'].unique()
  df_length = pd.DataFrame(columns=['topic', 'average_words', 'median_words', 'shortest', 'longest'])
  for topic in topics:
    topic_df = data[data['topic'] == topic]
    average_words = topic_df['word_count'].mean()
    median_words = topic_df['word_count'].median()
    shortest = topic_df['word_count'].min()
    longest = topic_df['word_count'].max()
    df_length.loc[len(df_length)] = [topic, average_words, median_words, shortest, longest]
  return df_length

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
word_counts = input_lengths(df_all)
word_counts.to_csv("/content/drive/My Drive/BA THESIS/analysis/raw_word_counts.csv")
word_counts

In [None]:
# Boxplot of word count

import seaborn as sns
import pandas as pd
import matplotlib.pyplot as plt

sns.boxplot(x = 'topic', y = 'word_count', data = df_all)
plt.xticks(rotation=45)
plt.title('Word Count by Topic')
plt.tight_layout()
plt.show()

The median inut lengths are between 4 and 20 for each topic. To obtain more convergent distribution, all observations with longer than 50 words will be removed.

In [None]:
df_all = df_all[df_all['word_count'] <= 50] # 5,312 observations were removed
df_all.shape

In [None]:
import seaborn as sns
import pandas as pd
import matplotlib.pyplot as plt

sns.boxplot(x = 'topic', y = 'word_count', data = df_all)
plt.xticks(rotation=45)
plt.title('Word Count by Topic')
plt.tight_layout()
plt.savefig("/content/drive/My Drive/BA THESIS/analysis/raw_word_count_distribution_box.png")
plt.show()

In [None]:

plt.figure(figsize=(10, 6))

for topic in df_all['topic'].unique():
    topic_data = df_all[df_all['topic'] == topic]['word_count']
    values, counts = topic_data.value_counts().sort_index().values, topic_data.value_counts().sort_index().index
    relative_freq = values/values.sum()
    plt.plot(counts, relative_freq, label=topic, marker=None)

plt.xlabel("Word Count")
plt.ylabel("Relative Frequency")
plt.title("Word Count Distribution by Topic (Relative Frequency)")
plt.legend()
plt.grid(True)
plt.savefig("/content/drive/My Drive/BA THESIS/analysis/raw_word_count_distribution_line.png")
plt.show()


In [None]:
df_all.shape

In [None]:
table = pd.crosstab(df_all["ground_truth"], df_all["topic"], margins=True)
table.columns.name = None
table

In [None]:
table.to_csv("/content/drive/My Drive/BA THESIS/analysis/raw_table_sentiment_topic_distribution.csv")

Accross all topics except for news, the "negative" class is the least represented. In all these categories we will resample the "neutral" and "positive" category to have the categories balanced within topics. In case of the news data, we resample it to the neutral category.

In [None]:
from sklearn.utils import resample

In [None]:
def balance_ground_truth(group):

    # counts of each class in this group
    class_counts = group["ground_truth"].value_counts()

    # minimum count among the classes
    min_count = class_counts.min()

    # Resample each class to the minimum count
    balanced_dfs = []
    for label in class_counts.index:
        df_class = group[group["ground_truth"] == label]

        # Only resample if the class has more than min_count
        if len(df_class) > min_count:
            df_class = resample(df_class, replace=False, n_samples=min_count, random_state=42)

        balanced_dfs.append(df_class)

    # Combine all balanced class DataFrames
    return pd.concat(balanced_dfs).reset_index(drop=True)

# Apply the resampling function to each topic separately
df_resampled = df_all.groupby("topic", group_keys=False).apply(balance_ground_truth)

print(df_resampled.groupby(["topic", "ground_truth"]).size())

Now the sentiments are balanced but the topics still need to be balanced. To not loose to many observations we will sample down each topic that has more than 2,000 observations per sentiment to 2,000. This concerns reddit, twitter and movies topics.

In [None]:
# sample down to 2,000 each label in movie, reddit and twitter

sample_topics = ['movie', 'reddit', 'twitter']

# Split the data into two parts:
# 1. Rows where the topic is one of the sample_topics
df_to_sample = df_resampled[df_resampled['topic'].isin(sample_topics)]
# 2. Rows where the topic is not one of the sample_topics (to be retained as-is)
df_remaining = df_resampled[~df_resampled['topic'].isin(sample_topics)]

# For each ground_truth category within the sample topics, randomly sample 2000 rows.
sampled_df = df_to_sample.groupby(['ground_truth', 'topic'], group_keys=False).apply(lambda x: x.sample(n=2000, random_state=222))

# Combine the sampled subset with the rest of the data.
df_resampled = pd.concat([sampled_df, df_remaining]).reset_index(drop=True)

# Optional: Verify counts
print(df_resampled.groupby(['topic', 'ground_truth']).size())

In [None]:
# Shuffle the dataset
df_balanced = df_resampled.sample(frac=1, random_state=42).reset_index(drop=True)

In [None]:
df_balanced = df_balanced.drop(columns=['word_count'])


In [None]:
df_balanced.head()

In [None]:
df_balanced.to_csv("/content/drive/My Drive/BA THESIS/data/df_balanced.csv", index=False)

# Text preprocessing

Naive Bayes and Bert require different processing. For Naive Bayes, first negations are treated, tehn stopwords are removed and finally all special characters and numbers are removed. For BERT only special characters and numbers are removed but stop words and negations do not need to eb treated differently.

In [None]:
import pandas as pd

In [None]:
df_nb = pd.read_csv("/content/drive/My Drive/BA THESIS/data/df_balanced.csv")
df_bert = pd.read_csv("/content/drive/My Drive/BA THESIS/data/df_balanced.csv")

In [None]:
df_nb.loc[109:113]

In [None]:

import pandas as pd
pd.set_option("display.max_colwidth", None)
df_nb.loc[109:113]


In [None]:
import re
import nltk
from nltk.corpus import stopwords

In [None]:
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))


First negations are processed. In a sentence "not good" will become NOT_good or "I wasn't happy" will be NOT_happy.

In [None]:
def preprocess_negation(text):

    text = text.lower()
    # Tokenize by whitespace only
    words = text.split()

    processed_words = []
    i = 0
    while i < len(words):
        word = words[i]
        if word in ["no", "not", "nor", "wasn't", "weren't", "wouldn't", "won't", "can't", "couldn't",
                   "hasn't", "haven't", "hadn't", "isn't", "aren't", "shouldn't", "shan't", "needn't",
                   "mightn't", "don't", "doesn't"]:
            if i + 1 < len(words):
                processed_words.append("NOT_" + words[i+1])
                i += 2  # Skip the next word as it's already processed
            else:
                processed_words.append(word) # Handle the case where negation word is the last word
                i += 1
        elif word in stop_words:
            i += 1
            continue
        else:
            processed_words.append(word)
            i += 1
    return " ".join(processed_words)

In [None]:
df_nb['text'] = df_nb['text'].apply(preprocess_negation)


Then special characers and numbers are removed.

In [None]:
import re

def remove_special_characters_and_numbers(text):
    # Keep only letters, spaces, and "NOT_"
    text = re.sub(r'[^a-zA-Z\sNOT_]', '', text)
    return text

In [None]:
df_nb['text'] = df_nb['text'].apply(remove_special_characters_and_numbers)


In [None]:
df_bert['text'] = df_bert['text'].apply(remove_special_characters_and_numbers)

In [None]:
df_nb.head()

In [None]:
df_bert.head()

Stop words, numbers and special characters have been removed. Now we inspect if any text input remains empty after this processing.

In [None]:
df_nb.shape == df_bert.shape

In [None]:
import numpy as np

In [None]:
df_nb["text"] = df_nb["text"].replace(r'^\s*$', np.nan, regex=True)
df_bert["text"] = df_bert["text"].replace(r'^\s*$', np.nan, regex=True)

print(df_nb.isna().sum(), df_bert.isna().sum())  # See NaN counts

In [None]:
pattern = re.compile(r'^\s*nan\s*$', flags=re.IGNORECASE)

df_nb["text"] = df_nb["text"].replace(pattern, np.nan, regex=True)
df_bert["text"] = df_bert["text"].replace(pattern, np.nan, regex=True)

print(df_nb.isna().sum(), df_bert.isna().sum())  # See NaN counts

In [None]:

# Drop rows with NA values in 'text' column of df_nb
df_nb.dropna(inplace=True)

# Get the index of rows that were dropped from df_nb
index_to_drop = df_bert.index.difference(df_nb.index)

# Drop the same observations from df_bert
df_bert.drop(index_to_drop, inplace=True)


In [None]:

# Reset index for df_nb and df_bert
df_nb = df_nb.reset_index(drop=True)
df_bert = df_bert.reset_index(drop=True)


In [None]:
df_nb.shape == df_bert.shape

After processing, some text inputs remained empty. These were also removed

In [None]:
table = pd.crosstab(df_bert["ground_truth"], df_bert["topic"], margins=True)
table.columns.name = None
table

In [None]:
table.to_csv("/content/drive/My Drive/BA THESIS/analysis/raw_table_sentiment_topic_distribution_after_processing.csv")

In [None]:
df_bert.head(10)

# Lemmatizing

Additionnal processing is required for the Naive Bayes model. Both stemming and lemmatizing will be tried.

In [None]:
from nltk.corpus import stopwords, wordnet
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

In [None]:
nltk.download('punkt_tab')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger_eng')

In [None]:
lemmatizer = WordNetLemmatizer()

In [None]:
def get_wordnet_pos(word):
    """Map POS tag to first character for lemmatization"""
    from nltk.corpus import wordnet
    from nltk import pos_tag

    tag = pos_tag([word])[0][1][0].upper()  # Get the POS tag and first letter
    tag_dict = {"J": wordnet.ADJ, "N": wordnet.NOUN, "V": wordnet.VERB, "R": wordnet.ADV}

    return tag_dict.get(tag, wordnet.NOUN)

In [None]:
df_NB_lem = df_nb.copy()

def lemm_text(text):

    # Tokenize text
    words = word_tokenize(text)
    # Remove stopwords and lemmatize
    words = [lemmatizer.lemmatize(word, get_wordnet_pos(word)) for word in words if word not in stop_words]
    # Reconstruct the text
    return ' '.join(words)

# Apply preprocessing to the text column
df_NB_lem["text"] = df_NB_lem["text"].astype(str).apply(lemm_text)


# Stemming

In [None]:
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

df_NB_stem = df_nb.copy()

# Initialize stemmer and stopwords
stemmer = PorterStemmer()
stop_words = set(stopwords.words("english"))

def stem_text(text):
    # Tokenize text
    words = word_tokenize(text)
    # Remove stopwords and stem words
    words = [stemmer.stem(word) for word in words if word not in stop_words]
    # Reconstruct the text
    return ' '.join(words)

# Apply stemming to the text column
df_NB_stem["text"] = df_NB_stem["text"].astype(str).apply(stem_text)


In [None]:
df_NB_lem.head(10)

In [None]:
df_NB_stem.head(10)

In [None]:
table = pd.crosstab(df_balanced["ground_truth"], df_balanced["topic"], margins=True)
table.columns.name = None
table

# Data Split
We split the data into 70% train, 10% validation and 20% test data.

In [None]:
from sklearn.model_selection import train_test_split

# First, split into 80% train+val and 20% test (stratifying on both 'topic' and 'ground_truth')
train_val_NB_lem, test_NB_lem = train_test_split(df_NB_lem, test_size=0.2, stratify=df_NB_lem[['topic', 'ground_truth']], random_state=222)

# Now, split train_val into 70% train and 10% validation
train_NB_lem, val_NB_lem = train_test_split(train_val_NB_lem, test_size=0.125, stratify=train_val_NB_lem[['topic', 'ground_truth']], random_state=222)
# (0.125 * 80% = 10% of the total data)

In [None]:
train_NB_lem.head()

In [None]:

# First, split into 80% train+val and 20% test (stratifying on both 'topic' and 'ground_truth')
train_val_NB_stem, test_NB_stem = train_test_split(df_NB_stem, test_size=0.2, stratify=df_NB_stem[['topic', 'ground_truth']], random_state=222)

# Now, split train_val into 70% train and 10% validation
train_NB_stem, val_NB_stem = train_test_split(train_val_NB_stem, test_size=0.125, stratify=train_val_NB_stem[['topic', 'ground_truth']], random_state=222)
# (0.125 * 80% = 10% of the total data)

In [None]:
train_NB_stem.head()


In [None]:
table = pd.crosstab(train_NB_stem["ground_truth"], train_NB_stem["topic"], margins=True)
table.columns.name = None
table

In [None]:
#table = pd.crosstab(test_NB["ground_truth"], test_NB["topic"], margins=True)
#table.columns.name = None
#table

In [None]:
# First, split into 80% train+val and 20% test (stratifying on both 'topic' and 'ground_truth')
train_val_BERT, test_BERT = train_test_split(df_bert, test_size=0.2, stratify=df_bert[['topic', 'ground_truth']], random_state=222)

# Now, split train_val into 70% train and 10% validation
train_BERT, val_BERT = train_test_split(train_val_BERT, test_size=0.125, stratify=train_val_BERT[['topic', 'ground_truth']], random_state=222)
# (0.125 * 80% = 10% of the total data)

In [None]:
train_BERT.head()

In [None]:
table = pd.crosstab(train_BERT["ground_truth"], train_BERT["topic"], margins=True)
table.columns.name = None
table

# Save the data

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
path = "/content/drive/My Drive/BA THESIS/data"

In [None]:
# Save the data
train_NB_lem.to_csv(os.path.join(path, 'train_NB_lem.csv'), index=False)
val_NB_lem.to_csv(os.path.join(path, 'val_NB_lem.csv'), index=False)
test_NB_lem.to_csv(os.path.join(path, 'test_NB_lem.csv'), index=False)

In [None]:
train_NB_stem.to_csv(os.path.join(path, 'train_NB_stem.csv'), index=False)
val_NB_stem.to_csv(os.path.join(path, 'val_NB_stem.csv'), index=False)
test_NB_stem.to_csv(os.path.join(path, 'test_NB_stem.csv'), index=False)

In [None]:
train_BERT.to_csv(os.path.join(path, 'train_BERT.csv'), index=False)
val_BERT.to_csv(os.path.join(path, 'val_BERT.csv'), index=False)
test_BERT.to_csv(os.path.join(path, 'test_BERT.csv'), index=False)