##Data Loading


Load the preprocessed data

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
import pandas as pd

# Provide the correct file path
file_path_train = "/content/drive/My Drive/BA THESIS/data/train_NB_lem.csv"
file_path_val = "/content/drive/My Drive/BA THESIS/data/val_NB_lem.csv"
#file_path_test = "/content/drive/My Drive/BA THESIS/data/test_NB_lem.csv"


# Load CSV
train_NB_lem = pd.read_csv(file_path_train)
val_NB_lem = pd.read_csv(file_path_val)
#test_NB_lem = pd.read_csv(file_path_test)


In [None]:
# Provide the correct file path
file_path_train = "/content/drive/My Drive/BA THESIS/data/train_NB_stem.csv"
file_path_val = "/content/drive/My Drive/BA THESIS/data/val_NB_stem.csv"
#file_path_test = "/content/drive/My Drive/BA THESIS/data/test_NB_stem.csv"


# Load CSV
train_NB_stem = pd.read_csv(file_path_train)
val_NB_stem = pd.read_csv(file_path_val)
#test_NB_stem = pd.read_csv(file_path_test)


In [None]:
# Provide the correct file path
file_path_train = "/content/drive/My Drive/BA THESIS/data/train_BERT.csv"
file_path_val = "/content/drive/My Drive/BA THESIS/data/val_BERT.csv"
#file_path_test = "/content/drive/My Drive/BA THESIS/data/test_BERT.csv"


# Load CSV
train_BERT = pd.read_csv(file_path_train)
val_BERT = pd.read_csv(file_path_val)
#test_BERT = pd.read_csv(file_path_test)


In [None]:
print(train_NB_lem.isna().sum(), train_NB_stem.isna().sum(), train_BERT.isna().sum())

In [None]:
print(val_NB_lem.isna().sum(), val_NB_stem.isna().sum(), val_BERT.isna().sum())

In [None]:
# two additional NAs need to be removed from stemmed and lemmatized sets

train_NB_lem = train_NB_lem.dropna()
train_NB_stem = train_NB_stem.dropna()

# Naive Bayes model training


Both lemmatizing and stemming datasets will be used separately and the highest validation accuracy will chose, which method will be carreid on. The intuition behind this is that with stemming the occurences of words (stems) will be higher as it uses brute force to stem words and allows for less stems and more reduction comapred to lemmatizing.

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV

In [None]:
def best_parameter_search (vectorizer, X_train, y_train, X_val, y_val):

  pipeline = Pipeline([
    ('vectorizer', vectorizer),
    ('classifier', MultinomialNB())
  ])

  # Grid of hyperparameters
  param_grid = {
      'vectorizer__ngram_range': [(1, 1), (1, 2)],
      'vectorizer__max_features': [5000, 10000, 15000, 20000, 25000],
      'classifier__alpha': [0.1, 0.5, 1.0]
  }

  # Grid search only on training data
  grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring='accuracy', verbose=2, n_jobs=-1)
  grid_search.fit(X_train, y_train)

  # Best model from grid search
  best_model = grid_search.best_estimator_

  # Evaluate on validation set
  y_pred = best_model.predict(X_val)

  # Print evaluation metrics
  print("Validation Accuracy:", accuracy_score(y_val, y_pred))
  print("\nClassification Report:\n", classification_report(y_val, y_pred))
  print("Best Parameters:", grid_search.best_params_)
  print("Best Cross-Validation Score: {:.4f}".format(grid_search.best_score_))

## Lemmatizing vs Stemming (with different vectorizers)

In [None]:
x_lem = train_NB_lem['text']
y_lem = train_NB_lem['ground_truth']

In [None]:
x_lem_val = val_NB_lem['text']
y_lem_val = val_NB_lem['ground_truth']

In [None]:
x_stem = train_NB_stem['text']
y_stem = train_NB_stem['ground_truth']

In [None]:
x_stem_val = val_NB_stem['text']
y_stem_val = val_NB_stem['ground_truth']

### TF-IDF vectorizer

Parameters:
- ngram_range: (1,1) or (1,2)
- max_features: 5000, 10 000, 15 000, 20 0000, 25 000
- alpha: 0.1, 0.5 and 1

The best parameters are ngram_range (1,1), max features 10,000 for lemmatized and 25,000 for stemmed set and alpha 1. On the lemmatized set the model performs marginally better.

In [None]:
best_parameter_search (TfidfVectorizer(), x_lem, y_lem, x_lem_val, y_lem_val)

In [None]:
best_parameter_search (TfidfVectorizer(), x_stem, y_stem, x_stem_val, y_stem_val)

### Count Vectorizer

Parameters:
- ngram_range: (1,1) or (1,2)
- max_features: 5000, 10 000, 15 000, 20 0000, 25 000
- alpha: 0.1, 0.5 and 1

The best parameters are ngram_range (1,1), max features 10,000 and alpha 1. On the stemmed set the model performs marginally better.

In [None]:
best_parameter_search (CountVectorizer(), x_lem, y_lem, x_lem_val, y_lem_val)

In [None]:
best_parameter_search (CountVectorizer(), x_stem, y_stem, x_stem_val, y_stem_val)

### Hash Vectorizer




Parameters:
- n_features: 20^10, 2^13, 2^15, 2^17, 2^20
- alpha: 0.1, 0.5 and 1

The best parameters are ngram_range (1,1), n features 2^15 for lemmatized and 2^16 for stemmed set and alpha 1 for bot sets. On the lemmatized set the model performs marginally better.

In [None]:
def best_parameter_search_hash (vectorizer, X_train, y_train, X_val, y_val):

  pipeline = Pipeline([
    ('vectorizer', vectorizer),
    ('classifier', MultinomialNB())
  ])

  # Grid of hyperparameters
  param_grid = {
      'vectorizer__ngram_range': [(1, 1), (1, 2)],
      'vectorizer__n_features': [2**10, 2**12, 2**14, 2**15, 2**16],
      'classifier__alpha': [0.1, 0.5, 1.0]
  }

  # Grid search only on training data
  grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring='accuracy', verbose=2, n_jobs=-1)
  grid_search.fit(X_train, y_train)

  # Best model from grid search
  best_model = grid_search.best_estimator_

  # Evaluate on validation set
  y_pred = best_model.predict(X_val)

  # Print evaluation metrics
  print("Validation Accuracy:", accuracy_score(y_val, y_pred))
  print("\nClassification Report:\n", classification_report(y_val, y_pred))
  print("Best Parameters:", grid_search.best_params_)
  print("Best Cross-Validation Score: {:.4f}".format(grid_search.best_score_))

In [None]:
best_parameter_search_hash (HashingVectorizer(alternate_sign=False), x_lem, y_lem, x_lem_val, y_lem_val)

In [None]:
best_parameter_search_hash (HashingVectorizer(alternate_sign=False), x_stem, y_stem, x_stem_val, y_stem_val)

In [None]:
# train and save the best model

vectorizer = TfidfVectorizer(ngram_range=(1,1), max_features=10000)
X_train_count = vectorizer.fit_transform(x_lem)

In [None]:
nb_classifier = MultinomialNB(alpha = 1)
nb_classifier.fit(X_train_count, y_lem)

In [None]:
import joblib

# Save the TF-IDF vectorizer
joblib.dump(vectorizer, "/content/drive/My Drive/BA THESIS/trained_nb_model/tfidf_vectorizer.pkl")

# Save the trained Naïve Bayes model
joblib.dump(nb_classifier, "/content/drive/My Drive/BA THESIS/trained_nb_model/naive_bayes_model.pkl")

The differences between vectorizers and lemmatizing or stemming are marginal. The highest validation accuracy was the determining factor for the chosen model. This is **TFIDF vectorizer with ngram_range (1,2) and 20 000 features** on the **lemmatized** set.

# BERT training

In [None]:
#!pip3 install torch torchaudio torchvision torchtext torchdata

In [None]:
#!pip install --upgrade tensorflow transformers


In [None]:
#!pip install tf-keras

In [None]:
from sklearn.preprocessing import LabelEncoder
import tensorflow as tf
from transformers import BertTokenizer
from transformers import TFBertForSequenceClassification
from sklearn.metrics import f1_score


Labels are encoded as 0,1 and 2. Then the tokenizer function is defined to accomodate to the longest text inputs.

In [None]:
# encode labels as 0,1,2

label_encoder = LabelEncoder()
train_BERT["label"] = label_encoder.fit_transform(train_BERT["ground_truth"])
val_BERT["label"] = label_encoder.transform(val_BERT["ground_truth"])

In [None]:
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

In [None]:
def tokenize_function(text):
    return tokenizer(
        text,
        truncation=True,
        padding="longest",    # or use "longest" for dynamic padding
        max_length=200,       # since we are using headlines
        return_tensors="tf"
    )

# Tokenize texts and extract tensors
tokenized_train = tokenize_function(train_BERT["text"].tolist())
tokenized_val = tokenize_function(val_BERT["text"].tolist())

In [None]:


# Convert labels to a TensorFlow tensor
labels_train = tf.convert_to_tensor(train_BERT["label"].tolist())
labels_val = tf.convert_to_tensor(val_BERT["label"].tolist())

# Create a dataset from the dictionary and labels
dataset_train = tf.data.Dataset.from_tensor_slices((
    {
        "input_ids": tokenized_train["input_ids"],
        "attention_mask": tokenized_train["attention_mask"]
    },
    labels_train
))

dataset_val = tf.data.Dataset.from_tensor_slices((
    {
        "input_ids": tokenized_val["input_ids"],
        "attention_mask": tokenized_val["attention_mask"]
    },
    labels_val
))




The hyperparameter optimization is split into two parts instead of a traditional grid search. This is due to computing power required if all combinations of the three hyperparameters are searched.

In the first round the best combination of learning rate and batch size is search. The considered batch sizes are 16 and 32, the considered learning rates are 2e-5, 3e-5, 5e-5.

In [None]:
learning_rates = [2e-5, 3e-5, 5e-5]
batch_sizes = [16, 32]


In [None]:
results = []

for lr in learning_rates:
    for batch_size in batch_sizes:

          print(f"Training with lr: {lr}, batch_size: {batch_size}")

          # Create batched datasets for current batch size
          train_dataset = dataset_train.shuffle(buffer_size=len(train_BERT)).batch(batch_size)
          val_dataset = dataset_val.batch(batch_size)

          # Load and compile the model
          model = TFBertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=3)
          optimizer = tf.keras.optimizers.Adam(learning_rate=lr)
          loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
          model.compile(optimizer=optimizer, loss=loss, metrics=["accuracy"])

          # Train the model
          history = model.fit(train_dataset, validation_data=val_dataset, epochs=1)

          # Evaluate the model on validation dataset
          eval_results = model.evaluate(val_dataset, verbose=0)
          val_accuracy = eval_results[1]

          # Compute F1 score manually:
          # 1. Collect predictions and true labels
          all_preds = []
          all_labels = []
          for batch in val_dataset:
              inputs, labels = batch
              logits = model(inputs, training=False).logits
              preds = tf.argmax(logits, axis=-1).numpy()
              all_preds.extend(preds)
              all_labels.extend(labels.numpy())

            # Calculate F1 score (macro-average)
          val_f1 = f1_score(all_labels, all_preds, average='macro')

            # Save the hyperparameters and metrics
          results.append({
              "learning_rate": lr,
              "batch_size": batch_size,
              "val_accuracy": val_accuracy,
              "val_f1": val_f1
          })
          print(f"Finished: Acc: {val_accuracy:.4f}, F1: {val_f1:.4f}\n")

# Print all results
for res in results:
    print(res)

In [None]:
df = pd.DataFrame(results)
df.to_csv("/content/drive/My Drive/BA THESIS/graphs/training/grid_search_results.csv", index=False)

In [None]:
# laod in training results

file_path_results = "/content/drive/My Drive/BA THESIS/graphs/training/grid_search_results.csv"


# Load CSV
results = pd.read_csv(file_path_results)
results

In [None]:
import matplotlib.pyplot as plt

In [None]:
batch_sizes = results['batch_size'].unique()

plt.figure(figsize=(8, 6))
for bs in batch_sizes:
    # Select rows for the current batch size
    subset = results[results['batch_size'] == bs]
    # Sort by learning_rate for smooth line plot
    subset = subset.sort_values('learning_rate')
    plt.plot(subset['learning_rate'], subset['val_accuracy'], marker='o', label=f'Batch Size {bs}')

plt.xlabel('Learning Rate')
plt.ylabel('Validation Accuracy')
plt.title('Validation Accuracy vs Learning Rate for Different Batch Sizes')
plt.legend()
plt.grid(True)
plt.savefig("/content/drive/My Drive/BA THESIS/graphs/training/validation_accuracy_vs_learning_rate.png")
plt.show()

In [None]:
plt.figure(figsize=(8, 6))
for bs in batch_sizes:
    # Select rows for the current batch size
    subset = results[results['batch_size'] == bs]
    # Sort by learning_rate for smooth line plot
    subset = subset.sort_values('learning_rate')
    plt.plot(subset['learning_rate'], subset['val_f1'], marker='o', label=f'Batch Size {bs}')

plt.xlabel('Learning Rate')
plt.ylabel('F1 score')
plt.title('F1 score vs Learning Rate for Different Batch Sizes')
plt.legend()
plt.grid(True)
plt.savefig('/content/drive/My Drive/BA THESIS/graphs/training/f1_vs_learning_rate.png')
plt.show()

We can see that overall smaller batch size yields better results, and the learning rate of 3e-5 yields the best results. These parameters we will keep fixed and train over multiple epochs.

Now, we fix the batch size at 16 and the learning rate at 3e-5 and optimize for the number of epochs. The search range is 1 to 5 epochs.

In [None]:

# Fixed hyperparameters
fixed_batch_size = 16
fixed_learning_rate = 3e-5
epoch_list = [1, 2, 3, 4, 5]



results = []

train_dataset = dataset_train.shuffle(buffer_size=len(train_BERT)).batch(fixed_batch_size)
val_dataset = dataset_val.batch(fixed_batch_size)

for num_epochs in epoch_list:
    print(f"Training for {num_epochs} epochs with batch size {fixed_batch_size} and learning rate {fixed_learning_rate}")

    #K.clear_session()
    #gc.collect()

    # Load and compile the model
    model = TFBertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=3)

    model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=fixed_learning_rate),
                  loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
                  metrics=["accuracy"])

    # Train the model
    history = model.fit(train_dataset, validation_data=val_dataset, epochs=num_epochs, verbose=1)

    # Evaluate the model on the validation dataset for accuracy
    eval_results = model.evaluate(val_dataset, verbose=0)
    val_accuracy = eval_results[1]

    # Compute F1 score manually
    all_preds = []
    all_labels = []
    for batch in val_dataset:
        inputs, labels = batch
        logits = model(inputs, training=False).logits
        preds = tf.argmax(logits, axis=-1).numpy()
        all_preds.extend(preds)
        all_labels.extend(labels.numpy())
    val_f1 = f1_score(all_labels, all_preds, average='macro')

    # Save the metrics and hyperparameter info in the results list
    results.append({
        "epochs": num_epochs,
        "batch_size": fixed_batch_size,
        "learning_rate": fixed_learning_rate,
        "val_accuracy": val_accuracy,
        "val_f1": val_f1
    })

    print(f"Finished {num_epochs} epochs: Accuracy = {val_accuracy:.4f}, F1 = {val_f1:.4f}\n")

    # Clean up model from memory
    del model

# Print results
for res in results:
    print(res)


In [None]:
df = pd.DataFrame(results)
df.to_csv("/content/drive/My Drive/BA THESIS/graphs/training/epochs_results.csv", index=False)

In [None]:
import pandas as pd

In [None]:
import matplotlib.pyplot as plt

In [None]:
# laod in training results

file_path_epochs = "/content/drive/My Drive/BA THESIS/graphs/training/epochs_results.csv"


# Load CSV
epochs = pd.read_csv(file_path_epochs)
epochs

In [None]:
plt.figure(figsize=(8, 6))

plt.plot(epochs['epochs'], epochs['val_accuracy'], marker='o')

plt.xlabel('Number of Epochs')
plt.ylabel('Validation Accuracy')
plt.title('Validation Accuracy for Different Number of Epochs')
plt.grid(True)
plt.savefig('/content/drive/My Drive/BA THESIS/graphs/training/validation_accuracy_vs_epochs.png')
plt.show()

In [None]:
plt.figure(figsize=(8, 6))

plt.plot(epochs['epochs'], epochs['val_f1'], marker='o')

plt.xlabel('Number of Epochs')
plt.ylabel('F1 score')
plt.title('F1 score for Different Number of Epochs')
plt.grid(True)
plt.savefig('/content/drive/My Drive/BA THESIS/graphs/training/f1_vs_epochs.png')
plt.show()

From the graph we can see that the last significant improvement happens when we increase the number of epochs from 1 to 2. As the increase of epochs linearly increases complexity, it is only worth to do an additional epoch when it yields significant increae in accuracy or F1 score. From the graphs and the table we can decide to chose the model that ha sbeen trained for only 2 epochs.

### Train and save final model

Now we train and save the model with the final hyperparameters that are:
- batch size: 16
- learning rate: 3e-5
- epochs: 2

In [None]:
# Shuffle and batch the dataset
batch_size = 16
dataset_train = dataset_train.shuffle(buffer_size=len(train_BERT)).batch(batch_size)
dataset_val = dataset_val.batch(batch_size)

In [None]:
# Positive, negative, neutral classification, num_labels=3
model = TFBertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=3)

# Compile the model with an optimizer, loss, and metrics
optimizer = tf.keras.optimizers.Adam(learning_rate=3e-5)
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
model.compile(optimizer=optimizer, loss=loss, metrics=['accuracy'])

In [None]:
num_epochs = 2
history = model.fit(
    dataset_train,
    validation_data=dataset_val,
    epochs=num_epochs
)

In [None]:
eval_results = model.evaluate(dataset_val)
print(f"Validation Loss: {eval_results[0]:.4f}, Validation Accuracy: {eval_results[1]:.4f}")


In [None]:
# Save the model
model.save_pretrained("/content/drive/My Drive/BA THESIS/trained_bert_model/bert_model")

# Save the tokenizer (important for preprocessing when using the model later)
tokenizer.save_pretrained("/content/drive/My Drive/BA THESIS/trained_bert_model/bert_tokenizer")