## Imports

Here import all crucial packages etc.

In [1]:
# Code here
import json
import os
import pandas as pd
from transformers import (
    AutoTokenizer, AutoModelForSequenceClassification,
    Trainer, TrainingArguments
)
from sklearn.metrics import f1_score
import torch
from transformers import EvalPrediction, pipeline
from sklearn.metrics import f1_score, precision_score, recall_score

In [2]:
os.environ["WANDB_DISABLED"] = "true"

## Utils

Helper functions that you will use

In [3]:
#Code here
class DisinformationDataset(torch.utils.data.Dataset):
    """
    This class wraps our tokenized data and labels so PyTorch can easily loop through them during training. It converts each input into tensors and returns them with the label — all in the format the model expects.
    """
    # When we create an instance of dataset, we pass in encodings and labels
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels
        
    # This method tells PyTorch how to get one item (input + label).
    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx], dtype=torch.long)
        return item
    
    # Returns how many examples are in the dataset (needed by DataLoader).
    def __len__(self):
        return len(self.labels)


def load_and_process_data(file_path: str, label_column: str = "label") -> pd.DataFrame:
    """
    Loads the data from a CSV file and processes the labels.
    Args:
        file_path (str): Path to the CSV file.
        label_column (str): The column name containing the labels.
        text_column (str): The column name containing the text content.
    Returns:
        pd.DataFrame: Processed dataframe with labels and text content.
    """
    try:
        data = pd.read_csv(file_path, encoding='utf-8', on_bad_lines='skip')  
        data[label_column] = data[label_column].apply(lambda x: 1 if "fake" in str(x).lower() else 0)
        return data
    except Exception as e:
        print(f"Error loading file {file_path}: {e}")
        raise


def save_metrics_to_json(metrics: dict, output_file_path: str):
    """
    Saves the metrics to a JSON file.
    Args:
        metrics (dict): The evaluation metrics.
        output_file_path (str): The file path to save the metrics.
    """
    os.makedirs(os.path.dirname(output_file_path), exist_ok=True)
    with open(output_file_path, 'w') as output_file:
        json.dump(metrics, output_file, indent=4)

In [4]:
def compute_metrics(pred=None, y_true=None, y_pred=None):
    if pred is not None:
        labels = pred.label_ids
        y_pred = pred.predictions.argmax(-1)
    elif y_true is not None and y_pred is not None:
        labels = y_true
    else:
        raise ValueError("Either `pred` or both `y_true` and `y_pred` must be provided.")

    return {
        'f1': f1_score(labels, y_pred, average='binary'),
        'precision': precision_score(labels, y_pred, average='binary'),
        'recall': recall_score(labels, y_pred, average='binary')
    }

def compute_metrics_for_trainer(pred):
    return compute_metrics(pred=pred)

# Assignment

# Fine-Tuning BERT Model to Fake News detection 

## Import Train, Validation and Test data 

Import all datasets and load and preprocess train and validation

Link to direcotry with data: https://github.com/ArkadiusDS/NLP-Labs/tree/master/data/CoAID/

In [5]:
url_train = 'https://raw.githubusercontent.com/ArkadiusDS/NLP-Labs/master/data/CoAID/train.csv'
url_valid = 'https://raw.githubusercontent.com/ArkadiusDS/NLP-Labs/master/data/CoAID/validation.csv'
url_test = 'https://raw.githubusercontent.com/ArkadiusDS/NLP-Labs/master/data/CoAID/test.csv'

!wget -O test.csv {url_test}
!wget -O train.csv {url_train}
!wget -O validation.csv {url_valid}

--2025-05-20 20:49:44--  https://raw.githubusercontent.com/ArkadiusDS/NLP-Labs/master/data/CoAID/test.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.111.133, 185.199.108.133, 185.199.109.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.111.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 221757 (217K) [text/plain]
Saving to: ‘test.csv’


2025-05-20 20:49:45 (1,09 MB/s) - ‘test.csv’ saved [221757/221757]

--2025-05-20 20:49:45--  https://raw.githubusercontent.com/ArkadiusDS/NLP-Labs/master/data/CoAID/train.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1556530 (1,5M) [text/plain]
Saving to: ‘train.csv’


2025-05-20 20:49:47 (1,11 MB/s) - ‘train.csv’ saved [15

In [6]:
train_data = load_and_process_data('train.csv')
validation_data = load_and_process_data('validation.csv')

## Load model and tokenizer

Firstly create two dicts id2label and label2id and then load model and tokenizer
Then use well-known distilled version of BERT model for faster fine-tuning: 'distilbert/distilbert-base-uncased' or any other model you wish.

In [7]:
id2label = {0: "Credible", 1: "Fake"}
label2id = {"Credible": 0, "Fake": 1}
model = AutoModelForSequenceClassification.from_pretrained('distilbert/distilbert-base-uncased', num_labels=2, id2label=id2label, label2id=label2id)


tokenizer = AutoTokenizer.from_pretrained('distilbert/distilbert-base-uncased')

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert/distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


## Tokenize datasets and prepare it for fine-tuning

You may use DisinformationDataset class for data preparation.

In [8]:
train_encodings = tokenizer(
    train_data['content'].tolist(),
    truncation=True,
    padding=True,
    max_length=256
)

val_encodings = tokenizer(
    validation_data['content'].tolist(),
    truncation=True,
    padding=True,
    max_length=256
)

train_dataset = DisinformationDataset(train_encodings, train_data['label'].tolist())
val_dataset = DisinformationDataset(val_encodings, validation_data['label'].tolist())


## Fine-tune BERT model on at least 3 sets of hyperparameters

Check F1 score, precision and recall for each fine-tuned model and at the end choose set of hyperparameters that gives you best results. For each set of hyperparameters write down the final metrics. You need to acheive at least below result on validation dataset:

"f1": 0.91,
"recall": 0.91,
"precision": 0.91

Remember you need to achieve these minimum results on VALIDATION dataset and the best model on validation dataset will have to be used for predictions on test dataset.


In [9]:
def train_with_hyperparams(learning_rate, batch_size, epochs):
    output_dir = f"./results_lr{learning_rate}_bs{batch_size}_ep{epochs}"
    
    training_args = TrainingArguments(
        output_dir=output_dir,
        num_train_epochs=epochs,
        per_device_train_batch_size=batch_size,
        per_device_eval_batch_size=batch_size,
        learning_rate=learning_rate,
        eval_strategy="epoch",
        save_strategy="epoch",
        logging_dir=f"{output_dir}/logs",
        load_best_model_at_end=True,
        metric_for_best_model="f1",
        greater_is_better=True
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=val_dataset,
        tokenizer=tokenizer,
        compute_metrics=compute_metrics_for_trainer
    )

    trainer.train()
    metrics = trainer.evaluate()

    print(f"Hyperparameters: LR={learning_rate}, BS={batch_size}, Epochs={epochs}")
    print(f"Validation Metrics: {metrics}\n")
    return metrics, trainer

In [10]:
results = []

#experiment 1
results.append(train_with_hyperparams(learning_rate=2e-5, batch_size=16, epochs=3))



Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
  trainer = Trainer(
  arr = np.array(obj)


Epoch,Training Loss,Validation Loss,F1,Precision,Recall
1,No log,0.104609,0.898936,0.988304,0.82439
2,No log,0.072314,0.952141,0.984375,0.921951
3,0.107100,0.085435,0.949495,0.984293,0.917073


  arr = np.array(obj)
  arr = np.array(obj)
  arr = np.array(obj)


Hyperparameters: LR=2e-05, BS=16, Epochs=3
Validation Metrics: {'eval_loss': 0.07231421768665314, 'eval_f1': 0.9521410579345089, 'eval_precision': 0.984375, 'eval_recall': 0.9219512195121952, 'eval_runtime': 10.6247, 'eval_samples_per_second': 94.309, 'eval_steps_per_second': 5.93, 'epoch': 3.0}



In [13]:
metrics1, trainer1 = results[0]

trainer1.save_model("model_experiment_1")
tokenizer.save_pretrained("model_experiment_1")

import json
with open("metrics_experiment_1.json", "w") as f:
    json.dump(metrics1, f, indent=4)


In [12]:
results2 = []
results2.append(train_with_hyperparams(learning_rate=2e-5, batch_size=16, epochs=2))

Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
  trainer = Trainer(
  arr = np.array(obj)


Epoch,Training Loss,Validation Loss,F1,Precision,Recall
1,No log,0.114392,0.899471,0.982659,0.829268
2,No log,0.071769,0.952618,0.97449,0.931707


  arr = np.array(obj)
  arr = np.array(obj)


Hyperparameters: LR=2e-05, BS=16, Epochs=2
Validation Metrics: {'eval_loss': 0.07176907360553741, 'eval_f1': 0.9526184538653367, 'eval_precision': 0.9744897959183674, 'eval_recall': 0.9317073170731708, 'eval_runtime': 10.3535, 'eval_samples_per_second': 96.779, 'eval_steps_per_second': 6.085, 'epoch': 2.0}



In [13]:
results3 = []
results3.append(train_with_hyperparams(learning_rate=2e-5, batch_size=8, epochs=3))

Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
  trainer = Trainer(
  arr = np.array(obj)


Epoch,Training Loss,Validation Loss,F1,Precision,Recall
1,No log,0.188439,0.910526,0.988571,0.843902
2,0.067000,0.124464,0.939086,0.978836,0.902439
3,0.025100,0.119405,0.949495,0.984293,0.917073


  arr = np.array(obj)
  arr = np.array(obj)
  arr = np.array(obj)


Hyperparameters: LR=2e-05, BS=8, Epochs=3
Validation Metrics: {'eval_loss': 0.11940473318099976, 'eval_f1': 0.9494949494949495, 'eval_precision': 0.9842931937172775, 'eval_recall': 0.9170731707317074, 'eval_runtime': 10.2661, 'eval_samples_per_second': 97.603, 'eval_steps_per_second': 12.273, 'epoch': 3.0}



In [None]:
metrics1, trainer1 = results[0]

# Save model and tokenizer
trainer1.save_model("model_experiment_1")
tokenizer.save_pretrained("model_experiment_1")

# Save metrics to JSON
import json
with open("metrics_experiment_1.json", "w") as f:
    json.dump(metrics1, f, indent=4)

In [14]:
metrics2, trainer2 = results2[0]

# Save model and tokenizer
trainer2.save_model("model_experiment_2")
tokenizer.save_pretrained("model_experiment_2")

# Save metrics to JSON
import json
with open("metrics_experiment_2.json", "w") as f:
    json.dump(metrics2, f, indent=4)


In [15]:
metrics3, trainer3 = results3[0]

# Save model and tokenizer
trainer3.save_model("model_experiment_3")
tokenizer.save_pretrained("model_experiment_3")

# Save metrics to JSON
import json
with open("metrics_experiment_3.json", "w") as f:
    json.dump(metrics3, f, indent=4)


## Final prediction on test dataset

Take best model and hyperparameters on validation and predict on test dataset. Compute evaluation metrics f1, precision and recall.

In [14]:
test_data = load_and_process_data("test.csv")

test_encodings = tokenizer(
    test_data['content'].tolist(),
    truncation=True,
    padding=True,
    max_length=256
)

test_dataset = DisinformationDataset(test_encodings, test_data['label'].tolist())


In [15]:
test_predictions = trainer1.predict(test_dataset)

y_pred = test_predictions.predictions.argmax(-1)

y_true = test_predictions.label_ids


  arr = np.array(obj)


In [16]:
f1 = f1_score(y_true, y_pred)
precision = precision_score(y_true, y_pred)
recall = recall_score(y_true, y_pred)

print("📊 Final Test Set Evaluation:")
print(f"F1 Score     : {f1:.4f}")
print(f"Precision    : {precision:.4f}")
print(f"Recall       : {recall:.4f}")


📊 Final Test Set Evaluation:
F1 Score     : 0.9505
Precision    : 0.9697
Recall       : 0.9320


In [17]:
final_test_metrics = {
    "f1": f1,
    "precision": precision,
    "recall": recall
}

with open("final_test_metrics.json", "w") as f:
    json.dump(final_test_metrics, f, indent=4)


# Final file with results and description

In [18]:
import json

All keys in your dictionary have to be the same as below. The only changes you should do in terms of keys is changing names of hyperparameters, e.g. instead of key "name_of_hyperparameter_0" if you used learning rate then write "learning_rate". Other important information in the dictionary below and comments. Each value says what is expected.

Example dictionary provided under the template.

Template for your structured resulting file 

In [3]:
data = {
    # Everything in experiment_0 is related to experiment on validation dataset, so metrics are computed on validation dataset etc.
    "experiment_0": {
        "model": "model name",
        "hyperparameters": {
            "name_of_hyperparameter_0": "value in str or float - You need to play with at least two different hyperparameters so at least name_of_hyperparameter_0 and name_of_hyperparameter_1",
            "name_of_hyperparameter_1": "value in str or float"
        },
        "f1_score": "value in float",
        "precision": "value in float",
        "recall": "value in float",
        "description": "Unique description one of the approach - it has to be different for each experiment."
    },
    # Everything in experiment_1 is related to experiment on validation dataset, so metrics are computed on validation dataset etc.
    "experiment_1": {
        "model": "model name",
        "hyperparameters": {
            "name_of_hyperparameter_0": "value in str or float",
            "name_of_hyperparameter_1": "value in str or float"
        },
        "f1_score": "value in float",
        "precision": "value in float",
        "recall": "value in float",
        "description": "Unique description two of the approach - it has to be different for each experiment."
    },
    # Everything in experiment_2 is related to experiment on validation dataset, so metrics are computed on validation dataset etc.
    "experiment_2": {
        "model": "model name",
        "hyperparameters": {
            "name_of_hyperparameter_0": "value in str or float",
            "name_of_hyperparameter_1": "value in str or float"
        },
        "f1_score": "value in float",
        "precision": "value in float",
        "recall": "value in float",
        "description": "Unique description three of the approach - it has to be different for each experiment."
    },
    # Everything in final_prediction is related to prediction on test dataset, so metrics are computed on test dataset etc.
    "final_prediction": {
        "model": "google-bert/bert-base-uncased",
        "experiment_chosen": "experiment_0 or experiment_1 or experiment_2",
        "hyperparameters": {
            "name_of_hyperparameter_0": "value in str or float",
            "name_of_hyperparameter_1": "value in str or float"
        },
        "f1_score": "value in float",
        "precision": "value in float",
        "recall": "value in float",
        "description": "Unique description four of the final results and prediction - it has to be different and here you will describe results on test dataset."
    }
}


In [20]:
data = {
    "experiment_0": {
        "model": "distilbert/distilbert-base-uncased",
        "hyperparameters": {
            "learning_rate": 2e-5,
            "batch_size": 16,
            "num_train_epochs": 3
        },
        "f1_score": 0.9521,
        "precision": 0.9844,
        "recall": 0.9220,
        "description": "This experiment fine-tuned DistilBERT with a learning rate of 2e-5, batch size 16, and 3 epochs. It achieved the highest precision and strong recall, offering a great balance between accurate detection and low false positives."
    },
    "experiment_1": {
        "model": "distilbert/distilbert-base-uncased",
        "hyperparameters": {
            "learning_rate": 2e-5,
            "batch_size": 16,
            "num_train_epochs": 2
        },
        "f1_score": 0.9526,
        "precision": 0.9745,
        "recall": 0.9317,
        "description": "This setup used fewer epochs to reduce training time. It maintained excellent performance across all metrics, especially recall, making it a fast yet effective option for detecting fake content."
    },
    "experiment_2": {
        "model": "distilbert/distilbert-base-uncased",
        "hyperparameters": {
            "learning_rate": 2e-5,
            "batch_size": 8,
            "num_train_epochs": 3
        },
        "f1_score": 0.9495,
        "precision": 0.9843,
        "recall": 0.9171,
        "description": "This experiment lowered the batch size to reduce memory usage. It achieved the highest precision but slightly lower recall, making it ideal when minimizing false positives is critical."
    },
    "final_prediction": {
        "model": "distilbert/distilbert-base-uncased",
        "experiment_chosen": "experiment_0",
        "hyperparameters": {
            "learning_rate": 2e-5,
            "batch_size": 16,
            "num_train_epochs": 3
        },
        "f1_score": 0.9505,
        "precision": 0.9697,
        "recall": 0.9320,
        "description": "The final model was selected from Experiment 0 due to its highest validation precision. When applied to the test set, it continued to perform well, achieving an F1 score of 0.9505. This confirms the model’s strong generalization ability across unseen disinformation data."
    }
}

with open("experiments_Mandana_Goudarzi_2122279.json", "w") as f:
    json.dump(data, f, indent=4)

## Example final file

In [5]:
data = {
    "experiment_0": {
        "model": "google-bert/bert-base-uncased",
        "hyperparameters": {
            "learning_rate": "float",
            "warmap_ratio": "float",
            "weight_decay": "float"
        },
        "f1_score": "float",
        "precision": "float",
        "recall": "float",
        "description": "This experiment fine-tuned the google-bert/bert-base-uncased model for binary classification using a learning rate of 1e-5 and a warmup ratio of 0.06. The model achieved an F1-score of 0.76, with a strong recall of 0.85, indicating high sensitivity to positive cases. Precision was moderate at 0.65, suggesting some trade-off in false positives. The setup demonstrates effective recall-oriented performance in identifying relevant instances."
    },
    "experiment_1": {
        "model": "google-bert/bert-base-uncased",
        "hyperparameters": {
            "learning_rate": "float",
            "weight_decay": "float"
        },
        "f1_score": "float",
        "precision": "float",
        "recall": "float",
        "description": "Unique description two of the approach - it has to be different for each experiment. Everything in experiment_1 is related to experiment on validation dataset, so metrics are computed on validation dataset etc."
    },
    "experiment_2": {
        "model": "google-bert/bert-base-uncased",
        "hyperparameters": {
            "learning_rate": "float",
            "num_train_epochs": "int",
            "weight_decay": "float"
        },
        "f1_score": "float",
        "precision": "float",
        "recall": "float",
        "description": "Unique description three of the approach - it has to be different for each experiment. Everything in experiment_2 is related to experiment on validation dataset, so metrics are computed on validation dataset etc."
    },
    "final_prediction": {
        "model": "google-bert/bert-base-uncased",
        "experiment_chosen": "experiment_0",
        "hyperparameters": {
            "learning_rate": "float",
            "warmap_ratio": "float"
        },
        "f1_score": "float",
        "precision": "float",
        "recall": "float",
        "description": "Unique description four of the final results and prediction - it has to be different and here you will describe results on test dataset. Everything in final_prediction is related to prediction on test dataset, so metrics are computed on test dataset etc."
    }
}

In [6]:
with open("experiments_Arkadiusz_Modzelewski_29580.json", "w") as f:
    json.dump(data, f, indent=4)