In [None]:
# My custom setup script
import os

os.environ['HF_HUB_DISABLE_IMPLICIT_TOKEN'] = '1'
os.environ['WANDB_DISABLED'] = 'true'
os.environ['TOKENIZERS_PARALLELISM'] = 'false'

print("Installing transformers version so no 404 error occurs...")
!pip uninstall -y peft -q
!pip install -q transformers==4.40.1 accelerate==0.27.0
print("[SUCCESS]: Environment variables have been set")

In [None]:
import pandas as pd
import numpy as np
import torch
import time
from torch.utils.data import Dataset 
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments, EarlyStoppingCallback
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, precision_recall_fscore_support
import matplotlib.pyplot as plt
import seaborn as sns

print("[SUCCESS]: Libraries were imported")
    

In [None]:
import torch

print("[CHECKING]: GPU")

if torch.cuda.is_available():
    print(f"[SUCCESS]: Available GPU --> {torch.cuda.get_device_name(0)}")
    print(f"[SUCCESS]: GPU Memory --> {torch.cuda.get_device_properties(0).total_memory / 1e9:.2f} GB")
else:
    print("[ERROR]: GPU was not detected. Please enable it to --> GPU T4 x 2")

In [None]:
# Created a custom dataset class since Pytorch's training system requires data to be in a specific format, which is different to that of Pandas (which is what we are making use of in this project)
class FakeNewsDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, maxLength=512):
        self.texts = texts.reset_index(drop=True)
        self.labels = labels.reset_index(drop=True)
        self.tokenizer = tokenizer
        self.maxLength = maxLength
    
    def __len__(self):
        return len(self.texts)
    
    def __getitem__(self, idx):
        text = str(self.texts.iloc[idx])
        label = self.labels.iloc[idx]

        encoding = self.tokenizer(
            text,
            truncation=True,
            padding="max_length",
            max_length=self.maxLength,
            return_tensors="pt"
        )

        return {
            "input_ids": encoding["input_ids"].flatten(),
            "attention_mask": encoding["attention_mask"].flatten(),
            "labels": torch.tensor(label, dtype=torch.long)
        }
    
print("[SUCCESS]: Dataset class successfully defined!")

In [None]:
# Load processed data
def loadProcessedData():

    try:
        trainDf = pd.read_csv("train.csv")
        valDf = pd.read_csv("validation.csv")
        testDf = pd.read_csv("test.csv")
        
    except:

        try:
            trainDf = pd.read_csv("/kaggle/input/processed-data/train.csv")
            valDf = pd.read_csv("/kaggle/input/processed-data/validation.csv")
            testDf = pd.read_csv("/kaggle/input/processed-data/test.csv")

        except:
            print("[ERROR]: Data not found. Please upload the required train.csv, test.csv, and validation.csv files")
            raise

    return trainDf, valDf, testDf

trainDf, valDf, testDf = loadProcessedData()

print("[SUCCESS]: Data was successfully added")
print(f"Training samples: {len(trainDf):,}")
print(f"Validation samples: {len(valDf):,}")
print(f"Test samples: {len(testDf):,}")
print(f"Total: {len(trainDf) + len(valDf) + len(testDf):,}")


In [None]:
def computeMetrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)

    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='weighted')
    acc = accuracy_score(labels, preds)

    return {
        "accuracy": acc,
        "f1": f1,
        "precision": precision,
        "recall": recall
    }