In [None]:
# uninstall wandb to fine-tune on Kaggle without token
%pip uninstall --yes wandb

In [None]:
# install and import all necessary libraries
%pip install datasets
%pip install evaluate

import datasets
import evaluate
import pandas as pd
import numpy as np
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
import torch

In [None]:
# check if GPU is available
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f'Device Availble: {DEVICE}')

In [None]:
# load data as pandas dataframe without using first line as header
training_data = pd.read_csv("/kaggle/input/huk-dataset/training.csv", header=None)
validation_data = pd.read_csv("/kaggle/input/huk-dataset/validation.csv", header=None)

In [None]:
# rename column names
training_data.rename(columns={0: 'ID', 1: 'product', 2: 'labels', 3: 'text'}, inplace=True)
validation_data.rename(columns={0: 'ID', 1: 'product', 2: 'labels', 3: 'text'}, inplace=True)

In [None]:
# keep only data used for model training and rename labels to correct training data format
training_data_final = training_data.loc[:, ['text', 'labels']].dropna()
training_data_final['labels'] = training_data_final['labels'].map({'Negative': 0, 'Positive': 1, 'Neutral': 2, 'Irrelevant': 3})

validation_data_final = validation_data.loc[:, ['text', 'labels']].dropna()
validation_data_final['labels'] = validation_data_final['labels'].map({'Negative': 0, 'Positive': 1, 'Neutral': 2, 'Irrelevant': 3})

In [None]:
# tokenize dataset
train_dataset = datasets.Dataset.from_pandas(training_data_final)
test_dataset = datasets.Dataset.from_pandas(validation_data_final)
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True)

tokenized_train_data = train_dataset.map(tokenize_function, batched=True)
tokenized_test_data = test_dataset.map(tokenize_function, batched=True)

accuracy = evaluate.load("accuracy")
f1 = evaluate.load("f1")

def compute_metrics(eval_pred):
    # Setup evaluation
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    acc = accuracy.compute(predictions=predictions, references=labels)
    f1_score = f1.compute(predictions=predictions, references=labels, average="weighted")
    return {"accuracy": acc, "f1": f1_score}

# Load pretrained model and evaluate model after each epoch
model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=4)
training_args = TrainingArguments(
    output_dir="/kaggle/output",
    eval_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train_data,
    eval_dataset=tokenized_test_data,
    compute_metrics=compute_metrics,
)

trainer.train()

trainer.evaluate()

model.save_pretrained("/kaggle/working/HUK_model_v1")
tokenizer.save_pretrained("/kaggle/working/HUK_model_v1")