In [1]:
from transformers import GPT2ForSequenceClassification, GPT2Tokenizer, Trainer, TrainingArguments
from datasets import Dataset
import pandas as pd
import numpy as np
from sklearn.metrics import accuracy_score
import time

# Load the training and testing data
train_df = pd.read_csv('/kaggle/input/ita-assignment-01/train.csv')
test_df = pd.read_csv('/kaggle/input/ita-assignment-01/test.csv')

# Encode the sentiment labels as integers
label_dict = {'negative': 0, 'positive': 1}
train_df['sentiment'] = train_df['sentiment'].map(label_dict)
test_df['sentiment'] = test_df['sentiment'].map(label_dict)

# Convert the DataFrame to Hugging Face's Dataset
train_dataset = Dataset.from_pandas(train_df)
test_dataset = Dataset.from_pandas(test_df)


2024-02-24 11:50:39.850517: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-02-24 11:50:39.850638: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-02-24 11:50:40.013127: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [2]:
# Tokenization
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
tokenizer.pad_token = tokenizer.eos_token  

def tokenize_function(examples):
    tokenized_inputs = tokenizer(examples["review"], padding=True, truncation=True, max_length=512)
    tokenized_inputs["labels"] = examples["sentiment"]
    return tokenized_inputs

train_dataset = train_dataset.map(tokenize_function, batched=True)
test_dataset = test_dataset.map(tokenize_function, batched=True)


tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

  0%|          | 0/30 [00:00<?, ?ba/s]

  0%|          | 0/20 [00:00<?, ?ba/s]

In [3]:
# Model Initialization
model = GPT2ForSequenceClassification.from_pretrained("gpt2", num_labels=2)
model.config.pad_token_id = model.config.eos_token_id  # Adding a classification head if not using a pre-trained classification model

model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

Some weights of GPT2ForSequenceClassification were not initialized from the model checkpoint at gpt2 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [4]:
# Training Arguments
training_args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=3,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir="./logs",
)

start_time = time.time()

# Fine-tuning the Model
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
)

trainer.train()

end_time = time.time()


[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
[34m[1mwandb[0m: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

  ········································


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc




Step,Training Loss
500,0.5885
1000,0.2658
1500,0.2165
2000,0.1977
2500,0.169
3000,0.1581
3500,0.1513
4000,0.1252
4500,0.1031
5000,0.0959




In [5]:
# Prediction and Evaluation
predictions = trainer.predict(test_dataset)
pred_labels = np.argmax(predictions.predictions, axis=1)

accuracy = accuracy_score(test_df['sentiment'], pred_labels)
print(f'Final Accuracy: {accuracy}')
print(f"Training time: {end_time - start_time} seconds")


Final Accuracy: 0.94175
Training time: 5895.962452888489 seconds
