In [None]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer, Trainer, TrainingArguments
from datasets import Dataset
import pandas as pd
from sklearn.metrics import accuracy_score
import time

# Load the training and testing data
train_df = pd.read_csv('/kaggle/input/ita-assignment-01/train.csv')
test_df = pd.read_csv('/kaggle/input/ita-assignment-01/test.csv')

# Encode the sentiment labels as integers
label_dict = {'negative': 0, 'positive': 1}
train_df['sentiment'] = train_df['sentiment'].map(label_dict)
test_df['sentiment'] = test_df['sentiment'].map(label_dict)

# Convert the DataFrame to Hugging Face's Dataset
train_dataset = Dataset.from_pandas(train_df)
test_dataset = Dataset.from_pandas(test_df)

In [None]:
# Tokenization
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

def tokenize_function(examples):
    tokenized_inputs = tokenizer(examples["review"], padding="max_length", truncation=True, max_length=512)
    tokenized_inputs["labels"] = examples["sentiment"]
    return tokenized_inputs

train_dataset = train_dataset.map(tokenize_function, batched=True)
test_dataset = test_dataset.map(tokenize_function, batched=True)

In [None]:
# Model Initialization
model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=2)

In [None]:
# Training Arguments
training_args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=3,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir="./logs",
)
start_time = time.time()
# Fine-tuning the Model
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
)

trainer.train()
end_time = time.time()

In [None]:
import numpy as np
from sklearn.metrics import accuracy_score

predictions = trainer.predict(test_dataset)
pred_labels = np.argmax(predictions.predictions, axis=1)

# Calculate accuracy using scikit-learn
accuracy = accuracy_score(test_df['sentiment'], pred_labels)
print(f'Final Accuracy: {accuracy}')
print(f"Training time: {end_time - start_time} seconds")