In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np
import torch
import torch.optim as optim
import torch.nn as nn
from torch.utils.data import DataLoader, TensorDataset
from tqdm.auto import tqdm
from sklearn.metrics import accuracy_score, f1_score
from transformers import AutoTokenizer, AutoModelForSequenceClassification, AdamW, get_scheduler

In [3]:
import os

# Define the new directory path
new_directory = '/content/drive/MyDrive/Colab Notebooks'

# Change the current working directory to the new one
os.chdir(new_directory)

# Verify the change by printing the current working directory
print("Current working directory:", os.getcwd())

Current working directory: /content/drive/MyDrive/Colab Notebooks


In [4]:
# Load the CSV file into a DataFrame
df = pd.read_csv('Datasets/data_amazon_product_reviews_video_games.csv')

In [5]:
df.drop(labels= ['Unnamed: 0', 'reviewerID', 'asin', 'reviewerName', 'helpful',
       'unixReviewTime', 'reviewTime'], axis= 1, inplace= True)

In [6]:
df.dropna(inplace= True)
#df.isna().sum()

In [7]:
df['overall']= df['overall'].astype(dtype= 'int64')
#df.info()

In [8]:
# Assuming your CSV file has two columns: 'text' for the input text and 'label' for the class label
df['new_text']= df['reviewText'] + ' ' + df['summary']

texts= df['new_text'].tolist()
labels= df['overall'].tolist()

In [10]:
# Define hyperparameters
learning_rate = 6.5e-4
batch_size = 128
weight_decay = 5e-5
num_epochs = 10

In [11]:
1# Split the data into train and test sets (80% train, 20% test)
train_texts, test_texts, train_labels, test_labels = train_test_split(texts, labels, test_size=0.2, random_state=42)

# Further split the test set into dev and test sets (50% dev, 50% test)
dev_texts, test_texts, dev_labels, test_labels = train_test_split(test_texts, test_labels, test_size=0.5, random_state=42)


In [12]:
# Load the pre-trained BERT tokenizer and model
tokenizer = AutoTokenizer.from_pretrained("LiYuan/amazon-review-sentiment-analysis")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/556 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/872k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.56M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

In [13]:
# Tokenize the input texts
tokenized_train_texts = tokenizer(train_texts, padding=True, truncation=True, return_tensors='pt')

tokenized_dev_texts = tokenizer(dev_texts, padding=True, truncation=True, return_tensors='pt')

tokenized_test_texts = tokenizer(test_texts, padding=True, truncation=True, return_tensors='pt')

In [14]:
# Convert the labels to tensor
train_labels = torch.tensor(train_labels)

dev_labels = torch.tensor(dev_labels)

test_labels = torch.tensor(test_labels)

In [15]:
# Create TensorDatasets and DataLoaders for train, dev, and test sets

train_dataset = TensorDataset(tokenized_train_texts['input_ids'], tokenized_train_texts['attention_mask'], train_labels)

dev_dataset = TensorDataset(tokenized_dev_texts['input_ids'], tokenized_dev_texts['attention_mask'], dev_labels)

test_dataset = TensorDataset(tokenized_test_texts['input_ids'], tokenized_test_texts['attention_mask'], test_labels)

In [16]:
train_dataloader = DataLoader(train_dataset, batch_size= batch_size, shuffle= True)

dev_dataloader = DataLoader(dev_dataset, batch_size= batch_size, shuffle= False)

test_dataloader = DataLoader(test_dataset, batch_size= batch_size, shuffle= False)

In [17]:
# Load the pre-trained BERT model for sequence classification
model = AutoModelForSequenceClassification.from_pretrained("LiYuan/amazon-review-sentiment-analysis", num_labels= 5)

config.json:   0%|          | 0.00/1.23k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/670M [00:00<?, ?B/s]

In [18]:
# Freeze BERT parameters
for param in model.base_model.parameters():
    param.requires_grad = False

In [20]:
# Define the optimizer for training the softmax layer
#optimizer = optim.Adam(model.classifier.parameters(), lr=learning_rate, weight_decay=weight_decay)
optimizer = AdamW(model.classifier.parameters(), lr=learning_rate, weight_decay=weight_decay)



In [21]:
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
model.to(device)

# Define the loss function
criterion = nn.CrossEntropyLoss()

In [None]:
num_training_steps = num_epochs * len(train_dataloader)
lr_scheduler = get_scheduler(
    "linear",
    optimizer= optimizer,
    num_warmup_steps= 0,
    num_training_steps= num_training_steps,
)

progress_bar = tqdm(range(num_training_steps))

In [None]:
# Train the model
best_dev_accuracy = 0.0
best_model_state_dict = None
Validation_results= []

for epoch in range(num_epochs):
    # Training loop
    model.train()
    for batch in train_dataloader:
        input_ids, attention_mask, batch_labels = batch

        input_ids= input_ids.to(device)
        attention_mask= attention_mask.to(device)
        batch_labels= batch_labels.to(device)

        optimizer.zero_grad()
        outputs = model(input_ids= input_ids, attention_mask= attention_mask, labels= batch_labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        #### to show progress_bar
        lr_scheduler.step()
        progress_bar.update(1)


    # Validation loop
    model.eval()
    dev_correct = 0
    total_dev = 0
    y_true = []
    y_pred = []
    loss_epoch= []
    with torch.no_grad():
        for batch in dev_dataloader:
            input_ids, attention_mask, batch_labels = batch

            input_ids= input_ids.to(device)
            attention_mask= attention_mask.to(device)
            batch_labels= batch_labels.to(device)

            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            logits = outputs.logits
            _, predicted = torch.max(logits, 1)

            # Append true labels and predicted labels for later use
            y_true.extend(batch_labels.tolist())
            y_pred.extend(predicted.tolist())

            # Calculate the loss
            loss = criterion(logits, batch_labels)
            loss_epoch.append(loss)

    # Calculate accuracy and F1 score
    f1 = f1_score(y_true, y_pred, average='weighted')
    accuracy = accuracy_score(y_true, y_pred)

    # Calculate the average loss
    #average_loss= np.mean(loss_epoch)
    loss_epoch_np = [tensor.cpu().detach().numpy() for tensor in loss_epoch]
    average_loss= np.mean(loss_epoch_np)
    print(f'epoch No. : {epoch}, Devset Accuracy : {round(accuracy,5)}, Devset f1_score : {round(f1,5)}, Average loss: {round(average_loss.tolist(),5)}')

    Validation_results.append([accuracy, f1, average_loss])

    if accuracy > best_dev_accuracy:
        best_dev_accuracy = accuracy
        # Save the best model (optional)
        best_model_state_dict = model.state_dict()

epoch No. : 0, Devset Accuracy : 0.67954, Devset f1_score : 0.66401, Average loss: 0.78238
epoch No. : 1, Devset Accuracy : 0.67634, Devset f1_score : 0.66868, Average loss: 0.79641
epoch No. : 2, Devset Accuracy : 0.67794, Devset f1_score : 0.6656, Average loss: 0.78619
epoch No. : 3, Devset Accuracy : 0.68194, Devset f1_score : 0.66515, Average loss: 0.77083
epoch No. : 4, Devset Accuracy : 0.68014, Devset f1_score : 0.67075, Average loss: 0.77371
epoch No. : 5, Devset Accuracy : 0.68094, Devset f1_score : 0.66612, Average loss: 0.77281
epoch No. : 6, Devset Accuracy : 0.68454, Devset f1_score : 0.67176, Average loss: 0.77072


In [None]:
# Saving Testset Results
data = {
    'Validation_results': Validation_results,
}
df = pd.DataFrame(data)
df.to_csv('Outputs/Main-8-LiYuan-One layer-Not OPT_Validation_results.csv', index= False)

In [None]:
# Load the best model state dict
if best_model_state_dict is not None:
    model.load_state_dict(best_model_state_dict)

    # Define the directory path to save the model
    save_path = 'Saved Models/Main-8-LiYuan-One layer-Not OPT.pth' 

    # Save the model state dictionary and other relevant information
    torch.save({
        'model_state_dict': best_model_state_dict,
        'tokenizer': tokenizer  
    }, save_path)

In [None]:
# Evaluate on the test set
model.eval()
y_true_test = []
y_pred_test = []
loss_epoch= []

with torch.no_grad():
    for batch in test_dataloader:
        input_ids, attention_mask, batch_labels = batch

        input_ids= input_ids.to(device)
        attention_mask= attention_mask.to(device)
        batch_labels= batch_labels.to(device)

        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        _, predicted = torch.max(logits, 1)

        # Append true labels and predicted labels for later use
        y_true_test.extend(batch_labels.tolist())
        y_pred_test.extend(predicted.tolist())

        # Calculate the loss
        loss = criterion(logits, batch_labels)
        loss_epoch.append(loss)

# Calculate accuracy and F1 score for the test set
test_accuracy = accuracy_score(y_true_test, y_pred_test)
test_f1 = f1_score(y_true_test, y_pred_test, average='weighted')

# Calculate the average loss
#average_loss= np.mean(loss_epoch)
loss_epoch_np = [tensor.cpu().detach().numpy() for tensor in loss_epoch]
average_loss= np.mean(loss_epoch_np)

print(f"Testset accuracy: {round(test_accuracy,5)} , Testset F1 score: {round(test_f1,5)}, Average loss: {round(average_loss.tolist(),5)}")
Test_results= [test_accuracy, test_f1, average_loss]

In [None]:
# Saving Testset Results
data = {
    'Test_results': Test_results
}
df = pd.DataFrame(data)
df.to_csv('Outputs/Main-8-LiYuan-One layer-Not OPT_Test_results.csv', index= False)