## Colab

In [1]:
!pip install comet_ml
!pip install transformers
!pip install datasets
!pip install evaluate

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting comet_ml
  Downloading comet_ml-3.32.0-py3-none-any.whl (462 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m462.7/462.7 KB[0m [31m7.2 MB/s[0m eta [36m0:00:00[0m
Collecting requests-toolbelt>=0.8.0
  Downloading requests_toolbelt-0.10.1-py2.py3-none-any.whl (54 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m54.5/54.5 KB[0m [31m4.2 MB/s[0m eta [36m0:00:00[0m
Collecting semantic-version>=2.8.0
  Downloading semantic_version-2.10.0-py2.py3-none-any.whl (15 kB)
Collecting wurlitzer>=1.0.2
  Downloading wurlitzer-3.0.3-py3-none-any.whl (7.3 kB)
Collecting sentry-sdk>=1.1.0
  Downloading sentry_sdk-1.14.0-py2.py3-none-any.whl (178 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m178.9/178.9 KB[0m [31m6.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting everett[ini]>=1.0.1
  Downloading everett-3.1.0-py2.py3-none-any.

In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

In [None]:
import os
os.chdir("/content/gdrive/My Drive/SSNElab15")

# Imports

In [None]:
import comet_ml
from comet_ml import Experiment
import torch
from torch.utils.data import DataLoader
from torch.optim import Adam
from torch import nn
import matplotlib.pyplot as plt

In [None]:
import transformers
from transformers import *
import numpy as np
from datasets import load_dataset
import evaluate
from tqdm import tqdm

In [None]:
from datasets import load_dataset
import datasets

from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from transformers import AutoTokenizer, Trainer, TrainingArguments, AutoModelForSequenceClassification, DataCollatorWithPadding

In [None]:
device = torch.device("cuda:0") if torch.cuda.is_available() else torch.device("cpu")
print("Device:", device)

In [None]:
RANDOM_SEED = 42
VALIDATION_PERCENTAGE = 10
# PRE_TRAINED_MODEL_NAME = 'distilbert-base-uncased'
# PRE_TRAINED_MODEL_NAME = 'distilbert-base-uncased-finetuned-sst-2-english'
# PRE_TRAINED_MODEL_NAME = "bert-base-uncased"
PRE_TRAINED_MODEL_NAME = "nlptown/bert-base-multilingual-uncased-sentiment"

# Load and prepare data

In [None]:
full_train_dataset = load_dataset("csv", data_files="p6/train_data.csv")

train_dataset, valid_dataset = datasets.load_dataset("csv", data_files="p6/train_data.csv", split=[
    datasets.ReadInstruction('train', to=100-VALIDATION_PERCENTAGE, unit='%'),
    datasets.ReadInstruction('train', from_=-VALIDATION_PERCENTAGE, unit='%'),
])

test_dataset = load_dataset("csv", data_files="p6/test_data.csv", column_names=['review'])

In [None]:
# rename label column
full_train_dataset = full_train_dataset.rename_column('rating', 'label')
train_dataset = train_dataset.rename_column('rating', 'label')
valid_dataset = valid_dataset.rename_column('rating', 'label')

In [None]:
print(full_train_dataset)

In [None]:
print(train_dataset)

In [None]:
print(valid_dataset)

In [None]:
print(test_dataset)

In [None]:
plt.hist(full_train_dataset['train']['label'])

In [None]:
# Create a 3x1 grid of subplots
fig, axs = plt.subplots(1, 3, figsize=(10,4))

# Plot the first histogram in the first subplot
axs[0].hist(full_train_dataset['train']['label'], color='blue', alpha=0.5)
axs[0].set_title('full train dataset')

# Plot the second histogram in the second subplot
axs[1].hist(train_dataset['label'], color='green', alpha=0.5)
axs[1].set_title('train dataset')

# Plot the third histogram in the third subplot
axs[2].hist(valid_dataset['label'], color='red', alpha=0.5)
axs[2].set_title('validation dataset')

plt.show()

## Compute class weights

In [None]:
from sklearn.utils.class_weight import compute_class_weight


CLASS_WEIGHT= compute_class_weight(class_weight='balanced', classes=np.unique(train_dataset['label']).tolist(), y=train_dataset['label'])
classes_weights = torch.tensor(np.array(CLASS_WEIGHT).astype('float32')).to(device)

In [None]:
for class_idx, class_weight in zip(np.unique(train_dataset['label']), CLASS_WEIGHT):
    print(class_idx, class_weight)

## Tokenization

In [None]:
tokenizer = AutoTokenizer.from_pretrained(PRE_TRAINED_MODEL_NAME)

def tokenize_function(data):
    return tokenizer(data["review"], padding="max_length", truncation=True)


tokenized_train_df = train_dataset.map(tokenize_function, batched=True)
tokenized_valid_df = valid_dataset.map(tokenize_function, batched=True)

tokenized_train_df = tokenized_train_df.remove_columns(['review'])
tokenized_valid_df = tokenized_valid_df.remove_columns(['review'])

tokenized_train_df.set_format("torch")
tokenized_valid_df.set_format("torch")

# train_df = tokenized_train_df.shuffle(seed=RANDOM_SEED).select(range(200))
# valid_df = tokenized_valid_df.shuffle(seed=RANDOM_SEED).select(range(100))
train_df = tokenized_train_df
valid_df = tokenized_valid_df

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [None]:
print(train_df)

In [None]:
print(valid_df)

# Define model

In [None]:
# model = AutoModel.from_pretrained(PRE_TRAINED_MODEL_NAME)
model = AutoModelForSequenceClassification.from_pretrained(
    PRE_TRAINED_MODEL_NAME, num_labels=5
)

In [None]:
training_arguments = TrainingArguments(
    seed=RANDOM_SEED,
    optim="adamw_torch",
    learning_rate=5e-5,
    num_train_epochs=1,
    output_dir="./results",
    overwrite_output_dir=True,
    do_train=True,
    do_eval=True,
    evaluation_strategy="steps",
    eval_steps=25,
    save_strategy="steps",
    save_total_limit=1,
    save_steps=500,
)

In [None]:
# read comet API key from a file
with open('api_key.txt', 'r') as file:
    API_KEY = file.readline().strip()

In [None]:
experiment = Experiment(
    api_key=API_KEY,
    project_name="hotel_stars_v3_with_weights",
    workspace="milosz-l",
)

In [None]:
def compute_metrics(pred):
    
    #get global experiments
    # experiment = comet_ml.get_global_experiment()
    
    #get y_true and y_preds for eval_dataset
    # print(f'pred = {pred}')
    labels = pred.label_ids
    # print(f'labels = {labels}')
    preds = pred.predictions.argmax(-1)
    # print(f'preds = {preds}')
    
    #compute precision, recall, and F1 score
    precision, recall, f1, _ = precision_recall_fscore_support(
        labels, preds, average='macro')
    
    #compute accuracy score
    acc = accuracy_score(labels, preds)
    
    #log confusion matrix
    if experiment:
        epoch = int(experiment.curr_epoch) if experiment.curr_epoch is not None else 0
        experiment.set_epoch(epoch)
        experiment.log_confusion_matrix(
            y_true=labels,
            y_predicted=preds,
            labels=[1, 2, 3, 4, 5]
        )

    return {"accuracy": acc, 
            "f1": f1, 
            "precision": precision,
            "recall": recall
            }

In [None]:
# override Trainer for adding class weighting


class CustomTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False):
        labels = inputs.get("labels")
        # forward pass
        outputs = model(**inputs)
        logits = outputs.get('logits')
        # compute custom loss
        loss_fct = nn.CrossEntropyLoss(weight=classes_weights)
        loss = loss_fct(logits.view(-1, self.model.config.num_labels), labels.view(-1))
        return (loss, outputs) if return_outputs else loss

In [None]:
%env COMET_MODE=ONLINE
%env COMET_LOG_ASSETS=TRUE
trainer = CustomTrainer(
    model=model,
    args=training_arguments,
    train_dataset=train_df,
    eval_dataset=valid_df,
    compute_metrics=compute_metrics,
    data_collator=data_collator,
)

In [None]:
trainer.train()

In [None]:
# experiment.end()