# <font color="#003660">Applied Machine Learning for Text Analysis (M.184.5331)</font>


# <font color="#003660">Session 2: Transformer Architecture - Encoder-only Models</font>

# <font color="#003660">Notebook 3: Text Regression with Transformers</font>

# Import Packages

In [None]:
#!pip install datasets

In [None]:
import pandas as pd
import numpy as np
from sklearn.metrics import mean_absolute_error
from datasets import Dataset, load_dataset, DatasetDict
from transformers import AutoTokenizer
from transformers import AutoModel
from transformers import AutoModelForSequenceClassification
from transformers import Trainer, TrainingArguments
import torch
import torch.nn as nn

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

# Create Dataset

In [None]:
corpus = pd.read_csv("https://raw.githubusercontent.com/olivermueller/amlta-2024/main/Session_01/winemag-data-130k-v2.csv")

In [None]:
corpus["points"] = corpus["points"].astype(float)

In [None]:
train_corpus = corpus.iloc[0:80000,].sample(n=10000)
val_corpus = corpus.iloc[80000:100000,]
test_corpus = corpus.iloc[100000:,]

In [None]:
train_corpus = train_corpus[["description", "points"]]
val_corpus = val_corpus[["description", "points"]]
test_corpus = test_corpus[["description", "points"]]

train_corpus = train_corpus.rename(columns={"description": "text", "points": "label"})
val_corpus = val_corpus.rename(columns={"description": "text", "points": "label"})
test_corpus = test_corpus.rename(columns={"description": "text", "points": "label"})

In [None]:
train_ds = Dataset.from_pandas(train_corpus)
val_ds = Dataset.from_pandas(val_corpus)
test_ds = Dataset.from_pandas(test_corpus)

wine_ds = DatasetDict()

wine_ds['train'] = train_ds
wine_ds['validation'] = val_ds
wine_ds['test'] = test_ds

In [None]:
wine_ds

# Fine-tune Model

In [None]:
model_name = "distilbert-base-uncased"

In [None]:
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [None]:
def tokenize(batch):
    return tokenizer(batch["text"], padding=True, truncation=True)

In [None]:
wine_ds_encoded = wine_ds.map(tokenize, batched=True, batch_size=None)

In [None]:
model = (AutoModelForSequenceClassification
         .from_pretrained(model_name, num_labels = 1)
         .to(device))

In [None]:
model.config.problem_type = "regression"

In [None]:
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = predictions.flatten()
    mae = np.mean(np.abs(predictions - labels))
    return {"mae": mae}

In [None]:
batch_size = 64
logging_steps = len(wine_ds_encoded["train"]) // batch_size

training_args = TrainingArguments(output_dir="results",
                                  num_train_epochs=3,
                                  learning_rate=3e-5,
                                  per_device_train_batch_size=batch_size,
                                  per_device_eval_batch_size=batch_size,
                                  load_best_model_at_end=True,
                                  weight_decay=0.01,
                                  eval_strategy="epoch",
                                  save_strategy="epoch",
                                  disable_tqdm=False,
                                  logging_steps=logging_steps,
                                  report_to="none")

In [None]:
trainer = Trainer(model=model,
                  args=training_args,
                  compute_metrics=compute_metrics,
                  train_dataset=wine_ds_encoded["train"],
                  eval_dataset=wine_ds_encoded["validation"])
trainer.train()

# Evaluate Model

In [None]:
preds_output = trainer.predict(wine_ds_encoded["test"])

In [None]:
y_preds = preds_output.predictions

In [None]:
y_test = wine_ds_encoded["test"]["label"]

In [None]:
mae = mean_absolute_error(y_test, y_preds)
print(mae)