# <font color="#003660">Applied Machine Learning for Text Analysis (M.184.5331)</font>


# <font color="#003660">Session 3: Transformer Architecture</font>

# <font color="#003660">Notebook 3: Text Regression with Transformers</font>

# Import Packages

In [2]:
import pandas as pd
import numpy as np
from sklearn.metrics import accuracy_score, f1_score, roc_curve, auc
from sklearn.metrics import classification_report
from datasets import Dataset, load_dataset, DatasetDict
from transformers import AutoTokenizer
from transformers import AutoModel
from transformers import AutoModelForSequenceClassification
from transformers import Trainer, TrainingArguments
import torch
import torch.nn as nn

In [3]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cpu')

# Create Dataset

In [4]:
corpus = pd.read_csv("https://raw.githubusercontent.com/olivermueller/amlta-2024/main/Session_01/winemag-data-130k-v2.csv")

In [None]:
train_corpus = corpus.iloc[0:80000,].sample(n=10000)
val_corpus = corpus.iloc[80000:100000,]
test_corpus = corpus.iloc[100000:,]

In [6]:
train_corpus = train_corpus[["description", "points"]]
val_corpus = val_corpus[["description", "points"]]
test_corpus = test_corpus[["description", "points"]]

train_corpus = train_corpus.rename(columns={"description": "text", "points": "label"})
val_corpus = val_corpus.rename(columns={"description": "text", "points": "label"})
test_corpus = test_corpus.rename(columns={"description": "text", "points": "label"})

In [7]:
train_ds = Dataset.from_pandas(train_corpus)
val_ds = Dataset.from_pandas(val_corpus)
test_ds = Dataset.from_pandas(test_corpus)

wine_ds = DatasetDict()

wine_ds['train'] = train_ds
wine_ds['validation'] = val_ds
wine_ds['test'] = test_ds

In [8]:
wine_ds

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 80000
    })
    validation: Dataset({
        features: ['text', 'label'],
        num_rows: 20000
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 29971
    })
})

# Fine-tune Model

In [9]:
model_name = "distilbert-base-uncased"

In [10]:
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [11]:
def tokenize(batch):
    return tokenizer(batch["text"], padding=True, truncation=True)

In [12]:
wine_ds_encoded = wine_ds.map(tokenize, batched=True, batch_size=None)

Map:   0%|          | 0/80000 [00:00<?, ? examples/s]

Map:   0%|          | 0/20000 [00:00<?, ? examples/s]

Map:   0%|          | 0/29971 [00:00<?, ? examples/s]

In [13]:
model = (AutoModelForSequenceClassification
         .from_pretrained(model_name, num_labels = 1)
         .to(device))

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [16]:
model.config.problem_type = "regression"

In [17]:
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = predictions.flatten()
    mse = ((predictions - labels) ** 2).mean()
    return {"mse": mse}

In [18]:
batch_size = 64
logging_steps = len(wine_ds_encoded["train"]) // batch_size

training_args = TrainingArguments(output_dir="results",
                                  num_train_epochs=2,
                                  learning_rate=3e-5,
                                  per_device_train_batch_size=batch_size,
                                  per_device_eval_batch_size=batch_size,
                                  load_best_model_at_end=True,
                                  #metric_for_best_model="f1",
                                  weight_decay=0.01,
                                  eval_strategy="epoch",
                                  save_strategy="epoch",
                                  disable_tqdm=False,
                                  logging_steps=logging_steps)

In [19]:
trainer = Trainer(model=model,
                  args=training_args,
                  compute_metrics=compute_metrics,
                  train_dataset=wine_ds_encoded["train"],
                  eval_dataset=wine_ds_encoded["validation"])
trainer.train()

  0%|          | 0/2500 [00:00<?, ?it/s]

: 

## Evaluate Model

In [None]:
preds_output = trainer.predict(wine_ds_encoded["test"])

In [None]:
y_preds = np.argmax(preds_output.predictions, axis=1)

In [None]:
y_valid = wine_ds_encoded["test"]["label"]

In [None]:
print(classification_report(y_valid, y_preds))

In [None]:
my_submission = pd.DataFrame({'index': wine_ds_encoded["test"]["__index_level_0__"],
                              'verygood':y_preds})
my_submission.to_csv("my_submission.csv", index=False)