In [1]:
import polars as pl
import numpy as np
import evaluate
from datasets import Dataset, DatasetDict
from transformers import AutoTokenizer, AutoModelForSequenceClassification, DataCollatorWithPadding, TrainingArguments, Trainer
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

manual_seed = 23
np.random.seed(manual_seed)
pl.set_random_seed(manual_seed)

# Load Data

In [2]:
filename = 'data/500k_50k'

df_train = pl.read_parquet(filename + '_train.parquet')
df_dev = pl.read_parquet(filename + '_dev.parquet')
df_test = pl.read_parquet(filename + '_test.parquet')

## Selecting relevant columns

In [3]:
# recommended
df_train = df_train.select(['review_text', 'recommended']).rename({'review_text': 'text', 'recommended': 'label'})
df_dev = df_dev.select(['review_text', 'recommended']).rename({'review_text': 'text', 'recommended': 'label'})
df_test = df_test.select(['review_text', 'recommended']).rename({'review_text': 'text', 'recommended': 'label'})

# found funny
# df_train = df_train.select(['review_text', 'found_funny']).rename({'review_text': 'text', 'found_funny': 'label'})
# df_dev = df_dev.select(['review_text', 'found_funny']).rename({'review_text': 'text', 'found_funny': 'label'})
# df_test = df_test.select(['review_text', 'found_funny']).rename({'review_text': 'text', 'found_funny': 'label'})

# found_helpful
# df_train = df_train.select(['review_text', 'found_helpful']).rename({'review_text': 'text', 'found_helpful': 'label'})
# df_dev = df_dev.select(['review_text', 'found_helpful']).rename({'review_text': 'text', 'found_helpful': 'label'})
# df_test = df_test.select(['review_text', 'found_helpful']).rename({'review_text': 'text', 'found_helpful': 'label'})

## Create dataset for transformers

In [3]:
dataset = DatasetDict({
    'train': Dataset(df_train.to_arrow()),
    'dev': Dataset(df_dev.to_arrow()),
    'test': Dataset(df_test.to_arrow())
})

# Training

## Select settings

In [11]:
# configurations DistilBERT
# recommend
output_dir='models/steam-classification-distilbert500k-recommend'

# funny
# output_dir='models/steam-classification-distilbert500k-funny'

# helpful
# output_dir='models/steam-classification-distilbert500k-helpful'

model_name = 'distilbert/distilbert-base-uncased'
batch_size = 32
num_epochs = 1
lr = 5e-5
num_labels = 2 # use for classification (recommend)
# num_labels = 1 # use for regression (funny and helpful)

In [8]:
# # configurations RoBERTa
# # recommend
# output_dir='models/steam-classification-roberta500k-recommend'

# # funny
# # output_dir='models/steam-classification-roberta500k-funny'

# # helpful
# # output_dir='models/steam-classification-roberta500k-helpful'

# model_name = 'FacebookAI/roberta-large'
# batch_size = 16
# num_epochs = 1
# lr = 5e-6
# num_labels = 2 # use for classification (recommend)
# # num_labels = 1 # use for regression (funny and helpful)

## Tokenize

In [8]:
tokenizer = AutoTokenizer.from_pretrained(model_name) # automatically selects the correct Tokenizer

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at FacebookAI/roberta-large and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [9]:
def tokenize_function(examples):
    text = examples['text']
    # it is possible to return tensors in pytorch, but then you need to pad everything which uses more memory
    # instead we use collator that does that on the fly
    return tokenizer(text, truncation=True, return_tensors="np", max_length=128)

tokenized_dataset = dataset.map(tokenize_function, batched=True)

Map:   0%|          | 0/500000 [00:00<?, ? examples/s]

Map:   0%|          | 0/50000 [00:00<?, ? examples/s]

Map:   0%|          | 0/50000 [00:00<?, ? examples/s]

In [10]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

## Evaluation during training

In [15]:
# classification
accuracy = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return {"accuracy": accuracy.compute(predictions=predictions, references=labels)['accuracy']}

In [11]:
# regression
# def compute_metrics(eval_pred):
#     predictions, labels = eval_pred
#     labels = labels.reshape(-1, 1)
#     mse = mean_squared_error(labels, predictions)
#     mae = mean_absolute_error(labels, predictions)
#     r2 = r2_score(labels, predictions)
#     return {"mse": mse, "mae": mae, "r2": r2}

## Train

In [12]:
model = AutoModelForSequenceDClassification.from_pretrained(model_name, num_labels=num_labels)

training_args = TrainingArguments(
    output_dir=output_dir,
    learning_rate=lr,
    weight_decay=0,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=num_epochs,
    eval_strategy="steps",
    eval_steps=0.1, # eval after 10% is done
    save_strategy="steps",
    save_steps=0.1, # save after 10% of processing is done
    load_best_model_at_end=True,
)
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset['train'],
    eval_dataset=tokenized_dataset['dev'],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)
trainer.train()