In [1]:
import polars as pl
import numpy as np
from simpletransformers.classification import ClassificationModel, ClassificationArgs
import os
import torch
from datasets import Dataset, DatasetDict


manual_seed = 23

np.random.seed(manual_seed)
pl.set_random_seed(manual_seed)

# Load Data

In [2]:
filename = 'data/500k_50k'

df_train = pl.read_parquet(filename + '_train.parquet')
df_dev = pl.read_parquet(filename + '_dev.parquet')
df_test = pl.read_parquet(filename + '_test.parquet')

In [3]:
df_train.head(10)

product_id,review_text,recommended,found_awarding,found_helpful,found_funny
i64,str,i8,f64,f64,f64
1057090,"""ya i'm gay""",1,0.0,0.0,0.0
40340,"""Ran across a bug in the game w…",0,0.0,0.051619,0.0
244210,"""Best drifting game!""",1,0.0,0.008,0.0
105450,"""Its da biz""",1,0.0,0.008,0.0
200210,"""The reason i do not recomend t…",0,0.0,0.0,0.0
738520,"""If you like Portal and you lik…",1,0.130435,0.064,0.023529
326460,"""Good!""",1,0.0,0.0,0.0
2080690,"""amazing game""",1,0.0,0.0,0.0
602960,"""Pretty good game. The Crewplay…",1,0.0,0.004,0.0
1091500,"""good game go pew pew""",1,0.0,0.0,0.0


In [5]:
# upscale importance of 1+ funny
df_train = df_train.with_columns((pl.col("found_funny") + pl.col("found_funny").ceil()) / 2)
df_dev = df_dev.with_columns((pl.col("found_funny") + pl.col("found_funny").ceil()) / 2)
df_test = df_test.with_columns((pl.col("found_funny") + pl.col("found_funny").ceil()) / 2)

## Downsize

The obtained dataset contains ~45M reviews. Training on this amount would take too long, so I decided to train models on smaller chunks of data. I aimed at something that wouldn't take more than 6 hours of training. For Roberta, this meant training on 500k randomly selected reviews. I decided to evaluate data on 50k reviews, which means 10% of the size of the training data. While the amount of training data might change, this evaluation set will be used for all models.

In [8]:
# roberta
df_train = df_train.sample(5000, seed=manual_seed, shuffle=True)
df_dev = df_dev.sample(500, seed=manual_seed, shuffle=True)
df_test = df_test.sample(500, seed=manual_seed, shuffle=True)

# TODO DELETE THIS BECAUSE PREPROCESSING!
#df_train = df_train.cast({'recommended': pl.Int8})
#df_dev = df_dev.cast({'recommended': pl.Int8})
#df_test = df_test.cast({'recommended': pl.Int8})

In [27]:
df_dev

text,label
str,f64
"""Good story but a little short.…",0.052541
"""- - - - implants are too expen…",0.004
"""eh idk i just like it""",0.0
"""Smooth movement in game, graph…",0.0
"""What you see in the screenshot…",0.113143
…,…
"""Its a great experience""",0.0
"""if you believe in sphere earth…",0.0
"""Played it for about 11 hours. …",0.012
"""Pay to win game. as it's mostl…",0.013732


## Selecting relevant columns

In [3]:
# recommended
# df_train = df_train.select(['review_text', 'recommended']).rename({'review_text': 'text', 'recommended': 'label'})
# df_dev = df_dev.select(['review_text', 'recommended']).rename({'review_text': 'text', 'recommended': 'label'})
# df_test = df_test.select(['review_text', 'recommended']).rename({'review_text': 'text', 'recommended': 'label'})

# found_helpful
df_train = df_train.select(['review_text', 'found_helpful']).rename({'review_text': 'text', 'found_helpful': 'label'})
df_dev = df_dev.select(['review_text', 'found_helpful']).rename({'review_text': 'text', 'found_helpful': 'label'})
df_test = df_test.select(['review_text', 'found_helpful']).rename({'review_text': 'text', 'found_helpful': 'label'})

# found funny
# df_train = df_train.select(['review_text', 'found_funny']).rename({'review_text': 'text', 'found_funny': 'label'})
# df_dev = df_dev.select(['review_text', 'found_funny']).rename({'review_text': 'text', 'found_funny': 'label'})
# df_test = df_test.select(['review_text', 'found_funny']).rename({'review_text': 'text', 'found_funny': 'label'})

## Create dataset for transformers

In [4]:
dataset = DatasetDict({
    'train': Dataset(df_train.to_arrow()),
    'dev': Dataset(df_dev.to_arrow()),
    'test': Dataset(df_test.to_arrow())
})

In [5]:
dataset

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 500000
    })
    dev: Dataset({
        features: ['text', 'label'],
        num_rows: 50000
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 50000
    })
})

# Training -  - simpletransformers

## Setup training

In [9]:
# setup classification arguments
classification_args = {
    'num_train_epochs': 1,
    'manual_seed': manual_seed,
    'save_steps': -1,
    'train_batch_size': 32
}

model_args = ClassificationArgs(**classification_args)

## Load Model

In [6]:
# setup model
# model_args = {
#     'model_type': 'roberta',
#     'model_name': 'models/roberta500k/model/checkpoint-15625-epoch-1',
#     'num_labels': 2,
#     'args': model_args
# }
model_args = {
    'model_type': 'distilbert',
    'model_name': 'distilbert/distilbert-base-uncased',
    'num_labels': 2,
    'args': model_args
}
model = ClassificationModel(**model_args)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert/distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


## Train

In [7]:
# model.train_model(df_train.to_pandas(), output_dir='models/roberta500k')
model.train_model(df_train.to_pandas(), output_dir='models/distilbert500k')



  0%|          | 0/1000 [00:00<?, ?it/s]

Epoch:   0%|          | 0/1 [00:00<?, ?it/s]

  scaler = amp.GradScaler()


Running Epoch 1 of 1:   0%|          | 0/15625 [00:00<?, ?it/s]

  with amp.autocast():


(15625, 0.1697815614566803)

# Training - transformers

In [6]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification, DataCollatorWithPadding, TrainingArguments, Trainer
import evaluate



In [8]:
# configurations distilbert - recommended
model_type = 'distilbert'
model_name = 'distilbert/distilbert-base-uncased'
output_dir='models/steam-classification-distilbert500k'
batch_size = 32
num_epochs = 1
lr = 5e-5 # default
weight_decay = 0
eval_steps=0.1 # eval after 10% is done
save_steps=0.1

In [7]:
# configurations distilbert - helpful
model_type = 'distilbert'
model_name = 'distilbert/distilbert-base-uncased'
output_dir='models/steam-classification-distilbert500k-helpful'
batch_size = 32
num_epochs = 1
lr = 5e-5 # default
weight_decay = 0
eval_steps=0.1 # eval after 10% is done
save_steps=0.1

In [11]:
# configurations distilbert - funny
model_type = 'distilbert'
model_name = 'distilbert/distilbert-base-uncased'
output_dir='models/steam-classification-distilbert500k-funny2'
batch_size = 32
num_epochs = 1
lr = 5e-5 # default
weight_decay = 0
eval_steps=0.1 # eval after 10% is done
save_steps=0.1

In [11]:
# configurations distilbert - funny
model_type = 'distilbert'
model_name = 'distilbert/distilbert-base-uncased'
output_dir='models/steam-classification-distilbert500k-funny3'
batch_size = 32
num_epochs = 1
lr = 5e-6 # default
weight_decay = 0
eval_steps=0.1 # eval after 10% is done
save_steps=0.1

In [7]:
# configurations roberta - recommended
model_type = 'roberta-large'
model_name = 'FacebookAI/roberta-large'
output_dir='models/steam-classification-roberta500k'
batch_size = 16
num_epochs = 1
lr = 5e-6 # lower than default
weight_decay = 0
eval_steps=0.1 # eval after 10% is done
save_steps=0.1

In [7]:
# configurations roberta - helpful
model_type = 'roberta-large'
model_name = 'FacebookAI/roberta-large'
output_dir='models/steam-classification-roberta500k-helpful'
batch_size = 16
num_epochs = 1
lr = 5e-6 # lower than default
weight_decay = 0
eval_steps=0.1 # eval after 10% is done
save_steps=0.1

In [8]:
# configurations roberta - funny
model_type = 'roberta-large'
model_name = 'FacebookAI/roberta-large'
output_dir='models/steam-classification-roberta500k-funny'
batch_size = 16
num_epochs = 1
lr = 5e-6 # lower than default
weight_decay = 0
eval_steps=0.1 # eval after 10% is done
save_steps=0.1

In [8]:
tokenizer = AutoTokenizer.from_pretrained(model_name)
# classification
# model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)

# regression
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=1)

def tokenize_function(examples):
    text = examples["text"]
    # it is possible to return tensors in pytorch, but then you need to pad everything which is inconvenient because it is better to do in collator
    return tokenizer(text, truncation=True, return_tensors="np", max_length=128)

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at FacebookAI/roberta-large and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [9]:
tokenized_dataset = dataset.map(tokenize_function, batched=True)

Map:   0%|          | 0/500000 [00:00<?, ? examples/s]

Map:   0%|          | 0/50000 [00:00<?, ? examples/s]

Map:   0%|          | 0/50000 [00:00<?, ? examples/s]

In [10]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [15]:
# classification

accuracy = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return {"accuracy": accuracy.compute(predictions=predictions, references=labels)['accuracy']}
    # return accuracy.compute(predictions=predictions, references=labels)

In [11]:
# regression
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from scipy import stats

spearmanr_func = lambda x, y: stats.spearmanr(x, y)[0]
pearsonr_func = lambda x, y: stats.pearsonr(x, y)[0]

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    #spearmanr = spearmanr_func(predictions, labels)
    #pearsonr = pearsonr_func(predictions, labels)
    labels = labels.reshape(-1, 1)
    mse = mean_squared_error(labels, predictions)
    mae = mean_absolute_error(labels, predictions)
    r2 = r2_score(labels, predictions)
    
    return {"mse": mse, "mae": mae, "r2": r2} #, "spearmanr": spearmanr, "pearsonr": pearsonr}


## Test untrained model

In [12]:
def example_predictions(dataset, model):
    for text in dataset:
        inputs = tokenizer.encode(text, return_tensors="pt")
        logits = model(inputs).logits
        predictions = torch.argmax(logits)
    
        print(f'{predictions.tolist()} = {text}')
example_predictions(tokenized_dataset['dev']['text'][:10], model)

0 = I totally agree the best bit of the sims making dream homes, but really needs the ability to sell and make a profit so you can buy more land and build your housing empire. Love the designs and the possibilities for the game
0 = Banger game cheap when on sale comes with a lot of dlc! Start the exe from the main folder and it won't crash because "ran out of memory" as much and enjoy pasting all those codes for the keys. I think 7 hours out of my 12 is just pasting codes.
0 = Too much random ♥♥♥♥♥♥♥♥.
0 = Story - 8
Visuals - 9
Audio - 9
Gameplay - 8
Length - 6
Replayability - 7
Value base $ - 6
Value sale ($3.74) - 8
Overall - 8
0 = This game is awesome. Play it.
0 = Great story, a visual novel walking simulator which is fueled by the modern non-organic extraterrestrial travelling theories. Better than most movies i guess..
Oh! It ran well, with full details with my 7 years old 1070GTX card, exhibiting me a visually stunning red planet
0 = amazing game the artstyle is creepy the contr

In [19]:
## LORA CONFIG - IGNORE FOR NOW!
# peft_config = LoraConfig(task_type="SEQ_CLS", # sequence classification
#                          r=4, # intrinsic rank of trainable weight matrix
#                          lora_alpha=32, # like a learning rate
#                          lora_dropout=0.01, # dropout probability
#                          target_modules = ['q_lin']) # apply lora to query layer

# model = get_peft_model(model, peft_config)
# model.print_trainable_parameters()

In [12]:
training_args = TrainingArguments(
    output_dir=output_dir,
    learning_rate=lr,
    weight_decay=weight_decay,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=num_epochs,
    eval_strategy="steps",
    eval_steps=eval_steps, # eval after 10% is done
    save_strategy="steps",
    save_steps=save_steps, # save after 10% of processing is done
    load_best_model_at_end=True,
)

In [13]:
# API
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset['train'],
    eval_dataset=tokenized_dataset['dev'],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

In [19]:
vars(trainer)

{'args': TrainingArguments(
 _n_gpu=1,
 accelerator_config={'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None, 'use_configured_state': False},
 adafactor=False,
 adam_beta1=0.9,
 adam_beta2=0.999,
 adam_epsilon=1e-08,
 auto_find_batch_size=False,
 batch_eval_metrics=False,
 bf16=False,
 bf16_full_eval=False,
 data_seed=None,
 dataloader_drop_last=False,
 dataloader_num_workers=0,
 dataloader_persistent_workers=False,
 dataloader_pin_memory=True,
 dataloader_prefetch_factor=None,
 ddp_backend=None,
 ddp_broadcast_buffers=None,
 ddp_bucket_cap_mb=None,
 ddp_find_unused_parameters=None,
 ddp_timeout=1800,
 debug=[],
 deepspeed=None,
 disable_tqdm=False,
 dispatch_batches=None,
 do_eval=True,
 do_predict=False,
 do_train=False,
 eval_accumulation_steps=None,
 eval_delay=0,
 eval_do_concat_batches=True,
 eval_on_start=False,
 eval_steps=0.1,
 eval_strategy=steps,
 eval_use_gather_

In [14]:
# trainer.train()
trainer.train(resume_from_checkpoint = True)

  torch.load(os.path.join(checkpoint, OPTIMIZER_NAME), map_location=map_location)
	eval_steps: 0.1 (from args) != 3125 (from trainer_state.json)
	save_steps: 0.1 (from args) != 3125 (from trainer_state.json)
[34m[1mwandb[0m: Currently logged in as: [33mluka-krsnik[0m ([33mluka-krsnik-outsmartify[0m). Use [1m`wandb login --relogin`[0m to force relogin


  checkpoint_rng_state = torch.load(rng_file)


Step,Training Loss,Validation Loss,Mse,Mae,R2
18750,0.0043,0.003207,0.003207,0.020156,-0.062829
21875,0.0047,0.002936,0.002936,0.019235,0.026894
25000,0.0051,0.002887,0.002887,0.014255,0.043331
28125,0.0042,0.002932,0.002932,0.013278,0.028268
31250,0.0046,0.002884,0.002884,0.014959,0.044279


TrainOutput(global_step=31250, training_loss=0.0021845314712524416, metrics={'train_runtime': 14445.8354, 'train_samples_per_second': 34.612, 'train_steps_per_second': 2.163, 'total_flos': 1.1164339834883568e+17, 'train_loss': 0.0021845314712524416, 'epoch': 1.0})

In [32]:
model.to('cuda')
def example_predictions(dataset, model):
    for text in dataset:
        inputs = tokenizer.encode(text, return_tensors="pt").to('cuda')
        logits = model(inputs).logits
        predictions = torch.max(logits, 1).indices
    
        print(f'{predictions.tolist()[0]} = {text}')
example_predictions(tokenized_dataset['dev']['text'][:10], model)

1 = I totally agree the best bit of the sims making dream homes, but really needs the ability to sell and make a profit so you can buy more land and build your housing empire. Love the designs and the possibilities for the game
1 = Banger game cheap when on sale comes with a lot of dlc! Start the exe from the main folder and it won't crash because "ran out of memory" as much and enjoy pasting all those codes for the keys. I think 7 hours out of my 12 is just pasting codes.
0 = Too much random ♥♥♥♥♥♥♥♥.
1 = Story - 8
Visuals - 9
Audio - 9
Gameplay - 8
Length - 6
Replayability - 7
Value base $ - 6
Value sale ($3.74) - 8
Overall - 8
1 = This game is awesome. Play it.
1 = Great story, a visual novel walking simulator which is fueled by the modern non-organic extraterrestrial travelling theories. Better than most movies i guess..
Oh! It ran well, with full details with my 7 years old 1070GTX card, exhibiting me a visually stunning red planet
1 = amazing game the artstyle is creepy the contr

# Pytorch training

In [10]:
# def tokenize_function(data):
#     text, tokenizer = data
#     return tokenizer(text, padding="max_length", truncation=True, return_tensors="pt", max_length=128)

In [8]:
# from multiprocessing import Pool, cpu_count
# from tqdm.auto import tqdm

# def tokenize_text(df, chunksize=500):
#     text = list(df['text'])
#     data = [
#         (text[i: i + chunksize], tokenizer) for i in range(0, len(text), chunksize)
#     ]
#     with Pool(16) as p:
#         examples = list(
#             tqdm(
#                 p.imap(tokenize_function, data),
#                 total=len(text) // chunksize,
#                 disable=False,
#             )
#         )

#     return examples

# def prepare_data(df, examples, output_mode="classification"):
#     examples = {
#         key: torch.cat([example[key] for example in examples])
#         for key in examples[0]
#     }
#     if output_mode == "classification":
#         labels = torch.tensor(list(df['label']), dtype=torch.long)
#     elif output_mode == "regression":
#         labels = torch.tensor(list(df['label']), dtype=torch.float)
#     return examples, labels

In [9]:
# train_tokenized = tokenize_text(df_train)
# dev_tokenized = tokenize_text(df_dev)

# # text has to be tokenized before further processing due to multiprocessing having problems when combined with torch (and possibly other libraries connected with GPU processing)
# # A workaround is to use `multiprocessing.set_start_method('spawn')` but this is impractical in jupyter
# # Avoiding multiprocessing might work but is prob. slower.
# train = prepare_data(df_dev, train_tokenized)
# dev = prepare_data(df_dev, dev_tokenized)

  0%|          | 0/1000 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

# PyTorch (semi)-implementation

In [22]:
from torch.utils.data import DataLoader, RandomSampler

train_dataset = (examples, labels)
train_sampler = RandomSampler(train_dataset)
train_dataloader = DataLoader(
    train_dataset,
    sampler=train_sampler,
    batch_size=32,
)

In [None]:
# TEST:
model = AutoModelForSequenceClassification.from_pretrained('distilbert/distilbert-base-uncased', num_labels=2)
optimizer = torch.optim.adamw.AdamW(model.parameters(), lr=4e-05)

In [None]:
train_iterator = trange(
    1, desc="Epoch", disable=False, mininterval=0
)

In [None]:
epoch_number = 0
for _ in train_iterator:
    model.train()
    train_iterator.set_description(
        f"Epoch {epoch_number + 1} of {args.num_train_epochs}"
    )
    batch_iterator = tqdm(
        train_dataloader,
        desc=f"Running Epoch {epoch_number + 1} of {1}",
        disable=False,
        mininterval=0,
    )
    for step, batch in enumerate(batch_iterator):
        print(batch)
        # create inputs = {'input_ids': tensor, 'attention_mask': ..., 'labels': ...
        break
        outputs = model(**inputs)
        loss = outputs.loss
        current_loss = loss.item()
        batch_iterator.set_description(
            f"Epochs {epoch_number + 1}/1. Running Loss: {current_loss:9.4f}"
        )
        # if necessary scale! - scaler.scale(loss).backward()
        loss.backward()
        optimizer.step()
        model.zero_grad()
        
        
    epoch_number += 1

In [23]:
# TEST:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

for epoch in range(1):
    for batch in dataloader:
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()

In [12]:
encoded_text = tokenizer("This is a short test for pneumonoultramicroscopicsilicovolcanoconiosis!")
print(encoded_text)

{'input_ids': [101, 2023, 2003, 1037, 2460, 3231, 2005, 1052, 2638, 2819, 17175, 11314, 6444, 2594, 7352, 26461, 27572, 11261, 6767, 15472, 6761, 8663, 10735, 2483, 999, 102], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}


In [19]:
tokenizer.decode(encoded_text["input_ids"])

'[CLS] this is a short test for pneumonoultramicroscopicsilicovolcanoconiosis! [SEP]'