In [1]:
import polars as pl
import numpy as np
from simpletransformers.classification import ClassificationModel, ClassificationArgs
import os
import torch
from datasets import Dataset, DatasetDict


manual_seed = 23

np.random.seed(manual_seed)
pl.set_random_seed(manual_seed)

# Load Data

In [2]:
filename = 'data/500k_50k'

df_train = pl.read_parquet(filename + '_train.parquet')
df_dev = pl.read_parquet(filename + '_dev.parquet')
df_test = pl.read_parquet(filename + '_test.parquet')

## Downsize

The obtained dataset contains ~45M reviews. Training on this amount would take too long, so I decided to train models on smaller chunks of data. I aimed at something that wouldn't take more than 6 hours of training. For Roberta, this meant training on 500k randomly selected reviews. I decided to evaluate data on 50k reviews, which means 10% of the size of the training data. While the amount of training data might change, this evaluation set will be used for all models.

In [6]:
df_train = df_train.sample(5000, seed=manual_seed, shuffle=True)
df_dev = df_dev.sample(500, seed=manual_seed, shuffle=True)
df_test = df_test.sample(500, seed=manual_seed, shuffle=True)

# TODO DELETE THIS BECAUSE PREPROCESSING!
#df_train = df_train.cast({'recommended': pl.Int8})
#df_dev = df_dev.cast({'recommended': pl.Int8})
#df_test = df_test.cast({'recommended': pl.Int8})

## Selecting relevant columns

In [3]:
# recommended
# df_train = df_train.select(['review_text', 'recommended']).rename({'review_text': 'text', 'recommended': 'label'})
# df_dev = df_dev.select(['review_text', 'recommended']).rename({'review_text': 'text', 'recommended': 'label'})
# df_test = df_test.select(['review_text', 'recommended']).rename({'review_text': 'text', 'recommended': 'label'})

# found_helpful
df_train = df_train.select(['review_text', 'found_helpful']).rename({'review_text': 'text', 'found_helpful': 'label'})
df_dev = df_dev.select(['review_text', 'found_helpful']).rename({'review_text': 'text', 'found_helpful': 'label'})
df_test = df_test.select(['review_text', 'found_helpful']).rename({'review_text': 'text', 'found_helpful': 'label'})

# found funny
# df_train = df_train.select(['review_text', 'found_funny']).rename({'review_text': 'text', 'found_funny': 'label'})
# df_dev = df_dev.select(['review_text', 'found_funny']).rename({'review_text': 'text', 'found_funny': 'label'})
# df_test = df_test.select(['review_text', 'found_funny']).rename({'review_text': 'text', 'found_funny': 'label'})

## Create dataset for transformers

In [4]:
dataset = DatasetDict({
    'train': Dataset(df_train.to_arrow()),
    'dev': Dataset(df_dev.to_arrow()),
    'test': Dataset(df_test.to_arrow())
})

# Evaluating - transformers

In [7]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification, DataCollatorWithPadding, TrainingArguments, Trainer
import evaluate



In [7]:
# configurations distilbert
model_name = 'models/steam-classification-distilbert500k/checkpoint-15625'

In [5]:
model_name = 'models/steam-classification-distilbert500k-funny/checkpoint-14067'

In [71]:
model_name = 'models/steam-classification-distilbert500k-funny2/checkpoint-15625'

In [None]:
model_name = 'models/steam-classification-distilbert500k-funny3/checkpoint-15625'

In [13]:
# configurations roberta
model_name = 'models/steam-classification-roberta500k/checkpoint-31250'

In [8]:
# configurations distilbert
model_type = 'roberta-large'
model_name = 'FacebookAI/roberta-large'
output_dir='models/steam-classification-roberta500k'
batch_size = 16
num_epochs = 1
lr = 5e-5 # default
weight_decay = 0
eval_steps=0.1 # eval after 10% is done
save_steps=0.1

In [72]:
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)

def tokenize_function(examples):
    text = examples["text"]
    # it is possible to return tensors in pytorch, but then you need to pad everything which is inconvenient because it is better to do in collator
    return tokenizer(text, truncation=True, return_tensors="np", max_length=128)

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

wandb: ERROR Dropped streaming file chunk (see wandb/debug-internal.log)


In [9]:
tokenized_dataset = dataset.map(tokenize_function, batched=True)

Map:   0%|          | 0/500000 [00:00<?, ? examples/s]

Map:   0%|          | 0/50000 [00:00<?, ? examples/s]

Map:   0%|          | 0/50000 [00:00<?, ? examples/s]

In [15]:
# classification

accuracy = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return {"accuracy": accuracy.compute(predictions=predictions, references=labels)}
    # return accuracy.compute(predictions=predictions, references=labels)

In [34]:
# regression
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from scipy import stats

spearmanr_func = lambda x, y: stats.spearmanr(x, y)[0]
pearsonr_func = lambda x, y: stats.pearsonr(x, y)[0]

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    #spearmanr = spearmanr_func(predictions, labels)
    #pearsonr = pearsonr_func(predictions, labels)
    labels = labels.reshape(-1, 1)
    mse = mean_squared_error(labels, predictions)
    mae = mean_absolute_error(labels, predictions)
    r2 = r2_score(labels, predictions)
    
    return {"mse": mse, "mae": mae, "r2": r2} #, "spearmanr": spearmanr, "pearsonr": pearsonr}

def compute_metrics_check(predictions, labels):
    #spearmanr = spearmanr_func(predictions, labels)
    #pearsonr = pearsonr_func(predictions, labels)
    # labels = labels.reshape(-1, 1)
    mse = mean_squared_error(labels, predictions)
    mae = mean_absolute_error(labels, predictions)
    r2 = r2_score(labels, predictions)
    
    return {"mse": mse, "mae": mae, "r2": r2} #, "spearmanr": spearmanr, "pearsonr": pearsonr}

In [73]:
# DISTILBert
trainer = Trainer(
    model=model,
    eval_dataset=tokenized_dataset['dev'],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)
predictions, label_ids, metrics = trainer.predict(test_dataset=tokenized_dataset['dev'])

wandb: ERROR Dropped streaming file chunk (see wandb/debug-internal.log)
wandb: ERROR Dropped streaming file chunk (see wandb/debug-internal.log)
wandb: ERROR Dropped streaming file chunk (see wandb/debug-internal.log)
wandb: ERROR Error while calling W&B API: run x2vzpsr5 was previously created and deleted; try a new run name (<Response [409]>)
wandb: ERROR Error while calling W&B API: run x2vzpsr5 was previously created and deleted; try a new run name (<Response [409]>)
wandb: ERROR Error while calling W&B API: run x2vzpsr5 was previously created and deleted; try a new run name (<Response [409]>)
wandb: ERROR Error while calling W&B API: run x2vzpsr5 was previously created and deleted; try a new run name (<Response [409]>)
wandb: ERROR Error while calling W&B API: run x2vzpsr5 was previously created and deleted; try a new run name (<Response [409]>)
wandb: ERROR Dropped streaming file chunk (see wandb/debug-internal.log)


In [36]:
df_dev = df_dev.with_columns(
    pl.lit(predictions.reshape(-1)).alias('funny')
)

In [74]:
df_dev = df_dev.with_columns(
    pl.lit(predictions.reshape(-1)).alias('funny2')
)

wandb: ERROR Error while calling W&B API: run x2vzpsr5 was previously created and deleted; try a new run name (<Response [409]>)


In [41]:
df_dev = df_dev.with_columns(
    pl.lit(np.zeros(len(label_ids))).alias('zeros')
)

In [42]:
compute_metrics_check(df_dev['label'], df_dev['zeros'])

{'mse': 0.0024860459868941793, 'mae': 0.006757692098994974, 'r2': 0.0}

In [37]:
compute_metrics_check(df_dev['label'], df_dev['funny'])

{'mse': 0.0023629834534891593,
 'mae': 0.011589905637799375,
 'r2': -23.61761824304716}

In [76]:
compute_metrics_check(df_dev['label'], df_dev['zeros'])

{'mse': 0.0024860459868941793, 'mae': 0.006757692098994974, 'r2': 0.0}

In [75]:
compute_metrics_check(df_dev['label'], df_dev['funny2'])

{'mse': 0.007468927305274942,
 'mae': 0.062205897909022,
 'r2': -1.7133070431903619}

wandb: ERROR Error while calling W&B API: run x2vzpsr5 was previously created and deleted; try a new run name (<Response [409]>)


In [85]:
df_dev.sort('funny2', descending=True)[400:420]

Error in callback <bound method _WandbInit._resume_backend of <wandb.sdk.wandb_init._WandbInit object at 0x713bcb7e8be0>> (for pre_run_cell), with arguments args (<ExecutionInfo object at 713ba7f26f80, raw_cell="df_dev.sort('funny2', descending=True)[400:420]" store_history=True silent=False shell_futures=True cell_id=78897e30-867b-4473-9598-36e6b5503a78>,),kwargs {}:


BrokenPipeError: [Errno 32] Broken pipe

text,label,funny,zeros,funny2
str,f64,f32,f64,f32
"""Feels like 1998, being thirtee…",0.0,0.043372,0.0,0.245534
"""Supermarket Simulator, the lat…",0.0,0.04319,0.0,0.245361
"""I am not paying AU$1600+ to ex…",0.0,0.040488,0.0,0.245341
"""So I had a huge wall of text t…",0.0,0.083059,0.0,0.245128
"""[Intro: Travis Scott] Yeah Yea…",0.0,0.055211,0.0,0.244843
…,…,…,…,…
"""♥♥♥♥♥ i just be playing lego g…",0.0,0.01494,0.0,0.244061
"""Pushed a man off motherbase in…",0.0,0.033119,0.0,0.244056
"""⠀⠘⡀⠀⠀⠀⠀people who play ⠀⠀⠀⠀⠀ ⠀…",0.011765,0.065314,0.0,0.243906
""": Yakuza 0 is a dense game and…",0.011765,0.063706,0.0,0.243862


Error in callback <bound method _WandbInit._pause_backend of <wandb.sdk.wandb_init._WandbInit object at 0x713bcb7e8be0>> (for post_run_cell), with arguments args (<ExecutionResult object at 713ba7f27130, execution_count=85 error_before_exec=None error_in_exec=None info=<ExecutionInfo object at 713ba7f26f80, raw_cell="df_dev.sort('funny2', descending=True)[400:420]" store_history=True silent=False shell_futures=True cell_id=78897e30-867b-4473-9598-36e6b5503a78> result=shape: (20, 5)
┌─────────────────────────────────┬──────────┬──────────┬───────┬──────────┐
│ text                            ┆ label    ┆ funny    ┆ zeros ┆ funny2   │
│ ---                             ┆ ---      ┆ ---      ┆ ---   ┆ ---      │
│ str                             ┆ f64      ┆ f32      ┆ f64   ┆ f32      │
╞═════════════════════════════════╪══════════╪══════════╪═══════╪══════════╡
│ Feels like 1998, being thirtee… ┆ 0.0      ┆ 0.043372 ┆ 0.0   ┆ 0.245534 │
│ Supermarket Simulator, the lat… ┆ 0.0      ┆ 0.04

BrokenPipeError: [Errno 32] Broken pipe

In [70]:
df_dev.sort('funny', descending=True)['text'][47]

"My gambling addict friend said if I post a review and it gets 75 likes and 20 Awards that he will buy me a case of beer and some vodka, so I'm just gonna leave this here."

wandb: ERROR Dropped streaming file chunk (see wandb/debug-internal.log)
wandb: ERROR Dropped streaming file chunk (see wandb/debug-internal.log)
wandb: ERROR Dropped streaming file chunk (see wandb/debug-internal.log)


In [31]:
label_ids.reshape(-1)

array([0.02588235, 0.        , 0.01176471, ..., 0.        , 0.        ,
       0.        ], dtype=float32)

In [19]:
label_ids

array([0.02588235, 0.        , 0.01176471, ..., 0.        , 0.        ,
       0.        ], dtype=float32)

In [17]:
# Roberta
trainer = Trainer(
    model=model,
    eval_dataset=tokenized_dataset['dev'],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)
trainer.evaluate()

{'eval_loss': 0.5369579792022705,
 'eval_model_preparation_time': 0.0054,
 'eval_accuracy': 0.836,
 'eval_runtime': 6.1489,
 'eval_samples_per_second': 81.315,
 'eval_steps_per_second': 10.246}

## Test untrained model

In [18]:
def example_predictions(dataset, model):
    for text in dataset:
        inputs = tokenizer.encode(text, return_tensors="pt")
        logits = model(inputs).logits
        #predictions = torch.argmax(logits)
    
        print(f'{logits.tolist()} = {text}')
example_predictions(tokenized_dataset['dev']['text'][:10], model)

RuntimeError: Expected all tensors to be on the same device, but found at least two devices, cuda:0 and cpu! (when checking argument for argument index in method wrapper_CUDA__index_select)

In [28]:
model.to('cuda')
def example_predictions(dataset, model):
    for text, label in zip(dataset['text'], dataset['label']):
        
        inputs = tokenizer.encode(text, return_tensors="pt").to('cuda')
        logits = model(inputs).logits
        # predictions = torch.max(logits, 1).indices
    
        print(f'{logits[0][0]} | {label} = {text}')
example_predictions(tokenized_dataset['dev'][:10], model)

0.01052621565759182 | 0.08921568627450979 = Good story but a little short. Took me 1h30 to finish the game, without skipping the cut scenes.
You play a detective called upon to investigate the mysterious disappearance of a young orphan.
The manager of the orphanage explains that since the boy disappeared, the place became hunted. Why, because of dark magic.
You go on a little journey in the orphanage, exploring the few rooms available, looking for clues on the boys whereabouts and who did this.
I like the Goal list they incorporated in the game. Each time you accomplish a goal, they grey it out and when you've finished all the goals for the section and you need to head out, they establish new goals. You don't have to accomplish them in order, you just need to do them all to advance in the story line.
The hidden objects games are few, but with a little twist to them, the red objects in your list are hidden, you have to find them or accomplish an act to reveal them. Pretty fun.
You also 

In [21]:
model.to('cuda')
def example_predictions(dataset, model):
    for text in dataset:
        inputs = tokenizer.encode(text, return_tensors="pt").to('cuda')
        logits = model(inputs).logits
        predictions = torch.max(logits, 1).indices
    
        print(f'{predictions.tolist()[0]} = {text}')
example_predictions(tokenized_dataset['dev']['text'][:10], model)

1 = I totally agree the best bit of the sims making dream homes, but really needs the ability to sell and make a profit so you can buy more land and build your housing empire. Love the designs and the possibilities for the game
1 = Banger game cheap when on sale comes with a lot of dlc! Start the exe from the main folder and it won't crash because "ran out of memory" as much and enjoy pasting all those codes for the keys. I think 7 hours out of my 12 is just pasting codes.
1 = Too much random ♥♥♥♥♥♥♥♥.
1 = Story - 8
Visuals - 9
Audio - 9
Gameplay - 8
Length - 6
Replayability - 7
Value base $ - 6
Value sale ($3.74) - 8
Overall - 8
1 = This game is awesome. Play it.
1 = Great story, a visual novel walking simulator which is fueled by the modern non-organic extraterrestrial travelling theories. Better than most movies i guess..
Oh! It ran well, with full details with my 7 years old 1070GTX card, exhibiting me a visually stunning red planet
1 = amazing game the artstyle is creepy the contr

In [29]:
tokenized_dataset

DatasetDict({
    train: Dataset({
        features: ['text', 'label', 'input_ids', 'attention_mask'],
        num_rows: 5000
    })
    dev: Dataset({
        features: ['text', 'label', 'input_ids', 'attention_mask'],
        num_rows: 500
    })
    test: Dataset({
        features: ['text', 'label', 'input_ids', 'attention_mask'],
        num_rows: 500
    })
})