In [1]:
import polars as pl
import numpy as np
import torch
from datasets import Dataset, DatasetDict
from transformers import AutoTokenizer, AutoModelForSequenceClassification, DataCollatorWithPadding, TrainingArguments, Trainer
import evaluate
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score


manual_seed = 23

np.random.seed(manual_seed)
pl.set_random_seed(manual_seed)

# Evaluating - transformers

In [2]:
filename = 'data/500k_50k'
recommend_models = {
    'distilbert': 'models/backup/steam-classification-distilbert500k/checkpoint-15625',
    'roberta': 'models/backup/steam-classification-roberta500k/checkpoint-31250',
}
funny_models = {
    'distilbert': 'models/steam-classification-distilbert500k-funny/checkpoint-14067',
    'roberta': 'models/backup/steam-classification-roberta500k-funny/checkpoint-31250',
}
helpful_models = {
    'distilbert': 'models/steam-classification-distilbert500k-helpful/checkpoint-15625',
    'roberta': 'models/steam-classification-roberta500k-helpful/checkpoint-31250',
}

In [3]:
def load_data(column):
    '''
    Selects a proper column. They could be `recommended`, `found_funny` or `found_helpful`.
    '''
    df_test = pl.read_parquet(filename + '_test.parquet')
    df_test = df_test.select(['review_text', column]).rename({'review_text': 'text', column: 'label'})
    dataset = DatasetDict({'test': Dataset(df_test.to_arrow())})
    
    return df_test, dataset

In [27]:
# classification
accuracy = evaluate.load("accuracy")

def compute_classification_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return {"accuracy": accuracy.compute(predictions=predictions, references=labels)['accuracy']}
    # return accuracy.compute(predictions=predictions, references=labels)

In [5]:
# regression
def compute_regression_metrics(eval_pred):
    predictions, labels = eval_pred
    labels = labels.reshape(-1, 1)
    mse = mean_squared_error(labels, predictions)
    mae = mean_absolute_error(labels, predictions)
    r2 = r2_score(labels, predictions)
    
    return {"mse": mse, "mae": mae, "r2": r2}

In [29]:
def evaluate(model_path, df_test, dataset, compute_metrics):
    tokenizer = AutoTokenizer.from_pretrained(model_path)
    def tokenize_function(examples):
        text = examples["text"]
        # it is possible to return tensors in pytorch, but then you need to pad everything which is inconvenient because it is better to do in collator
        return tokenizer(text, truncation=True, return_tensors="np", max_length=128)
    tokenized_dataset = dataset.map(tokenize_function, batched=True)
    data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
    
    model = AutoModelForSequenceClassification.from_pretrained(model_path)
    trainer = Trainer(
        model=model,
        tokenizer=tokenizer,
        data_collator=data_collator,
        compute_metrics=compute_metrics,
    )
    return trainer.predict(test_dataset=tokenized_dataset['test'])

In [7]:
def collect_regression_evaluations(models, column):
    results = {'model_name': [], 'mse': [], 'mae': [], 'r2': []}
    df, dataset = load_data(column)
    for model_name, model_path in models.items():
        predictions, _, metrics = evaluate(model_path, df, dataset, compute_regression_metrics)
        df = df.with_columns(
            pl.lit(predictions.reshape(-1)).alias(model_name)
        )
        results['model_name'].append(model_name)
        results['mse'].append(metrics['test_mse'])
        results['mae'].append(metrics['test_mae'])
        results['r2'].append(metrics['test_r2'])
    
    # what if all predictions are 0
    metrics = compute_regression_metrics((np.zeros(len(df['label'])), df['label'].to_numpy()))
    results['model_name'].append('baseline')
    results['mse'].append(metrics['mse'])
    results['mae'].append(metrics['mae'])
    results['r2'].append(metrics['r2'])
    
    return pl.DataFrame(results), df

In [32]:
def collect_classification_evaluations(models, column):
    results = {'model_name': [], 'accuracy': []}
    df, dataset = load_data(column)
    for model_name, model_path in models.items():
        predictions, _, metrics = evaluate(model_path, df, dataset, compute_classification_metrics)
        df = df.with_columns(
            pl.lit(np.argmax(predictions, axis=1)).alias(model_name)
        )
        results['model_name'].append(model_name)
        results['accuracy'].append(metrics['test_accuracy'])
    
    # what if all predictions are 0
    metrics = compute_classification_metrics((np.concatenate((np.zeros((50000, 1)), np.ones((50000, 1))), axis=1), df['label'].to_numpy()))
    results['model_name'].append('baseline')
    results['accuracy'].append(metrics['accuracy'])
    
    return pl.DataFrame(results), df

In [33]:
df_recommended_results, df_recommended_details = collect_classification_evaluations(recommend_models, 'recommended')

Map:   0%|          | 0/50000 [00:00<?, ? examples/s]

Map:   0%|          | 0/50000 [00:00<?, ? examples/s]

In [10]:
df_funny_results, df_funny_details = collect_regression_evaluations(funny_models, 'found_funny')

Map:   0%|          | 0/50000 [00:00<?, ? examples/s]

Map:   0%|          | 0/50000 [00:00<?, ? examples/s]

In [11]:
df_helpful_results, df_helpful_details = collect_regression_evaluations(helpful_models, 'found_helpful')

Map:   0%|          | 0/50000 [00:00<?, ? examples/s]

Map:   0%|          | 0/50000 [00:00<?, ? examples/s]

In [35]:
df_recommended_results.write_csv('evaluations/recommend_metrics.csv')
df_recommended_details.write_csv('evaluations/recommend_details.csv')
df_funny_results.write_csv('evaluations/funny_metrics.csv')
df_funny_details.with_columns(pl.col(["label", "distilbert", "roberta"]).round(3)).write_csv('evaluations/funny_details.csv')
df_helpful_results.write_csv('evaluations/helpful_metrics.csv')
df_helpful_details.with_columns(pl.col(["label", "distilbert", "roberta"]).round(3)).write_csv('evaluations/helpful_details.csv')

In [36]:
df_recommended_results

model_name,accuracy
str,f64
"""distilbert""",0.95132
"""roberta""",0.9598
"""baseline""",0.87588


In [14]:
df_funny_results

model_name,mse,mae,r2
str,f64,f64,f64
"""distilbert""",0.002015,0.010775,0.022518
"""roberta""",0.002008,0.008409,0.025824
"""baseline""",0.002098,0.006068,-0.017862


In [15]:
df_helpful_results

model_name,mse,mae,r2
str,f64,f64,f64
"""distilbert""",0.002339,0.013148,0.04566
"""roberta""",0.00232,0.013807,0.053243
"""baseline""",0.002532,0.009021,-0.033202


In [48]:
df_funny_details.filter((pl.col('text').str.len_chars() > 20) & (pl.col('text').str.len_chars() < 50)).sort('label', descending=True).write_csv('evaluations/funny_details_readable.csv')

In [49]:
df_helpful_details.filter((pl.col('text').str.len_chars() > 20) & (pl.col('text').str.len_chars() < 50)).sort('label', descending=True).write_csv('evaluations/helpful_details_readable.csv')