In [34]:
import re
import random
import numpy as np
from scipy.special import softmax

import torch
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
!pip install datasets
from datasets import Dataset
from sklearn.metrics import mean_squared_error
import os
os.environ["WANDB_DISABLED"] = "true"

# Load Data with pandas
usecols = ['id', 'set_num', 'name', 'types', 'evolvesFrom', 'evolvesTo', 'cleaned_attacks', 'cleaned_abilities', 'cleaned_rules']
train_df = pd.read_csv('/content/drive/MyDrive/266/project/pokemon-tcg-data-master 1999-2023_D.csv')
test_df = pd.read_csv('/content/drive/MyDrive/266/project/pokemon-tcg-data-master 1999-2023_E.csv')

train_df = train_df[usecols + ['power_level']]
test_df = test_df[usecols + ['power_level']]

# Combine columns into a single text input
def combine_features(row):
    return ' '.join(row.values.astype(str))

train_df['text'] = train_df[usecols].apply(combine_features, axis=1)
test_df['text'] = test_df[usecols].apply(combine_features, axis=1)

# Convert to `datasets.Dataset`
train_dataset = Dataset.from_pandas(train_df[['text', 'power_level']])
test_dataset = Dataset.from_pandas(test_df[['text', 'power_level']])

# Tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

def preprocess_function(examples):
    return tokenizer(examples['text'], truncation=True, padding='max_length', max_length=512)

train_dataset = train_dataset.map(preprocess_function, batched=True)
test_dataset = test_dataset.map(preprocess_function, batched=True)

# Set format for PyTorch
train_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'power_level'])
test_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'power_level'])

# Model
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=1)  # Regression

# Ensure labels are in the correct format for regression
def add_labels_to_dataset(dataset):
    dataset = dataset.map(lambda x: {'labels': x['power_level']}, batched=True)
    return dataset

train_dataset = add_labels_to_dataset(train_dataset)
test_dataset = add_labels_to_dataset(test_dataset)

# Custom Trainer for Regression
class CustomTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False, **kwargs):
        # Forward pass
        labels = inputs.get("labels")
        outputs = model(**inputs)
        logits = outputs.get("logits")
        loss_fct = torch.nn.MSELoss()
        loss = loss_fct(logits.squeeze(), labels)  # Regression loss (MSE)
        return (loss, outputs) if return_outputs else loss

# Training Arguments
training_args = TrainingArguments(
    output_dir='./outputResults',
    num_train_epochs=3,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    report_to=None,  # Ensure W&B is disabled
)

# Define Metrics (RMSE)
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = logits.squeeze()
    rmse = mean_squared_error(labels, predictions)  # RMSE
    return {"rmse": rmse}

# Trainer
trainer = CustomTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

# Train and Evaluate
trainer.train()
eval_results = trainer.evaluate()
print(eval_results)




Map:   0%|          | 0/1261 [00:00<?, ? examples/s]

Map:   0%|          | 0/1202 [00:00<?, ? examples/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/1261 [00:00<?, ? examples/s]

Map:   0%|          | 0/1202 [00:00<?, ? examples/s]

Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
  trainer = CustomTrainer(


Epoch,Training Loss,Validation Loss,Rmse
1,0.0035,0.000811,0.000811
2,0.0007,0.00661,0.00661
3,0.0004,0.000372,0.000372


{'eval_loss': 0.0003716045175679028, 'eval_rmse': 0.00037160454667173326, 'eval_runtime': 34.4667, 'eval_samples_per_second': 34.874, 'eval_steps_per_second': 4.381, 'epoch': 3.0}


In [35]:
predictions = trainer.predict(test_dataset)
predicted_values = predictions.predictions.squeeze()
actual_values = predictions.label_ids

# Print some predicted vs actual values
for i in range(10):  # Print the first 10 predictions
    print(f"Predicted: {predicted_values[i]}, Actual: {actual_values[i]}")

# Print the evaluation results (RMSE)
print(eval_results)

Predicted: 0.0013072174042463303, Actual: 0.0002871912729460746
Predicted: 0.003669019788503647, Actual: 0.01024511270225048
Predicted: 0.004107741639018059, Actual: 0.0005743825458921492
Predicted: 0.005449187941849232, Actual: 0.0
Predicted: 0.00046809480409137905, Actual: 0.0
Predicted: 0.004255498759448528, Actual: 0.0
Predicted: 0.006487157195806503, Actual: 0.0
Predicted: 0.0018044719472527504, Actual: 0.0
Predicted: 0.006574000231921673, Actual: 0.00012860081915277988
Predicted: 0.006821609102189541, Actual: 0.0
{'eval_loss': 0.0003716045175679028, 'eval_rmse': 0.00037160454667173326, 'eval_runtime': 34.4667, 'eval_samples_per_second': 34.874, 'eval_steps_per_second': 4.381, 'epoch': 3.0}
