In [None]:
import re
import random
import numpy as np
from scipy.special import softmax
import pandas as pd
from google.colab import drive
drive.mount('/content/drive')
from sklearn.metrics import mean_absolute_error
import matplotlib.pyplot as plt
import joblib
from sklearn.preprocessing import MinMaxScaler

import torch
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
!pip install datasets
!pip install optuna
from datasets import Dataset
from sklearn.metrics import mean_squared_error
from transformers import EarlyStoppingCallback
import os
from scipy.special import expit
os.environ["WANDB_DISABLED"] = "true"

In [None]:
usecols = ['id', 'regulationMark', 'hp', 'name', 'types', 'subtypes', 'evolvesFrom', 'evolvesTo', 'weaknesses', 'convertedRetreatCost', 'resistances', 'cleaned_attacks', 'cleaned_abilities', 'cleaned_rules', 'tournamentYear']

pokemon_2021 = pd.read_csv('/content/drive/MyDrive/266/project/standard_2021_pokemon_power_level.csv')
pokemon_2022 = pd.read_csv('/content/drive/MyDrive/266/project/standard_2022_pokemon_power_level.csv')
pokemon_2023 = pd.read_csv('/content/drive/MyDrive/266/project/standard_2023_pokemon_power_level.csv')
trainer_2021 = pd.read_csv('/content/drive/MyDrive/266/project/standard_2021_trainer_power_level.csv')
trainer_2022 = pd.read_csv('/content/drive/MyDrive/266/project/standard_2022_trainer_power_level.csv')
trainer_2023 = pd.read_csv('/content/drive/MyDrive/266/project/standard_2023_trainer_power_level.csv')


In [None]:
def print_top_10_predictions(predicted_values, actual_values, test_df):
    names = test_df['name'].values

    top_10_predicted_indices = np.argsort(predicted_values)[-10:][::-1]

    top_10_actual_indices = np.argsort(actual_values)[-10:][::-1]

    print("Top 10 Highest Predicted Values:")
    print("-" * 50)
    for i in top_10_predicted_indices:
        print(f"Name: {names[i]}, Predicted: {predicted_values[i]:.4f}, Actual: {actual_values[i]:.4f}")

    print("\nTop 10 Highest Actual Values:")
    print("-" * 50)
    for i in top_10_actual_indices:
        print(f"Name: {names[i]}, Actual: {actual_values[i]:.4f}, Predicted: {predicted_values[i]:.4f}")


In [None]:
from transformers import (
        BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
    )

def train_and_evaluate_bert_model(train_df, test_df, usecols, model_save_path, num_epochs=20, batch_size=8):
    train_df = train_df[usecols + ['power_level']]
    test_df = test_df[usecols + ['power_level']]

    def combine_features(row):
        return ' '.join(row.values.astype(str))

    train_df['text'] = train_df[usecols].apply(combine_features, axis=1)
    test_df['text'] = test_df[usecols].apply(combine_features, axis=1)

    train_dataset = Dataset.from_pandas(train_df[['text', 'power_level']])
    test_dataset = Dataset.from_pandas(test_df[['text', 'power_level']])

    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

    def preprocess_function(examples):
        return tokenizer(examples['text'], truncation=True, padding='max_length', max_length=512)

    train_dataset = train_dataset.map(preprocess_function, batched=True)
    test_dataset = test_dataset.map(preprocess_function, batched=True)

    train_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'power_level'])
    test_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'power_level'])

    model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=1)  # Regression

    def add_labels_to_dataset(dataset):
        dataset = dataset.map(lambda x: {'labels': x['power_level']}, batched=True)
        return dataset

    train_dataset = add_labels_to_dataset(train_dataset)
    test_dataset = add_labels_to_dataset(test_dataset)

    class CustomTrainer(Trainer):
        def compute_loss(self, model, inputs, return_outputs=False, **kwargs):
          labels = inputs.get("labels")
          outputs = model(**inputs)
          logits = outputs.get("logits").squeeze()

          predictions = torch.sigmoid(logits)

          abs_error = torch.abs(predictions - labels)
          weights = torch.where(abs_error < 0.1, 1.0, 2.0).to(predictions.device)

          delta = 1.0
          loss = torch.where(abs_error <= delta, 0.5 * (abs_error ** 2), delta * (abs_error - 0.5 * delta))
          loss = loss * weights
          loss = loss.mean()

          return (loss, outputs) if return_outputs else loss

    training_args = TrainingArguments(
        output_dir='./outputResults',
        num_train_epochs=num_epochs,
        per_device_train_batch_size=batch_size,
        per_device_eval_batch_size=batch_size,
        learning_rate=3e-5,  # Lower learning rate for better convergence
        warmup_steps=500,
        weight_decay=0.01,
        logging_dir='./logs',
        logging_steps=10,
        evaluation_strategy="epoch",
        save_strategy="epoch",
        load_best_model_at_end=True,
        report_to=None,
        metric_for_best_model="mae",
        greater_is_better=False,
    )

    def compute_metrics(eval_pred):
      logits, labels = eval_pred
      predictions = torch.sigmoid(torch.tensor(logits.squeeze())).numpy()
      mae = mean_absolute_error(labels, predictions)
      return {"mae": mae}

    trainer = CustomTrainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=test_dataset,
        tokenizer=tokenizer,
        compute_metrics=compute_metrics,
        callbacks=[EarlyStoppingCallback(early_stopping_patience=3)],
    )

    trainer.train()
    eval_results = trainer.evaluate()
    print(eval_results)

    predictions = trainer.predict(test_dataset)
    predicted_values = torch.sigmoid(torch.tensor(predictions.predictions.squeeze())).numpy()
    predicted_values = np.clip(predicted_values, 0, 1)
    actual_values = predictions.label_ids

    print_top_10_predictions(predicted_values, actual_values, test_df)

    def plot_predictions(predictions, actual_values):
        plt.figure(figsize=(10, 6))
        plt.scatter(actual_values, predictions, alpha=0.5, c='blue', label='Predictions')
        plt.xlabel('Actual Power Level')
        plt.ylabel('Predicted Power Level')
        plt.title('Predicted vs. Actual Power Levels')
        plt.legend()
        plt.show()

    plot_predictions(predicted_values, actual_values)

    if not os.path.exists(model_save_path):
        os.makedirs(model_save_path)

    model.save_pretrained(model_save_path)

    tokenizer.save_pretrained(model_save_path)


In [None]:
# Pokemon Bert Model 2
model_save_path = '/content/drive/MyDrive/266/project/pokemonModel2'

pokemon_concat = pd.concat([pokemon_2021, pokemon_2022], ignore_index=True)

train_and_evaluate_bert_model(pokemon_concat, pokemon_2023, usecols, model_save_path)

In [None]:
# Trainer Bert Model 2
model_save_path = '/content/drive/MyDrive/266/project/trainerModel2'

trainer_concat = pd.concat([trainer_2021, trainer_2022], ignore_index=True)

train_and_evaluate_bert_model(trainer_concat, trainer_2023, usecols, model_save_path)

In [None]:
from transformers import (
    RobertaTokenizer, RobertaForSequenceClassification, Trainer, TrainingArguments, EarlyStoppingCallback
)

def train_and_evaluate_roberta_model(train_df, test_df, usecols, model_save_path, num_epochs=20, batch_size=8):
    train_df = train_df[usecols + ['power_level']]
    test_df = test_df[usecols + ['power_level']]

    def combine_features(row):
        return ' '.join(row.values.astype(str))

    train_df['text'] = train_df[usecols].apply(combine_features, axis=1)
    test_df['text'] = test_df[usecols].apply(combine_features, axis=1)

    train_dataset = Dataset.from_pandas(train_df[['text', 'power_level']])
    test_dataset = Dataset.from_pandas(test_df[['text', 'power_level']])

    tokenizer = RobertaTokenizer.from_pretrained('roberta-base')

    def preprocess_function(examples):
        return tokenizer(examples['text'], truncation=True, padding='max_length', max_length=512)

    train_dataset = train_dataset.map(preprocess_function, batched=True)
    test_dataset = test_dataset.map(preprocess_function, batched=True)

    train_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'power_level'])
    test_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'power_level'])

    model = RobertaForSequenceClassification.from_pretrained('roberta-base', num_labels=1)

    def add_labels_to_dataset(dataset):
        dataset = dataset.map(lambda x: {'labels': x['power_level']}, batched=True)
        return dataset

    train_dataset = add_labels_to_dataset(train_dataset)
    test_dataset = add_labels_to_dataset(test_dataset)

    class CustomTrainer(Trainer):
        def compute_loss(self, model, inputs, return_outputs=False, **kwargs):
          labels = inputs.get("labels")
          outputs = model(**inputs)
          logits = outputs.get("logits").squeeze()
          predictions = torch.sigmoid(logits)
          abs_error = torch.abs(predictions - labels)
          weights = torch.where(abs_error < 0.1, 1.0, 2.0).to(predictions.device)
          delta = 1.0
          loss = torch.where(abs_error <= delta, 0.5 * (abs_error ** 2), delta * (abs_error - 0.5 * delta))
          loss = loss * weights
          loss = loss.mean()
          return (loss, outputs) if return_outputs else loss

    training_args = TrainingArguments(
        output_dir='./outputResults',
        num_train_epochs=num_epochs,
        per_device_train_batch_size=batch_size,
        per_device_eval_batch_size=batch_size,
        learning_rate=2e-5,
        warmup_steps=500,
        weight_decay=0.01,
        logging_dir='./logs',
        logging_steps=10,
        evaluation_strategy="epoch",
        save_strategy="epoch",
        load_best_model_at_end=True,
        report_to=None,
        metric_for_best_model="mae",
        greater_is_better=False,
    )

    def compute_metrics(eval_pred):
      logits, labels = eval_pred
      predictions = torch.sigmoid(torch.tensor(logits.squeeze())).numpy()
      mae = mean_absolute_error(labels, predictions)
      return {"mae": mae}

    trainer = CustomTrainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=test_dataset,
        tokenizer=tokenizer,
        compute_metrics=compute_metrics,
        callbacks=[EarlyStoppingCallback(early_stopping_patience=3)],
    )

    trainer.train()
    eval_results = trainer.evaluate()
    print(eval_results)

    predictions = trainer.predict(test_dataset)
    predicted_values = torch.sigmoid(torch.tensor(predictions.predictions.squeeze())).numpy()
    predicted_values = np.clip(predicted_values, 0, 1)
    actual_values = predictions.label_ids

    print_top_10_predictions(predicted_values, actual_values, test_df)

    def plot_predictions(predictions, actual_values):
        plt.figure(figsize=(10, 6))
        plt.scatter(actual_values, predictions, alpha=0.5, c='blue', label='Predictions')
        plt.xlabel('Actual Power Level')
        plt.ylabel('Predicted Power Level')
        plt.title('Predicted vs. Actual Power Levels')
        plt.legend()
        plt.show()

    plot_predictions(predicted_values, actual_values)

    if not os.path.exists(model_save_path):
        os.makedirs(model_save_path)

    model.save_pretrained(model_save_path)
    tokenizer.save_pretrained(model_save_path)


In [None]:
# Pokemon Roberta Model
model_save_path = '/content/drive/MyDrive/266/project/pokemonRoberta'

pokemon_concat = pd.concat([pokemon_2021, pokemon_2022], ignore_index=True)

train_and_evaluate_roberta_model(pokemon_concat, pokemon_2023, usecols, model_save_path)

In [None]:
# Trainer Roberta Model
model_save_path = '/content/drive/MyDrive/266/project/trainerRoberta'

trainer_concat = pd.concat([trainer_2021, trainer_2022], ignore_index=True)

train_and_evaluate_roberta_model(trainer_concat, trainer_2023, usecols, model_save_path)

In [None]:
# Trainer Roberta Model 2
model_save_path = '/content/drive/MyDrive/266/project/trainerRoberta2'

trainer_concat = pd.concat([trainer_2021, trainer_2022], ignore_index=True)

train_and_evaluate_roberta_model(trainer_concat, trainer_2023, usecols, model_save_path)

In [None]:
from torch import nn
from transformers import (
    RobertaTokenizer, RobertaForSequenceClassification, Trainer, TrainingArguments, EarlyStoppingCallback
)

class RoBERTaWithDenseLayers(RobertaForSequenceClassification):
    def __init__(self, config):
        super(RoBERTaWithDenseLayers, self).__init__(config)
        self.dense1 = nn.Linear(config.hidden_size, 512)
        self.dense2 = nn.Linear(512, 128)
        self.output_layer = nn.Linear(128, 1)
        self.dense1.weight.data.normal_(mean=0.0, std=config.initializer_range)
        self.dense2.weight.data.normal_(mean=0.0, std=config.initializer_range)
        self.output_layer.weight.data.normal_(mean=0.0, std=config.initializer_range)

    def forward(self, input_ids=None, attention_mask=None, token_type_ids=None, labels=None, **kwargs):
        roberta_output = self.roberta(input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, **kwargs)
        hidden_state = roberta_output[0]
        pooled_output = hidden_state[:, 0]
        x = self.dense1(pooled_output)
        x = torch.relu(x)
        x = self.dense2(x)
        x = torch.relu(x)
        logits = self.output_layer(x)
        loss = None
        if labels is not None:
            loss_fct = nn.MSELoss()
            loss = loss_fct(logits.squeeze(), labels)
        return (loss, logits) if loss is not None else logits

def train_and_evaluate_roberta_dense_model(train_df, test_df, usecols, model_save_path, num_epochs=20, batch_size=8):
    train_df = train_df[usecols + ['power_level']]
    test_df = test_df[usecols + ['power_level']]

    def combine_features(row):
        return ' '.join(row.values.astype(str))

    train_df['text'] = train_df[usecols].apply(combine_features, axis=1)
    test_df['text'] = test_df[usecols].apply(combine_features, axis=1)

    train_dataset = Dataset.from_pandas(train_df[['text', 'power_level']])
    test_dataset = Dataset.from_pandas(test_df[['text', 'power_level']])

    tokenizer = RobertaTokenizer.from_pretrained('roberta-base')

    def preprocess_function(examples):
        return tokenizer(examples['text'], truncation=True, padding='max_length', max_length=512)

    train_dataset = train_dataset.map(preprocess_function, batched=True)
    test_dataset = test_dataset.map(preprocess_function, batched=True)

    train_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'power_level'])
    test_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'power_level'])

    model = RoBERTaWithDenseLayers.from_pretrained('roberta-base', num_labels=1)

    def add_labels_to_dataset(dataset):
        return dataset.map(lambda x: {'labels': x['power_level']}, batched=True)

    train_dataset = add_labels_to_dataset(train_dataset)
    test_dataset = add_labels_to_dataset(test_dataset)

    class CustomTrainer(Trainer):
        def compute_loss(self, model, inputs, return_outputs=False, **kwargs):
            labels = inputs.get("labels")
            outputs = model(**inputs, output_attentions=True)
            logits = outputs[1].squeeze()
            predictions = torch.sigmoid(logits)
            abs_error = torch.abs(predictions - labels)
            weights = torch.where(abs_error < 0.1, 1.0, 2.0).to(predictions.device)
            delta = 1.0
            loss = torch.where(abs_error <= delta, 0.5 * (abs_error ** 2), delta * (abs_error - 0.5 * delta))
            loss = loss * weights
            loss = loss.mean()
            return (loss, outputs) if return_outputs else loss

    training_args = TrainingArguments(
        output_dir='./outputResults',
        num_train_epochs=num_epochs,
        per_device_train_batch_size=batch_size,
        per_device_eval_batch_size=batch_size,
        learning_rate=3e-5,
        warmup_steps=500,
        weight_decay=0.01,
        logging_dir='./logs',
        logging_steps=10,
        evaluation_strategy="epoch",
        save_strategy="epoch",
        load_best_model_at_end=True,
        report_to=None,
    )

    def compute_metrics(eval_pred):
        logits, labels = eval_pred
        predictions = torch.sigmoid(torch.tensor(logits.squeeze())).numpy()
        mae = mean_absolute_error(labels, predictions)
        return {"mae": mae}

    trainer = CustomTrainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=test_dataset,
        tokenizer=tokenizer,
        compute_metrics=compute_metrics,
    )

    trainer.train()
    eval_results = trainer.evaluate()
    print(eval_results)

    predictions = trainer.predict(test_dataset)
    predicted_values = torch.sigmoid(torch.tensor(predictions.predictions.squeeze())).numpy()
    predicted_values = np.clip(predicted_values, 0, 1)
    actual_values = predictions.label_ids

    print_top_10_predictions(predicted_values, actual_values, test_df)

    def plot_predictions(predictions, actual_values):
        plt.figure(figsize=(10, 6))
        plt.scatter(actual_values, predictions, alpha=0.5, c='blue', label='Predictions')
        plt.xlabel('Actual Power Level')
        plt.ylabel('Predicted Power Level')
        plt.title('Predicted vs. Actual Power Levels')
        plt.legend()
        plt.show()

    plot_predictions(predicted_values, actual_values)

    if not os.path.exists(model_save_path):
        os.makedirs(model_save_path)
    model.save_pretrained(model_save_path)
    tokenizer.save_pretrained(model_save_path)


In [None]:
# Pokemon Roberta Dense Model
model_save_path = '/content/drive/MyDrive/266/project/pokemonRobertaDense'

pokemon_concat = pd.concat([pokemon_2021, pokemon_2022], ignore_index=True)

train_and_evaluate_roberta_dense_model(pokemon_concat, pokemon_2023, usecols, model_save_path)

In [None]:
# Trainer Roberta Dense Model
model_save_path = '/content/drive/MyDrive/266/project/trainerRobertaDense'

trainer_concat = pd.concat([trainer_2021, trainer_2022], ignore_index=True)

train_and_evaluate_roberta_dense_model(trainer_concat, trainer_2023, usecols, model_save_path)

In [None]:
!pip install scikit-optimize

In [None]:
from transformers import (
    RobertaTokenizer, RobertaForSequenceClassification, Trainer,
    TrainingArguments, EarlyStoppingCallback
)
from datasets import Dataset
from sklearn.metrics import mean_absolute_error
import numpy as np
import torch
import os
import matplotlib.pyplot as plt

def print_top_10_predictions(preds, labels, df):
    top_indices = np.argsort(np.abs(preds - labels))[:10]
    print(df.iloc[top_indices][['text']])
    print("Actual:", labels[top_indices])
    print("Predicted:", preds[top_indices])

def objective(params, train_df, test_df, usecols, model_save_path,
              num_epochs=20, batch_size=8, return_model=False):
    learning_rate, warmup_steps, weight_decay = params
    warmup_steps = int(warmup_steps)

    train_df = train_df[usecols + ['power_level']]
    test_df = test_df[usecols + ['power_level']]

    combine_features = lambda row: ' '.join(row.values.astype(str))
    train_df['text'] = train_df[usecols].apply(combine_features, axis=1)
    test_df['text'] = test_df[usecols].apply(combine_features, axis=1)

    train_dataset = Dataset.from_pandas(train_df[['text', 'power_level']])
    test_dataset = Dataset.from_pandas(test_df[['text', 'power_level']])

    tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
    def preprocess_function(examples):
        return tokenizer(examples['text'], truncation=True, padding='max_length', max_length=512)
    train_dataset = train_dataset.map(preprocess_function, batched=True)
    test_dataset = test_dataset.map(preprocess_function, batched=True)

    train_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'power_level'])
    test_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'power_level'])

    model = RobertaForSequenceClassification.from_pretrained('roberta-base', num_labels=1)

    add_labels = lambda dataset: dataset.map(lambda x: {'labels': x['power_level']}, batched=True)
    train_dataset = add_labels(train_dataset)
    test_dataset = add_labels(test_dataset)

    class CustomTrainer(Trainer):
        def compute_loss(self, model, inputs, return_outputs=False):
            labels = inputs.get("labels")
            outputs = model(**inputs)
            logits = outputs.get("logits").squeeze()
            predictions = torch.sigmoid(logits)
            abs_error = torch.abs(predictions - labels)
            weights = torch.where(abs_error < 0.1, 1.0, 2.0).to(predictions.device)
            delta = 1.0
            loss = torch.where(abs_error <= delta,
                               0.5 * abs_error ** 2,
                               delta * (abs_error - 0.5 * delta))
            loss = (loss * weights).mean()
            return (loss, outputs) if return_outputs else loss

    def compute_metrics(eval_pred):
        logits, labels = eval_pred
        preds = torch.sigmoid(torch.tensor(logits.squeeze())).numpy()
        return {"mae": mean_absolute_error(labels, preds)}

    training_args = TrainingArguments(
        output_dir='./outputResults',
        num_train_epochs=num_epochs,
        per_device_train_batch_size=batch_size,
        per_device_eval_batch_size=batch_size,
        learning_rate=learning_rate,
        warmup_steps=warmup_steps,
        weight_decay=weight_decay,
        logging_dir='./logs',
        logging_steps=10,
        evaluation_strategy="epoch",
        save_strategy="epoch",
        load_best_model_at_end=True,
        save_total_limit=3,
        report_to=None,
        metric_for_best_model="mae",
        greater_is_better=False,
    )

    trainer = CustomTrainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=test_dataset,
        tokenizer=tokenizer,
        compute_metrics=compute_metrics,
        callbacks=[EarlyStoppingCallback(early_stopping_patience=3)],
    )

    trainer.train()

    eval_results = trainer.evaluate()
    print(eval_results)

    predictions = trainer.predict(test_dataset)
    predicted_values = torch.sigmoid(torch.tensor(predictions.predictions.squeeze())).numpy()
    predicted_values = np.clip(predicted_values, 0, 1)
    actual_values = predictions.label_ids

    print_top_10_predictions(predicted_values, actual_values, test_df)

    plt.figure(figsize=(10, 6))
    plt.scatter(actual_values, predicted_values, alpha=0.5, c='blue', label='Predictions')
    plt.xlabel('Actual Power Level')
    plt.ylabel('Predicted Power Level')
    plt.title('Predicted vs. Actual Power Levels')
    plt.legend()
    plt.show()

    if return_model:
        return trainer.model
    else:
        return eval_results['eval_mae']

def train_and_evaluate_roberta_tuned_model(train_df, test_df, usecols, model_save_path):
    from skopt import gp_minimize

    search_space = [
        (1e-6, 1e-4, 'log-uniform'),
        (500, 2000),
        (0.0, 0.1),
    ]

    result = gp_minimize(
        func=lambda params: objective(params, train_df, test_df, usecols, model_save_path),
        dimensions=search_space,
        n_calls=10,
        random_state=42
    )

    best_params = result.x
    print(f"Best hyperparameters: {best_params}")
    print(f"Best validation MAE: {result.fun}")

    print("Training final model with best parameters...")
    final_model = objective(best_params, train_df, test_df, usecols, model_save_path, return_model=True)

    os.makedirs(model_save_path, exist_ok=True)
    final_model.save_pretrained(model_save_path)
    tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
    tokenizer.save_pretrained(model_save_path)

    return result


In [None]:
# Pokemon Roberta Fine Tuned Model
model_save_path = '/content/drive/MyDrive/266/project/pokemonRobertaTuned'

pokemon_concat = pd.concat([pokemon_2021, pokemon_2022], ignore_index=True)

train_and_evaluate_roberta_tuned_model(pokemon_concat, pokemon_2023, usecols, model_save_path)

In [None]:
# Trainer Roberta Fine Tuned Model
model_save_path = '/content/drive/MyDrive/266/project/trainerRobertaTuned'

trainer_concat = pd.concat([trainer_2021, trainer_2022], ignore_index=True)

train_and_evaluate_roberta_tuned_model(trainer_concat, trainer_2023, usecols, model_save_path)

In [None]:
# Trainer Roberta Fine Tuned Model Saved
model_save_path = '/content/drive/MyDrive/266/project/trainerRobertaTuned'

trainer_concat = pd.concat([trainer_2021, trainer_2022], ignore_index=True)

train_and_evaluate_roberta_tuned_model(trainer_concat, trainer_2023, usecols, model_save_path)

In [None]:
import optuna
import os
import numpy as np
import torch
import matplotlib.pyplot as plt
from transformers import RobertaTokenizer, RobertaForSequenceClassification, Trainer, TrainingArguments
from datasets import Dataset
from sklearn.metrics import mean_absolute_error

def train_and_evaluate_ensemble_model_advanced(train_df, test_df, usecols, model_save_path, n_trials=20, n_models=5):
    def combine_features(row):
        return ' '.join(row.values.astype(str))

    train_df['text'] = train_df[usecols].apply(combine_features, axis=1)
    test_df['text'] = test_df[usecols].apply(combine_features, axis=1)

    if 'power_level' in train_df.columns and 'power_level' in test_df.columns:
        train_df['power_level'] = np.log1p(train_df['power_level'])
        test_df['power_level'] = np.log1p(test_df['power_level'])
    else:
        raise ValueError("Column 'power_level' not found in train or test dataset")

    train_dataset = Dataset.from_pandas(train_df[['text', 'power_level']])
    test_dataset = Dataset.from_pandas(test_df[['text', 'power_level']])

    tokenizer = RobertaTokenizer.from_pretrained('roberta-base')

    def preprocess_function(examples):
        return tokenizer(examples['text'], truncation=True, padding='max_length', max_length=512)

    train_dataset = train_dataset.map(preprocess_function, batched=True)
    test_dataset = test_dataset.map(preprocess_function, batched=True)

    train_dataset = train_dataset.rename_column("power_level", "labels")
    test_dataset = test_dataset.rename_column("power_level", "labels")

    train_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])
    test_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])

    class CustomTrainer(Trainer):
      def compute_loss(self, model, inputs, return_outputs=False, **kwargs):
        labels = inputs.get("labels")
        outputs = model(**inputs)
        logits = outputs.get("logits").squeeze()

        abs_error = torch.abs(logits - labels)
        weights = torch.where(abs_error < 0.1, 1.0, 2.0).to(logits.device)

        delta = 1.0
        loss = torch.where(abs_error <= delta, 0.5 * (abs_error ** 2), delta * (abs_error - 0.5 * delta))
        loss = loss * weights
        loss = loss.mean()

        return (loss, outputs) if return_outputs else loss

    # MAE Metric Function
    def compute_metrics(eval_pred):
        logits, labels = eval_pred
        predictions = logits.squeeze()
        return {"mae": mean_absolute_error(labels, predictions)}

    def objective(trial):
        learning_rate = trial.suggest_loguniform('learning_rate', 1e-6, 1e-3)
        batch_size = trial.suggest_categorical('batch_size', [8])
        num_train_epochs = trial.suggest_int('num_train_epochs', 3, 5)
        warmup_steps = trial.suggest_int('warmup_steps', 200, 1000)
        model_type = trial.suggest_categorical('model_type', ['roberta', 'bert'])

        if model_type == 'roberta':
            model = RobertaForSequenceClassification.from_pretrained('roberta-base', num_labels=1)
        else:
            model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=1)

        training_args = TrainingArguments(
            output_dir='./outputResults',
            num_train_epochs=num_train_epochs,
            per_device_train_batch_size=batch_size,
            per_device_eval_batch_size=batch_size,
            learning_rate=learning_rate,
            warmup_steps=warmup_steps,
            weight_decay=0.01,
            evaluation_strategy="epoch",
            save_strategy="epoch",
            load_best_model_at_end=True,
            report_to=None,
            metric_for_best_model="mae",
            greater_is_better=False,
        )

        trainer = CustomTrainer(
            model=model,
            args=training_args,
            train_dataset=train_dataset,
            eval_dataset=test_dataset,
            tokenizer=tokenizer,
            compute_metrics=compute_metrics,
        )
        trainer.train()
        eval_results = trainer.evaluate()
        return eval_results["eval_mae"]

    study = optuna.create_study(direction='minimize')
    study.optimize(objective, n_trials=n_trials)

    best_params = study.best_params
    models = []

    for _ in range(n_models):
        model_type = best_params['model_type']
        if model_type == 'roberta':
            model = RobertaForSequenceClassification.from_pretrained('roberta-base', num_labels=1)
        else:
            model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=1)

        training_args = TrainingArguments(
            output_dir='./outputResults',
            num_train_epochs=best_params['num_train_epochs'],
            per_device_train_batch_size=best_params['batch_size'],
            per_device_eval_batch_size=best_params['batch_size'],
            learning_rate=best_params['learning_rate'],
            warmup_steps=best_params['warmup_steps'],
            weight_decay=0.01,
            evaluation_strategy="epoch",
            save_strategy="epoch",
            load_best_model_at_end=True,
            report_to=None,
            metric_for_best_model="mae",
            greater_is_better=False,
        )

        trainer = CustomTrainer(
            model=model,
            args=training_args,
            train_dataset=train_dataset,
            eval_dataset=test_dataset,
            tokenizer=tokenizer,
            compute_metrics=compute_metrics,
        )
        trainer.train()
        models.append(model)

    def ensemble_predictions(models, dataset):
        all_predictions = []
        for model in models:
            trainer = Trainer(
                model=model,
                tokenizer=tokenizer,
                compute_metrics=compute_metrics
            )
            predictions = trainer.predict(dataset)
            all_predictions.append(predictions.predictions.squeeze())
        return np.mean(all_predictions, axis=0)

    predictions = ensemble_predictions(models, test_dataset)
    predicted_values = np.expm1(predictions)
    actual_values = np.expm1(test_dataset['labels'].numpy())

    print_top_10_predictions(predicted_values, actual_values, test_df)

    print("Ensemble MAE:", mean_absolute_error(actual_values, predicted_values))

    plt.figure(figsize=(10, 6))
    plt.scatter(actual_values, predicted_values, alpha=0.5, c='blue', label='Predictions')
    plt.xlabel('Actual Power Level')
    plt.ylabel('Predicted Power Level')
    plt.title('Ensemble Predictions vs Actual')
    plt.legend()
    plt.show()

    if not os.path.exists(model_save_path):
        os.makedirs(model_save_path)
    for i, model in enumerate(models):
        model.save_pretrained(f'{model_save_path}/model_{i}')
        tokenizer.save_pretrained(f'{model_save_path}/model_{i}')

In [None]:
# Pokemon Ensemble Model
model_save_path = '/content/drive/MyDrive/266/project/pokemonEnsemble'

pokemon_concat = pd.concat([pokemon_2021, pokemon_2022], ignore_index=True)

train_and_evaluate_ensemble_model_advanced(pokemon_concat, pokemon_2023, usecols, model_save_path)

In [None]:
# Trainer Ensemble Model
model_save_path = '/content/drive/MyDrive/266/project/trainerEnsemble'

trainer_concat = pd.concat([trainer_2021, trainer_2022], ignore_index=True)

train_and_evaluate_ensemble_model_advanced(trainer_concat, trainer_2023, usecols, model_save_path)

In [None]:
from sklearn.neural_network import MLPRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline

def train_and_evaluate_mlp_model(train_df, test_df, usecols, model_save_path):
    X_train = train_df[usecols]
    y_train = train_df['power_level']
    X_test = test_df[usecols]
    y_test = test_df['power_level']

    categorical_columns = X_train.select_dtypes(include=['object']).columns
    numerical_columns = X_train.select_dtypes(exclude=['object']).columns

    preprocessor = ColumnTransformer(
        transformers=[
            ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_columns),
            ('num', StandardScaler(), numerical_columns)
        ],
        remainder='passthrough'
    )

    mlp = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('regressor', MLPRegressor())
    ])

    param_grid = {
        'regressor__hidden_layer_sizes': [(50,), (100,), (100, 50), (200,)],
        'regressor__activation': ['relu', 'tanh'],
        'regressor__solver': ['adam'],
        'regressor__learning_rate': ['constant', 'adaptive'],
        'regressor__max_iter': [500, 1000, 1500]
    }

    grid_search = GridSearchCV(mlp, param_grid, cv=5, scoring='neg_mean_absolute_error', n_jobs=-1)
    grid_search.fit(X_train, y_train)

    print(f"Best parameters found: {grid_search.best_params_}")


    best_mlp = grid_search.best_estimator_

    predictions = best_mlp.predict(X_test)
    mae = mean_absolute_error(y_test, predictions)
    print(f"Mean Absolute Error: {mae}")

    predictions = np.expm1(predictions)
    y_test = np.expm1(y_test)

    print_top_10_predictions(predictions, y_test, test_df)

    def plot_predictions(predictions, actual_values):
        plt.figure(figsize=(10, 6))
        plt.scatter(actual_values, predictions, alpha=0.5, c='blue', label='Predictions')
        plt.xlabel('Actual Power Level')
        plt.ylabel('Predicted Power Level')
        plt.title('Predicted vs. Actual Power Levels')
        plt.legend()
        plt.show()

    plot_predictions(predictions, y_test)

    if not os.path.exists(model_save_path):
        os.makedirs(model_save_path)

    model_filename = os.path.join(model_save_path, 'mlp_model.pkl')
    joblib.dump(best_mlp, model_filename)
    print(f"Model saved to {model_filename}")


In [None]:
# Pokemon MLP Model
model_save_path = '/content/drive/MyDrive/266/project/pokemonMLP'

pokemon_concat = pd.concat([pokemon_2021, pokemon_2022], ignore_index=True)

train_and_evaluate_mlp_model(pokemon_concat, pokemon_2023, usecols, model_save_path)

In [None]:
# Trainer MLP Model
model_save_path = '/content/drive/MyDrive/266/project/trainerMLP'

trainer_concat = pd.concat([trainer_2021, trainer_2022], ignore_index=True)

train_and_evaluate_mlp_model(trainer_concat, trainer_2023, usecols, model_save_path)

In [None]:
from sklearn.svm import SVR
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_absolute_error
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline

def train_and_evaluate_svr_model(train_df, test_df, usecols, model_save_path):
    X_train = train_df[usecols]
    y_train = train_df['power_level']
    X_test = test_df[usecols]
    y_test = test_df['power_level']

    preprocessor = ColumnTransformer(
        transformers=[
            ('cat', OneHotEncoder(handle_unknown='ignore'), usecols)
        ],
        remainder='passthrough'
    )

    svr = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('regressor', SVR())
    ])

    param_grid = {
        'regressor__C': [1, 10, 100],
        'regressor__epsilon': [0.1, 0.2, 0.5],
        'regressor__kernel': ['linear', 'rbf']
    }

    grid_search = GridSearchCV(svr, param_grid, cv=5, scoring='neg_mean_absolute_error', n_jobs=-1)
    grid_search.fit(X_train, y_train)

    print(f"Best parameters: {grid_search.best_params_}")

    best_svr = grid_search.best_estimator_
    test_predictions = best_svr.predict(X_test)

    mae = mean_absolute_error(y_test, test_predictions)
    print(f"Test MAE: {mae}")

    print_top_10_predictions(test_predictions, y_test, test_df)

    plt.figure(figsize=(10, 6))
    plt.scatter(y_test, test_predictions, alpha=0.5, c='blue', label='Predictions')
    plt.xlabel('Actual Power Level')
    plt.ylabel('Predicted Power Level')
    plt.title('Predicted vs. Actual Power Levels')
    plt.legend()
    plt.show()

    if not os.path.exists(model_save_path):
        os.makedirs(model_save_path)

    model_filename = os.path.join(model_save_path, 'svr_model.pkl')
    joblib.dump(best_svr, model_filename)
    print(f"Model saved to {model_filename}")

In [None]:
# Pokemon SVR Model
model_save_path = '/content/drive/MyDrive/266/project/pokemonSVR'

pokemon_concat = pd.concat([pokemon_2021, pokemon_2022], ignore_index=True)

train_and_evaluate_svr_model(pokemon_concat, pokemon_2023, usecols, model_save_path)

In [None]:
# Trainer SVR Model 2
model_save_path = '/content/drive/MyDrive/266/project/trainerSVR'

trainer_concat = pd.concat([trainer_2021, trainer_2022], ignore_index=True)

train_and_evaluate_svr_model(trainer_concat, trainer_2023, usecols, model_save_path)