In [None]:
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow import keras
from transformers import BertTokenizer, TFRobertaForSequenceClassification
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm


In [None]:
from google.colab import drive
drive.mount("/content/drive/")

Import data

In [None]:
usecols = ['id', 'name', 'cleaned_abilities', 'cleaned_rules', 'cleaned_attacks', 'hp', 'types', 'subtypes', 'evolvesFrom', 'evolvesTo', 'weaknesses', 'convertedRetreatCost', 'resistances']
standard_2021_synergies = pd.read_csv('/content/drive/MyDrive/266/project/standard_2021_synergies.csv')
standard_2022_synergies = pd.read_csv('/content/drive/MyDrive/266/project/standard_2022_synergies.csv')
standard_2023_synergies = pd.read_csv('/content/drive/MyDrive/266/project/standard_2023_synergies.csv')



In [None]:
dat = pd.concat([standard_2021_synergies, standard_2022_synergies, standard_2023_synergies], axis=0).fillna(0)
standard_2021_synergies_padded = dat.iloc[:standard_2021_synergies.shape[0], :]
standard_2022_synergies_padded = dat.iloc[standard_2021_synergies.shape[0]:standard_2021_synergies.shape[0]+standard_2022_synergies.shape[0], :]
standard_2023_synergies_padded = dat.iloc[standard_2021_synergies.shape[0]+standard_2022_synergies.shape[0]:, :]

In [None]:
card_counts = dat['id'].nunique()

In [None]:
assert(standard_2021_synergies_padded.shape[0]==standard_2021_synergies.shape[0])
assert(standard_2022_synergies_padded.shape[0]==standard_2022_synergies.shape[0])
assert(standard_2023_synergies_padded.shape[0]==standard_2023_synergies.shape[0])

assert(standard_2021_synergies_padded.shape[1]==standard_2023_synergies_padded.shape[1])
assert(standard_2022_synergies_padded.shape[1]==standard_2023_synergies_padded.shape[1])

In [None]:
train_dat_x = pd.concat([
    standard_2021_synergies_padded[usecols],
    standard_2022_synergies_padded[usecols]],
axis=0).apply(lambda row: ';'.join([str(elem) for elem in row]), axis=1).to_numpy()
test_dat_x = standard_2023_synergies_padded[usecols].apply(lambda row: ';'.join([str(elem) for elem in row]), axis=1).to_numpy()

train_dat_y = pd.concat([
    standard_2021_synergies_padded.iloc[:, -card_counts:],
    standard_2022_synergies_padded.iloc[:, -card_counts:]],
axis=0)
test_dat_y = standard_2023_synergies_padded.iloc[:, -card_counts:]

Define tokenizer

In [None]:
max_length = 512

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

tokenized_train_x = tokenizer(
    train_dat_x.tolist(),
    padding="max_length",
    truncation=True,
    max_length=max_length,
    return_tensors='tf'
)

tokenized_test_x = tokenizer(
    test_dat_x.tolist(),
    padding="max_length",
    truncation=True,
    max_length=max_length,
    return_tensors='tf'
)


Define BERT model

In [None]:
# https://huggingface.co/transformers/v3.0.2/model_doc/bert.html#transformers.TFBertModel

bert_model = TFBertModel.from_pretrained("bert-base-uncased", output_hidden_states=True, output_attentions=True)


Optionally freeze layers

In [None]:
# Freeze layers
bert_model.bert.encoder.trainable = True
bert_model.bert.embeddings.trainable = True

Define the model

In [None]:
def stack_layers(hidden_states):
    stacked = tf.stack(hidden_states, axis=0)  # Shape: (n, batch_size, seq_length, hidden_size)
    return tf.reduce_mean(stacked, axis=2)  # Average over sequence tokens, shape: (n, batch_size, hidden_size)

def reshape_layers(tensor):
    transposed = tf.transpose(tensor, perm=[1, 0, 2])  # Shape: (batch_size, n, hidden_size)
    batch_size = tf.shape(transposed)[0]
    reshaped_dim = embedding_nlayers * 768
    return tf.reshape(transposed, [batch_size, reshaped_dim])  # Shape: (batch_size, n*hidden_size)

In [None]:
# Create model
embedding_nlayers = 3

input_ids = keras.Input(shape=(max_length,), dtype=tf.int32, name="input_ids")
attention_mask = keras.Input(shape=(max_length,), dtype=tf.int32, name="attention_mask")
token_type_ids = keras.Input(shape=(max_length,), dtype=tf.int32, name="token_type_ids")

bert_outputs = bert_model.bert(input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids)

hidden_states = bert_outputs.pooler_output
dropout_1_out = keras.layers.Dropout(0.1)(hidden_states)
linear_1_out = keras.layers.Dense(256, activation='relu')(dropout_1_out)
layer_norm_1_out = keras.layers.LayerNormalization()(linear_1_out)
dropout_2_out = keras.layers.Dropout(0.1)(layer_norm_1_out)
linear_2_out = keras.layers.Dense(64, activation='relu')(dropout_2_out)
out = keras.layers.Dense(card_counts, activation='sigmoid')(linear_2_out)

model = keras.Model(inputs=[input_ids, attention_mask, token_type_ids], outputs=out)
model.summary()

Train

In [None]:
model.compile(
    optimizer=keras.optimizers.Adam(),
    loss=keras.losses.BinaryCrossentropy(),
    metrics=['accuracy'],
)

history = model.fit(
    x=dict(tokenized_train_x),
    y=train_dat_y,
    batch_size=32,
    epochs=10,
    validation_split=0.2,
)

In [None]:
bert_model_filename = "synergy_dense_bert_v1"
model_filename = "synergy_dense_model.keras_v1"

# bert_model.save_pretrained(f"/content/drive/MyDrive/266/project/{bert_model_filename}")
# model.save(f"/content/drive/MyDrive/266/project/{model_filename}")
bert_model = TFBertModel.from_pretrained(f"/content/drive/MyDrive/266/project/{bert_model_filename}")
loaded_model = keras.saving.load_model(f"/content/drive/MyDrive/266/project/{model_filename}")

In [None]:
preds = loaded_model.predict(dict(tokenized_test_x))


Extract correct columns

In [None]:
test_synergies = standard_2023_synergies_padded.iloc[:, -card_counts:]
just_2023_columns = test_synergies.columns.isin(standard_2023_synergies_padded['id'])
test_synergies_just_2023 = test_synergies.loc[:, just_2023_columns]
preds_just_2023 = preds[:, just_2023_columns]

assert(test_synergies_just_2023.shape == preds_just_2023.shape)

In [None]:
differences = test_synergies_just_2023-preds_just_2023
differences.index = differences.columns
differences

In [None]:
fig = plt.figure(figsize=(20, 16))
sns.heatmap(differences, annot=False, cmap='vlag')
plt.show()

Top 10 true synergies

In [None]:
def get_top_n_indices(data, n):
  data = np.array(data)
  top_n_values = sorted(np.unique(test_synergies_just_2023.to_numpy().flatten()))[-n:]
  data_mask = data > top_n_values[0]
  ret_rows = set([])
  ret_cols = set([])
  counter = 0
  for i in range(data.shape[0]):
    for j in range(data.shape[1]):
      if data_mask[i, j]:
        ret_rows.add(i)
        ret_cols.add(j)
  return ret_rows, ret_cols


In [None]:
rows, cols = get_top_n_indices(test_synergies_just_2023, 10)

In [None]:
fig = plt.figure(figsize=(20, 16))
sns.heatmap(differences.iloc[list(rows), list(cols)], annot=False, cmap='vlag')
plt.show()

In [None]:
n = 10
lowest_n_values = sorted(np.unique(differences.to_numpy().flatten()))[:n]
mask = differences <= lowest_n_values[-1]
synergies = []
for card1 in differences.index:
  for card2 in differences.columns:
    if mask.loc[card1, card2]:
      print(card1, card2)
      synergies.append((card1, card2))
      if len(synergies) == n:
        break
  if len(synergies) == n:
        break

In [None]:
mask.sum().sum()