In [20]:
import pandas as pd
import numpy as np
import torch
import transformers
from transformers import AutoTokenizer, AutoModelForCausalLM
from peft import LoraConfig, get_peft_model
from torch.utils.data import Dataset, DataLoader
from tqdm import tqdm

In [2]:
df = pd.read_csv("cards.csv", usecols=['name', 'type', 'atk', 'def', 'level', 'race'])
df['atk'] = df['atk'].fillna(0)
df['def'] = df['def'].fillna(0)
df['level'] = df['level'].fillna(0)

MASK_TOKEN = "<UNK>"


df['text'] = df.apply(lambda row: f"{row['name']} | {row['type']} | {int(row['atk'])} | {int(row['def'])} | {int(row['level'])} | {row['race']}", axis=1)

print(len(df['text']))
input_entries = df['text']
target_entries = input_entries.copy()
print(len(input_entries))

13281
13281
0    <START> "A" Cell Breeding Device delim Spell C...
1    <START> "A" Cell Incubator delim Spell Card de...
2    <START> "A" Cell Recombination Device delim Sp...
3    <START> "A" Cell Scatter Burst delim Spell Car...
4    <START> "Infernoble Arms - Almace" delim Spell...
5    <START> "Infernoble Arms - Durendal" delim Spe...
6    <START> "Infernoble Arms - Hauteclere" delim S...
7    <START> "Infernoble Arms - Joyeuse" delim Spel...
8    <START> 1st Movement Solo delim Spell Card del...
9    <START> 3-Hump Lacooda delim Effect Monster de...
Name: text, dtype: object


In [3]:
masking_prob = 0.2
masked_entries = []
for entry in input_entries:
    tokens = entry.split(' ')
    masked_entry = []
    for token in tokens:
        if np.random.rand() < masking_prob:
            masked_entry.append(MASK_TOKEN)
        else:
            masked_entry.append(token)
    masked_entries.append(" ".join(masked_entry))

masked_entries = pd.Series(masked_entries)
print(masked_entries[:10])

0    <START> <UNK> Cell Breeding Device <UNK> <UNK>...
1    <START> "A" Cell <UNK> delim Spell <UNK> delim...
2    <START> "A" <UNK> Recombination <UNK> delim Sp...
3    <START> "A" <UNK> Scatter <UNK> delim Spell Ca...
4    <START> "Infernoble <UNK> - Almace" delim <UNK...
5    <START> "Infernoble <UNK> - Durendal" delim <U...
6    <START> "Infernoble <UNK> - Hauteclere" delim ...
7    <START> "Infernoble <UNK> - Joyeuse" <UNK> <UN...
8    <START> 1st Movement <UNK> delim Spell Card <U...
9    <START> <UNK> Lacooda delim Effect Monster del...
dtype: object


In [None]:
X = np.array([masked_entries[i] + " -> " + target_entries[i] + "<END>" for i in range(len(masked_entries))])
print(X.shape)
print(X[0])

In [15]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'

model = AutoModelForCausalLM.from_pretrained(
    "gpt2",
    device_map='auto',
).to(device)
tokenizer = AutoTokenizer.from_pretrained("gpt2")
tokenizer.pad_token = tokenizer.eos_token

for param in model.parameters():
    param.requires_grad = False

# LoRa
config = LoraConfig(
    r=16,
    lora_alpha=32,
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM")

model = get_peft_model(model, config).to(device)



In [17]:
class TextDataset(Dataset):
    def __init__(self, X, tokenizer, max_length=128):
        """
        Args:
        - X: A list of input sequences (strings) for next-token prediction.
        - tokenizer: The tokenizer for processing the text.
        - max_length: The maximum length of the tokenized input/output.
        """
        self.X = X
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.X)

    def __getitem__(self, idx):
        # Get the input sequence
        x = self.X[idx]

        # Tokenize the input sequence
        tokenized = self.tokenizer(x.strip(), truncation=True, padding="max_length", max_length=self.max_length, return_tensors="pt")

        input_ids = tokenized['input_ids'].squeeze()  # Remove batch dimension
        attention_mask = tokenized['attention_mask'].squeeze()

        # Labels for next-token prediction are the same as input_ids but shifted to the right
        labels = input_ids.clone()
        labels[labels == self.tokenizer.pad_token_id] = -100  # Ignore padding in the loss computation

        return {"input_ids": input_ids, "attention_mask": attention_mask, "labels": labels}

In [21]:
dataset = TextDataset(X, tokenizer)
dataloader = DataLoader(dataset, batch_size=32, shuffle=True)
optimizer = torch.optim.AdamW(model.parameters(), lr=5e-4)



In [None]:
model.train()
epochs = 3
for epoch in range(epochs):
    loop = tqdm(dataloader, leave=True)
    for batch in loop:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        # Forward pass
        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        
        # Backward pass and optimization
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()

        # Update progress bar
        loop.set_description(f'Epoch {epoch+1}')
        loop.set_postfix(loss=loss.item())

Epoch 1:  18%|█▊        | 299/1661 [19:40<1:28:07,  3.88s/it, loss=1.05] 

In [48]:
model.eval()

input_text = "<UNK> Magician | <UNK> | 3300 | <UNK> | 7 | <UNK> -> "
inputs = tokenizer(input_text, return_tensors="pt").to(device)

with torch.no_grad():
    outputs = model.generate(
        inputs["input_ids"],
        max_length=100,
        pad_token_id=tokenizer.pad_token_id,
        eos_token_id=tokenizer.eos_token_id,
    )

generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
print("Generated Text: ", generated_text)

Generated Text:  <START> powerful <UNK> skeleton DELIM Ritual Effect <UNK> DELIM 1000 DELIM 3000 DELIM <UNK> DELIM fiend <END>
