In [2]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from tqdm import tqdm  # do paska postępu

#########################
# 1. Wczytywanie CSV i formatowanie
#########################

CSV_FILE = "filtered_npc_data_trimmed.csv"  # podaj swoją ścieżkę
df = pd.read_csv(CSV_FILE)

def row_to_text(row):
    """
    Konwertuje jeden wiersz DataFrame'a (zawierający dane postaci) na
    ustrukturyzowany ciąg znaków w formacie:
    <NPC_START>
    Name: ...
    Gender: ...
    ...
    <NPC_END>
    """
    lines = []
    lines.append("<NPC_START>")
    lines.append(f"Name: {str(row.get('Name', 'Unknown'))}")
    lines.append(f"Gender: {str(row.get('Gender', 'Unknown'))}")
    lines.append(f"Race: {str(row.get('Race', 'Unknown'))}")
    lines.append(f"Description: {str(row.get('Description',' '))}")
    lines.append(f"Personality: {str(row.get('Personality',' '))}")
    lines.append(f"History: {str(row.get('History',' '))}")
    lines.append(f"Motivation: {str(row.get('Motivation',' '))}")
    lines.append(f"Flaws: {str(row.get('Flaws',' '))}")
    lines.append(f"Bonds: {str(row.get('Bonds',' '))}")
    lines.append(f"Occupation: {str(row.get('Occupation',' '))}")
    lines.append(f"Voice: {str(row.get('Voice',' '))}")
    lines.append("<NPC_END>")
    
    return "\n".join(lines)

# Tworzymy listę stringów
all_rows = []
for _, row in df.iterrows():
    text_line = row_to_text(row)
    all_rows.append(text_line)

# Scal wszystko w jeden duży korpus
corpus_text = "\n".join(all_rows)

#########################
# 2. Tokenizacja
#########################

def simple_tokenizer(text):
    """Proste rozdzielenie po białych znakach."""
    return text.split()

tokens = simple_tokenizer(corpus_text)

# Budowa słownika token->id oraz id->token
word2idx = {}
idx2word = []

for t in tokens:
    if t not in word2idx:
        idx2word.append(t)
        word2idx[t] = len(idx2word) - 1

vocab_size = len(word2idx)
print("Liczba unikalnych tokenów w słowniku:", vocab_size)

def encode_sentence(text):
    toks = simple_tokenizer(text)
    return [word2idx[t] for t in toks if t in word2idx]

def decode_tokens(id_list):
    return " ".join([idx2word[i] for i in id_list])

#########################
# 3. Przygotowanie datasetu
#########################

class TextDataset(Dataset):
    """
    Dataset dzieli sekwencję tokenów na próbki (wejście, target) o stałej długości seq_len.
    """
    def __init__(self, token_ids, seq_len=50):
        super().__init__()
        self.token_ids = token_ids
        self.seq_len = seq_len
        self.num_samples = len(token_ids) - seq_len

    def __len__(self):
        return self.num_samples

    def __getitem__(self, idx):
        # X = [idx : idx+seq_len], y = [idx+1 : idx+seq_len+1]
        x_chunk = self.token_ids[idx : idx + self.seq_len]
        y_chunk = self.token_ids[idx+1 : idx + self.seq_len + 1]
        return torch.tensor(x_chunk, dtype=torch.long), torch.tensor(y_chunk, dtype=torch.long)

# Kodowanie całego korpusu w listę ID
all_token_ids = encode_sentence(corpus_text)
print("Łączna liczba tokenów w corpora:", len(all_token_ids))

seq_len = 50
dataset = TextDataset(all_token_ids, seq_len=seq_len)
batch_size = 32
loader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

#########################
# 4. Definicja modelu LSTM
#########################

class LSTMGenerator(nn.Module):
    def __init__(self, vocab_size, embed_dim=128, hidden_dim=256, num_layers=2):
        super().__init__()
        self.embed = nn.Embedding(vocab_size, embed_dim)
        self.lstm = nn.LSTM(embed_dim, hidden_dim, num_layers=num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_dim, vocab_size)

    def forward(self, x, hidden=None):
        """
        x: shape (batch, seq_len)
        hidden: (h0, c0) lub None
        Zwraca:
          logits: (batch, seq_len, vocab_size)
          hidden: (hn, cn)
        """
        emb = self.embed(x)                # (batch, seq_len, embed_dim)
        out, hidden = self.lstm(emb, hidden)
        logits = self.fc(out)              # (batch, seq_len, vocab_size)
        return logits, hidden

model = LSTMGenerator(vocab_size)
print(model)

###################################
# 5. Trenowanie modelu z progres barem
###################################

criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=1e-3)
num_epochs = 5  # Dopasuj liczbę epok do rozmiaru datasetu

def detach_hidden(hidden):
    """Odłącz hidden state od poprzednich obliczeń w LSTM, by uniknąć rozrastania grafu."""
    if isinstance(hidden, tuple):
        return (hidden[0].detach(), hidden[1].detach())
    else:
        return hidden.detach()

for epoch in range(num_epochs):
    model.train()
    hidden_state = None

    total_loss = 0.0
    count = 0

    # tqdm do wyświetlenia paska postępu
    pbar = tqdm(loader, desc=f"Epoch {epoch+1}/{num_epochs}", unit="batch")

    for batch_x, batch_y in pbar:
        optimizer.zero_grad()
        logits, hidden_state = model(batch_x, hidden_state)
        hidden_state = detach_hidden(hidden_state)

        # shape logits => (batch, seq_len, vocab_size)
        # shape batch_y => (batch, seq_len)
        # musimy zreshape do cross entropy
        loss = criterion(logits.view(-1, vocab_size), batch_y.view(-1))
        loss.backward()
        optimizer.step()

        total_loss += loss.item()
        count += 1

        # Aktualizujemy paska postępu - np. pokazując running loss
        pbar.set_postfix({"loss": f"{(total_loss/count):.4f}"})

    avg_loss = total_loss / count
    print(f"===> Epoch {epoch+1} finished, average loss = {avg_loss:.4f}")

#########################
# 6. Zapisywanie modelu
#########################
MODEL_PATH = "npc_lstm_model.pth"
torch.save(model.state_dict(), MODEL_PATH)
print(f"Model zapisany do pliku: {MODEL_PATH}")

#########################
# 7. Funkcja generująca
#########################

def generate_text(model, start_text="<NPC_START>", max_length=200, temperature=1.0):
    """
    Auto-regresywne generowanie sekwencji tokenów z LSTM.
    """
    model.eval()

    # Zamiana słów startowych na ID
    start_tokens = encode_sentence(start_text)
    if not start_tokens:
        start_tokens = [0]  # fallback do tokenu 0, by nie było błędu

    tokens_out = start_tokens[:]
    # batch=1, seq=1
    inp = torch.tensor([start_tokens], dtype=torch.long)

    hidden = None

    for _ in range(max_length):
        with torch.no_grad():
            logits, hidden = model(inp, hidden)
        # logits: (1, seq_len, vocab_size)
        # bierzemy ostatni token:
        last_logits = logits[0, -1, :]
        # temperature
        last_logits = last_logits / temperature

        probs = torch.softmax(last_logits, dim=0).detach().cpu().numpy()
        next_id = np.random.choice(len(probs), p=probs)

        tokens_out.append(next_id)
        # następne wejście
        inp = torch.tensor([[next_id]], dtype=torch.long)

        # Możemy przerwać gdy znajdziemy <NPC_END>
        if idx2word[next_id] == "<NPC_END>":
            break

    return decode_tokens(tokens_out)

#################################
# PRZYKŁAD UŻYCIA (test generowania)
#################################
sample_text = generate_text(model, start_text="<NPC_START>", max_length=100, temperature=0.8)
print("=== Przykładowo wygenerowany tekst ===")
print(sample_text)


Liczba unikalnych tokenów w słowniku: 7552
Łączna liczba tokenów w corpora: 1762407
LSTMGenerator(
  (embed): Embedding(7552, 128)
  (lstm): LSTM(128, 256, num_layers=2, batch_first=True)
  (fc): Linear(in_features=256, out_features=7552, bias=True)
)


Epoch 1/5:   0%|          | 118/55074 [00:20<2:42:56,  5.62batch/s, loss=6.9729]


KeyboardInterrupt: 

In [4]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from tqdm import tqdm

############################################
# 1. Wczytywanie CSV i formatowanie danych
############################################

CSV_FILE = "filtered_npc_data_trimmed.csv"
df = pd.read_csv(CSV_FILE)

def row_to_text(row):
    """
    Konwertuje wiersz DataFrame na format:
    <NPC_START>
    Name: ...
    Gender: ...
    ...
    <NPC_END>
    """
    lines = []
    lines.append("<NPC_START>")
    lines.append(f"Name: {str(row.get('Name', 'Unknown'))}")
    lines.append(f"Gender: {str(row.get('Gender', 'Unknown'))}")
    lines.append(f"Race: {str(row.get('Race', 'Unknown'))}")
    lines.append(f"Description: {str(row.get('Description',' '))}")
    lines.append(f"Personality: {str(row.get('Personality',' '))}")
    lines.append(f"History: {str(row.get('History',' '))}")
    lines.append(f"Motivation: {str(row.get('Motivation',' '))}")
    lines.append(f"Flaws: {str(row.get('Flaws',' '))}")
    lines.append(f"Bonds: {str(row.get('Bonds',' '))}")
    lines.append(f"Occupation: {str(row.get('Occupation',' '))}")
    lines.append(f"Voice: {str(row.get('Voice',' '))}")
    lines.append("<NPC_END>")
    return "\n".join(lines)

all_rows = []
for _, row in df.iterrows():
    all_rows.append(row_to_text(row))

corpus_text = "\n".join(all_rows)

############################################
# 2. Tokenizacja i słownik
############################################

def simple_tokenizer(text):
    return text.split()

tokens = simple_tokenizer(corpus_text)

word2idx = {}
idx2word = []

for t in tokens:
    if t not in word2idx:
        idx2word.append(t)
        word2idx[t] = len(idx2word) - 1

vocab_size = len(word2idx)
print("Liczba unikalnych tokenów w słowniku:", vocab_size)

def encode(text):
    """Zamiana tekstu na listę int (token IDs)."""
    ts = simple_tokenizer(text)
    return [word2idx[t] for t in ts if t in word2idx]

def decode(id_list):
    """Zamiana listy int na tekst."""
    return " ".join(idx2word[i] for i in id_list)

############################################
# 3. Dataset: sekwencyjne okienko
############################################

class TextDataset(Dataset):
    def __init__(self, token_ids, seq_len=50):
        super().__init__()
        self.token_ids = token_ids
        self.seq_len = seq_len
        self.num_samples = len(token_ids) - seq_len

    def __len__(self):
        return self.num_samples

    def __getitem__(self, idx):
        # X = [idx : idx+seq_len], Y = [idx+1 : idx+seq_len+1]
        x_chunk = self.token_ids[idx : idx + self.seq_len]
        y_chunk = self.token_ids[idx+1 : idx + self.seq_len + 1]
        return torch.tensor(x_chunk, dtype=torch.long), torch.tensor(y_chunk, dtype=torch.long)

all_token_ids = encode(corpus_text)
seq_len = 50
dataset = TextDataset(all_token_ids, seq_len=seq_len)

batch_size = 32
loader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

print("Łączna liczba tokenów:", len(all_token_ids))

############################################
# 4. Implementacja uproszczonego transformera
############################################

class PositionalEncoding(nn.Module):
    """
    Klasyczna sinusoidalna pozycjonalna encja wg Vaswani et al.
    """
    def __init__(self, d_model, max_len=5000):
        super().__init__()
        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-np.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        self.register_buffer('pe', pe.unsqueeze(0))  # shape (1, max_len, d_model)

    def forward(self, x):
        """
        x shape: (batch, seq_len, d_model)
        """
        seq_len = x.size(1)
        # dodajemy do x
        x = x + self.pe[:, :seq_len, :]
        return x


def subsequent_mask(size):
    """
    Tworzy maskę w kształcie (size, size), która uniemożliwia atencję do tokenów przyszłych (po danej pozycji).
    True => zablokowane, False => dozwolone
    """
    # diag = 1, upper-tri = 1
    mask = torch.triu(torch.ones(size, size), diagonal=1).bool()
    return mask


class TransformerModel(nn.Module):
    """
    Decoder-only mini-Transformer do auto-regresywnego generowania.
    """
    def __init__(self, vocab_size, d_model=256, nhead=4, num_layers=2, dim_feedforward=512, max_seq_len=512):
        super().__init__()
        self.d_model = d_model
        self.embedding = nn.Embedding(vocab_size, d_model)
        self.pos_encoder = PositionalEncoding(d_model, max_len=max_seq_len)

        # Używamy wbudowanego TransformerDecoder w PyTorch
        decoder_layer = nn.TransformerDecoderLayer(d_model=d_model,
                                                   nhead=nhead,
                                                   dim_feedforward=dim_feedforward,
                                                   dropout=0.1,
                                                   activation='relu')
        self.transformer_decoder = nn.TransformerDecoder(decoder_layer, num_layers=num_layers)

        self.fc_out = nn.Linear(d_model, vocab_size)

    def forward(self, x, tgt_mask=None):
        """
        x shape: (batch, seq_len)
        tgt_mask: shape (seq_len, seq_len) - maska future tokens
        Zwraca logits: (batch, seq_len, vocab_size)
        """
        # (batch, seq_len) -> (seq_len, batch)
        x = x.transpose(0,1)  # (seq_len, batch)
        emb = self.embedding(x) * np.sqrt(self.d_model)  # (seq_len, batch, d_model)
        emb = self.pos_encoder(emb.transpose(0,1))       # => (batch, seq_len, d_model)
        emb = emb.transpose(0,1)                         # => (seq_len, batch, d_model)

        # Dla transformera wbudowanego w PyTorch, src jest zwykle do encoder
        # Tutaj robimy decoder-only, więc "src" może być puste
        # lub identyczne z "tgt" w stylu "causal" (ale PyTorch wymaga src).
        # Sztuczka: dajemy zero-tensor jako 'memory', a tak naprawdę używamy samego decodera

        # memory (encoder output) = zeros(1,batch,d_model) => hack
        memory = torch.zeros(1, emb.size(1), self.d_model, device=emb.device)

        out = self.transformer_decoder(emb, memory, tgt_mask=tgt_mask)  # shape (seq_len, batch, d_model)
        logits = self.fc_out(out)  # => (seq_len, batch, vocab_size)
        logits = logits.transpose(0,1)  # => (batch, seq_len, vocab_size)
        return logits


############################################
# 5. Trening
############################################

model = TransformerModel(vocab_size, d_model=256, nhead=4, num_layers=2, dim_feedforward=512, max_seq_len=512)
print(model)

criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=1e-4)

num_epochs = 5

for epoch in range(num_epochs):
    model.train()
    total_loss = 0.0
    count = 0

    pbar = tqdm(loader, desc=f"Epoch {epoch+1}/{num_epochs}")
    for batch_x, batch_y in pbar:
        # batch_x, batch_y: (batch, seq_len)
        batch_size_curr = batch_x.size(0)
        seq_length = batch_x.size(1)

        # Tworzymy maskę (seq_len, seq_len) do future tokens
        mask = subsequent_mask(seq_length).to(batch_x.device)

        optimizer.zero_grad()
        logits = model(batch_x, tgt_mask=mask)  # (batch, seq_len, vocab_size)
        # CrossEntropy: (batch*seq_len, vocab_size) vs (batch*seq_len)
        loss = criterion(
    logits.contiguous().view(-1, vocab_size), 
    batch_y.contiguous().view(-1)
)


        loss.backward()
        optimizer.step()

        total_loss += loss.item()
        count += 1
        pbar.set_postfix({"loss": f"{(total_loss/count):.4f}"})

    avg_loss = total_loss / count
    print(f"===> Epoch {epoch+1} done, avg_loss={avg_loss:.4f}")

# Zapisujemy wagi
MODEL_PATH = "transformer_npc_model.pth"
torch.save(model.state_dict(), MODEL_PATH)
print(f"Model zapisany do: {MODEL_PATH}")

############################################
# 6. Generowanie nowego tekstu
############################################

def generate_text(model, start_text="<NPC_START>", max_length=100, temperature=1.0, device="cpu"):
    """
    Auto-regresywna generacja przy użyciu transformera decoder-only.
    """
    model.eval()
    # Zamiana start_text -> tokeny
    start_ids = [word2idx.get(t, 0) for t in start_text.split()]
    # Tworzymy tensora (1, len)
    generated = torch.tensor([start_ids], dtype=torch.long, device=device)

    for _ in range(max_length):
        seq_len = generated.size(1)
        # Maska future tokens
        mask = subsequent_mask(seq_len).to(device)

        with torch.no_grad():
            logits = model(generated, tgt_mask=mask)  # (1, seq_len, vocab_size)

        # bierzemy ostatni token
        last_logits = logits[0, -1, :] / temperature
        probs = torch.softmax(last_logits, dim=-1).cpu().numpy()
        next_id = np.random.choice(len(probs), p=probs)

        # Doklejamy
        next_token = torch.tensor([[next_id]], dtype=torch.long, device=device)
        generated = torch.cat([generated, next_token], dim=1)

        # warunek zakończenia
        if idx2word[next_id] == "<NPC_END>":
            break

    out_ids = generated[0].tolist()
    return decode(out_ids)


# Przykładowa generacja
sample = generate_text(model, start_text="<NPC_START>", max_length=100, temperature=0.8)
print("\n=== Przykładowe wygenerowane dane ===")
print(sample)


Liczba unikalnych tokenów w słowniku: 7552
Łączna liczba tokenów: 1762407
TransformerModel(
  (embedding): Embedding(7552, 256)
  (pos_encoder): PositionalEncoding()
  (transformer_decoder): TransformerDecoder(
    (layers): ModuleList(
      (0-1): 2 x TransformerDecoderLayer(
        (self_attn): MultiheadAttention(
          (out_proj): NonDynamicallyQuantizableLinear(in_features=256, out_features=256, bias=True)
        )
        (multihead_attn): MultiheadAttention(
          (out_proj): NonDynamicallyQuantizableLinear(in_features=256, out_features=256, bias=True)
        )
        (linear1): Linear(in_features=256, out_features=512, bias=True)
        (dropout): Dropout(p=0.1, inplace=False)
        (linear2): Linear(in_features=512, out_features=256, bias=True)
        (norm1): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
        (norm2): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
        (norm3): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
        

Epoch 1/5:   5%|▍         | 2692/55074 [11:51<3:50:46,  3.78it/s, loss=3.7299]


KeyboardInterrupt: 