**imports**

In [1]:
import os
import time
import nltk
from nltk.tokenize import word_tokenize
import torch
import torch.nn as nn
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import Dataset, DataLoader
import numpy as np
import pandas as pd
import random

# Data Preprocessing/Model Preparations/Helper Functions

In [2]:
#download Punkt tokenizer data - necessary for sentence tokenization, which word_tokenize needs
nltk.download("punkt")

[nltk_data] Downloading package punkt to /usr/share/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [3]:
#define hyperparams
BATCH_SIZE = 64 #good batch size for this task (raised to use more GPU and less CPU)
EMBED_SIZE = 256 #large enough for capturing word relationships but not too much to overfit
HIDDEN_SIZE = 512 #same here
EPOCHS = 20 #high number of epochs to ensure model trains well, will implement early stopping to counteract
PATIENCE = 3 #amount of epochs to wait for until early stop - good val, considering total epoch num
MAX_LEN = 60 #fits all data in dataset without truncating
PAD_VAL = 2 #for designating what id pad_token is (used in pad batch and cross entropy loss) #maybe make when vocab made
DROPOUT_RATE = 0.5 #to avoid overfitting
LEARNING_RATE = 0.0001 #10x below initial Adam optimizer learning rate (initial test showed valid loss stagnate by epoch 4, hopefully this makes it better)
#teacher forcing - gives true prev token as input to model at specified timesteps to help with learning and avoid errors chaining
TEACHER_FORCING_RATIO = 0.5 #50% chance true token will be used vs model's prev prediction (so not always learning from either one)
RANDOM = 213 #random state for reproducibility (I like using this num)

In [4]:
#randomness
random.seed(RANDOM)
torch.manual_seed(RANDOM) #remove for non-deterministic
np.random.seed(RANDOM)
torch.backends.cudnn.deterministic = True #ensures GPU operations are deterministic (only for training)
torch.backends.cudnn.benchmark = False #disables auto-tuning of algorithms (ensures more reproducibility)

In [5]:
#load GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
#make directory to save final model
final_dir = "./final/seq2seq"
os.makedirs(final_dir, exist_ok=True)
#make directory to save final data
model_outputs_dir = "./model_outputs"
os.makedirs(model_outputs_dir, exist_ok=True)

#init special tokens - no mask tokens needed
sos_token = "<s>"
eos_token = "</s>"
pad_token = "<pad>"
unk_token = "<unk>"

In [6]:
#tokenize words w/ word_tokenize from nltk (handles punctuation & numbers properly (as separate tokens, good for natural text gen))
def tokenize(text):
    return word_tokenize(text.lower()) #bc of above, only making lowercase is needed (contractions were removed, so won't be an issue)

In [7]:
#build vocab with every word encountered so no loss in generation ability (unknown tokens would make it difficult)
def build_vocab(sentences):
    #init with special tokens
    vocab = {sos_token: 0, eos_token: 1, pad_token: 2, unk_token: 3}
    
    #iterate through all tokenized sentences
    for sent in sentences:
        for word in tokenize(sent):
            if word not in vocab:
                vocab[word] = len(vocab) #add to vocab with new id
                
    return vocab

In [8]:
#assign tokens to ids in sentences using vocab
def token_to_ids(sentence, vocab):
    #replace with <unk> if not in vocab, else id
    return [vocab.get(word, vocab[unk_token]) for word in tokenize(sentence)]

In [9]:
#put data in custom dataset so it works with DataLoader (won't work with anything without __len__ and __getitem__)
class ThoughtDataset(Dataset):
    def __init__(self, df, src_vocab, tgt_vocab):

        self.df = df.reset_index(drop=False) #stores original row indices
        #make input_output pairs
        self.pairs = list(zip(df["negative_thought"], df["reframed_thought"]))
        self.src_vocab = src_vocab
        self.tgt_vocab = tgt_vocab

    
    def __len__(self):
        return len(self.pairs)

    
    def __getitem__(self, idx):
        src, tgt = self.pairs[idx]
        index = self.df.loc[idx, "index"] #get preserved index
        
        #get tokens
        src_tokens = token_to_ids(src, self.src_vocab)
        tgt_tokens = [self.tgt_vocab[sos_token]] + token_to_ids(tgt, self.tgt_vocab) + [self.tgt_vocab[eos_token]]
        
        #make into pytorch tensors, send to GPU
        return torch.tensor(src_tokens).to(device), torch.tensor(tgt_tokens).to(device), index

In [10]:
#pad batches to make same size (use as collate function)
def pad_batch(batch):
    src_batch, tgt_batch, indices_batch = zip(*batch)
    
    #use pad sequence - finds longest sequence in batch to automatically pad to
    src_batch = pad_sequence(src_batch, padding_value=PAD_VAL, batch_first=True)
    tgt_batch = pad_sequence(tgt_batch, padding_value=PAD_VAL, batch_first=True)
    
    return src_batch.to(device), tgt_batch.to(device), indices_batch

In [11]:
#encoder class (pytorch)
class Encoder(nn.Module):
    def __init__(self, input_dim, embedding_dim, hidden_dim):
        super().__init__()
        
        #embedding layer
        self.embedding = nn.Embedding(input_dim, embedding_dim, padding_idx=PAD_VAL)
        
        #gru time
        self.gru = nn.GRU(embedding_dim, hidden_dim, batch_first=True)

    
    #call function (uses forward for pytorch methods)
    def forward(self, src):
        
        #get embeddings
        embed = self.embedding(src)
        
        #do gru
        enc_outputs, hidden_state = self.gru(embed)
        
        return enc_outputs, hidden_state

In [12]:
#attention - specifically additive (or bahdanau) attention class (pytorch)
class Attention(nn.Module):
    def __init__(self, hidden_dim):
        super().__init__()

        #attention linear layer - concatenates decoder hidden state and enc_output
        self.attn = nn.Linear(hidden_dim * 2, hidden_dim)
        
        #linear layer that outputs scalar - computes score (or "energy") for each position in enc_output's sequence
        self.v = nn.Linear(hidden_dim, 1, bias=False)

    
    def forward(self, hidden_state, enc_outputs):

        #init
        batch_size = enc_outputs.size(0)
        src_len = enc_outputs.size(1)

        #repeating hidden state for attention calc (add extra dim for score calc)
        hidden_state = hidden_state.unsqueeze(1).repeat(1, src_len, 1)
        
        #score time (non-linear transformat to attention layer, which gets hidden state and enc_output concat)
        score = torch.tanh(self.attn(torch.cat((hidden_state, enc_outputs), dim=2)))
        
        #get attention score (scalar) for each position in src sequence
        attention = self.v(score).squeeze(2)
        
        #get attention weights - prob dist from softmax
        attn_weights = torch.softmax(attention, dim=1)

        return attn_weights

In [13]:
#decoder class (pytorch)
class Decoder(nn.Module):
    def __init__(self, output_dim, embedding_dim, hidden_dim, attention):
        super().__init__()
        
        #init
        self.output_dim = output_dim
        self.attention = attention
        
        #embedding
        self.embedding = nn.Embedding(output_dim, embedding_dim, padding_idx=PAD_VAL)
        
        #gru
        self.gru = nn.GRU(embedding_dim + hidden_dim, hidden_dim, batch_first=True)
        
        #dropout
        self.dropout = nn.Dropout(p=DROPOUT_RATE)
        
        #fully connected output layer 
        self.out_layer = nn.Linear(embedding_dim + hidden_dim * 2, output_dim)

    
    def forward(self, tgt_input, hidden_state, enc_outputs):
        #add dim to input - ONLY if tgt_input is 1D beforehand
        if tgt_input.dim() == 1:
            tgt_input = tgt_input.unsqueeze(1)
        
        #embed
        embedded = self.embedding(tgt_input.to(device))
        
        #dropping out
        embedded = self.dropout(embedded)
        
        #get attention weights - which parts of enc_output to focus on
        attn_weights = self.attention(hidden_state[-1], enc_outputs).unsqueeze(1)
        
        #get context vector - shows decoder which parts to focus on
        context = torch.bmm(attn_weights, enc_outputs)
        
        #get input for GRU
        gru_input = torch.cat((embedded, context), dim=2)
        
        #gru time
        output, hidden_state = self.gru(gru_input, hidden_state)
        
        #get output of output layer (logits for each word in vocab at current time step)
        dec_outputs = self.out_layer(torch.cat((output.squeeze(1), context.squeeze(1), embedded.squeeze(1)), dim=1))
        
        return dec_outputs, hidden_state

In [14]:
#seq2seq class (pytorch)
class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder):
        super().__init__()
        
        #init encoder and decoder
        self.encoder = encoder
        self.decoder = decoder

    
    def forward(self, src, tgt, teacher_forcing_ratio=0.5):
        #init
        batch_size = src.size(0)
        tgt_len = tgt.size(1)
        tgt_vocab_size = self.decoder.output_dim
        outputs = torch.zeros(batch_size, tgt_len, tgt_vocab_size).to(device)

        #do encoder
        enc_outputs, hidden_state = self.encoder(src)
        
        #get tgt input
        tgt_input = tgt[:, 0].to(device)
        
        #do decoder
        for t in range(1, tgt_len):
            #get decoder output & store it
            dec_output, hidden_state = self.decoder(tgt_input, hidden_state, enc_outputs)
            outputs[:, t] = dec_output
            
            #decide whether teacher forces or not
            teacher_force = random.random() < teacher_forcing_ratio
            
            #get top predicted token
            pred_token = dec_output.argmax(1)
            
            #use either true target token or model predicted token as input
            tgt_input = tgt[:, t].to(device) if teacher_force else pred_token.to(device)

        return outputs

In [15]:
#load raw dataset
train_df = pd.read_csv("./data/train_data.csv")
valid_df = pd.read_csv("./data/valid_data.csv")

#build vocab off of training data - including inverse_vocab for target (for making readable at end)
src_vocab = build_vocab(train_df["negative_thought"])
tgt_vocab = build_vocab(train_df["reframed_thought"])
inv_tgt_vocab = {v: k for k, v in tgt_vocab.items()}

#wrap in dataset
train_dataset = ThoughtDataset(train_df, src_vocab, tgt_vocab)
valid_dataset = ThoughtDataset(valid_df, src_vocab, tgt_vocab)

#use DataLoader
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True, collate_fn=pad_batch)
valid_loader = DataLoader(valid_dataset, batch_size=BATCH_SIZE, shuffle=False, collate_fn=pad_batch)

In [16]:
#init model - encoder, attention mechanism, decoder, and entire seq2seq model
encoder = Encoder(len(src_vocab), EMBED_SIZE, HIDDEN_SIZE)
attention = Attention(HIDDEN_SIZE)
decoder = Decoder(len(tgt_vocab), EMBED_SIZE, HIDDEN_SIZE, attention)
model = Seq2Seq(encoder, decoder).to(device)

#get optimizer and loss function
optimizer = torch.optim.Adam(model.parameters(), lr=LEARNING_RATE) #adam optimizer
criterion = nn.CrossEntropyLoss(ignore_index=PAD_VAL) #ignore padding tokens for loss calc

# Model Training

In [17]:
#for early stop
best_valid_loss = float("inf") #init with high valid loss
stagnating_epochs = 0 #epoch num with no improvement

#training
for epoch in range(EPOCHS):
    epoch_start = time.time() #how long whole epoch takes
    train_start = time.time() #how long training takes
    model.train() #training mode
    train_loss = 0 #init loss
    
    for src, tgt, _ in train_loader:
        optimizer.zero_grad() #clear optimized tensor gradients that accumulate in backward pass
        
        output = model(src, tgt, TEACHER_FORCING_RATIO) #get output token
        output_dim = output.shape[-1]
        output = output[:, 1:].reshape(-1, output_dim)
        tgt = tgt[:, 1:].reshape(-1) #get true token
        
        loss = criterion(output, tgt) #compare (loss time)
        loss.backward() #backpropagation!!!
        optimizer.step() #update weights
        train_loss += loss.item() #add to loss
        
    print(f"Epoch {epoch+1}, Training Loss: {train_loss/len(train_loader):.4f}")
    train_end = time.time()

    #validation loop fr
    valid_start = time.time() #how long validation takes
    model.eval() #bc we're predicting rn
    valid_loss = 0

    with torch.no_grad():
        for src, tgt, _ in valid_loader:
            #same thing but no teacher forcing and no backprop/updating weights
            output = model(src, tgt, 0)
            output_dim = output.shape[-1]
            output = output[:, 1:].reshape(-1, output_dim)
            tgt = tgt[:, 1:].reshape(-1)
            
            loss = criterion(output, tgt)
            valid_loss += loss.item() #add to loss

    print(f"Epoch {epoch+1}, Validation Loss: {valid_loss/len(valid_loader):.4f}")
    valid_end = time.time()
    epoch_end = time.time()

    print(f"Epoch {epoch + 1} summary:")
    print(f"  Training time: {train_end - train_start:.2f} seconds")
    print(f"  Validation time: {valid_end - valid_start:.2f} seconds")
    print(f"  Total epoch time: {epoch_end - epoch_start:.2f} seconds\n")

    #early stop check! (if better valid loss, update. otherwise, update stagnating epochs) 
    if valid_loss/len(valid_loader) < best_valid_loss:
        best_valid_loss = valid_loss/len(valid_loader) 
        stagnating_epochs = 0 #reset
        print(f"Validation Loss improved!")
        #save model with best validation loss (in case of early stop)
        torch.save({
            "epoch": epoch,
            "model_state_dict": model.state_dict(),
            "optimizer_state_dict": optimizer.state_dict(),
            "loss": loss.item()
        }, os.path.join(final_dir, "best_model.pth"))
    else:
        stagnating_epochs += 1 #increase
        print(f"No Validation Loss Improvement. Num of epochs with no improvement: {stagnating_epochs}/{PATIENCE}.")

    #if patience reached, end
    if stagnating_epochs >= PATIENCE:
        print(f"Early stopping triggered: No improvement in Validation Loss for {stagnating_epochs} epochs.")
        break

Epoch 1, Training Loss: 5.9060
Epoch 1, Validation Loss: 5.6701
Epoch 1 summary:
  Training time: 88.00 seconds
  Validation time: 13.09 seconds
  Total epoch time: 101.09 seconds

Validation Loss improved!
Epoch 2, Training Loss: 5.2763
Epoch 2, Validation Loss: 5.6401
Epoch 2 summary:
  Training time: 87.62 seconds
  Validation time: 13.08 seconds
  Total epoch time: 100.70 seconds

Validation Loss improved!
Epoch 3, Training Loss: 5.1053
Epoch 3, Validation Loss: 5.6021
Epoch 3 summary:
  Training time: 88.30 seconds
  Validation time: 13.19 seconds
  Total epoch time: 101.49 seconds

Validation Loss improved!
Epoch 4, Training Loss: 4.9167
Epoch 4, Validation Loss: 5.4126
Epoch 4 summary:
  Training time: 87.63 seconds
  Validation time: 13.23 seconds
  Total epoch time: 100.86 seconds

Validation Loss improved!
Epoch 5, Training Loss: 4.6535
Epoch 5, Validation Loss: 5.2197
Epoch 5 summary:
  Training time: 87.40 seconds
  Validation time: 13.29 seconds
  Total epoch time: 100.69 

# Model Generation

In [18]:
#evaluation
checkpoint = torch.load(os.path.join(final_dir, "best_model.pth")) #load best model (in case early stopping or no valid loss improvement at the end)
model.load_state_dict(checkpoint['model_state_dict']) #load model params into the model

  checkpoint = torch.load(os.path.join(final_dir, "best_model.pth")) #load best model (in case early stopping or no valid loss improvement at the end)


<All keys matched successfully>

In [19]:
#load test data
test_df = pd.read_csv("./data/test_data.csv")

#wrap in dataset and use in DataLoader
test_dataset = ThoughtDataset(test_df, src_vocab, tgt_vocab)
test_loader = DataLoader(test_dataset, batch_size=1, shuffle=False, collate_fn=pad_batch)

In [None]:
#init
generated_texts = []
true_texts = []
original_texts = []
indices = []

test_start = time.time() #how long testing takes
model.eval() #eval time
with torch.no_grad(): #no gradients computed for predict
    
    for src, tgt, idx in test_loader:
        #encode time (use final state for input for predict)
        enc_outputs, hidden_state = model.encoder(src)
        
        #get target input for decoding - and batch size so first dim matches up
        batch_size = src.size(0)
        tgt_input = torch.tensor([tgt_vocab[sos_token]] * batch_size).to(device)
        outputs = [[] for _ in range(batch_size)] #init proper size
        finished = [False] * batch_size
        
        for _ in range(MAX_LEN):
            #decode time
            output, hidden_state = model.decoder(tgt_input, hidden_state, enc_outputs)
            output = output.squeeze(1) #not needed, but good just in case
            
            #get top 1 pred token for each one
            pred_tokens = output.argmax(dim=1)

            for i in range(batch_size):
                if not finished[i]:
                    token = pred_tokens[i].item()
                    if token == tgt_vocab[eos_token]: #if end of sequence
                        finished[i] = True
                    else:
                        outputs[i].append(token) #add to output
            
            if all(finished):
                break
                
            tgt_input = pred_tokens.unsqueeze(1)

        #decode into readable text - generated text = pred reframe
        generated = " ".join([inv_tgt_vocab.get(tok, '') for tok in outputs[i]])
        
        #true text = dataset reframe
        true = " ".join([inv_tgt_vocab.get(tok.item(), '') for tok in tgt[0] if tok.item() not in [tgt_vocab[pad_token], tgt_vocab[sos_token], tgt_vocab[eos_token]]])
        
        #original text = negative thought (kept just in case tokenization/reconstruction changes it in meaningful way, good to keep a copy in case)
        original = " ".join([list(src_vocab.keys())[list(src_vocab.values()).index(tok.item())] for tok in src[0] if tok.item() != PAD_VAL])
        
        generated_texts.append(generated)
        true_texts.append(true)
        original_texts.append(original)
        indices.append(idx)

test_end = time.time()
print(f"Testing time: {test_end - test_start:.2f} seconds")

In [21]:
#save to csv
output_df = pd.DataFrame({
    "Original_Index": indices,
    "Original_Text": original_texts,
    "True_Text": true_texts,
    "Generated_Text": generated_texts
})
#sort by index to restore original csv order first
output_df = output_df.sort_values(by="Original_Index").reset_index(drop=True)
output_df.drop(columns=["Original_Index"], inplace=True) #not needed after sort
output_df.to_csv(model_outputs_dir + "/generated_output_seq2seq.csv", index=False)