**imports**

In [1]:
import os
import time
import nltk
from nltk.tokenize import word_tokenize
import torch
import torch.nn as nn
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import Dataset, DataLoader
import numpy as np
import pandas as pd
import random

# Data Preprocessing/Model Preparations/Helper Functions

In [2]:
#download Punkt tokenizer data - necessary for sentence tokenization, which word_tokenize needs
nltk.download("punkt")

[nltk_data] Downloading package punkt to /usr/share/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [3]:
#define hyperparams
BATCH_SIZE = 64 #good batch size for this task
EMBED_SIZE = 128 #large enough for capturing word relationships but not too much to overfit (was overfitting with 256)
HIDDEN_SIZE = 256 #same here
EPOCHS = 40 #very high number of epochs to ensure model trains well, will implement early stopping to counteract
PATIENCE = 3 #amount of epochs to wait for until early stop - good val, considering total epoch num
MAX_LEN = 140 #fits all data in dataset without truncating
PAD_VAL = 2 #for designating what id pad_token is (used in pad batch and cross entropy loss) #maybe make when vocab made
DROPOUT_RATE = 0.6 #to avoid overfitting (increase slightly)
LEARNING_RATE = 0.0005 #below initial Adam optimizer lr (model was overfitting, trying to fix)
MAX_NORM = 1.0 #ensures gradients don't get too large
TEMPERATURE=0.8 #controls randomness (higher = more random but less accurate/coherent - 0.8 is balanced choice)
RANDOM = 213 #random state for reproducibility (I like using this num)

#randomness
random.seed(RANDOM)
torch.manual_seed(RANDOM) #remove for non-deterministic
np.random.seed(RANDOM)
torch.backends.cudnn.deterministic = True #ensures GPU operations are deterministic (only for training)
torch.backends.cudnn.benchmark = False #disables auto-tuning of algorithms (ensures more reproducibility)

#init special tokens - no mask tokens needed
sos_token = "<s>"
eos_token = "</s>"
pad_token = "<pad>"
unk_token = "<unk>"
mask_token = "<mask>" #used to differentiate between negative and positive thought

In [4]:
#load GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
#make directory to save final model
final_dir = "./final/gru"
os.makedirs(final_dir, exist_ok=True)
#make directory to save final data
model_outputs_dir = "./model_outputs"
os.makedirs(model_outputs_dir, exist_ok=True)

In [5]:
#tokenize words w/ word_tokenize from nltk (handles punctuation & numbers properly (as separate tokens, good for natural text gen))
def tokenize(text):
    return word_tokenize(text.lower()) #bc of above, only making lowercase is needed (contractions were removed, so won't be an issue)

In [6]:
#build vocab with every word encountered so no loss in generation ability (unknown tokens would make it difficult)
def build_vocab(sentences):
    #init with special tokens
    vocab = {sos_token: 0, eos_token: 1, pad_token: 2, unk_token: 3, mask_token: 4}
    
    #iterate through all tokenized sentences
    for sent in sentences:
        for word in tokenize(sent):
            if word not in vocab:
                vocab[word] = len(vocab) #add to vocab with new id
                
    return vocab

In [7]:
#assign tokens to ids in sentences using vocab
def token_to_ids(sentence, vocab):
    #replace with <unk> if not in vocab, else id
    return [vocab.get(word, vocab[unk_token]) for word in tokenize(sentence)]

In [8]:
#put data in custom dataset so it works with DataLoader (won't work with anything without __len__ and __getitem__)
class ReframedThoughtsDataset(Dataset):
    def __init__(self, df, vocab):
        self.df = df.reset_index(drop=False)  #store original index
        #get vocab and sentences
        self.input_sentences = df["negative_thought"]
        self.target_sentences = df["reframed_thought"]
        self.vocab = vocab

    
    def __len__(self):
        return len(self.input_sentences)

    
    def __getitem__(self, idx):
        #get tokens (using mask token as separator but ensuring it doesn't get passed through tokenizer)
        tokens = [self.vocab[sos_token]] + token_to_ids(self.input_sentences[idx], self.vocab) + [self.vocab[mask_token]] + token_to_ids(self.target_sentences[idx], self.vocab) + [self.vocab[eos_token]]
        #convert to pytorch tensor
        tokens = torch.tensor(tokens, dtype=torch.long)
        index = self.df.loc[idx, "index"] #get index
        
        #send to GPU
        return tokens.to(device), index

In [9]:
#pad batches to make same size (use as collate function)
def pad_batch(batch):
    #separate tokens and indices in the batch (so they don't get mixed up)
    tokens_batch = [item[0] for item in batch]  #list of tokenized sequences
    indices_batch = [item[1] for item in batch]  #list of indices
    
    #use pad sequence - finds longest sequence in batch to automatically pad to
    batch = pad_sequence(tokens_batch, padding_value=PAD_VAL, batch_first=True)
    
    #remove last token from input and first token from output (setting up next-token prediction)
    inputs = batch[:, :-1]
    targets = batch[:, 1:]
    
    return inputs.to(device), targets.to(device), indices_batch

In [10]:
#gru based rnn language model
class GRULanguageModel(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim):
        super().__init__()

        #embedding
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=PAD_VAL)
        
        #gru time - 2 layers to make more complex (prevent underfitting)
        self.gru = nn.GRU(embedding_dim, hidden_dim, num_layers=2, batch_first=True)
        
        #dropout
        self.dropout = nn.Dropout(p=DROPOUT_RATE)
        
        #fully connected output layer (map hidden state back to vocab for prediction) 
        self.out_layer = nn.Linear(hidden_dim, vocab_size)
        
    
    def forward(self, x):  
        #embed + dropout
        embedded = self.dropout(self.embedding(x))

        #gru time - don't need hidden state here
        output, _ = self.gru(embedded)
        
        #get logits from output layer
        logits = self.out_layer(output)
        
        return logits

In [11]:
#load raw dataset
train_df = pd.read_csv("./data/train_data.csv")
valid_df = pd.read_csv("./data/valid_data.csv")

#build vocab off of training data - including inverse_vocab for target (for making readable at end)
vocab = build_vocab(train_df["negative_thought"])
inverse_vocab = {v: k for k, v in vocab.items()}

In [12]:
#wrap in dataset
train_dataset = ReframedThoughtsDataset(train_df, vocab)
valid_dataset = ReframedThoughtsDataset(valid_df, vocab)

#use DataLoader
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True, collate_fn=pad_batch)
valid_loader = DataLoader(valid_dataset, batch_size=BATCH_SIZE, shuffle=False, collate_fn=pad_batch)

In [13]:
#init language model
model = GRULanguageModel(len(vocab), EMBED_SIZE, HIDDEN_SIZE).to(device)

#get optimizer and loss function
optimizer = torch.optim.Adam(model.parameters(), lr=LEARNING_RATE) #adam optimizer
criterion = nn.CrossEntropyLoss(ignore_index=PAD_VAL) #ignore padding tokens for loss calc

# Model Training

In [14]:
#for early stop
best_valid_loss = float("inf") #init with high valid loss
stagnating_epochs = 0 #epoch num with no improvement

#training
for epoch in range(EPOCHS):
    epoch_start = time.time() #how long whole epoch takes
    train_start = time.time() #how long training takes
    model.train() #training mode
    train_loss = 0 #init loss
    
    for input_token, next_token, _ in train_loader:
        optimizer.zero_grad() #clear optimized tensor gradients that accumulate in backward pass

        #ensure input token and next token are on same device as the model
        input_token = input_token.to(device)
        next_token = next_token.to(device)
        
        #get output token
        logits = model(input_token)
        logits_dim = logits.shape[-1]
        logits = logits.reshape(-1, logits_dim)
        
        #get true next token
        next_token = next_token.reshape(-1)
        
        loss = criterion(logits, next_token) #compare (loss time)
        loss.backward() #backpropagation!!!

        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=MAX_NORM) #gradient clipping (to be safe with a weaker model)
        
        optimizer.step() #update weights
        train_loss += loss.item() #add to loss
        
    print(f"Epoch {epoch+1}, Training Loss: {train_loss/len(train_loader):.4f}")
    train_end = time.time()

    #validation loop fr
    valid_start = time.time() #how long validation takes
    model.eval() #bc we're predicting rn
    valid_loss = 0

    with torch.no_grad():
        for input_token, next_token, _ in valid_loader:
            #same thing but no backprop/updating weights
            input_token = input_token.to(device)
            next_token = next_token.to(device)
            
            logits = model(input_token)
            logits_dim = logits.shape[-1]
            logits = logits.reshape(-1, logits_dim)
            
            next_token = next_token.reshape(-1)
            
            loss = criterion(logits, next_token)
            valid_loss += loss.item()

    print(f"Epoch {epoch+1}, Validation Loss: {valid_loss/len(valid_loader):.4f}")
    valid_end = time.time()
    epoch_end = time.time()

    print(f"Epoch {epoch + 1} summary:")
    print(f"  Training time: {train_end - train_start:.2f} seconds")
    print(f"  Validation time: {valid_end - valid_start:.2f} seconds")
    print(f"  Total epoch time: {epoch_end - epoch_start:.2f} seconds\n")

    #early stop check! (if better valid loss, update. otherwise, update stagnating epochs) 
    if valid_loss/len(valid_loader) < best_valid_loss:
        best_valid_loss = valid_loss/len(valid_loader) 
        stagnating_epochs = 0 #reset
        print(f"Validation Loss improved!")
        #save model with best validation loss (in case of early stop)
        torch.save({
            "epoch": epoch,
            "model_state_dict": model.state_dict(),
            "optimizer_state_dict": optimizer.state_dict(),
            "loss": loss.item()
        }, os.path.join(final_dir, "best_model.pth"))
    else:
        stagnating_epochs += 1 #increase
        print(f"No Validation Loss Improvement. Num of epochs with no improvement: {stagnating_epochs}/{PATIENCE}.")

    #if patience reached, end
    if stagnating_epochs >= PATIENCE:
        print(f"Early stopping triggered: No improvement in Validation Loss for {stagnating_epochs} epochs.")
        break

Epoch 1, Training Loss: 5.6326
Epoch 1, Validation Loss: 4.9868
Epoch 1 summary:
  Training time: 13.86 seconds
  Validation time: 2.42 seconds
  Total epoch time: 16.28 seconds

Validation Loss improved!
Epoch 2, Training Loss: 4.7929
Epoch 2, Validation Loss: 4.4541
Epoch 2 summary:
  Training time: 13.12 seconds
  Validation time: 2.48 seconds
  Total epoch time: 15.61 seconds

Validation Loss improved!
Epoch 3, Training Loss: 4.4185
Epoch 3, Validation Loss: 4.2307
Epoch 3 summary:
  Training time: 13.19 seconds
  Validation time: 2.40 seconds
  Total epoch time: 15.59 seconds

Validation Loss improved!
Epoch 4, Training Loss: 4.2234
Epoch 4, Validation Loss: 4.0928
Epoch 4 summary:
  Training time: 13.16 seconds
  Validation time: 2.46 seconds
  Total epoch time: 15.62 seconds

Validation Loss improved!
Epoch 5, Training Loss: 4.0919
Epoch 5, Validation Loss: 4.0161
Epoch 5 summary:
  Training time: 13.08 seconds
  Validation time: 2.43 seconds
  Total epoch time: 15.52 seconds

V

# Model Generation

In [15]:
#evaluation
checkpoint = torch.load(os.path.join(final_dir, "best_model.pth")) #load best model (in case early stopping or no valid loss improvement at the end)
model.load_state_dict(checkpoint['model_state_dict']) #load model params into the model

  checkpoint = torch.load(os.path.join(final_dir, "best_model.pth")) #load best model (in case early stopping or no valid loss improvement at the end)


<All keys matched successfully>

In [16]:
#load test data
test_df = pd.read_csv("./data/test_data.csv")

#wrap in dataset and use in DataLoader
test_dataset = ReframedThoughtsDataset(test_df, vocab)
test_loader = DataLoader(test_dataset, batch_size=1, shuffle=False, collate_fn=pad_batch) #for testing all at once (without doing the batch predict thing)

In [17]:
#init
generated_texts = []
true_texts = []
original_texts = []
indices = []

test_start = time.time() #how long testing takes
model.eval() #eval time
with torch.no_grad(): #no gradients computed for predict
    for idx in range(len(test_dataset)): #iterates over whole dataset (since batch size is 1 and test dataset needs to be iterated over)
        input_seq, idx = test_dataset[idx] #get original test input & index
        input_token = torch.tensor([vocab[sos_token]] + input_seq.tolist(), device=device).unsqueeze(0) #safer way of making pytorch tensor and sending to GPU
        generated = []
        
        for _ in range(MAX_LEN):
            logits = model(input_token) #get logits
            next_token = logits[:, -1].argmax(-1).item() #get next token from logits

            if next_token == vocab[mask_token]: #if mask token has been hit
                if next_token in generated: #if mask token is already in generated (don't want multiple, only the one separating negative and positive thoughts)
                    #clone logits and make mask token logits very low (will pick 2nd most prob token instead)
                    logits_no_mask = logits.clone()
                    logits_no_mask[:, -1, vocab[mask_token]] = float("-inf")
                    next_token = logits_no_mask[:, -1].argmax(-1).item()
                    
                else: #once mask is hit, stop adding to input token
                    input_token = torch.cat((input_token, torch.tensor([[next_token]], device=device)), dim=1)
                    continue
                
            if next_token == vocab[eos_token]: #check if end of sequence
                    break
    
            generated.append(next_token)
    
            input_token = torch.cat((input_token, torch.tensor([[next_token]], device=device)), dim=1) #add next token to input
    
        #decode into readable text - generated and original/true text
        generated_text = " ".join([inverse_vocab[t] for t in generated])
        generated_texts.append(generated_text)
        
        all_true_text = " ".join([inverse_vocab.get(tok.item(), '') for tok in input_seq if tok.item() not in [vocab[pad_token], vocab[sos_token], vocab[eos_token]]])
        if len(all_true_text.split(mask_token)) == 2: #ensuring only one mask token per sequence (shouldn't be necessary, but defensive programming tactic)
            #get original text (negative thought) and true text (reframed thought) separated for easier comparison
            original_text, true_text = all_true_text.split(mask_token)
            true_texts.append(true_text)
            original_texts.append(original_text)

        indices.append(idx) #store index
            

test_end = time.time()
print(f"Testing time: {test_end - test_start:.2f} seconds")

Testing time: 53.51 seconds


In [18]:
#save to csv
output_df = pd.DataFrame({
    "Original_Index": indices,
    "Original_Text": original_texts,
    "True_Text": true_texts,
    "Generated_Text": generated_texts
})
#sort by index to restore original csv order first
output_df = output_df.sort_values(by="Original_Index").reset_index(drop=True)
output_df.drop(columns=["Original_Index"], inplace=True) #not needed after sort
output_df.to_csv(model_outputs_dir + "/generated_output_gru.csv", index=False)