In [1]:
import torch
import numpy as np
import torch.nn as nn
import math
import os
import pandas as pd
import torch.optim as optim
import torch.nn.functional as F
import json
import copy
from torch import nn, Tensor


SRC_VOCAB_SIZE = 3
TGT_VOCAB_SIZE = 3
D_MODEL = 252
PADDING_TOKEN = 0
UNK_TOKEN = 1
SOS_TOKEN = 2
EOS_TOKEN = 3
MAX_LENGTH = 200
BATCH_SIZE = 0
src_tokens = {}
tgt_tokens = {}

file_path = os.path.join(os.path.expanduser("~"), "", "mental_health.csv")
orig_dataset = pd.read_csv(file_path) #read csv file as pandas object
orig_dataset = orig_dataset.to_numpy()
for example in range(len(orig_dataset)): 
    for cont_response in range(2): #context than response
        if type(orig_dataset[example][cont_response]) == float: #NaN values
            continue
        cur_Sentence = orig_dataset[example][cont_response].split() #seperate by word
        if cont_response == 0:
            for word in cur_Sentence:
                if word not in src_tokens: src_tokens[word] = 1
                else: src_tokens[word] +=1
        else:
            for word in cur_Sentence:
                if word not in tgt_tokens: tgt_tokens[word] = 1
                else: tgt_tokens[word] +=1
            
SRC_TOKENS = {}
TGT_TOKENS = {}
token = 4
for word in src_tokens:
    if src_tokens[word]>2:
        SRC_TOKENS[word] = token
        token+=1
        SRC_VOCAB_SIZE+=1
        
token = 4
for word in tgt_tokens:
    if tgt_tokens[word]>2:
        TGT_TOKENS[word] = token
        token+=1
        TGT_VOCAB_SIZE+=1
        
        
        
        
if not os.path.exists('context.txt'):
    unfiltered_trainingSet =  orig_dataset[np.random.choice(orig_dataset.shape[0], 1000, replace=True)] #extract training set
    trainingSet = []
    for example in unfiltered_trainingSet:
        if type(example[0]) is not float and type(example[1]) is not float:
            if len(example[0].split())<200 and len(example[1].split())<200:
                trainingSet.append(example)

    contextSet = [trainingSet[i][0] for i in range(len(trainingSet))] 
    responseSet = [trainingSet[i][1] for i in range(len(trainingSet))]
    contextSet_tokenized =[[SRC_TOKENS[word] if word in SRC_TOKENS else UNK_TOKEN for word in example.split()] 
                             for example in contextSet]

    # print(contextSet_tokenized)
    # for i in contextSet_tokenized[0]:
    #     for x in SRC_TOKENS:
    #         if SRC_TOKENS[x] == i:
    #             print(x)

    responseSet_tokenized =[[TGT_TOKENS[word] if word in TGT_TOKENS else UNK_TOKEN for word in example.split()] 
                             for example in responseSet]

    # print(responseSet_tokenized)
    # for i in responseSet_tokenized[0]:
    #     for x in TGT_TOKENS:
    #         if TGT_TOKENS[x] == i:
    #             print(x)

    label =copy.deepcopy(responseSet_tokenized)
    #set up special tokens
    for i in range(len(contextSet_tokenized)):
        while len(contextSet_tokenized[i])!= MAX_LENGTH:
            contextSet_tokenized[i].append(PADDING_TOKEN)
    for i in range(len(responseSet_tokenized)):
        responseSet_tokenized[i].insert(0, SOS_TOKEN)
        label[i].append(EOS_TOKEN)
    for i in range(len(responseSet_tokenized)):
        while len(responseSet_tokenized[i]) != MAX_LENGTH:
            responseSet_tokenized[i].append(PADDING_TOKEN)
            label[i].append(PADDING_TOKEN)



    json.dump(responseSet_tokenized, open("response.txt",'w'))
    json.dump(contextSet_tokenized, open("context.txt",'w'))
    json.dump(label, open("labels.txt",'w'))
    json.dump(SRC_TOKENS, open("source_tokens.txt",'w'))
    json.dump(TGT_TOKENS, open("target_tokens.txt",'w'))
else:
    # Load data from files
    with open('response.txt', 'r') as f:
        responseSet_tokenized = json.load(f)

    with open('context.txt', 'r') as f:
        contextSet_tokenized = json.load(f)

    with open('labels.txt', 'r') as f:
        label = json.load(f)

    with open('source_tokens.txt', 'r') as f:
        SRC_TOKENS = json.load(f)

    with open('target_tokens.txt', 'r') as f:
        TGT_TOKENS = json.load(f)




BATCH_SIZE = len(contextSet_tokenized) // 40 
mod = len(contextSet_tokenized) % BATCH_SIZE
if mod:
    reduce = len(contextSet_tokenized) - (BATCH_SIZE*40)
    responseSet_tokenized = responseSet_tokenized[: len(responseSet_tokenized)-reduce]
    contextSet_tokenized = contextSet_tokenized[: len(contextSet_tokenized)-reduce]
    label = label[: len(label)-reduce]
    
print(f"Target Vocab Size: {TGT_VOCAB_SIZE}")
print(f"Total Training Size: {len(responseSet_tokenized)}")    
print(f"Batch Size: {BATCH_SIZE}")

Target Vocab Size: 10439
Total Training Size: 640
Batch Size: 16


In [2]:
class InputEmbeddings(nn.Module):

    def __init__(self, d_model: int, vocab_size: int) -> None:
        super().__init__()
        self.d_model = d_model
        self.vocab_size = vocab_size
        self.embedding = nn.Embedding(vocab_size, d_model)

    def forward(self, x):
        # (batch, seq_len) --> (batch, seq_len, d_model)
        # Multiply by sqrt(d_model) to scale the embeddings according to the paper
        # (batch, seq_len) --> (batch, seq_len, d_model)
        # Multiply by sqrt(d_model) to scale the embeddings according to the paper
        return self.embedding(x) * math.sqrt(self.d_model)
    
class PositionalEncoding(nn.Module):
    def __init__(self, d_model = D_MODEL, seq_len = MAX_LENGTH, dropout = 0.1) -> None:
        super().__init__()
        self.d_model = d_model
        self.seq_len = seq_len
        self.dropout = nn.Dropout(dropout)
        # Create a matrix of shape (seq_len, d_model)
        pe = torch.zeros(seq_len, d_model)
        # Create a vector of shape (seq_len)
        position = torch.arange(0, seq_len, dtype=torch.float).unsqueeze(1) # (seq_len, 1)
        # Create a vector of shape (d_model)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model)) # (d_model / 2)
        # Apply sine to even indices
        pe[:, 0::2] = torch.sin(position * div_term) # sin(position * (10000 ** (2i / d_model))
        # Apply cosine to odd indices
        pe[:, 1::2] = torch.cos(position * div_term) # cos(position * (10000 ** (2i / d_model))
        # Add a batch dimension to the positional encoding
        pe = pe.unsqueeze(0) # (1, seq_len, d_model)
        # Register the positional encoding as a buffer
        self.register_buffer('pe', pe)

    def forward(self, x):
        x = x + (self.pe[:, :x.shape[1], :]).requires_grad_(False) # (batch, seq_len, d_model)
        return self.dropout(x)


In [3]:
class MultiHeadAttentionBlock(nn.Module):
    def __init__(self, d_model: int, h: int, dropout: float)->None:
        super().__init__()
        self.d_model = d_model # Embedding vector size
        self.h = h # Number of heads
        # Make sure d_model is divisible by h
        assert d_model % h == 0, "d_model is not divisible by h"
        self.d_k = d_model // h # Dimension of vector seen by each head
        self.w_q = nn.Linear(d_model, d_model, bias=False) # linear layer for queue
        self.w_k = nn.Linear(d_model, d_model, bias=False) # liner layer for key
        self.w_v = nn.Linear(d_model, d_model, bias=False) # linear layer for 
        self.w_o = nn.Linear(d_model, d_model, bias=False) # Wo
        self.dropout = nn.Dropout(dropout)
                 
    @staticmethod
    def attention(query, key, value, mask, dropout: nn.Dropout):
        #d_k is number of embeddings per word for each head
        d_k = query.shape[-1]
        # Just apply the formula from the paper
        # (batch, h, seq_len, d_k) --> (batch, h, seq_len, seq_len)
        # [[[[4,3],[2,1]],[[2, 5], [3,2]]]] * [[[[4,3][2,5]],[[2,1], [3,2]]]] = 
        #[[[[0.7, 0.3], [0.1, 0.9]], [0.8, 0.2], [0.4, 0.6]]]
        #basically saying the first word of the first half of the embedding(for head 1) is related to
        #to its own word by 0.7 and related to the first half embedding of the second word by 0.3
        #also saying the second word of the first half of the embedding(for head 1) is related to
        #to its own word by 0.9 and related to the first half embedding of the first word by 0.1
        #essentially a attention matrix for each part of the embedding for the head
        attention_scores = (query @ key.transpose(-2, -1)) / math.sqrt(d_k)
        #if mask is true, then make all appropriate masked attentions scores to very low value
        #so that the soft max will ignore their scores
        if mask is not None:
            # Write a very low value (indicating -inf) to the positions where mask == 0
            attention_scores.masked_fill_(mask == 0, -1e9)
        attention_scores = attention_scores.softmax(dim=-1) # (batch, h, seq_len, seq_len) # Apply softmax
        if dropout is not None:
            attention_scores = dropout(attention_scores)
        # (batch, h, seq_len, seq_len) --> (batch, h, seq_len, d_k)
        # return attention scores which can be used for visualization
        return (attention_scores @ value), attention_scores
    
    def forward(self, q, k, v, mask):
        query = self.w_q(q) # (batch, seq_len, d_model) --> (batch, seq_len, d_model)
        key = self.w_k(k) # (batch, seq_len, d_model) --> (batch, seq_len, d_model)
        value = self.w_v(v) # (batch, seq_len, d_model) --> (batch, seq_len, d_model)
        # (batch, seq_len, d_model) --> (batch, seq_len, h, d_k) --> (batch, h, seq_len, d_k)
        #[[[4,3,2,1] , [2,5,3,2]]]
        #3rd dimension represents word embeddings of a word
        #2nd dimension represent each word embedding in an example
        #1st dimension represents each example containing each word embedding
        # * Transform the dimensions using .view *
        # [[[[4,3],[2,1]],[[2, 5], [3,2]]]]
        #4th dimenension represents the size of the split embeddings for each multi-head attention
        #3rd dimension represents the number of heads. Note that head times the size of split embeddings equals original embedding dimension
        #2nd dimension represents sequence length
        #1st dimension represents example lenght
        # * Transponse dimensions 1,2 *
        # [[[[4,3][2,5]],[[2,1], [3,2]]]]
        # 4th dimension represents the size of the split embeddings for each multi-head attention
        # 3rd dimension represents the length of the sequence. Contains each word with its associated head
        # 2nd dimension represents the number of heads
        # 1st dimension is the number of examples
        #[[[[word 1 with first half embeddings][word 2 with first half embedding]],[[word 1 with second half], [word 2 with second half]]]]        
        query = query.view(query.shape[0], query.shape[1], self.h, self.d_k).transpose(1, 2)
        key = key.view(key.shape[0], key.shape[1], self.h, self.d_k).transpose(1, 2)
        value = value.view(value.shape[0], value.shape[1], self.h, self.d_k).transpose(1, 2)
        # Calculate attention
        x, self.attention_scores = MultiHeadAttentionBlock.attention(query, key, value, mask, self.dropout)
        # Combine all the heads together
        # (batch, h, seq_len, d_k) --> (batch, seq_len, h, d_k) --> (batch, seq_len, d_model)
        x = x.transpose(1, 2).contiguous().view(x.shape[0], -1, self.h * self.d_k)
        #pass combined heads to last linear layer
        return self.w_o(x)



In [4]:
class FeedForwardBlock(nn.Module):
    def __init__(self, d_model: int, d_ff: int, dropout: float) -> None:
        super().__init__()
        self.linear_1 = nn.Linear(d_model, d_ff) # w1 and b1
        self.dropout = nn.Dropout(dropout)
        self.linear_2 = nn.Linear(d_ff, d_model) # w2 and b2

    def forward(self, x):
        # (batch, seq_len, d_model) --> (batch, seq_len, d_ff) --> (batch, seq_len, d_model)
        #the output of the self.linear1 layer is the input of a relu layer
        #the output of that is the input of linear layer 2
     
        return self.linear_2(self.dropout(torch.relu(self.linear_1(x))))

In [5]:
#normalizing the data for stabilized training
class LayerNormalization(nn.Module):
    def __init__(self, features: int, eps:float=10**-6) -> None:
        super().__init__()
        self.eps = eps
        self.alpha = nn.Parameter(torch.ones(features)) # alpha is a learnable parameter
        self.bias = nn.Parameter(torch.zeros(features)) # bias is a learnable parameter

    def forward(self, x):
        # x: (batch, seq_len, hidden_size)
         # Keep the dimension for broadcasting
        mean = x.mean(dim = -1, keepdim = True) # (batch, seq_len, 1)
        # Keep the dimension for broadcasting
        std = x.std(dim = -1, keepdim = True) # (batch, seq_len, 1)
        # eps is to prevent dividing by zero or when std is very small
        return self.alpha * (x - mean) / (std + self.eps) + self.bias


In [6]:
class ResidualConnection(nn.Module):
    def __init__(self, features: int, dropout: float) -> None:
        super().__init__()
        self.dropout = nn.Dropout(dropout)
        self.norm = LayerNormalization(features)
    def forward(self, x, sublayer):
        # in the paper, you would typically add x to the sublayer and then normalize the output
        #of the sublayer, but in this case, we normalize before passing it in to the sublayer
        #the .norm part is the normalizing part in ADD and NORM, and the addition of x is the 
        #ADD part of ADD and NORM
        return x + self.dropout(sublayer(self.norm(x)))


In [7]:
class EncoderBlock(nn.Module):

    def __init__(self, features: int, self_attention_block: MultiHeadAttentionBlock, feed_forward_block: FeedForwardBlock, dropout: float) -> None:
        super().__init__()
        self.self_attention_block = self_attention_block
        self.feed_forward_block = feed_forward_block
        #encoder block contains 2 residual connections
        self.residual_connections = nn.ModuleList([ResidualConnection(features, dropout) for _ in range(2)])

    def forward(self, x, src_mask):
        #Passes self attention block to residuals. Residuals will perform the attention block
        #and also perform the ADD and NORM
        x = self.residual_connections[0](x, lambda x: self.self_attention_block(x, x, x, src_mask))
        #passes the output of the previous residual_connection layer('x'). Then, passes in a feed_forward_block
        #Residual connection will perform the feed forward and ADD and NORM
        x = self.residual_connections[1](x, self.feed_forward_block)
        return x
    

In [8]:
class Encoder(nn.Module):
    def __init__(self, features: int, layers: nn.ModuleList) -> None:
        super().__init__()
        #layers of encoder blocks
        self.layers = layers
        self.norm = LayerNormalization(features)

    def forward(self, x, mask):
        #perform a forward method on every encoder block. The output of each encoder
        #becomes the input of the new encoder
        for layer in self.layers:
            x = layer(x, mask)
        #normalize the final output
        return self.norm(x)

In [9]:
class DecoderBlock(nn.Module):
    def __init__(self, features: int, self_attention_block: MultiHeadAttentionBlock, cross_attention_block: MultiHeadAttentionBlock, feed_forward_block: FeedForwardBlock, dropout: float) -> None:
        super().__init__()
        self.self_attention_block = self_attention_block
        self.cross_attention_block = cross_attention_block
        self.feed_forward_block = feed_forward_block
        self.residual_connections = nn.ModuleList([ResidualConnection(features, dropout) for _ in range(3)])
    
    #src mask are for the encoder output. Do not want to attend to padding tokens
    #tgt mask are for decoder input. Doesn't let you look into the future
    #typically these just always be true I think
    def forward(self, x, encoder_output, src_mask, tgt_mask):
        x = self.residual_connections[0](x, lambda x: self.self_attention_block(x, x, x, tgt_mask))
        x = self.residual_connections[1](x, lambda x: self.cross_attention_block(x, encoder_output, encoder_output, src_mask))
        x = self.residual_connections[2](x, self.feed_forward_block)
        return x

In [10]:
class Decoder(nn.Module):
    def __init__(self, features: int, layers: nn.ModuleList) -> None:
        super().__init__()
        self.layers = layers
        self.norm = LayerNormalization(features)
    def forward(self, x, encoder_output, src_mask, tgt_mask):
        for layer in self.layers:
            x = layer(x, encoder_output, src_mask, tgt_mask)
        return self.norm(x)

In [11]:
#linear layer where output represents all words. We are going to soft max this 
#in the future. 
class ProjectionLayer(nn.Module):
    def __init__(self, d_model, vocab_size) -> None:
        super().__init__()
        self.proj = nn.Linear(d_model, vocab_size)

    def forward(self, x) -> None:
        # (batch, seq_len, d_model) --> (batch, seq_len, vocab_size)
        return self.proj(x)
    

In [12]:
class Transformer(nn.Module):
    def __init__(self, encoder: Encoder, decoder: Decoder, src_embed: InputEmbeddings, tgt_embed: InputEmbeddings ,src_pos: PositionalEncoding, tgt_pos: PositionalEncoding, projection_layer: ProjectionLayer, src_pad_indx = 0, device = "cpu") -> None:
        super().__init__()
        self.encoder = encoder
        self.decoder = decoder
        self.src_embed = src_embed
        self.tgt_embed = tgt_embed
        self.src_pos = src_pos
        self.tgt_pos = tgt_pos
        self.projection_layer = projection_layer
        self.src_pad_idx = src_pad_indx
        self._pad_idx = src_pad_indx
        self.device = device

    def make_src_mask(self, src_tokens):
        src_mask = (src_tokens != self.src_pad_idx).unsqueeze(1).unsqueeze(2)
        # (N, 1, 1, src_len)
        return src_mask.to(self.device)
    
    def make_trg_mask(self, trg_tokens):
        N, trg_len = trg_tokens.shape
        trg_pad_mask = (trg_tokens != self.src_pad_idx).unsqueeze(1).unsqueeze(2)
        trg_sub_mask = torch.tril(torch.ones((trg_len, trg_len), device=self.device)).bool()
        trg_pad_mask= trg_pad_mask.to(self.device)
        trg_sub_mask= trg_sub_mask.to(self.device)
        trg_mask = trg_pad_mask & trg_sub_mask
        # print(trg_mask)
        
        # N, trg_len = trg_tokens.shape
        # trg_mask = torch.tril(torch.ones((trg_len, trg_len))).expand(
        #     N, 1, trg_len, trg_len
        # )

        return trg_mask.to(self.device)

    def encode(self, src, src_mask):
        # (batch, seq_len, d_model)
        src = self.src_embed(src)
        src = self.src_pos(src)
        return self.encoder(src, src_mask)

    def decode(self, encoder_output: torch.Tensor, src_mask: torch.Tensor, tgt: torch.Tensor, tgt_mask: torch.Tensor):
        # (batch, seq_len, d_model)
        tgt = self.tgt_embed(tgt)
        tgt = self.tgt_pos(tgt)
        return self.decoder(tgt, encoder_output, src_mask, tgt_mask)
    
    def project(self, x):
        # (batch, seq_len, vocab_size)
        return self.projection_layer(x)

In [13]:
def build_transformer(src_vocab_size: int, tgt_vocab_size: int, src_seq_len: int, tgt_seq_len: int, d_model: int=512, N: int=6, h: int=8, dropout: float=0.1, d_ff: int=2048, device = "cpu") -> Transformer:

    # Create the embedding layers
    src_embed = InputEmbeddings(d_model, src_vocab_size).to(device)
    tgt_embed = InputEmbeddings(d_model, tgt_vocab_size).to(device)
    # Create the positional encoding layers
    src_pos = PositionalEncoding(d_model, src_seq_len, dropout).to(device)
    tgt_pos = PositionalEncoding(d_model, tgt_seq_len, dropout).to(device)

    # Create the encoder blocks
    encoder_blocks = []
    for _ in range(N):
        encoder_self_attention_block = MultiHeadAttentionBlock(d_model, h, dropout).to(device)
        feed_forward_block = FeedForwardBlock(d_model, d_ff, dropout).to(device)
        encoder_block = EncoderBlock(d_model, encoder_self_attention_block, feed_forward_block, dropout).to(device)
        encoder_blocks.append(encoder_block)

    # Create the decoder blocks
    decoder_blocks = []
    for _ in range(N):
        decoder_self_attention_block = MultiHeadAttentionBlock(d_model, h, dropout).to(device)
        decoder_cross_attention_block = MultiHeadAttentionBlock(d_model, h, dropout).to(device)
        feed_forward_block = FeedForwardBlock(d_model, d_ff, dropout).to(device)
        decoder_block = DecoderBlock(d_model, decoder_self_attention_block, decoder_cross_attention_block, feed_forward_block, dropout).to(device)
        decoder_blocks.append(decoder_block)
    
    # Create the encoder and decoder
    encoder = Encoder(d_model, nn.ModuleList(encoder_blocks)).to(device)
    decoder = Decoder(d_model, nn.ModuleList(decoder_blocks)).to(device)
    
    # Create the projection layer
    projection_layer = ProjectionLayer(d_model, tgt_vocab_size).to(device)
    
    # Create the transformer
    transformer = Transformer(encoder, decoder, src_embed, tgt_embed, src_pos, tgt_pos, projection_layer, device=device).to(device)
    
    # Initialize the parameters
    for p in transformer.parameters():
        if p.dim() > 1:
            nn.init.xavier_uniform_(p)
    
    return transformer

In [17]:
torch.cuda.empty_cache()
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)
# device = "cpu"
transformer = build_transformer( src_vocab_size = SRC_VOCAB_SIZE, 
                                tgt_vocab_size = TGT_VOCAB_SIZE, src_seq_len = MAX_LENGTH, 
                                tgt_seq_len = MAX_LENGTH, d_model = D_MODEL, N = 8 , h=6, dropout=0.1, device=device).to(device)

split_contextSet_tokenized = [contextSet_tokenized[i: i+BATCH_SIZE] for i in range(0, len(contextSet_tokenized), BATCH_SIZE)]
split_responseSet_tokenized = [responseSet_tokenized[i: i+BATCH_SIZE] for i in range(0, len(responseSet_tokenized), BATCH_SIZE)]
split_label = [label[i: i+BATCH_SIZE] for i in range(0, len(label), BATCH_SIZE)]
print(len(split_contextSet_tokenized[0]))
split_contextSet_tokenized = torch.tensor(split_contextSet_tokenized)
split_responseSet_tokenized = torch.tensor(split_responseSet_tokenized)
split_label = torch.tensor(split_label)
# Loss and optimizer
criterion = nn.CrossEntropyLoss(ignore_index=PADDING_TOKEN).to(device)
optimizer = optim.Adam(transformer.parameters(), lr=0.0001,  eps=1e-9)
num_epochs = 500


cuda
16


In [23]:

for epoch in range(num_epochs):
    for batch in range(len(split_contextSet_tokenized)):
        contextSet_tokenized= split_contextSet_tokenized[batch] 
        responseSet_tokenized= split_responseSet_tokenized[batch]
        label = split_label[batch]
        contextSet_tokenized =contextSet_tokenized.to(device)
        responseSet_tokenized =responseSet_tokenized.to(device)
        label =label.to(device)
        losses = 0
        encoder_mask = transformer.make_src_mask(contextSet_tokenized).to(device)
        decoder_mask = transformer.make_trg_mask(responseSet_tokenized).to(device)
        encoder_input = contextSet_tokenized.to(device)
        decoder_input = responseSet_tokenized.to(device)
        encoder_output = transformer.encode(encoder_input, encoder_mask) # (B, seq_len, d_model)
        decoder_output = transformer.decode(encoder_output, encoder_mask, decoder_input, decoder_mask) # (B, seq_len, d_model)
        proj_output = transformer.project(decoder_output).to(device) # (B, seq_len, vocab_size)
        loss = criterion(proj_output.view(-1, TGT_VOCAB_SIZE), label.view(-1)).to(device)
        loss.backward()
        # Update the weights
        optimizer.step()
        optimizer.zero_grad(set_to_none=True)
        losses += loss
    if not epoch%10:
        print(epoch)
        # Inside your loop


    #     with torch.no_grad():  # Disable gradient calculation for inference
    #         logits = proj_output  # Assuming proj_output is your logits tensor
    #         probabilities = F.softmax(logits, dim=-1)  # Apply softmax along the last dimension
    #         predicted_classes = torch.argmax(probabilities, dim=-1)  # Find the index of the class with the highest probability

    #         # Convert tensors to numpy arrays for inspection
    #         #logits_array = logits.numpy()        
    #         probabilities_array = probabilities.cpu().numpy()
    #         predicted_classes_array = predicted_classes.cpu().numpy()
    #         sent = ""
    #         for i in predicted_classes_array[0]:
    #             if i<4:
    #                 if i == 1:
    #                     sent = sent + "<UNK>"
    #                     sent = sent + " "
    #                 if i == 3:
    #                     sent = sent + "<EOS>"
    #                     sent = sent + " "
    #                     break
    #                 continue
    #             for word in TGT_TOKENS:
    #                 if TGT_TOKENS[word] == i:
    #                     sent = sent + word
    #                     sent = sent + " "
    #         print("")
    #         print("Loss: ")
    #         print(losses.item())
    #         print("")
    #         print("True Response: ")
    #         print("")
    #         print(responseSet[0])
    #         print("")
    #         print("Predicted Response: ")
    #         print("")
    #         print(sent)
print("Training Finished")
torch.save(transformer.state_dict(), 'transformer_model.pth')

cuda
2


TypeError: only integer tensors of a single element can be converted to an index

In [68]:
transformer = build_transformer(src_vocab_size=SRC_VOCAB_SIZE, tgt_vocab_size=TGT_VOCAB_SIZE, src_seq_len=MAX_LENGTH, tgt_seq_len=MAX_LENGTH, d_model=D_MODEL, N=8, h=6, dropout=0.1, device=device).to(device)
transformer.load_state_dict(torch.load('transformer_model.pth'))
transformer.eval()

contextSet_tokenized= split_contextSet_tokenized[0].to(device) 
responseSet_tokenized= split_responseSet_tokenized[0].to(device)
label = split_label[0].to(device)
with torch.no_grad():  # Disable gradient calculation for inference
    for epoch in range(1):
        encoder_mask = transformer.make_src_mask(contextSet_tokenized).to(device)
        decoder_mask = transformer.make_trg_mask(responseSet_tokenized).to(device)
        encoder_input = contextSet_tokenized.to(device)
        decoder_input = responseSet_tokenized.to(device)
        encoder_output = transformer.encode(encoder_input, encoder_mask) # (B, seq_len, d_model)
        decoder_output = transformer.decode(encoder_output, encoder_mask, decoder_input, decoder_mask) # (B, seq_len, d_model)
        proj_output = transformer.project(decoder_output).to(device) # (B, seq_len, vocab_size)
        # Update the weights



        logits = proj_output  # Assuming proj_output is your logits tensor
        probabilities = F.softmax(logits, dim=-1)  # Apply softmax along the last dimension
        predicted_classes = torch.argmax(probabilities, dim=-1)  # Find the index of the class with the highest probability
        losses = criterion(proj_output.view(-1, TGT_VOCAB_SIZE), label.view(-1)).to(device)
        # Convert tensors to numpy arrays for inspection
        #logits_array = logits.numpy()        
        probabilities_array = probabilities.cpu().numpy()
        predicted_classes_array = predicted_classes.cpu().numpy()
        sent = ""
        for i in predicted_classes_array[7]:
            if i<4:
                if i == 1:
                    sent = sent + "<UNK>"
                    sent = sent + " "
                if i == 3:
                    sent = sent + "<EOS>"
                    sent = sent + " "
                    break
                continue
            for word in TGT_TOKENS:
                if TGT_TOKENS[word] == i:
                    sent = sent + word
                    sent = sent + " "
        print("")
        print("Loss: ")
        print(losses.item())
        print("")
        print("True Response: ")
        print("")
        sent1 = ""
        for i in contextSet_tokenized[7]:
            if i<4:
                if i == 1:
                    sent1 = sent1 + "<UNK>"
                    sent1 = sent1 + " "
                if i == 3:
                    sent1 = sent1 + "<EOS>"
                    sent1 = sent1 + " "
                    break
                continue
            for word in SRC_TOKENS:
                if SRC_TOKENS[word] == i:
                    sent1 = sent1 + word
                    sent1 = sent1 + " "
        print(sent1)
        print("")
        print("Predicted Response: ")
        print("")
        print(sent)



Loss: 
0.010994641110301018

True Response: 

I have so many issues to address. I have a history of sexual abuse, I’m a breast cancer survivor and I am a lifetime insomniac. I have a long history of depression and I’m beginning to have anxiety. I have low self esteem but I’ve been happily married for almost 35 years. I’ve never had counseling about any of this. Do I have too many issues to address in counseling? 

Predicted Response: 

It is no such thing as too many issues for counseling. Many issues are often <UNK> and can all be worked on with some time and patience. <EOS> 


In [80]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(TGT_VOCAB_SIZE)
transformer = build_transformer(src_vocab_size=SRC_VOCAB_SIZE, tgt_vocab_size=TGT_VOCAB_SIZE, src_seq_len=MAX_LENGTH, tgt_seq_len=MAX_LENGTH, d_model=D_MODEL, N=8, h=6, dropout=0.1, device=device).to(device)
transformer.load_state_dict(torch.load('transformer_model.pth'))
transformer.eval()
first = "I start counseling/therapy in a few days (I'm freaking out) but my main fear is that I'll cry and embarrass myself, is it something to worry about?"
first = first.split()
second = "Do I have too many issues  "
second = second.split()
print(first)
inference = [first, second]
inf_tokens = []
decoder_tokens = [[SOS_TOKEN], [SOS_TOKEN]]
for example in inference:
    tok = []
    for word in example:
        tok.append(SRC_TOKENS[word])
    inf_tokens.append(tok[:])
print(len(inf_tokens[0]))
for i in range(len(inf_tokens)):
    total = 200-len(inf_tokens[i])
    for j in range(total):
        inf_tokens[i].append(PADDING_TOKEN)

for i in range(len(decoder_tokens)):
    for j in range(199):
        decoder_tokens[i].append(PADDING_TOKEN)
print(len(inf_tokens[0]))
contextSet_tokenized = []
responseSet_tokenized = []
contextSet_tokenized = torch.tensor(inf_tokens)
responseSet_tokenized = torch.tensor(decoder_tokens)
cur_word_idx = 0
finished = {}
with torch.no_grad():  # Disable gradient calculation for inference
    encoder_mask = transformer.make_src_mask(contextSet_tokenized).to(device)
    decoder_mask = transformer.make_trg_mask(responseSet_tokenized).to(device)
    encoder_input = contextSet_tokenized.to(device)
    decoder_input = responseSet_tokenized.to(device)
    encoder_output = transformer.encode(encoder_input, encoder_mask)  # (B, seq_len, d_model)

    for i in range(MAX_LENGTH - 1):
        decoder_output = transformer.decode(encoder_output, encoder_mask, decoder_input, decoder_mask)  # (B, seq_len, d_model)
        proj_output = transformer.project(decoder_output).to(device)  # (B, seq_len, vocab_size)
        logits = proj_output  # Assuming proj_output is your logits tensor
        probabilities = F.softmax(logits, dim=-1)  # Apply softmax along the last dimension
        predicted_classes = torch.argmax(probabilities, dim=-1)  # Find the index of the class with the highest probability [[4,76,3,5], [4,67,43, 4]]

        for example in range(len(predicted_classes)):
            if example not in finished:
                decoder_input[example][cur_word_idx + 1] = predicted_classes[example][cur_word_idx]
                if decoder_input[example][cur_word_idx + 1] == EOS_TOKEN:
                    finished[example] = 1
                decoder_mask = transformer.make_trg_mask(decoder_input).to(device)
        cur_word_idx += 1

        # Convert tensors to numpy arrays for inspection
        # logits_array = logits.numpy()
        probabilities_array = probabilities.cpu().numpy()
        predicted_classes_array = predicted_classes.cpu().numpy()

sentence = ""

for token in decoder_input[1]:
    for word in TGT_TOKENS:
        if TGT_TOKENS[word] == token:
            sentence = sentence + word + " "

print(sentence)


10439
['I', 'start', 'counseling/therapy', 'in', 'a', 'few', 'days', "(I'm", 'freaking', 'out)', 'but', 'my', 'main', 'fear', 'is', 'that', "I'll", 'cry', 'and', 'embarrass', 'myself,', 'is', 'it', 'something', 'to', 'worry', 'about?']
27
200
It is very common for people to have multiple issues that they want to (and need address in counseling. I have had clients ask that same question and through more exploration, there is often an underlying fear that they be or that they will "be too much for their therapist." I don't know if any of this rings true for you. But, most people have more than one problem in their lives and more often than not, people have numerous significant stressors in their lives. Let's face it, life can be Therapists are completely ready and equipped to handle all of the issues small or large that a client presents in session. Most therapists over the first couple of sessions will help you prioritize the issues you are facing so that you start addressing the issues

In [73]:

sentence = ""
for token in decoder_input[0]:
    for word in TGT_TOKENS:
        if TGT_TOKENS[word] == token:
            sentence = sentence + word + " "
print(sentence)


People do cry in therapy sometimes, but it's not at all necessary to cry in order for most kinds of therapy to be helpful. When you start counseling you don't yet know your counselor very well, so it's normal to keep your feelings in check until you feel comfortable and a bit more relaxed with your counselor and with the situation. Sometimes, though, there are emotions that have been waiting and waiting to finally find someone who will listen with a kind ear. If you feel safe right away in the situation with your counselor, you might just cry in spite of your fears about it. Your therapist is used to people expressing how they feel and will keep strict confidentiality, so even though it's embarrassing, finally experiencing someone truly listening with empathy and kindness may just be worth it. It's okay too to let your counselor know right at the beginning that you're kind of freaked out about getting too emotional in front of another person. 
