In [None]:
import sys
import torch
import torch.nn as nn
import torch.optim as optim
import random
import torch.nn.functional as F

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

def openFile(file): 
    '''
    Description: Read in text file from CLI and return the input string and label in seperate arrays
    Input: file 
    Output: Two arrays, input and target, containing the input and target strings respectively 
    '''
    input = []
    target = []
    with open(file, 'r') as f:
        for line in f: 
            if ',' in line:
                currentLine = line.split(',')
                input.append(currentLine[0])
                target.append(currentLine[1].strip('\n'))
            else:
                input.append(currentLine)
    return input, target

def writeFile(input, output):
    '''
    Description: write the input and predicted translation to a text file
    Input: Original input to model
    Output: Output of model
    Return: None, text file stored in file path where code is exectued in CLI
    '''
    with open('result_predict.txt', 'a') as f:
        for i in input: 
            f.write(input[i])
            f.write(',')
            f.write(output[i])
            f.write('\n')
    return

def pairData(input, output): 

    source = Sequence()
    target = Sequence()

    pairs = []

    for i in range(len(input)):
        full = [input[i], output[i]]
        source.addWord(input[i])
        target.addWord(output[i])

        pairs.append(full)

    return source, target, pairs


'''
This section pre-processes the input and output sequences. We break the words down into their corresponding letters
and feed the dicitionary of letters to the model. We convert each letter into a one-hot vector. 
'''
sos_tkn = 0
eos_tkn = 1

class Sequence:
    '''
    Class to help make a dictionary. Each word from the input or target array is split into letters
    and then added to the dictionary. Each letter is added to an index
    '''
    def __init__(self): 
    #initialize containers to hold the letters and corresponding index
        self.letter2index = {}
        self.letter2count = {}
        self.index2letter = {}
        self.n_letters = 2
    
    def addWord(self, word): 
         '''
         Description: split a word into letters and pass to addLetter function
         Input: word from input or target sequence
         Output: none 
         '''
         for letter in word:
            self.addLetter(letter)
     
    def addLetter(self, letter): 
         if letter not in self.letter2index:
             self.letter2index[letter] = self.n_letters
             self.letter2count[letter] = 1
             self.index2letter[self.n_letters] = letter
             self.n_letters += 1
         else:
             self.letter2count[letter] += 1
    
'''
This section converts strings to tensors and then makes them pairs 
'''

def indexesFromWord(Sequence, word):
    return [Sequence.letter2index[letter] for letter in word]

def tensorFromWord(Sequence, word):
    indexes = indexesFromWord(Sequence, word)
    indexes.append(eos_tkn)

    return torch.tensor(indexes, dtype = torch.long, device = device).view(-1,1)

def tensorsFromPair(input_seq, target_seq, pair):

    input_tensor = tensorFromWord(input_seq, pair[0])
    target_tensor = tensorFromWord(target_seq, pair[1])
    return (input_tensor, target_tensor)


'''
Define the model here 
'''

class Encoder(nn.Module): 
    def __init__(self, input_dim, hidden_dim, embbed_dim, num_layers):
       super(Encoder, self).__init__()
      
       self.input_dim = input_dim
       self.embbed_dim = embbed_dim
       self.hidden_dim = hidden_dim
       self.num_layers = num_layers

       self.embedding = nn.Embedding(input_dim, self.embbed_dim)

       self.gru = nn.GRU(self.embbed_dim, self.hidden_dim, num_layers=self.num_layers)
              
    def forward(self, src):
      
       embedded = self.embedding(src).view(1,1,-1)
       outputs, hidden = self.gru(embedded)
       return outputs, hidden

class Decoder(nn.Module):
   def __init__(self, output_dim, hidden_dim, embbed_dim, num_layers):
       super(Decoder, self).__init__()

       self.embbed_dim = embbed_dim
       self.hidden_dim = hidden_dim
       self.output_dim = output_dim
       self.num_layers = num_layers

       self.embedding = nn.Embedding(output_dim, self.embbed_dim)
       self.gru = nn.GRU(self.embbed_dim, self.hidden_dim, num_layers=self.num_layers)
       self.out = nn.Linear(self.hidden_dim, output_dim)
       self.softmax = nn.LogSoftmax(dim=1)
      
   def forward(self, input, hidden):

       input = input.view(1, -1)
       embedded = F.relu(self.embedding(input))
       output, hidden = self.gru(embedded, hidden)       
       prediction = self.softmax(self.out(output[0]))
      
       return prediction, hidden

'''
Combine Encoder and Decoder
'''
class Seq2Seq(nn.Module):
   def __init__(self, encoder, decoder, device):
       super().__init__()
      
       self.encoder = encoder
       self.decoder = decoder
       self.device = device
     
   def forward(self, source, target, teacher_forcing_ratio=0.5):

       input_length = source.size(0) 
       batch_size = target.shape[1] 
       target_length = target.shape[0]
       vocab_size = self.decoder.output_dim

       outputs = torch.zeros(target_length, batch_size, vocab_size).to(self.device)

       for i in range(input_length):
           encoder_output, encoder_hidden = self.encoder(source[i])


       decoder_hidden = encoder_hidden.to(device)

       decoder_input = torch.tensor([sos_tkn], device=device)  # SOS


       for t in range(target_length):   
           decoder_output, decoder_hidden = self.decoder(decoder_input, decoder_hidden)
           outputs[t] = decoder_output
           teacher_force = random.random() < teacher_forcing_ratio
           topv, topi = decoder_output.topk(1)
           input = (target[t] if teacher_force else topi)
           if(teacher_force == False and input.item() == eos_tkn):
               break

       return outputs

tf_ratio = 0.5

'''
Loss calculation
'''

def lossCalc(model, input_tensor, target_tensor, optimizer, criterion):
    optimizer.zero_grad()
    input_length = input_tensor.size(0)

    loss = 0
    epoch_loss = 0 

    output = model(input_tensor, target_tensor)

    num_iter = output.size(0)

    for i in range(num_iter): 
        loss += criterion(output[i], target_tensor[i])
    
    loss.backward()
    optimizer.step()
    epoch_loss = loss.item()/num_iter

    return epoch_loss

'''
Function for training the model
'''

def trainModel(model, source, target, pairs, num_iteration = 20000):
    model.train()

    optimizer = optim.SGD(model.parameters(), lr=0.01)
    criterion = nn.NLLLoss()
    total_loss_iterations = 0

    training_pairs = [tensorsFromPair(source, target, random.choice(pairs))
                        for i in range(num_iteration)]

    for iter in range(1, num_iteration+1):
        training_pair = training_pairs[iter - 1]
        input_tensor = training_pair[0]
        target_tensor = training_pair[1]

        loss = lossCalc(model, input_tensor, target_tensor, optimizer, criterion)

        total_loss_iterations += loss

        if iter % 100 == 0: 
            print('%d training iteraitions completed' % (iter))

        if iter % 5000 == 0:
            avarage_loss= total_loss_iterations / 5000
            total_loss_iterations = 0
            print('%d %.4f' % (iter, avarage_loss))
            
    torch.save(model.state_dict(), 'trained_model.pt')
    return model

'''
Setup for evaluating model
'''
def evaluate(model, input_lang, output_lang, words):
   with torch.no_grad():
       input_tensor = tensorFromWord(input_lang, words[0])
       output_tensor = tensorFromWord(output_lang, words[1])
  
       decoded_words = []
  
       output = model(input_tensor, output_tensor)
       # print(output_tensor)
  
       for ot in range(output.size(0)):
           topv, topi = output[ot].topk(1)
           # print(topi)

           if topi[0].item() == eos_tkn:
               decoded_words.append('<EOS>')
               break
           else:
               decoded_words.append(output_lang.index2letter[topi[0].item()])
   return decoded_words

def evaluateRandomly(model, source, target, pairs, n=10):
   for i in range(n):
       pair = random.choice(pairs)
       print('source {}'.format(pair[0]))
       print('target {}'.format(pair[1]))
       output_letters = evaluate(model, source, target, pair)
       output_words = ' '.join(output_letters)
       print('predicted {}'.format(output_words))


#input, output = openFile(sys.argv[1])

input, output = openFile('data_train.txt')

source, target, pairs = pairData(input, output)

randomize = random.choice(pairs)
print('random sentence {}'.format(randomize))

#print number of letters
input_size = source.n_letters
output_size = target.n_letters
print('Input : {} Output : {}'.format(input_size, output_size))

embed_size = 256
hidden_size = 512
num_layers = 1

encoder = Encoder(input_size, hidden_size, embed_size, num_layers)
decoder = Decoder(output_size, hidden_size, embed_size, num_layers)

model = Seq2Seq(encoder, decoder, device).to(device)

#print model 
print(encoder)
print(decoder)


model = trainModel(model, source, target, pairs)
evaluateRandomly(model, source, target, pairs)

cuda
random sentence ['GILGFVFTL', 'CASSTGRNYGYTF']
Input : 22 Output : 22
Encoder(
  (embedding): Embedding(22, 256)
  (gru): GRU(256, 512)
)
Decoder(
  (embedding): Embedding(22, 256)
  (gru): GRU(256, 512)
  (out): Linear(in_features=512, out_features=22, bias=True)
  (softmax): LogSoftmax(dim=1)
)
100 training iteraitions completed
200 training iteraitions completed
300 training iteraitions completed
400 training iteraitions completed
500 training iteraitions completed
600 training iteraitions completed
700 training iteraitions completed
800 training iteraitions completed
900 training iteraitions completed
1000 training iteraitions completed
1100 training iteraitions completed
1200 training iteraitions completed
1300 training iteraitions completed
1400 training iteraitions completed
1500 training iteraitions completed
1600 training iteraitions completed
1700 training iteraitions completed
1800 training iteraitions completed
1900 training iteraitions completed
2000 training iteraiti