In [None]:
!pip install transformers

In [None]:
!pip install solver

In [None]:
from preprocess import *
import torch
import numpy as np

device = "cuda" if torch.cuda.is_available() else "cpu"
print(device)

# **Preprocess**

In [None]:
import glob
space_code = ''
pad_code = '27'
max_len = 256

def read_file(filename = ""):
      lines = []
      with open(filename, 'r') as file:
          for line in file.readlines():
              if line != "\n":
                  lines.append((line.strip()).replace(' ',''))
      return lines

def freq_array(line):
    freq_str = frequency_encode_string(line)
    freq_str = freq_str.replace('_',space_code)
    freq_arr = [int(x) for x in freq_str.split()]
    return freq_arr + [int(pad_code)] * (max_len - len(freq_arr))

def get_tensor_file(file_dir="", file_type=".test"):
      # file_list = glob.glob(file_dir + "*" + file_type)
      file_list = [file_dir]
      freq_lines = []
      source_lines = []
      for filename in file_list:
          strp_lines = read_file(filename)
          for line in strp_lines:
              freq_lines.append(freq_array(line))
          freq_lines.append([x.replace('_','') for x in strp_lines])
          source_lines.extend(strp_lines)
      return freq_lines, source_lines

In [None]:
import glob
lines_arr = []
tgt_arr = []
lang_arr = []
i = 0
files = glob.glob('/content/drive/MyDrive/' + "*" + '.train')
print(files)
for i,filename in enumerate(files):
    print(filename)
    lines,tgt = get_tensor_file(filename)
    lines_arr.append(lines)
    tgt_arr.append(tgt)
    lang_arr.append(i)


In [None]:
take = 100000
red_tuple = []
for i in range(len(lines_arr)):
      red_tuple.append((lines_arr[i][:take],tgt_arr[i][:take],i))

In [None]:
rearr_tuple = []
for lines_array, tgt_array, lang_idx in red_tuple:
    for i in range(take):
        rearr_tuple.append((lines_array[i],tgt_array[i],lang_idx))

In [None]:
print(len(rearr_tuple))
order_rearrange = np.arange(len(rearr_tuple))
np.random.shuffle(order_rearrange)
rearrange_tuple = [rearr_tuple[i] for i in order_rearrange]

In [None]:
final_torch, cur_sent, lang_idx = rearrange_tuple[0]
sentence_list = [cur_sent]
lang_indexes = [lang_idx]

In [None]:
final_torch = [final_torch]

In [None]:
for i in range(1, len(rearrange_tuple)):
    print(i)
    cur_torch, cur_sent, cur_idx = rearrange_tuple[i]
    sentence_list.append(cur_sent)
    lang_indexes.append(cur_idx)
    final_torch.append(cur_torch)

In [None]:
final_torch = torch.tensor(final_torch)

In [None]:
torch.save(final_torch,'/content/drive/MyDrive/space_torch')
np.save('/content/drive/MyDrive/space_sent',np.array(sentence_list))

In [None]:
lines = torch.load('/content/drive/MyDrive/space_torch')
tgt = np.load('/content/drive/MyDrive/space_sent.npy').tolist()

# **Pre Processing**

In [None]:
start_token = '28'
end_token = '29'
alphabets = 'abcdefghijklmnopqrstuvwxyz'
vocab_size = 30
alphabet_dict = {}
alphabet_dict['start'] = int(start_token) 
alphabet_dict['end'] = int(end_token)
alphabet_dict['_'] = int(space_code)
alphabet_dict['pad'] = int(pad_code)
alphabet_idx = 0

for character in alphabets:
    alphabet_dict[character] = alphabet_idx
    alphabet_idx += 1


def one_hot_encode(sent,start = True, end = True):
    num_code = []
    input_code = []
    if start:
        one_code = [0] * vocab_size
        one_code[alphabet_dict['start']] = 1
        num_code.append(one_code)

        input_code.append(alphabet_dict['start'])

    for character in sent:
        one_code = [0] * vocab_size
        one_code[alphabet_dict[character]] = 1
        num_code.append(one_code)

        input_code.append(alphabet_dict[character])

    if end:
        one_code = [0] * vocab_size
        one_code[alphabet_dict['end']] = 1
        num_code.append(one_code)

        input_code.append(alphabet_dict['end'])

    for i in range(len(num_code), max_len):
        one_code = [0] * vocab_size
        one_code[alphabet_dict['pad']] = 1
        num_code.append(one_code)

        input_code.append(alphabet_dict['pad'])

    return num_code, input_code

def one_hot_code_vocab(lines, start = True, end = True):
    one_hot_codes = []
    input_codes = []
    for line in lines:
        num_code, input_code = one_hot_encode(line, start, end)
        one_hot_codes.append(num_code) 
        input_codes.append(input_code)
    return torch.FloatTensor(one_hot_codes), torch.tensor(input_codes)


In [None]:
def get_batch(torch_lines, target, batch_idx, batch_size = 32):
      start_idx = (batch_idx * batch_size)
      end_idx = min((batch_idx + 1)*batch_size, len(target))

      one_hot_codes, input_codes = one_hot_code_vocab(target[start_idx : end_idx])
      return torch_lines[start_idx : end_idx], input_codes, one_hot_codes, target[start_idx : end_idx]

# **Baseline Model and Evaluation**

In [None]:
from torch import nn
class PositionalEncoding(nn.Module):

    def __init__(self, d_model: int, dropout: float = 0.1, max_len: int = 5000):
        super().__init__()
        self.dropout = nn.Dropout(p=dropout)

        position = torch.arange(max_len).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2) * (-math.log(10000.0) / d_model))
        pe = torch.zeros(max_len, 1, d_model)
        pe[:, 0, 0::2] = torch.sin(position * div_term)
        pe[:, 0, 1::2] = torch.cos(position * div_term)
        self.register_buffer('pe', pe)

    def forward(self, x):
        """
        Args:
            x: Tensor, shape [seq_len, batch_size, embedding_dim]
        """
        x = x + self.pe[:x.size(0)]
        return self.dropout(x)

In [None]:
from transformers import AutoTokenizer, AutoModel, AutoModelForSequenceClassification, AdamW, DataCollatorWithPadding, \
    get_scheduler
import pandas as pd
import torch
from torch.utils.data import DataLoader
from tqdm.auto import tqdm
# from utils import *
from solver import *
from torch.nn import Transformer, TransformerEncoder, TransformerEncoderLayer, TransformerDecoderLayer, TransformerDecoder
import math

dropout_prob = 0
vocab_size = 30

class Deciphormer(torch.nn.Module):
    '''
    Defining the base model:
    1)
    '''

    def __init__(self, ntoken: int = 30, d_model: int = 512, nhead: int = 8, d_hid: int = 2048, nlayers: int = 6,
                 dropout: float = 0.5):
        # Initialize model attributes
        super().__init__()
        self.d_model = d_model
        self.nhead = nhead
        self.d_hid = d_hid
        self.nlayers = nlayers
        self.dropout = dropout

        # Define model layers

        self.embedder = nn.Embedding(ntoken, d_model)
        self.pos_encoder = PositionalEncoding(d_model, max_len = d_model)
        encoder_layers = TransformerEncoderLayer(self.d_model, nhead, d_hid, dropout, batch_first = True)
        self.transformer_encoder = TransformerEncoder(encoder_layers, nlayers)

        self.embedder2 = nn.Embedding(vocab_size, d_model)
        decoder_layers = TransformerDecoderLayer(d_model, nhead, d_hid, batch_first = True)
        self.transformer_decoder = TransformerDecoder(decoder_layers, nlayers)
        self.linearout = nn.Linear(d_model, vocab_size)

    def forward(self, data, tgt):
      src_mask = self.generate_src_mask(data)
      src_mask = src_mask.to(device)
      src_padding_mask = self.generate_src_padding_mask(data)
      src_padding_mask = src_padding_mask.to(device)

      src = self.embedder(data) * math.sqrt(self.d_model)
      # print(src.shape)
      src = self.pos_encoder(src)
      # print(src.shape)
      out1 = self.transformer_encoder(src, src_mask, src_padding_mask)
      # print(out1.shape)

      tgt_mask = self.generate_tgt_mask(tgt)
      tgt_mask = tgt_mask.to(device)
      tgt_padding_mask = self.generate_tgt_padding_mask(tgt)
      tgt_padding_mask = tgt_padding_mask.to(device)
      embed_tgt = self.embedder2(tgt) * math.sqrt(self.d_model)
      out2 = self.transformer_decoder(embed_tgt, out1, tgt_mask, src_mask, tgt_padding_mask, src_padding_mask)
      out2 = self.linearout(out2)
      return (out1, out2)


    def generate_src_mask(self,data):
        batch_size = data.shape[0]
        seq_len = data.shape[1]

        src_mask = torch.zeros(seq_len, seq_len)

        return src_mask

    def generate_src_padding_mask(self, data):
        batch_size = data.shape[0]
        seq_len = data.shape[1]

        src_padding_mask = torch.full(size = (batch_size,seq_len), fill_value = False)

        for i in range(batch_size):
            pad_idx = -1
            for j in range(seq_len):
                if data[i][j] == pad_code:
                    pad_idx = j
                    break
        
        if pad_idx != -1:
            src_padding_mask[i][pad_idx:] = True

        return src_padding_mask

    def generate_tgt_mask(self,tgt):
        batch_size = tgt.shape[0]
        seq_len = tgt.shape[1]
        return (torch.triu(torch.ones(seq_len, seq_len)) == 1).transpose(0, 1)

    def generate_tgt_padding_mask(self, tgt):
        batch_size = tgt.shape[0]
        seq_len = tgt.shape[1]

        tgt_padding_mask = torch.full(size = (batch_size,seq_len), fill_value = False)

        for i in range(batch_size):
            pad_idx = -1
            for j in range(seq_len):
                if tgt[i][j] == pad_code:
                    pad_idx = j
                    break
        
        if pad_idx != -1:
            tgt_padding_mask[i][pad_idx:] = True

        return tgt_padding_mask

In [None]:
model = Deciphormer()
model.to(device)
model

In [None]:
model.train()
batch_size = 32
num_of_batches = lines.shape[0]//batch_size
num_of_epochs = 5
optimizer = torch.optim.AdamW(model.parameters(), lr = 0.01)
loss = torch.nn.CrossEntropyLoss()

for epoch in range(num_of_epochs):
    for i in range(num_of_batches):
        encoder_input, decoder_tgt, decoder_final,_ = get_batch(lines, tgt, i,2)

        encoder_input = encoder_input.to(device)
        decoder_tgt = decoder_tgt.to(device)
        decoder_final = decoder_final.to(device)

        model_encoder_out, model_decoder_out = model(encoder_input, decoder_tgt)

        loss_tensor = loss(model_decoder_out,decoder_final)

        optimizer.zero_grad()
        loss_tensor.backward()
        optimizer.step()

        if (i + 1)%500==0:
            torch.save(model.state_dict(), f'/content/drive/MyDrive/space_model_final_{i+1}_{epoch}')

        print(i)
    torch.save(model.state_dict(), f'/content/drive/MyDrive/space_{epoch}')

# **Evaluation**

In [None]:
import numpy as np
eval_tgt = np.load('/content/drive/MyDrive/test_sent.npy').tolist()

In [None]:
model = Deciphormer()
model.load_state_dict(torch.load('/content/drive/MyDrive/space_model_final_2000_1'))
model.to(device)

In [None]:
def get_tensor_spaces(strp_lines):
      freq_lines = []
      source_lines = []
      for line in strp_lines:
          freq_lines.append(freq_array(line))
      return freq_lines

In [None]:
eval_src = torch.tensor(get_tensor_spaces(eval_tgt))

In [None]:
rev_alphabet = {}
for key in alphabet_dict.keys():
    rev_alphabet[alphabet_dict[key]] = key

In [None]:
model.eval()
num_of_batches = eval_src.shape[0]

predicted_sentences = []
true_sentences = []

for i in range(num_of_batches):
    encoder_input, decoder_tgt, decoder_final, target_sentences = get_batch(eval_src, eval_tgt, i, batch_size = 1)
    encoder_input = encoder_input.to(device)
    pred_sentence = ''
    for j in range(len(target_sentences[0])):
        decoder_one_hot, decoder_tgt = one_hot_code_vocab([pred_sentence], start = True, end = False)
        decoder_tgt = decoder_tgt.to(device)

        model_encoder_out, model_decoder_out = model(encoder_input, decoder_tgt)
        logits = model_decoder_out.detach().cpu().numpy()
        
        logits = logits.squeeze()

        pred_char = np.argmax(logits[j])
        if(pred_char == end_token):
              break
        elif(pred_char == space_code):
              pred_sentence = ' '
        elif(pred_char < 26):
              pred_sentence += rev_alphabet[pred_char]

    predicted_sentences.append(pred_sentence)
    true_sentences.append(target_sentences[0])
    if ((i+1)%100)==0:
        np.save('/content/drive/MyDrive/predsent',np.array(predicted_sentences))
        np.save('/content/drive/MyDrive/truesent',np.array(true_sentences))

# **Evaluation Metrics**

In [None]:
np_pred = np.array(predicted_sentences)
np_true = np.array(true_sentences)
np.save('pred_arr', np_pred)
np.save('true_arr', np_true)

In [None]:
!pip install pyter3

In [None]:
import pyter
ter_lis = []
for i in range(len(predicted_sentences)):
    ter_lis.append(pyter.ter(predicted_sentences[i],true_sentences[i]))

In [None]:
print(sum(ter_lis)/len(ter_lis))