In [1]:
!pip install transformers

Collecting transformers
  Downloading transformers-4.13.0-py3-none-any.whl (3.3 MB)
[K     |████████████████████████████████| 3.3 MB 9.2 MB/s 
[?25hCollecting sacremoses
  Downloading sacremoses-0.0.46-py3-none-any.whl (895 kB)
[K     |████████████████████████████████| 895 kB 32.9 MB/s 
Collecting tokenizers<0.11,>=0.10.1
  Downloading tokenizers-0.10.3-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (3.3 MB)
[K     |████████████████████████████████| 3.3 MB 37.9 MB/s 
Collecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.2.1-py3-none-any.whl (61 kB)
[K     |████████████████████████████████| 61 kB 455 kB/s 
Collecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)
[K     |████████████████████████████████| 596 kB 39.4 MB/s 
Installing collected packages: pyyaml, tokenizers, sacremoses, huggingface-hub, transformers
  Attem

In [10]:
!pip install solver

Collecting solver
  Downloading solver-0.0.4.tar.gz (2.8 kB)
Building wheels for collected packages: solver
  Building wheel for solver (setup.py) ... [?25l[?25hdone
  Created wheel for solver: filename=solver-0.0.4-py3-none-any.whl size=3128 sha256=9a6be2829694000521590f8a7df0543614022e287fcde124f8744a2e76f3b209
  Stored in directory: /root/.cache/pip/wheels/66/27/02/f951263ae7f5f0d6b627987a9ca9baefdb4da1f5c6f18fb4f5
Successfully built solver
Installing collected packages: solver
Successfully installed solver-0.0.4


In [3]:
from preprocess import *
import torch
import numpy as np

device = "cuda" if torch.cuda.is_available() else "cpu"
print(device)

cpu


# **Preprocess**

In [4]:
space_code = '26'
pad_code = '27'
max_len = 256

def read_file(filename = ""):
      lines = []
      with open(filename, 'r') as file:
          for line in file.readlines():
              if line != "\n":
                  lines.append((line.strip()).replace(' ',''))
      return lines

def freq_array(line):
    freq_str = frequency_encode_string(line)
    freq_str = freq_str.replace('_',space_code)
    freq_arr = [int(x) for x in freq_str.split()]
    return freq_arr + [int(pad_code)] * (max_len - len(freq_arr))

def get_tensor_file(filename=""):
      strp_lines = read_file(filename)
      freq_lines = []
      for line in strp_lines:
          freq_lines.append(freq_array(line))
      return torch.tensor(freq_lines), strp_lines

In [5]:
lines,tgt = get_tensor_file('/content/drive/MyDrive/cs685/project/gutenberg-data/catalan.train')
lines.shape

torch.Size([71780, 256])

In [6]:
start_token = '28'
end_token = '29'
alphabets = 'abcdefghijklmnopqrstuvwxyz'
vocab_size = 30
alphabet_dict = {}
alphabet_dict['start'] = int(start_token) 
alphabet_dict['end'] = int(end_token)
alphabet_dict['_'] = int(space_code)
alphabet_dict['pad'] = int(pad_code)
alphabet_idx = 0

for character in alphabets:
    alphabet_dict[character] = alphabet_idx
    alphabet_idx += 1


def one_hot_encode(sent,start = True, end = True):
    num_code = []
    input_code = []
    if start:
        one_code = [0] * vocab_size
        one_code[alphabet_dict['start']] = 1
        num_code.append(one_code)

        input_code.append(alphabet_dict['start'])

    for character in sent:
        one_code = [0] * vocab_size
        one_code[alphabet_dict[character]] = 1
        num_code.append(one_code)

        input_code.append(alphabet_dict[character])

    if end:
        one_code = [0] * vocab_size
        one_code[alphabet_dict['end']] = 1
        num_code.append(one_code)

        input_code.append(alphabet_dict['end'])

    for i in range(len(num_code), max_len):
        one_code = [0] * vocab_size
        one_code[alphabet_dict['pad']] = 1
        num_code.append(one_code)

        input_code.append(alphabet_dict['pad'])

    return num_code, input_code

def one_hot_code_vocab(lines, start = True, end = True):
    one_hot_codes = []
    input_codes = []
    for line in lines:
        num_code, input_code = one_hot_encode(line, start, end)
        one_hot_codes.append(num_code) 
        input_codes.append(input_code)
    return torch.FloatTensor(one_hot_codes), torch.tensor(input_codes)


In [7]:
def get_batch(torch_lines, target, batch_idx, batch_size = 32):
      start_idx = (batch_idx * batch_size)
      end_idx = min((batch_idx + 1)*batch_size, len(target))

      one_hot_codes, input_codes = one_hot_code_vocab(target[start_idx : end_idx])
      return torch_lines[start_idx : end_idx], input_codes, one_hot_codes, target[start_idx : end_idx]

# **Model**

In [11]:
from torch import nn
class PositionalEncoding(nn.Module):

    def __init__(self, d_model: int, dropout: float = 0.1, max_len: int = 5000):
        super().__init__()
        self.dropout = nn.Dropout(p=dropout)

        position = torch.arange(max_len).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2) * (-math.log(10000.0) / d_model))
        pe = torch.zeros(max_len, 1, d_model)
        pe[:, 0, 0::2] = torch.sin(position * div_term)
        pe[:, 0, 1::2] = torch.cos(position * div_term)
        self.register_buffer('pe', pe)

    def forward(self, x):
        """
        Args:
            x: Tensor, shape [seq_len, batch_size, embedding_dim]
        """
        x = x + self.pe[:x.size(0)]
        return self.dropout(x)

In [12]:
from transformers import AutoTokenizer, AutoModel, AutoModelForSequenceClassification, AdamW, DataCollatorWithPadding, \
    get_scheduler
import pandas as pd
import torch
from torch.utils.data import DataLoader
from tqdm.auto import tqdm
# from utils import *
from solver import *
from torch.nn import Transformer, TransformerEncoder, TransformerEncoderLayer, TransformerDecoderLayer, TransformerDecoder
import math

dropout_prob = 0
vocab_size = 30

class Deciphormer(torch.nn.Module):
    '''
    Defining the base model:
    1)
    '''

    def __init__(self, ntoken: int = 30, d_model: int = 512, nhead: int = 8, d_hid: int = 2048, nlayers: int = 6,
                 dropout: float = 0.5):
        # Initialize model attributes
        super().__init__()
        self.d_model = d_model
        self.nhead = nhead
        self.d_hid = d_hid
        self.nlayers = nlayers
        self.dropout = dropout

        # Define model layers

        self.embedder = nn.Embedding(ntoken, d_model)
        self.pos_encoder = PositionalEncoding(d_model, max_len = d_model)
        encoder_layers = TransformerEncoderLayer(self.d_model, nhead, d_hid, dropout, batch_first = True)
        self.transformer_encoder = TransformerEncoder(encoder_layers, nlayers)

        self.embedder2 = nn.Embedding(vocab_size, d_model)
        decoder_layers = TransformerDecoderLayer(d_model, nhead, d_hid, batch_first = True)
        self.transformer_decoder = TransformerDecoder(decoder_layers, nlayers)
        self.linearout = nn.Linear(d_model, vocab_size)

    def forward(self, data, tgt, mask=None):
      src = self.embedder(data) * math.sqrt(self.d_model)
      # print(src.shape)
      src = self.pos_encoder(src)
      # print(src.shape)
      out1 = self.transformer_encoder(src, mask)
      # print(out1.shape)

      embed_tgt = self.embedder2(tgt) * math.sqrt(self.d_model)
      out2 = self.transformer_decoder(embed_tgt, out1)
      out2 = self.linearout(out2)
      return (out1, out2)


In [14]:
model = Deciphormer()
model.cuda()
model

Deciphormer(
  (embedder): Embedding(30, 512)
  (pos_encoder): PositionalEncoding(
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (transformer_encoder): TransformerEncoder(
    (layers): ModuleList(
      (0): TransformerEncoderLayer(
        (self_attn): MultiheadAttention(
          (out_proj): NonDynamicallyQuantizableLinear(in_features=512, out_features=512, bias=True)
        )
        (linear1): Linear(in_features=512, out_features=2048, bias=True)
        (dropout): Dropout(p=0.5, inplace=False)
        (linear2): Linear(in_features=2048, out_features=512, bias=True)
        (norm1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
        (norm2): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
        (dropout1): Dropout(p=0.5, inplace=False)
        (dropout2): Dropout(p=0.5, inplace=False)
      )
      (1): TransformerEncoderLayer(
        (self_attn): MultiheadAttention(
          (out_proj): NonDynamicallyQuantizableLinear(in_features=512, out_features=512,

In [None]:
model.train()
batch_size = 32
num_of_batches = lines.shape[0]//batch_size

optimizer = torch.optim.SGD(model.parameters(),lr = 0.01)
loss = torch.nn.CrossEntropyLoss()

for i in range(num_of_batches):
    encoder_input, decoder_tgt, decoder_final,_ = get_batch(lines, tgt, i)

    encoder_input = encoder_input.to(device)
    decoder_tgt = decoder_tgt.to(device)
    decoder_final = decoder_final.to(device)

    model_encoder_out, model_decoder_out = model(encoder_input, decoder_tgt)

    loss_tensor = loss(model_decoder_out,decoder_final)

    optimizer.zero_grad()
    loss_tensor.backward()
    optimizer.step()
    print(i)

0
1
2


In [19]:
print(lines.shape[0]/32)

2243.125


# **Evaluation**

In [20]:
eval_src, eval_tgt = get_tensor_file('/content/drive/MyDrive/cs685/project/gutenberg-data/catalan.test')

In [21]:
rev_alphabet = {}
for key in alphabet_dict.keys():
    rev_alphabet[alphabet_dict[key]] = key

In [None]:
model.eval()
num_of_batches = eval_src.shape[0]

for i in range(num_of_batches):
    encoder_input, decoder_tgt, decoder_final, target_sentences = get_batch(eval_src, eval_tgt, i, batch_size = 1)
    encoder_input = encoder_input.to(device)
    pred_sentence = ''
    for j in range(max_len):
        decoder_one_hot, decoder_tgt = one_hot_code_vocab([pred_sentence], start = True, end = False)
        decoder_tgt = decoder_tgt.to(device)

        model_encoder_out, model_decoder_out = model(encoder_input, decoder_tgt)
        logits = model_decoder_out.detach().cpu().numpy()
        
        logits = logits.squeeze()

        pred_char = np.argmax(logits[j])
        if(pred_char == end_token):
              break
        elif(pred_char == space_token):
              pred_sentence = ' '
        elif(pred_char < 26):
              pred_sentence += rev_alphabet[pred_char]
    print(pred_sentence)
    print(target_sentences)
    print('-'*20)

startstartstartstartstartstartstartstartstartstartstartstartstartstartstartstartstartstartstartstartstartstartstartstartstartstartstartstartstartstartstartstartstartstartstartstartstartstartstartstartstartstartstartstartstartstartstartstartstartstartstartstart
['larribada_del_cinque_cosa_dolca_seria_pera_mi_la_seva_vinguda']
--------------------
startstartstartstartstartstartstartstartstartstartstartstartstartstartstartstartstartstartstartstartstartstartstartstartstartstartstartstartstartstartstartstartstartstartstartstartstartstartstartstartstartstartstartstartstartstartstartstartstartstartstartstart
['sir_tobias_per_deu']
--------------------
startstartstartstartstartstartstartstartstartstartstartstartstartstartstartstartstartstartstartstartstartstartstartstartstartstartstartstartstartstartstartstartstartstartstartstartstartstartstartstartstartstartstartstartstartstartstartstartstartstartstartstart
['si_certament_nhi_tinc_una_pero_que']
--------------------
startstartstartstartstarts