In [None]:
import torch
import numpy as np
import pandas as pd

In [None]:
df = pd.read_csv('/content/drive/MyDrive/Datasets/CoEdit/train_set.csv')
df = df[: 100]

In [None]:
# import zipfile
# import os

# # Specify the path to the zip file in Google Drive
# zip_path = '/content/drive/MyDrive/Datasets/CoEdit/vocabs.zip'

# # Specify the directory to extract the contents
# extract_dir = '/content/drive/MyDrive/Datasets/CoEdit/vocabs'

# # Create a ZipFile object and extract the contents
# with zipfile.ZipFile(zip_path, 'r') as zip_ref:
#     zip_ref.extractall(extract_dir)

# # List the contents of the extracted directory
# extracted_files = os.listdir(extract_dir)
# print(extracted_files)

In [None]:
source_vocab_file_path = '/content/drive/MyDrive/Datasets/CoEdit/vocabs/source_vocab.txt'
target_vocab_file_path = '/content/drive/MyDrive/Datasets/CoEdit/vocabs/target_vocab.txt'

In [None]:
# Read the contents of source_vocab.txt
with open(source_vocab_file_path, 'r') as source_file:
    source_vocab = [line.strip() for line in source_file.readlines()]

# Read the contents of target_vocab.txt
with open(target_vocab_file_path, 'r') as target_file:
    target_vocab = [line.strip() for line in target_file.readlines()]

In [None]:
len(target_vocab), len(source_vocab)

In [None]:
index_to_source = {k:v for k,v in enumerate(source_vocab)}
source_to_index = {v:k for k,v in enumerate(source_vocab)}
index_to_target= {k:v for k,v in enumerate(target_vocab)}
target_to_index = {v:k for k,v in enumerate(target_vocab)}

In [None]:
len(target_to_index)

In [None]:
source_sentences = list(df['source sentence'].values)
target_sentences = list(df['target sentence'].values)

In [None]:
len(source_sentences)

In [None]:
max(len(x) for x in source_sentences), max(len(x) for x in target_sentences),

In [None]:
len('Fr xmpl, cntrs wth  lt f dsrts cn trnsfrm thr dsrt t ncrs thr hbtbl lnd nd s rrgtn t prvd cln wtr t th dsrt.')

In [None]:
PERCENTILE = 97
print( f"{PERCENTILE}th percentile length in Source: {np.percentile([len(x) for x in source_sentences], PERCENTILE)}" )
print( f"{PERCENTILE}th percentile length in Target: {np.percentile([len(x) for x in target_sentences], PERCENTILE)}" )

In [None]:
print(source_vocab[42])

In [None]:
target_vocab[7438]

In [None]:
source_x = ['a']

In [None]:
tk = "'"
if tk in source_vocab:
  print("yes")
else:
  print("No")

In [None]:
max_sequence_length = 200

def is_valid_tokens(sentence, vocab):
    for token in list(set(sentence)):
        # print(f"token: {token}")
        if token not in vocab and token != ' ':
            print(f"not found token: {token}")
            return False
    return True

def is_valid_length(sentence, max_sequence_length):
    return len(list(sentence)) < (max_sequence_length - 1) # need to re-add the end token so leaving 1 space

valid_sentence_indicies = []
for index in range(len(source_sentences)):
    # print(index)
    source_sentence, target_sentence = source_sentences[index].lower(), target_sentences[index].lower()
    if is_valid_length(source_sentence, max_sequence_length) \
      and is_valid_length(target_sentence, max_sequence_length) \
      and is_valid_tokens(source_sentence, source_vocab):
        valid_sentence_indicies.append(index)

print(f"Number of sentences: {len(source_sentences)}")
print(f"Number of valid sentences: {len(valid_sentence_indicies)}")

In [None]:
source_sentences = [source_sentences[i] for i in valid_sentence_indicies]
target_sentences = [target_sentences[i] for i in valid_sentence_indicies]

In [None]:
len(source_sentences), len(target_sentences)

In [None]:
target_sentences[:3]

In [None]:
source_sentences[: 3]

In [None]:
from torch.utils.data import Dataset, DataLoader

class TextDataset(Dataset):

    def __init__(self, source_sentences, target_sentences):
        self.source_sentences = source_sentences
        self.target_sentences = target_sentences

    def __len__(self):
        return len(self.source_sentences)

    def __getitem__(self, idx):
        return self.source_sentences[idx], self.target_sentences[idx]

In [None]:
dataset = TextDataset(source_sentences, target_sentences)

In [None]:
dataset.__len__()

In [None]:
dataset.__getitem__(0)

In [None]:
batch_size = 3
train_loader = DataLoader(dataset, batch_size)
iterator = iter(train_loader)

In [None]:
for batch_num, batch in enumerate(iterator):
    print(batch)
    print(batch_num)
    if batch_num > 2:
        break

In [None]:
PADDING_TOKEN = '[PAD]'
UNKNOWN_TOKEN = '[UNK]'
START_TOKEN = '[START]'
END_TOKEN = '[END]'

In [None]:
target_to_index['##igan']
# index_to_target[7438]

In [None]:
batch[1]

In [None]:
from torchtext.transforms import BERTTokenizer
source_tokenizer = BERTTokenizer(vocab_path=source_vocab_file_path, do_lower_case=True, return_tokens=True)
target_tokenizer = BERTTokenizer(vocab_path=target_vocab_file_path, do_lower_case=True, return_tokens=True)

In [None]:
def tokenize(sentence, language_to_index, tokenizer_type, start_token=True, end_token=True):

    if tokenizer_type == 'src_tkn':
        src_sentence_tokens = source_tokenizer(sentence)
        sentence_word_indicies = [language_to_index[token] for token in src_sentence_tokens]
    if tokenizer_type == 'tgt_tkn':
        tgt_sentence_tokens = target_tokenizer(sentence)
        sentence_word_indicies = [language_to_index[token] for token in tgt_sentence_tokens]

    if start_token:
        sentence_word_indicies.insert(0, language_to_index[START_TOKEN])
    if end_token:
        sentence_word_indicies.append(language_to_index[END_TOKEN])

    for _ in range(len(sentence_word_indicies), max_sequence_length):
        sentence_word_indicies.append(language_to_index[PADDING_TOKEN])

    return torch.tensor(sentence_word_indicies)

In [None]:
source_tokenized, target_tokenized = [], []
for sentence_num in range(batch_size):
    source_sentence, target_sentence = batch[0][sentence_num], batch[1][sentence_num]
    source_tokenized.append( tokenize(source_sentence, source_to_index, tokenizer_type = 'src_tkn', start_token=False, end_token=False) )
    target_tokenized.append( tokenize(target_sentence, target_to_index, tokenizer_type = 'tgt_tkn', start_token=True, end_token=True) )
source_tokenized = torch.stack(source_tokenized)
target_tokenized = torch.stack(target_tokenized)

In [None]:
len(target_tokenized), len(source_tokenized)

In [None]:
target_tokenized

In [None]:
len(batch[0])

In [None]:
NEG_INFTY = -1e9

def create_masks(source_batch, target_batch):
    num_sentences = len(source_batch)
    look_ahead_mask = torch.full([max_sequence_length, max_sequence_length] , True)
    look_ahead_mask = torch.triu(look_ahead_mask, diagonal=1)
    encoder_padding_mask = torch.full([num_sentences, max_sequence_length, max_sequence_length] , False)
    decoder_padding_mask_self_attention = torch.full([num_sentences, max_sequence_length, max_sequence_length] , False)
    decoder_padding_mask_cross_attention = torch.full([num_sentences, max_sequence_length, max_sequence_length] , False)

    for idx in range(num_sentences):
      source_sentence_length, target_sentence_length = len(source_batch[idx]), len(target_batch[idx])
      source_wordPiece_to_padding_mask = np.arange(source_sentence_length + 1, max_sequence_length)
      target_wordPiece_to_padding_mask = np.arange(target_sentence_length + 1, max_sequence_length)
      encoder_padding_mask[idx, :, source_wordPiece_to_padding_mask] = True
      encoder_padding_mask[idx, source_wordPiece_to_padding_mask, :] = True
      decoder_padding_mask_self_attention[idx, :, target_wordPiece_to_padding_mask] = True
      decoder_padding_mask_self_attention[idx, target_wordPiece_to_padding_mask, :] = True
      decoder_padding_mask_cross_attention[idx, :, source_wordPiece_to_padding_mask] = True
      decoder_padding_mask_cross_attention[idx, target_wordPiece_to_padding_mask, :] = True

    encoder_self_attention_mask = torch.where(encoder_padding_mask, NEG_INFTY, 0)
    decoder_self_attention_mask =  torch.where(look_ahead_mask + decoder_padding_mask_self_attention, NEG_INFTY, 0)
    decoder_cross_attention_mask = torch.where(decoder_padding_mask_cross_attention, NEG_INFTY, 0)
    print(f"encoder_self_attention_mask {encoder_self_attention_mask.size()}: {encoder_self_attention_mask[0, :10, :10]}")
    print(f"decoder_self_attention_mask {decoder_self_attention_mask.size()}: {decoder_self_attention_mask[0, :10, :10]}")
    print(f"decoder_cross_attention_mask {decoder_cross_attention_mask.size()}: {decoder_cross_attention_mask[0, :10, :10]}")
    return encoder_self_attention_mask, decoder_self_attention_mask, decoder_cross_attention_mask

In [None]:
create_masks(batch[0], batch[1])

In [None]:
# class SentenceEmbedding(nn.Module):
#     "For a given sentence, create an embedding"
#     def __init__(self, max_sequence_length, d_model, language_to_index, source_tokenizer, target_tokenizer,START_TOKEN, END_TOKEN, PADDING_TOKEN):
#         super().__init__()
#         self.vocab_size = len(language_to_index)
#         self.max_sequence_length = max_sequence_length
#         self.embedding = nn.Embedding(self.vocab_size, d_model)
#         self.language_to_index = language_to_index
#         self.source_tokenizer = source_tokenizer
#         self.target_tokenizer = target_tokenizer
#         self.position_encoder = PositionalEncoding(d_model, max_sequence_length)
#         self.dropout = nn.Dropout(p=0.1)
#         self.START_TOKEN = START_TOKEN
#         self.END_TOKEN = END_TOKEN
#         self.PADDING_TOKEN = PADDING_TOKEN

#     def batch_tokenize(self, batch, start_token=True, end_token=True):

#         def tokenize(sentence, tokenizer_type, start_token=True, end_token=True):

#           if tokenizer_type == 'src_tkn':
#               src_sentence_tokens = self.source_tokenizer(sentence)
#               sentence_word_indicies = [self.language_to_index[token] for token in src_sentence_tokens]
#           if tokenizer_type == 'tgt_tkn':
#               tgt_sentence_tokens = self.target_tokenizer(sentence)
#               sentence_word_indicies = [self.language_to_index[token] for token in tgt_sentence_tokens]

#           if start_token:
#               sentence_word_indicies.insert(0, self.language_to_index[self.START_TOKEN])
#           if end_token:
#               sentence_word_indicies.append(self.language_to_index[self.END_TOKEN])

#           for _ in range(len(sentence_word_indicies), self.max_sequence_length):
#               sentence_word_indicies.append(self.language_to_index[self.PADDING_TOKEN])

#           return torch.tensor(sentence_word_indicies)

#         tokenized = []
#         for sentence_num in range(len(batch)):
#            tokenized.append( tokenize(batch[sentence_num], start_token, end_token) )
#         tokenized = torch.stack(tokenized)
#         return tokenized.to(get_device())

#     def forward(self, x, end_token=True): # sentence
#         x = self.batch_tokenize(x ,end_token)
#         x = self.embedding(x)
#         pos = self.position_encoder().to(get_device())
#         x = self.dropout(x + pos)
#         return x