In [None]:
!pip install transformers

In [None]:
from transformers import AutoModel, AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("xlm-roberta-base")
#tokenizer = AutoTokenizer.from_pretrained("dbmdz/distilbert-base-turkish-cased")

In [None]:
tokenizer.src_lang = 'tr'
tokenizer.add_tokens('<br/>')

In [None]:
print(tokenizer.tokenize("merhaba benim adım kaan efe keleştir geldim gittim <br/> çekoslavaykyalılaştırdım"))

In [None]:
import math
from typing import Tuple
import numpy as np
import torch
from torch import nn, Tensor
import torch.nn.functional as F
from torch.nn import TransformerEncoder, TransformerEncoderLayer
from torch.utils.data import dataset
import pandas as pd
import torch.distributions as dist
import re

In [None]:
import torch
from torch import nn
import numpy as np
import math

#mask function for the attention
def apply_mask(matrices, maskval, mask_diagonal=True):
    h, w = matrices.size(-2), matrices.size(-1)
    indices = torch.triu_indices(h, w, offset=0 if mask_diagonal else 1)
    matrices[..., indices[0], indices[1]] = maskval


class multi_head_attention(nn.Module):
  def __init__(self, d_model, mask, heads=8):
    super().__init__()
    self.d_model = d_model
    self.heads = heads
    self.mask = mask

    self.keyWeights = nn.Linear(d_model, d_model*heads, bias=False)
    self.queryWeights = nn.Linear(d_model, d_model*heads, bias=False)
    self.valueWeights = nn.Linear(d_model, d_model*heads, bias=False)

    self.concatHeads = nn.Linear(d_model*heads, d_model)

  def forward(self, v, k, q):
    #b:batch_len, s:sequence_len, e:encoding_len
    b,s,e = v.size()
    h = self.heads

    #e must be divisible by h otherwise throw error
    if e % h != 0:
      print("Head size should be a diviser of embedding length")
      return

    #apply the weights on inputs and then reshape to b,s,h,e to be able to move head dimension later
    keys  = self.keyWeights(k).view(b,s,h,e)
    queries  = self.queryWeights(q).view(b,s,h,e)
    values  = self.valueWeights(v).view(b,s,h,e)

    #we need to move batch and head dimension next to each other to form a b*h sized dimension
    keys = keys.transpose(1, 2).reshape(b * h, s, e)
    queries = queries.transpose(1, 2).reshape(b * h, s, e)
    values = values.transpose(1, 2).reshape(b * h, s, e)

    #queries b*h,s,e X keys: b*h,e,s so output dimensions: b*h,s,s
    dot = torch.bmm(queries, keys.transpose(1, 2))
    scaled_dot = dot / (e**(1/2))
    
    #add the mask to the scaled dot product
    if self.mask:
      apply_mask(dot, float('-inf'), mask_diagonal=False)
    #apply softmax
    attention_weights = nn.functional.softmax(dot, dim=2)

    #apply self-attention to values mul's result dimension: b*h,s,e
    #we will reshape to b,h,s,e
    out = torch.bmm(attention_weights, values).view(b, h, s, e)

    #swap h and s dimension to concat all the h's over e dimensions
    out = out.transpose(1,2).reshape(b, s, h*e)
    return self.concatHeads(out)

#TO-DO: check weight initilizations for all parameter types
class position_wise_feed_forward(nn.Module):
  def __init__(self, d_model, dff):
    super().__init__()
    self.conv1 = nn.Conv1d(in_channels=d_model, out_channels=dff, kernel_size=1)
    self.conv2 = nn.Conv1d(in_channels=dff, out_channels=d_model, kernel_size=1)


  def forward(self, x):
    #input is batch_size, seq_len, d_model
    #since conv expects the channel dimension on second dimension, transpose
    conv1_out = nn.functional.relu(self.conv1(x.transpose(1, 2)))
    #second conv's output will be batch_size, d_model, seq_len, transpose to acquire input dimensions
    conv2_out = self.conv2(conv1_out).transpose(1,2)
    return conv2_out


class EncoderLayer(nn.Module):
  def __init__(self, d_model, mask, heads, dff, dp_rate):
    super().__init__()
    self.multi_head_attention = multi_head_attention(d_model, mask, heads)
    self.add_norm1 = nn.LayerNorm(d_model, eps=1e-6)
    self.dropout1 = nn.Dropout(dp_rate)

    self.ffn = position_wise_feed_forward(d_model, dff)
    self.add_norm2 = nn.LayerNorm(d_model, eps=1e-6)
    self.dropout2 = nn.Dropout(dp_rate)
  
  def forward(self, x):
    attention = self.multi_head_attention(x,x,x)
    attention = self.dropout1(attention)
    res_connection = x + attention
    add_normalized = self.add_norm1(res_connection)

    ff_out = self.ffn(add_normalized)
    ff_out = self.dropout2(ff_out)
    res_connection2 = add_normalized + ff_out
    add_normalized2 = self.add_norm2(res_connection2)
    
    return add_normalized2

def get_angles(pos, i, d_model):
  angle_rates = 1 / np.power(10000, (2 * (i//2)) / np.float32(d_model))
  return pos * angle_rates

def positional_encoding(position, d_model):
  angle_rads = get_angles(np.arange(position)[:, np.newaxis], np.arange(d_model)[np.newaxis, :], d_model)
  # apply sin to even indices in the array; 2i
  angle_rads[:, 0::2] = np.sin(angle_rads[:, 0::2])
  # apply cos to odd indices in the array; 2i+1
  angle_rads[:, 1::2] = np.cos(angle_rads[:, 1::2])

  pos_encoding = angle_rads[np.newaxis, ...]
  return torch.from_numpy(pos_encoding)


class EncoderStack(nn.Module):
  def __init__(self, num_layers, mask, d_model, heads, dff, dp_rate, vocab_size, seq_length):
    super().__init__()
    self.d_model = d_model
    self.num_layers = num_layers

    #embeddings
    self.embedding = nn.Embedding(vocab_size, d_model)
    self.pos_embedding = nn.Embedding(embedding_dim=d_model, num_embeddings=seq_length)    
    self.dropout = nn.Dropout(dp_rate)

    #encoder layers
    enc_layers = []
    for i in range(num_layers):
      enc_layers.append(EncoderLayer(d_model, mask, heads, dff, dp_rate))

    self.enc_layers = nn.Sequential(*enc_layers)

  def forward(self, x):
    #learned embeddings       
    x = self.embedding(x)
    x *= math.sqrt(self.d_model)
    batch_size, seq_size, em_size = x.shape


    #addition with positional encodings
    x += self.pos_embedding(torch.arange(seq_size, device=device))[None, :, :].expand(batch_size, seq_size, em_size)
    #dropout layer
    x = self.dropout(x)

    #encoder stack
    for i in range(self.num_layers):
      x = self.enc_layers[i](x)
    return x


from torch import nn

class GeneratorTransformer(nn.Module):
  def __init__(self, num_layers, d_model, heads, dff, dp_rate, vocab_size, seq_length, mask=True):
    super().__init__()
    self.d_model = d_model
    self.num_layers = num_layers
    self.vocab_size = vocab_size

    self.encoder_stack  = EncoderStack(num_layers, mask, d_model, heads, dff, dp_rate, vocab_size, seq_length).to(device)
    self.outprobs = nn.Linear(d_model, vocab_size)
    self.init_weights()
      
  def init_weights(self):
      self.outprobs.weight = self.encoder_stack.embedding.weight
      self.apply(self._init_weights)
  
  def _init_weights(self, module):
      if isinstance(module, (nn.Linear, nn.Embedding, nn.Conv1d)):
          module.weight.data.normal_(mean=0.0, std=0.02)
          if isinstance(module, (nn.Linear, nn.Conv1d)) and module.bias is not None:
              module.bias.data.zero_()
      elif isinstance(module, nn.LayerNorm):
          module.bias.data.zero_()
          module.weight.data.fill_(1.0)
        
  def forward(self, x):
    batch_size, sequence_size = x.size()
    
    x = self.encoder_stack(x)
    x = self.outprobs(x.view(batch_size * sequence_size, self.d_model)).view(batch_size, sequence_size, self.vocab_size)
    softmax = nn.functional.log_softmax(x, dim=2)
    return softmax


In [None]:
poems1 = pd.read_csv('poems.csv')
poems2_f = open("poems2.txt")
poems2 = poems2_f.read()

In [None]:
poems1['poem']

In [None]:
chars_to_remove = ['\t', ':', 'î'
       'û', '(', ')', '—', '…',
       'ə',  '”', '*', '“', '"', 'j', 'w', '«', '»',
        'Â', '_',  '‘',  'à',  '`',
       'W', '–', 'ê', '=',  '´', '[', 'Î', 'Ə', 'ä', 'ß',
       '•', 'é', ']', '@', 'ô', '+', 'ù', '&', '¥', '\xa0', 'ý', '·', 'Û',
       '|', 'Ò', '‚', '%', '^', '¹', '\u200e', 'Í', 'Ã', '#', 'ú', 'è',
       '„','</p>','<p>','...','0','1','2','3','4','5','6','7','8','9',"quot;"]

In [None]:
text = ""
for poem in poems1['poem']:
    poemer = poem
    for i in chars_to_remove:
        poemer = poemer.replace(i,' ')
    text = text + poemer

In [None]:
for i in chars_to_remove:
    poems2 = poems2.replace(i,' ')
text = text + " " + poems2

In [None]:
text = re.sub('<br>','<br/>',text)
text = re.sub('<br/>',' <br/> ',text)
text = re.sub('\n','<br/>',text)
text = re.sub('  *',' ',text)
text = re.sub('\.\.*','.',text)

In [None]:
tokenized_text = tokenizer.tokenize(text)

In [None]:
tokenized_text

In [None]:
uniques, counts = np.unique(tokenized_text, return_counts=True)

In [None]:
freq_sorted = np.flip(uniques[counts.argsort()])

In [None]:
freq_sorted[0:20]

In [None]:
vocab_dict = {}
for i in range(len(freq_sorted)):
    vocab_dict[freq_sorted[i]] = i

In [None]:
detok_vocab_dict = {}
for i in range(len(freq_sorted)):
    detok_vocab_dict[i] = freq_sorted[i]

In [None]:
def tokenize(char_array, vocab):
    tokenized = []
    for i in range(len(char_array)):
        if i%100000 == 0:
            print(f"%{i/len(char_array)*100}",end='\r')
        tokenized.append(vocab.get(char_array[i]))
    return tokenized

In [None]:
def de_tokenize(tok_array, vocab):
    de_tokenize = []
    for i in range(len(tok_array)):
        if i%100000 == 0:
            print(f"%{i/len(tok_array)*100}",end='\r')
        de_tokenize.append(vocab.get(tok_array[i]))
    return de_tokenize

In [None]:
tokenized = tokenize(tokenized_text,vocab_dict)

In [None]:
tokenized_len = len(tokenized)
train_val_split = 90
train_data = torch.tensor(tokenized[:tokenized_len*train_val_split//100])
validation_data = torch.tensor(tokenized[tokenized_len*train_val_split//100:])

In [None]:
def split_to_batches(src, batch_size):
    sequence_length = src.size(0) // batch_size
    src = src[:sequence_length * batch_size]
    src = src.view(batch_size, sequence_length).t().contiguous()
    return src.to(device)

In [None]:
def get_batch_data(src, bptt, batch_count):
    seq_len = min(bptt, len(src) - 1 - batch_count)
    data = src[batch_count:batch_count+seq_len]
    target = src[batch_count+1:batch_count+1+seq_len].reshape(-1)
    return data, target

In [None]:
def get_lr(optimizer):
    for param_group in optimizer.param_groups:
        return param_group['lr']

In [None]:
def sampler(prompt ,out,temp = 0.6):
    probabilities = F.softmax(out[0][-1] / temp, dim=0)
    catout = dist.Categorical(probabilities)
    prompt.append(de_tokenize([catout.sample().cpu().item()],detok_vocab_dict)[0])
    return prompt

In [None]:
import copy
import time

batch_size = 4
backpropagation_through_time = 1024
device = "cuda"
ntokens = len(vocab_dict)  # size of vocabulary
emsize = 512  # embedding dimension
d_hid = 512   # dimension of the feedforward network model in nn.TransformerEncoder
nlayers = 1 # number of nn.TransformerEncoderLayer in nn.TransformerEncoder
nhead = 4  # number of heads in nn.MultiheadAttention
dropout = 0.1 # dropout probability
model = GeneratorTransformer(num_layers=nlayers, d_model=emsize, heads=nhead, dff=d_hid, dp_rate=dropout, vocab_size=ntokens, seq_length=backpropagation_through_time, mask=True).to(device)

trn_batch_split_tokens = split_to_batches(train_data,batch_size).to(device)
val_batch_split_tokens = split_to_batches(validation_data,batch_size).to(device)

criterion = nn.NLLLoss()
lr = 3e-4  # learning rate
optimizer = torch.optim.Adam(model.parameters(), lr=lr)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 1.0, gamma=0.95)

train_hist = []
eval_hist = []


In [None]:
def train(model: nn.Module) -> None:
    model.train()  # turn on train mode
    start_time = time.time()
    num_batches = len(trn_batch_split_tokens) // backpropagation_through_time
    mean_losses = []
    for batch, i in enumerate(range(0, trn_batch_split_tokens.size(0) - 1, backpropagation_through_time)):
        data, targets = get_batch_data(trn_batch_split_tokens, backpropagation_through_time,i)
        batch_size = data.size(0)
        output = model(data)
        #print(output.shape)
        #print(targets.shape)
        loss = criterion(output.view(-1, ntokens), targets)

        optimizer.zero_grad()
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 0.5)
        optimizer.step()

        mean_loss = torch.mean(loss).item()
        mean_losses.append(mean_loss)
        if batch % 100 == 0 and batch > 0:
            lr = scheduler.get_last_lr()[0]
            ms_per_batch = (time.time() - start_time) * 1000
            cur_loss = mean_loss
            print(f'| epoch {epoch:3d} | {batch:5d}/{num_batches:5d} batches | '
                  f'lr {get_lr(optimizer)} | ms/batch {ms_per_batch:5.2f} | '
                  f'overall loss {np.mean(mean_losses)}')
            start_time = time.time()
    train_hist.append(np.mean(mean_losses))
def evaluate(model: nn.Module, eval_data: Tensor) -> float:
    model.eval()  # turn on evaluation mode
    total_loss = []
    with torch.no_grad():
        for i in range(0, eval_data.size(0) - 1, backpropagation_through_time):
            data, targets = get_batch_data(eval_data,backpropagation_through_time, i)
            batch_size = data.size(0)
            output = model(data)
            total_loss.append(criterion(output.view(-1, ntokens), targets).item())
    eval_hist.append(np.mean(total_loss))
    return np.mean(total_loss)

In [None]:
best_val_loss = float('inf')
epochs = 100
best_model = None
for epoch in range(1, epochs + 1):
    epoch_start_time = time.time()
    train(model)
    val_loss = evaluate(model, val_batch_split_tokens)
    val_ppl = math.exp(val_loss)
    elapsed = time.time() - epoch_start_time
    print('-' * 89)
    print(f'| end of epoch {epoch} | time: {elapsed:5.2f}s | '
          f'valid loss {val_loss} | valid ppl {val_ppl:8.2f}')
    print('-' * 89)

    prompt = "Yalnız"
    overall = prompt
    prompt = tokenizer.tokenize(prompt)
    with torch.no_grad():
        model.eval()
        for i in range(100):
            prompt = prompt[-backpropagation_through_time:]
            out = model(torch.tensor(tokenize(list(prompt),vocab_dict)).reshape(1,len(prompt)).to(device))
            prompt = sampler(prompt, out,0.7)
            if prompt[-1][0] == '▁':
                 overall = overall + " " + prompt[-1][1:]
            elif prompt[-1] == '<br/>':
                 overall = overall + "\n"           
            else:
                overall = overall + prompt[-1]
        print(overall)

    if val_loss < best_val_loss:
        best_val_loss = val_loss
        best_model = copy.deepcopy(model)

    scheduler.step()

In [None]:
import matplotlib.pyplot as plt

In [None]:
plt.plot(eval_hist)
plt.plot(train_hist)
plt

In [None]:
model = best_model
prompt = "Yağmuru seviyorum diyorsun,\nyağmur yağınca şemsiyeni açıyorsun \nGüneşi seviyorum diyorsun,\ngüneş açınca gölgeye kaçıyorsun \nRüzgarı seviyorum diyorsun,\nrüzgar çıkınca pencereni kapatıyorsun\nİşte,bunun için korkuyorum;\nBeni de sevdiğini söylüyorsun.."
overall = prompt
prompt = tokenizer.tokenize(prompt)
with torch.no_grad():
        model.eval()
        for i in range(300):
            prompt = prompt[-backpropagation_through_time:]
            out = model(torch.tensor(tokenize(list(prompt),vocab_dict)).reshape(1,len(prompt)).to(device))
            prompt = sampler(prompt, out,1)
            if prompt[-1][0] == '▁':
                 overall = overall + " " + prompt[-1][1:]
            elif prompt[-1] == '<br/>':
                 overall = overall + "\n"           
            else:
                overall = overall + prompt[-1]
        print(overall)

In [None]:
 val_loss = evaluate(model, val_batch_split_tokens)
 print(val_loss)

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
path = '/content/drive/MyDrive/transformer_models/' + str(batch_size) + '_' + str(backpropagation_through_time) + '_' + str(emsize) + '_' + str(d_hid) + '_' + str(nlayers) + '_' + str(nhead) + '.pth'
torch.save(model, path)

In [None]:
path = "/content/drive/MyDrive/transformer_models/4_1024_512_512_1_4.pth"
model_loaded = torch.load(path)
model_loaded

In [None]:
 eval_hist = []
 val_loss = evaluate(model_loaded, val_batch_split_tokens)
 print(val_loss)

In [None]:
prompt = "Yalnızlık"
overall = prompt
prompt = tokenizer.tokenize(prompt)
with torch.no_grad():
        for i in range(300):
            prompt = prompt[-backpropagation_through_time:]
            out = model_loaded(torch.tensor(tokenize(list(prompt),vocab_dict)).reshape(1,len(prompt)).to(device))
            prompt = sampler(prompt, out,0.85)
            if prompt[-1][0] == '▁':
                 overall = overall + " " + prompt[-1][1:]
            elif prompt[-1] == '<br/>':
                 overall = overall + "\n"           
            else:
                overall = overall + prompt[-1]
        print(overall)