In [2]:
pip install transformers

Installing collected packages: pyyaml, tokenizers, huggingface-hub, transformers
  Attempting uninstall: pyyaml
    Found existing installation: PyYAML 3.13
    Uninstalling PyYAML-3.13:
      Successfully uninstalled PyYAML-3.13
Successfully installed huggingface-hub-0.7.0 pyyaml-6.0 tokenizers-0.12.1 transformers-4.19.2


In [11]:
from transformers import GPT2TokenizerFast,GPT2LMHeadModel
import numpy as np
import pandas as pd
import torch

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
data = pd.read_csv('/content/drive/MyDrive/Barmagan/chosen_data.csv')

In [38]:
model_name = 'aubmindlab/aragpt2-medium'

tokenizer = GPT2TokenizerFast.from_pretrained(model_name)
model = GPT2LMHeadModel.from_pretrained(model_name)

**Tokenization**

In [11]:
poems = list(data.new_poems2.values)

In [10]:
## add padding token to tokenizer, then modify model's vocab size
tokenizer.add_special_tokens({'pad_token': '[PAD]'})
tokenizer.add_special_tokens({'additional_special_tokens': ['[NEWLINE]']})
model.resize_token_embeddings(len(tokenizer))

Embedding(64002, 1024)

In [19]:
## treat first line as input
def CreateInputs(poems, tokenizer):
    input_lines = []
    for poem in poems:
        poem_lines = poem.split('\n')
        input_line = tokenizer.bos_token 
        for idx in range(0, len(poem_lines)-2):
          
          input_line += poem_lines[idx]+  ' ' + tokenizer.additional_special_tokens[0]
        
        input_line +=  poem_lines[3] + tokenizer.eos_token
                
        input_lines.append(input_line)
            
    return input_lines

input_sentences = CreateInputs(poems, tokenizer)

In [21]:
def get_max_length(input_sentences):
    max_len = 0
    for line in input_sentences:
        encoded_line = tokenizer.encode(line)
        encoded_len = len(encoded_line)
        max_len = max(max_len, encoded_len)
    
    return max_len
 
max_length = get_max_length(input_sentences)

In [None]:
def TokenizeAndEncodeInput(input_sentences, tokenizer, max_len):

    encoded_input = []
    attention_mask = []
 
    for poem in input_sentences:

            
        encoded_line = tokenizer.encode_plus(poem,
                                        max_length=max_len,
                                        padding= 'max_length',
                                        return_tensors= 'pt',
                                            truncation= True)



        encoded_input.append(encoded_line['input_ids'])
        attention_mask.append(encoded_line['attention_mask'])

              
    
    input_tensor = torch.cat(encoded_input, dim=0)
    attention_tensor =torch.cat(attention_mask, dim=0)
    
        
    return input_tensor, attention_tensor

input_tensor, input_mask_tensor = TokenizeAndEncodeInput(input_sentences, tokenizer, max_length)

In [28]:
input_mask_tensor.shape

torch.Size([10000, 50])

In [27]:
from torch.utils.data import TensorDataset, DataLoader, random_split
from transformers import AdamW, get_linear_schedule_with_warmup
from tqdm import tqdm, trange
import torch.nn.functional as F

In [33]:
def create_dataloader(poetry_in, attention_mask,  batch_size):
    
    tensor_dataset = TensorDataset(poetry_in, attention_mask)

    
    train_dataloader = DataLoader(tensor_dataset, batch_size = batch_size)

    return train_dataloader

In [30]:
def initiate(lr, warmup,total_steps, epochs, model = model):

    
    optimizer = AdamW(model.parameters(), lr=lr)
    scheduler = get_linear_schedule_with_warmup(optimizer,
                                                num_warmup_steps=warmup,
                                                num_training_steps=total_steps)
    
    return optimizer, scheduler

In [None]:
if torch.cuda.is_available():
    device = torch.device('cuda')
else:
    print('No GPU available, using CPU')
    device = torch.device('cpu')
        
model.cuda()  

In [None]:
epochs = 5
train_dataloader = create_dataloader(input_tensor, input_mask_tensor,batch_size = 8)  
total_steps = len(train_dataloader) * epochs
warmup_steps = total_steps * 0.2
   
optimizer, scheduler = initiate(lr= 3e-5, epochs= epochs,warmup = warmup_steps,
                                total_steps=total_steps)

In [35]:
def train(lr, epochs, train_dataloader, model = model, tokenizer = tokenizer):
    idx = 0
    
    for epoch in range(0, epochs):
      loop = tqdm(train_dataloader, leave= True)
      for batch in loop:
        optimizer.zero_grad()
        input_tensor  = batch[0].to(device)
        attention_mask = batch[1].to(device)
            
        outputs = model(input_tensor, attention_mask = attention_mask, labels = input_tensor)
        loss= outputs[0]
        loss.backward()
            
        optimizer.step()
        scheduler.step()
        if(idx == 6000):
          try:
            model_name = 'poetry_generator'+str(epoch)
            torch.save({
                  'model_state_dict': model.state_dict(),
                  'loss': loss,
                  'optimizer_state_dict': optimizer.state_dict()
              }, '/content/drive/MyDrive/Barmagan/' +model_name+'.pth')
          except:
            print('Something Went Wrong!')
          
        loop.set_description(f'Epoch {epoch}')
        loop.set_postfix(loss = loss.item())
            

train(train_dataloader = train_dataloader, epochs = 5, lr =3e-5)

Epoch 0: 100%|██████████| 1250/1250 [05:15<00:00,  3.97it/s, loss=3.29]
Epoch 1: 100%|██████████| 1250/1250 [05:13<00:00,  3.98it/s, loss=2.78]
Epoch 2: 100%|██████████| 1250/1250 [05:13<00:00,  3.98it/s, loss=1.9]
Epoch 3: 100%|██████████| 1250/1250 [05:13<00:00,  3.99it/s, loss=1.18]
Epoch 4: 100%|██████████| 1250/1250 [05:13<00:00,  3.98it/s, loss=0.722]


In [25]:
import torch
model =GPT2LMHeadModel.from_pretrained('/content/drive/MyDrive/Barmagan/poetry_generator')


In [35]:
prompt = tokenizer.bos_token + 'أبينا أبينا أن تضب لثاتكم' +tokenizer.eos_token

In [36]:

encoded_prompt = tokenizer.encode(prompt, return_tensors = 'pt')
gneration = model.generate(encoded_prompt, top_k = 70, max_length =80, top_p= 0.2,
                           repetition_penalty = 3.0,
    no_repeat_ngram_size = 3, num_beams=15)
                   
generated_text = tokenizer.decode(gneration[0], skip_special_tokens = True)


Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.


In [37]:
print(generated_text)

أبينا أبينا أن تضب لثاتكم خصم ذوي مرديات إذا ما ضن بعض الحق بالباطل عن حوزتهم فإن الحق مظنون ومعروف مغيار لا يحلون حقكم في إمامكم ضرارا للسان وللدهر إحلاء وإمرار
