In [10]:
import pandas as pd
import numpy as np

import collections
import itertools
import random
import torch

from transformers import GPT2LMHeadModel, GPT2Tokenizer

import torch.nn as nn

import os

import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
from tqdm import tqdm_notebook

from sklearn. model_selection import train_test_split

from torch.utils.data import TensorDataset, DataLoader

if torch.cuda.is_available(): 
    dev = "cuda:0"
else: 
    dev = "cpu"
device = torch.device(dev)
print('Using {}'.format(device))

In [None]:
# Setando seed para replicabilidade
seed = 42

torch.manual_seed(seed)
torch.cuda.manual_seed_all(seed)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False
np.random.seed(seed)
random.seed(seed)
os.environ['PYTHONHASHSEED'] = str(seed)

In [29]:
model = GPT2LMHeadModel.from_pretrained("gpt2", pad_token_id=tokenizer.eos_token_id)
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')

In [11]:
# Testando tokenizador (Byte level)

tokenizer.decode(tokenizer.encode('hi my name is lucas'))

'hi my name is lucas'

In [13]:
tokenizer.encode('hi my name is lucas',  return_tensors='pt')

tensor([[ 5303,   616,  1438,   318, 17115,   292]])

In [21]:
loss, out = model(tokenizer.encode('hi my name is lucas',  return_tensors='pt'))

In [16]:
out[0].shape

torch.Size([1, 6, 50257])

In [20]:
out[1][0].shape

torch.Size([2, 1, 12, 6, 64])

In [59]:
torch.manual_seed(42)
# encode context the generation is conditioned on
input_ids = tokenizer.encode('Once upon a time', return_tensors='pt')

# generate text until the output length (which includes the context length) reaches 50
greedy_output = model.generate(input_ids, max_length=50, top_p=.95, top_k=50, do_sample= True, num_return_sequences=1)

print("Output:\n" + 100 * '-')
print(tokenizer.decode(greedy_output[0], skip_special_tokens=True))

Output:
----------------------------------------------------------------------------------------------------
Once upon a time, the only other thing that mattered was the need to help him save his father, who had been in such a bad way, after an incident of his own. A friend's dream, in which she was being held captive by


In [60]:
input_ids

tensor([[7454, 2402,  257,  640]])

In [None]:
data = pd.read_csv("../deliver/train/train.csv")
data.head()

In [61]:
tokenizer.unk_token

'<|endoftext|>'

In [70]:
tokenizer.pad_token = tokenizer.eos_token

In [71]:
tokenizer.pad_token

'<|endoftext|>'

In [72]:
tokenizer.encode_plus('hi', max_length= 10, pad_to_max_length=True)

{'input_ids': [5303, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 0, 0, 0, 0, 0, 0, 0, 0, 0]}

In [None]:
# Função que cria n-grams para transformar nossos tokens em entrada para a rede baseado em https://pytorch.org/tutorials/beginner/nlp/word_embeddings_tutorial.html
def get_ngrams(tokens, vocab ,n = 5):
    '''
    Função que recebe uma lista de tokens e retorna os índices, de acordo com o vocab, dos n-grams de tamanho n com base no vocab passado

    '''

    # Primeiro adicionamos n-1 <pad> no começo dos tokens
    local_tokens = (n-1)*['<pad>'] + tokens.copy() 

    # Array que guardará o vetor de contextos
    context_df = []
    # Array que guardará os tokens target para cada contexto
    target_token = []

  # Para cada token de 0 a len(tokens) - n - 1(já que vamos usar até i+n tokens por contexto e o i+n+1 é o target)
    for i in range(len(local_tokens) - n):
    # Vetor auxiliar que será incrementado ao context_df
        aux_df = []

        # Loop que percorre os primeiros n tokens
        for j in range(i, i+n):
            if(local_tokens[j] not in vocab):
                aux_df.append(vocab["<unk>"])
            else:
                aux_df.append(vocab[local_tokens[j]])

        # Incrementa o context_df
        context_df.append(aux_df)

        # Incrementa o target_token com o i+n+1 token
        if(local_tokens[i+n] not in vocab):
            target_token.append(vocab["<unk>"])
        else:
            target_token.append(vocab[local_tokens[i+n]])

    # Retorno numpy arrays por comodidade minha, mas poderiam ser tensores direto
    return context_df, target_token

In [None]:
def generate_df(df, len_context = 5):
    results_list = []
    target_list = []
    author_list = []
    
    i = 0
    for text, author in zip(df.text, df.author):
#         print([text],author)
        encoded = get_ngrams(text.split(), vocab ,n = len_context)
#         print([text],encoded[0])
#         i+=1
#         if(i>10):
#             break
        
        results_list.extend(encoded[0])
        target_list.extend(encoded[1])
        author_list.extend(len(encoded[1])*[author])
        
    return results_list, target_list, author_list