# Setup

## Imports

In [10]:
import sys
sys.path

['/Library/Frameworks/Python.framework/Versions/3.6/lib/python36.zip',
 '/Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6',
 '/Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/lib-dynload',
 '',
 '/Users/nirendy/school-repo/language_model_playground/venv/lib/python3.6/site-packages',
 '/Users/nirendy/school-repo/language_model_playground/venv/lib/python3.6/site-packages/IPython/extensions',
 '/Users/nirendy/.ipython',
 '/Users/nirendy/school-repo/language_model_playground']

In [11]:
sys.path.append('/Users/nirendy/school-repo/language_model_playground')

In [12]:
from src.utils.logits import TokenizerDebugger

In [13]:
import torch

In [14]:
from transformers import AutoTokenizer, AutoModelForCausalLM

In [7]:
import pandas as pd

In [8]:
import functools

In [9]:
import itertools

## Init

In [16]:
tokenizer = AutoTokenizer.from_pretrained("distilgpt2")

model = AutoModelForCausalLM.from_pretrained("distilgpt2", pad_token_id=tokenizer.eos_token_id)

In [17]:
# encode context the generation is conditioned on
input_ids = tokenizer.encode('I enjoy walking with my cute dog', return_tensors='pt')

# Training Models

## Greedy

In [12]:
# generate text until the output length (which includes the context length) reaches 50
greedy_output = model.generate(input_ids, max_length=50)

print("Output:\n" + 100 * '-')
print(tokenizer.decode(greedy_output[0], skip_special_tokens=True))

Output:
----------------------------------------------------------------------------------------------------
I enjoy walking with my cute dog. I love it. I love it. I love it. I love it. I love it. I love it. I love it. I love it. I love it. I love it. I love


## Beam

In [None]:
beam_output = model.generate(
    input_ids, 
    max_length=50, 
    num_beams=5, 
    early_stopping=True
)

print("Output:\n" + 100 * '-')
print(tokenizer.decode(beam_output[0], skip_special_tokens=True))

## More sequences

In [None]:
# set return_num_sequences > 1
beam_outputs = model.generate(
    input_ids, 
    max_length=50, 
    num_beams=5, 
    no_repeat_ngram_size=2, 
    num_return_sequences=5, 
    early_stopping=True
)
# now we have 3 output sequences
print("Output:\n")
for i, beam_output in enumerate(beam_outputs):
  print((f"{'-'*45}{'{:^10}'}{'-'*45} \n{'{}'}\n").format(i, tokenizer.decode(beam_output, skip_special_tokens=True)))


## Adding randomization

In [None]:
# set seed to creproduce results. Feel free to change the seed though to get different results
torch.manual_seed(0)

# activate sampling and deactivate top_k by setting top_k sampling to 0
sample_output = model.generate(
    input_ids, 
    do_sample=True, 
    max_length=50, 
    top_k=0
)

print("Output:\n" + 100 * '-')
print(tokenizer.decode(sample_output[0], skip_special_tokens=True))

## Adding Tempature

In [66]:
os.getcwd()

'/Users/nirendy/school-repo/language_model_playground/notebooks'

In [65]:
import os
os.path.dirname(os.path.realpath(__file__))

''

In [18]:
# set seed to reproduce results. Feel free to change the seed though to get different results
torch.manual_seed(0)

# use temperature to decrease the sensitivity to low probability candidates
sample_output = model.generate(
    input_ids, 
    do_sample=True, 
    max_length=50, 
    top_k=0, 
    temperature=0.7
)

print("Output:\n" + 100 * '-')
print(tokenizer.decode(sample_output[0], skip_special_tokens=True))

Output:
----------------------------------------------------------------------------------------------------
I enjoy walking with my cute dog along the way.

I was thinking about my dog the whole time.
So, did I feel compelled to do that? I was just thinking about it.
And he did it.


In [19]:
# set seed to reproduce results. Feel free to change the seed though to get different results
torch.manual_seed(0)

# set top_k to 50
sample_output = model.generate(
    input_ids, 
    do_sample=True, 
    max_length=50, 
    top_k=50
)

print("Output:\n" + 100 * '-')
print(tokenizer.decode(sample_output[0], skip_special_tokens=True))

Output:
----------------------------------------------------------------------------------------------------
I enjoy walking with my cute dog, when it's like being in my own home for 10-16 hours per day and then being in a lot of different places around the world."



"And he just seems to be taking away


# Evaluating

## Functions

In [20]:
def convert_ids_to_token(ids):
    return tokenizer.convert_tokens_to_string(
        tokenizer.convert_ids_to_tokens(
            ids, skip_special_tokens=True
        )
    ).strip()

In [21]:
def get_logit_top_n_tokens(logit, n):
    return convert_ids_to_token(
        logit.topk(n).indices
    ).strip().split(' ')

In [22]:
def get_sequence_logit_top_n_tokens(input_ids, sequence_logit, n):
    lst = []
    for i in range(len(input_ids)):
        sub_lst = []
        token_str = convert_ids_to_token(input_ids[i].item())
        sub_lst.append(token_str)
        if i > 0:
            sub_lst.extend(get_logit_top_n_tokens(sequence_logit[i-1], n))
        lst.append(sub_lst)
    return lst
    

In [None]:
beam_output = model.generate(
    input_ids, 
    max_length=50, 
    num_beams=5, 
    early_stopping=True
)

print("Output:\n" + 100 * '-')
print(tokenizer.decode(beam_output[0], skip_special_tokens=True))

## Testing

In [23]:
generated_model = model(input_ids)

In [24]:
res = get_sequence_logit_top_n_tokens(input_ids[0],generated_model.logits[0], 9)

In [26]:
pd.DataFrame(res).fillna('')

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,I,,,,,,,,,
1,enjoy,TheI,"A.\nTheA,�",,,,,,,
2,walking,the,this,it,my,your.,and,a,these,
3,with,"around,",and,on,in.,through,the,to,,
4,my,me,you,my,your,the,a,friends,us,kids
5,cute,kids,family,wife,friends,dog,children,son,daughter,dogs
6,dog,little,dog,baby,"girl,",cat,daughter,kids,friends,


In [57]:
generated_model = model(sample_output)

In [58]:
res = get_sequence_logit_top_n_tokens(sample_output[0],generated_model.logits[0], 9)

In [59]:
pd.DataFrame(res).fillna('')

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,I,,,,,,,,,
1,enjoy,TheI,"A.\nTheA,�",,,,,,,
2,walking,the,this,it,my,your.,and,a,these,
3,with,"around,",and,on,in.,through,the,to,,
4,my,me,you,my,your,the,a,friends,us,kids
5,cute,kids,family,wife,friends,dog,children,son,daughter,dogs
6,dog,little,dog,baby,"girl,",cat,daughter,kids,friends,
7,",",".,",and!,in,at,(,while,when,,
8,when,and,but,which,so�,I,my,who,the,
9,it,I,she,we,he,they,it,you,my,the


## More

In [24]:
x = model(input_ids)

In [25]:
tokenizer.convert_ids_to_tokens(input_ids[0])

['I', 'Ġenjoy', 'Ġwalking', 'Ġwith', 'Ġmy', 'Ġcute', 'Ġdog']

In [26]:
tokenizer.convert_tokens_to_string(
    tokenizer.convert_ids_to_tokens(
        sample_output[0], skip_special_tokens=True
    )
)

'I enjoy walking with my cute dog, when it\'s like being in my own home for 10-16 hours per day and then being in a lot of different places around the world."\n\n\n\n"And he just seems to be taking away'

In [27]:
tokenizer.convert_tokens_to_string(
    tokenizer.convert_ids_to_tokens(
        input_ids[0], skip_special_tokens=True
    )
)

'I enjoy walking with my cute dog'

In [28]:
x.logits.shape

torch.Size([1, 7, 50257])

In [29]:
tokenizer.convert_tokens_to_string('dog')

'dog'

In [30]:
tokenizer.decode(sample_output[0], skip_special_tokens=True, clean_up_tokenization_spaces=True)

'I enjoy walking with my cute dog, when it\'s like being in my own home for 10-16 hours per day and then being in a lot of different places around the world."\n\n\n\n"And he just seems to be taking away'

In [31]:
tokenizer.decode(sample_output[0], skip_special_tokens=True)

'I enjoy walking with my cute dog, when it\'s like being in my own home for 10-16 hours per day and then being in a lot of different places around the world."\n\n\n\n"And he just seems to be taking away'

In [32]:
x.logits[0,1]

tensor([-57.2613, -60.7171, -65.1646,  ..., -68.9890, -67.5203, -61.3845],
       grad_fn=<SelectBackward>)

In [33]:
get_logit_top_n_tokens(x.logits[0,1], 5)

['the', 'this', 'it', 'my', 'your']

In [34]:
t = x.logits[0,1]

In [35]:
t2 = t.sort(descending=True)

In [36]:
t2 = t.topk(5)

In [37]:
input_ids[0]

tensor([   40,  2883,  6155,   351,   616, 13779,  3290])

In [38]:
type(model).mro()

[transformers.models.gpt2.modeling_gpt2.GPT2LMHeadModel,
 transformers.models.gpt2.modeling_gpt2.GPT2PreTrainedModel,
 transformers.modeling_utils.PreTrainedModel,
 torch.nn.modules.module.Module,
 transformers.modeling_utils.ModuleUtilsMixin,
 transformers.generation_utils.GenerationMixin,
 transformers.file_utils.PushToHubMixin,
 object]