# Introduction to the GPT family

In [2]:
from transformers import pipeline, set_seed, GPT2Tokenizer, GPT2LMHeadModel
from torch import tensor, numel
from bertviz import model_view

set_seed(100)

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
generator = pipeline(
    'text-generation', model='gpt2'
)
generator('From the river to the sea Palestine will be', max_length=10, num_return_sequences=3)

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


[{'generated_text': 'From the river to the sea Palestine will be united'},
 {'generated_text': 'From the river to the sea Palestine will be full'},
 {'generated_text': 'From the river to the sea Palestine will be a'}]

In [4]:
tokenizer = GPT2Tokenizer.from_pretrained('gpt2') # Cased, by default

'tom' in tokenizer.get_vocab()

True

In [5]:
tokenizer.convert_ids_to_tokens(
    tokenizer.encode('Tomen loves a beautiful day!!!!!!')
)

['T', 'omen', 'Ġloves', 'Ġa', 'Ġbeautiful', 'Ġday', '!!!!', '!!']

In [6]:
encoded = tokenizer.encode(
    'Sinan loves a beautiful day!!!!!!', return_tensors='pt'
)
encoded

tensor([[46200,   272, 10408,   257,  4950,  1110, 13896,  3228]])

In [7]:
model = GPT2LMHeadModel.from_pretrained('gpt2')
model

GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-11): 12 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=768, out_features=50257, bias=False)
)

In [8]:
model.transformer.wte(encoded).shape

torch.Size([1, 8, 768])

# Masked multi-headed attention

In [1]:
import torch
import pandas as pd

In [23]:
phrase = 'My friend was right about this class. It is so much fun!'
encoded_phrase = tokenizer(phrase, return_tensors='pt')
# encoded_phrase
response = model(
    **encoded_phrase, output_attentions=True, output_hidden_states=True
)
len(response.attentions)

12

In [24]:
response.attentions[-1].shape # From last decoder

torch.Size([1, 12, 14, 14])

In [25]:
tokens = tokenizer.convert_ids_to_tokens(encoded_phrase['input_ids'][0])
tokens

['My',
 'Ġfriend',
 'Ġwas',
 'Ġright',
 'Ġabout',
 'Ġthis',
 'Ġclass',
 '.',
 'ĠIt',
 'Ġis',
 'Ġso',
 'Ġmuch',
 'Ġfun',
 '!']

In [26]:
# 10th decoder, first head
arr = response.attentions[9][0][0]

n_digits = 3

attention_df = pd.DataFrame(
    (torch.round(arr*10**n_digits) / (10**n_digits)).detach()
).applymap(float)


attention_df.columns = tokens
attention_df.index = tokens

attention_df

  attention_df = pd.DataFrame(


Unnamed: 0,My,Ġfriend,Ġwas,Ġright,Ġabout,Ġthis,Ġclass,.,ĠIt,Ġis,Ġso,Ġmuch,Ġfun,!
My,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Ġfriend,0.968,0.032,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Ġwas,0.824,0.145,0.031,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Ġright,0.979,0.008,0.007,0.005,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Ġabout,0.979,0.008,0.004,0.005,0.005,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Ġthis,0.924,0.031,0.007,0.006,0.016,0.016,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Ġclass,0.946,0.005,0.001,0.001,0.001,0.002,0.044,0.0,0.0,0.0,0.0,0.0,0.0,0.0
.,0.691,0.013,0.003,0.003,0.002,0.006,0.269,0.013,0.0,0.0,0.0,0.0,0.0,0.0
ĠIt,0.318,0.003,0.003,0.003,0.006,0.018,0.599,0.018,0.032,0.0,0.0,0.0,0.0,0.0
Ġis,0.331,0.006,0.002,0.002,0.003,0.018,0.533,0.013,0.062,0.03,0.0,0.0,0.0,0.0


In [27]:
tokens = tokenizer.convert_ids_to_tokens(encoded_phrase['input_ids'][0])
model_view(
    attention=response.attentions, tokens=tokens
)

<IPython.core.display.Javascript object>

In [28]:
response.hidden_states[-1].shape

torch.Size([1, 14, 768])

In [36]:
tokenizer.convert_ids_to_tokens(response.logits.argmax(2)[0])

tensor([ 198,   11,  257,   13,  326,   13,   13,  314,  338,  257, 1593, 1257,
         284,  314])

In [37]:
pd.DataFrame(
    zip(tokens, tokenizer.convert_ids_to_tokens(response.logits.argmax(2)[0])),
    columns=['Sequence up until','Next token with highest probability']
)

Unnamed: 0,Sequence up until,Next token with highest probability
0,My,Ċ
1,Ġfriend,","
2,Ġwas,Ġa
3,Ġright,.
4,Ġabout,Ġthat
5,Ġthis,.
6,Ġclass,.
7,.,ĠI
8,ĠIt,'s
9,Ġis,Ġa


In [39]:
generator(phrase, max_length=30, num_return_sequences=1, do_sample=False) # greedy search

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


[{'generated_text': 'My friend was right about this class. It is so much fun! I love it! I love the class. I love the class. I love'}]

In [40]:
generator(phrase, max_length=30, num_return_sequences=1, do_sample=True) # greedy search with sampling

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


[{'generated_text': 'My friend was right about this class. It is so much fun! You learn a lot about yourself. You look better each time you learn and take'}]

# Pre-training GPT

In [47]:
from transformers import pipeline, set_seed
from torch import tensor

generator = pipeline(
    'text-generation', model='gpt2', tokenizer=tokenizer
)
set_seed(100)

In [52]:
generator("The holocaust was", max_length=10, num_return_sequences=10, temperature=0.8, num_beams=10)

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


[{'generated_text': 'The holocaust was not an isolated event. It'},
 {'generated_text': 'The holocaust was not an aberration. It'},
 {'generated_text': 'The holocaust was not an isolated event. The'},
 {'generated_text': 'The holocaust was not an isolated event, but'},
 {'generated_text': 'The holocaust was a crime against humanity. It'},
 {'generated_text': 'The holocaust was a crime against humanity.\n'},
 {'generated_text': 'The holocaust was a crime against humanity, a'},
 {'generated_text': 'The holocaust was not an isolated incident. It'},
 {'generated_text': 'The holocaust was not an isolated incident, but'},
 {'generated_text': 'The holocaust was not an isolated event.\n'}]

In [53]:
generator("Jewish people are", max_length=10, num_return_sequences=10, temperature=0.8, num_beams=10)

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


[{'generated_text': 'Jewish people are not the only ones who have been'},
 {'generated_text': 'Jewish people are not the only ones who suffer from'},
 {'generated_text': 'Jewish people are not the only ones who are being'},
 {'generated_text': 'Jewish people are the most oppressed people on Earth.'},
 {'generated_text': 'Jewish people are the most oppressed people on Earth,'},
 {'generated_text': 'Jewish people are not the only ones who feel that'},
 {'generated_text': 'Jewish people are not the only ones who feel the'},
 {'generated_text': 'Jewish people are not the only ones who are affected'},
 {'generated_text': 'Jewish people are not the only ones who feel this'},
 {'generated_text': 'Jewish people are not the only ones who are oppressed'}]

# Few-shot learning

In [65]:
few_shot_ex="""Sentiment Analysis
Text: I hate it when my phone battery dies.
Sentiment: Negative
###
Text: My day has been really great!
Sentiment: Positive
###
Text: This new music video was so good
Sentiment:"""
print(generator(few_shot_ex,top_k=2, temperature=0.1,max_length=55
    )[0]['generated_text']
)

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Sentiment Analysis
Text: I hate it when my phone battery dies.
Sentiment: Negative
###
Text: My day has been really great!
Sentiment: Positive
###
Text: This new music video was so good
Sentiment: Positive

