In [252]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import itertools
import functools
from collections import defaultdict
import numpy as np
from transformers import AutoModel,AutoTokenizer,AutoModelForCausalLM
torch.set_printoptions(linewidth=300)

In [187]:
tokenizer=AutoTokenizer.from_pretrained('gpt2-medium')
model=AutoModelForCausalLM.from_pretrained('gpt2-medium')

In [188]:
class Dummy(nn.Module):
    def __init__(self, conv1dmodule: nn.Module,lora_rank: int) :
        super().__init__()
        self.base_module = conv1dmodule

        
        size=conv1dmodule.get_parameter("weight").shape
       
        self.A=nn.Parameter(torch.randn(size[0],lora_rank),True)
        self.B=nn.Parameter(torch.zeros(lora_rank,size[1]),True)
    def forward(self,x):

        y=self.base_module(x)
        z=x@self.A@self.B
        return y+z

In [189]:
for i,m in enumerate(model.transformer.h):
    # print(i)
    print(m.mlp.c_fc.get_parameter("weight").shape,m.mlp.c_proj.get_parameter("weight").shape)
    print(m.attn.c_attn.get_parameter("weight").shape,m.attn.c_proj.get_parameter("weight").shape)
    break

torch.Size([1024, 4096]) torch.Size([4096, 1024])
torch.Size([1024, 3072]) torch.Size([1024, 1024])


In [190]:
for m in model.transformer.h:
    m.attn.c_attn=Dummy(m.attn.c_attn,2)
    m.mlp.c_fc=Dummy(m.mlp.c_fc,2)
    m.mlp.c_proj=Dummy(m.mlp.c_proj,2)

In [191]:
tokenizer.pad_token=tokenizer.eos_token
inp=tokenizer(['hello','Good bye'],['world',' jone'],padding=True,return_tensors='pt')

In [263]:
model(**inp,use_cache=False).logits.shape

torch.Size([2, 4, 50257])

In [193]:
tokenizer.pad_token_id

50256

In [194]:
model

GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 1024)
    (wpe): Embedding(1024, 1024)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-23): 24 x GPT2Block(
        (ln_1): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Dummy(
            (base_module): Conv1D()
          )
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Dummy(
            (base_module): Conv1D()
          )
          (c_proj): Dummy(
            (base_module): Conv1D()
          )
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=1024, ou

In [198]:
parameters=[]
for m in model.transformer.h:
    parameters.extend([m.attn.c_attn.A,m.attn.c_attn.B])
    parameters.extend([m.mlp.c_fc.A,m.mlp.c_fc.B])
    parameters.extend([m.mlp.c_proj.A,m.mlp.c_proj.B])
    
    # m.attn.c_attn=Dummy(m.attn.c_attn,2)
    # m.mlp.c_fc=Dummy(m.mlp.c_fc,2)
    # m.mlp.c_proj=Dummy(m.mlp.c_proj,2)

In [218]:
parameters=[]
for m in model.modules():
    if isinstance(m,Dummy):
        parameters.extend([m.A,m.B])
# parameters

In [299]:
x=['Who is the singer for the band Queen?', 'What is the capital of France?', 'What is the capital of France?']
y=['Freddie Mercury', 'Paris','Freddie Mercury']

In [300]:
inputs=tokenizer(x,y,padding=True,return_tensors='pt')
labels=inputs.input_ids.clone()

labels[inputs.attention_mask==0]=-100

x_len=tokenizer(x,padding=True,return_tensors='pt').attention_mask.sum(1)


for i in range(len(x_len)):
    labels[i,0:x_len[i]]=-100    
# tokenizer.decode(ids[1])

In [301]:
print(inputs.input_ids)
print(inputs.attention_mask)
print(labels)

tensor([[ 8241,   318,   262, 14015,   329,   262,  4097,  7542,    30, 30847, 11979, 21673],
        [ 2061,   318,   262,  3139,   286,  4881,    30, 40313, 50256, 50256, 50256, 50256],
        [ 2061,   318,   262,  3139,   286,  4881,    30, 30847, 11979, 21673, 50256, 50256]])
tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
        [1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0],
        [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0]])
tensor([[ -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100, 30847, 11979, 21673],
        [ -100,  -100,  -100,  -100,  -100,  -100,  -100, 40313,  -100,  -100,  -100,  -100],
        [ -100,  -100,  -100,  -100,  -100,  -100,  -100, 30847, 11979, 21673,  -100,  -100]])


        >>> x = ['Who is the singer for the band Queen?', 'What is the capital of France?']
        >>> y = ['Freddie Mercury', 'Paris']
        >>> tokenizer = transformers.GPT2Tokenizer.from_pretrained('gpt2')
        >>> tokenizer_dict = tokenizer([x_ + y_ for x_, y_ in zip(x, y)], return_tensors='pt', padding=True)
        >>> tokenizer_dict['input_ids']
        tensor([[ 8241,   318,   262, 14015,   329,   262,  4097,  7542,    30, 30847, 11979, 21673],
                [ 2061,   318,   262,  3139,   286,  4881,    30, 40313, 50256, 50256, 50256, 50256]])
        >>> tokenizer_dict['attention_mask']
        tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
                [1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0]])
        >>> tokenizer(x)['input_ids']
        [[8241, 318, 262, 14015, 329, 262, 4097, 7542, 30],
         [2061, 318, 262, 3139, 286, 4881, 30]]
        >>> tokenizer(y)['input_ids']
        [[30847, 11979, 21673],
         [40313]]

        In this case, our labels should look like:
        [[-100, -100, -100, -100, -100, -100, -100, -100,   -100,  30847, 11979, 21673],
         [-100, -100, -100, -100, -100, -100, -100,  40313, -100, -100,  -100,  -100]]
        Note we've replaced padding tokens and the input prefix for each example

In [333]:
model.to('cuda')

GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 1024)
    (wpe): Embedding(1024, 1024)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-23): 24 x GPT2Block(
        (ln_1): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Dummy(
            (base_module): Conv1D()
          )
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Dummy(
            (base_module): Conv1D()
          )
          (c_proj): Dummy(
            (base_module): Conv1D()
          )
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=1024, ou

In [334]:
# inp=dict(input_ids=inputs.input_ids,attention_mask=inputs.attention_mask,labels=labels)
inputs['labels']=labels
inputs=inputs.to('cuda')

out=model(**inputs,use_cache=False)

In [335]:
logits=out.logits[:,:-1,:]
target=inputs.labels[:,1:]
B,T=target.shape
mask=target!=-100
(logits[mask].argmax(-1)==target[mask]).float().mean()

tensor(0.2857, device='cuda:0')

In [336]:
list(inputs.values())[0]

tensor([[ 8241,   318,   262, 14015,   329,   262,  4097,  7542,    30, 30847, 11979, 21673],
        [ 2061,   318,   262,  3139,   286,  4881,    30, 40313, 50256, 50256, 50256, 50256],
        [ 2061,   318,   262,  3139,   286,  4881,    30, 30847, 11979, 21673, 50256, 50256]], device='cuda:0')

In [327]:
{k:v[0:1] for k,v in inputs.items()}

{'input_ids': tensor([[ 8241,   318,   262, 14015,   329,   262,  4097,  7542,    30, 30847, 11979, 21673]], device='cuda:0'),
 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]], device='cuda:0'),
 'labels': tensor([[ -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100, 30847, 11979, 21673]], device='cuda:0')}

In [328]:
inputs.items()

dict_items([('input_ids', tensor([[ 8241,   318,   262, 14015,   329,   262,  4097,  7542,    30, 30847, 11979, 21673],
        [ 2061,   318,   262,  3139,   286,  4881,    30, 40313, 50256, 50256, 50256, 50256],
        [ 2061,   318,   262,  3139,   286,  4881,    30, 30847, 11979, 21673, 50256, 50256]], device='cuda:0')), ('attention_mask', tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
        [1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0],
        [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0]], device='cuda:0')), ('labels', tensor([[ -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100, 30847, 11979, 21673],
        [ -100,  -100,  -100,  -100,  -100,  -100,  -100, 40313,  -100,  -100,  -100,  -100],
        [ -100,  -100,  -100,  -100,  -100,  -100,  -100, 30847, 11979, 21673,  -100,  -100]], device='cuda:0'))])

In [331]:
model.to('cpu')

GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 1024)
    (wpe): Embedding(1024, 1024)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-23): 24 x GPT2Block(
        (ln_1): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Dummy(
            (base_module): Conv1D()
          )
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Dummy(
            (base_module): Conv1D()
          )
          (c_proj): Dummy(
            (base_module): Conv1D()
          )
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=1024, ou

In [332]:
model.device

device(type='cpu')