In [6]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import math
from dataclasses import dataclass

@dataclass
class GPTConfig:
    block_size: int = 1024
    vocab_size: int = 50257
    n_layer: int = 48      # e.g. GPT-2 XL
    n_head: int = 25
    n_embd: int = 1600

class CausalSelfAttention(nn.Module):
    def __init__(self, config):
        super().__init__()
        assert config.n_embd % config.n_head == 0
        self.n_head = config.n_head
        self.n_embd = config.n_embd
        self.c_attn = nn.Linear(config.n_embd, 3 * config.n_embd)
        self.c_proj = nn.Linear(config.n_embd, config.n_embd)

        self.register_buffer(
            "bias",
            torch.tril(torch.ones(config.block_size, config.block_size))
            .view(1, 1, config.block_size, config.block_size)
        )

    def forward(self, x):
        B, T, C = x.size()
        qkv = self.c_attn(x)
        q, k, v = qkv.split(C, dim=2)

        q = q.view(B, T, self.n_head, C // self.n_head).transpose(1, 2)
        k = k.view(B, T, self.n_head, C // self.n_head).transpose(1, 2)
        v = v.view(B, T, self.n_head, C // self.n_head).transpose(1, 2)

        att = (q @ k.transpose(-2, -1)) * (1.0 / math.sqrt(k.size(-1)))
        att = att.masked_fill(self.bias[:, :, :T, :T] == 0, float('-inf'))
        att = F.softmax(att, dim=-1)

        y = att @ v
        y = y.transpose(1, 2).contiguous().view(B, T, C)
        y = self.c_proj(y)
        return y


class MLP(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.c_fc = nn.Linear(config.n_embd, 4 * config.n_embd)
        self.gelu = nn.GELU(approximate='tanh')
        self.c_proj = nn.Linear(4 * config.n_embd, config.n_embd)

    def forward(self, x):
        x = self.c_fc(x)
        x = self.gelu(x)
        x = self.c_proj(x)
        return x


class Block(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.ln_1 = nn.LayerNorm(config.n_embd)
        self.attn = CausalSelfAttention(config)
        self.ln_2 = nn.LayerNorm(config.n_embd)
        self.mlp = MLP(config)

    def forward(self, x):
        x = x + self.attn(self.ln_1(x))
        x = x + self.mlp(self.ln_2(x))
        return x


class GPT(nn.Module):
    """
    A GPT-like model that only stores the hidden state *after*
    'skip_up_to - 1' layers (the "Block k" state).

    For skipping:
      - if we detect a copy scenario, we load the cached partial
        hidden state from t_matched, run the last layers only.
      - else, we run all layers and store the partial state.
    """

    def __init__(self, config, skip_up_to=43):
        super().__init__()
        self.config = config
        self.skip_up_to = skip_up_to   # number of layers to skip
        self.transformer = nn.ModuleDict(dict(
            wte = nn.Embedding(config.vocab_size, config.n_embd),
            wpe = nn.Embedding(config.block_size, config.n_embd),
            h = nn.ModuleList([Block(config) for _ in range(config.n_layer)]),
            ln_f = nn.LayerNorm(config.n_embd),
        ))
        self.lm_head = nn.Linear(config.n_embd, config.vocab_size, bias=False)

        # cache_partial: only store the hidden state after skip_up_to - 1 layers
        # so cache_partial[(batch_idx, pos)] = Tensor shape (B, T, n_embd)
        self.cache_partial = {}

    # def forward(self, input_ids):
    #     B, T = input_ids.shape
    #     device = input_ids.device

    #     # token + pos embedding
    #     positions = torch.arange(0, T, dtype=torch.long, device=device)
    #     x = self.transformer.wte(input_ids) + self.transformer.wpe(positions)

    #     # Actually run the layers from start_layer to end
    #     for layer_idx in range(self.config.n_layer):
    #         block = self.transformer.h[layer_idx]
    #         x = block(x)

    #     x = self.transformer.ln_f(x)
    #     logits = self.lm_head(x)
    #     return logits

    def forward(self, input_ids, pos_matched=None):
        B, T = input_ids.shape
        device = input_ids.device

        # if pos_matched is not None:
        #     print("pos_matched", input_ids[:, pos_matched-4 : pos_matched])
        #     print("last token", input_ids[:, -5:-1])
        #     a

        # Basic embed
        pos = torch.arange(0, T, dtype=torch.long, device=device)
        full_x = self.transformer.wte(input_ids) + self.transformer.wpe(pos)

        if pos_matched is None:
            # Normal forward for all T tokens from layer 0..n_layer
            x = full_x
            for block in self.transformer.h:
                x = block(x)

        else:
            # "Copy" scenario
            # print(full_x.shape)
            # 1) separate the sequence into first T-1 tokens vs the newly added token
            x_trunc = full_x[:, :, :]  # shape = (1, T-1, n_embd)
            # print(x_trunc.shape)

            # 2) compute from layer 0..skip_up_to on the truncated x
            for layer_idx in range(self.skip_up_to):
                x_trunc = self.transformer.h[layer_idx](x_trunc)

            # 3) get matched hidden from x_trunc for the new token
            #    note that pos_matched must be < T-1, so we can do x_trunc[:, pos_matched, :]
            matched_hid = x_trunc[:, pos_matched, :] - self.transformer.wpe(pos)[pos_matched, :]  # shape (1, n_embd)
            last_hid = x_trunc[:, -1, :] - self.transformer.wpe(pos)[-1, :]  # shape (1, n_embd)
            cos_sim = F.cosine_similarity(matched_hid, last_hid, dim=-1)
            print("cosine similarity: ", cos_sim.cpu().detach().numpy())
            a
            
            matched_hid = matched_hid.unsqueeze(1)    # => shape (1, 1, n_embd)

            # 4) cat matched hidden to x_trunc => new shape (1, T, n_embd)
            x = torch.cat([x_trunc, matched_hid], dim=1)
            # print(x.shape)
            # print(x[:, -1, :] == x[:, pos_matched, :])
            # print(x[:, -1, :])
            # print(x[:, pos_matched, :])

            # 5) continue from layer skip_up_to+1..end on the full (1, T, n_embd)
            for layer_idx in range(self.skip_up_to, self.config.n_layer):
                x = self.transformer.h[layer_idx](x)

        # final layer norm + logits
        x = self.transformer.ln_f(x)
        logits = self.lm_head(x)  # shape (B, T, vocab_size)
        return logits

    # def forward(self, input_ids, batch_idx=0, pos_new=None, pos_matched=None):
    #     """
    #     input_ids: shape (B, T)
    #     pos_new: which position we're generating if single-token stepping
    #     pos_matched: if copying from matched position
    #     """
    #     B, T = input_ids.shape
    #     device = input_ids.device

    #     # token + pos embedding
    #     positions = torch.arange(0, T, device=device)
    #     x = self.transformer['wte'](input_ids) + self.transformer['wpe'](positions)

    #     # Skip scenario?
    #     if pos_matched is not None and (batch_idx, pos_matched) in self.cache_partial:
    #         # Reuse the partial hidden state from pos_matched
    #         # instead of computing layers [0..skip_up_to-1].
    #         partial_hid = self.cache_partial[(batch_idx, pos_matched)][:, -1, :].to(device)
    #         x[:,-1,:] = partial_hid
    #         start_layer = self.skip_up_to
    #     else:
    #         # we have to compute all layers or up to skip_up_to
    #         start_layer = 0

    #     # Actually run the layers from start_layer to end
    #     for layer_idx in range(start_layer, self.config.n_layer):
    #         block = self.transformer['h'][layer_idx]
    #         x = block(x)

    #         # If layer_idx == skip_up_to-1, store partial
    #         if layer_idx == self.skip_up_to - 1 and pos_new is not None:
    #             # store in CPU to reduce GPU memory
    #             self.cache_partial[(batch_idx, pos_new)] = x[:,-1,:].detach().cpu()

    #     x = self.transformer['ln_f'](x)
    #     logits = self.lm_head(x)
    #     return logits

    @classmethod
    def from_pretrained(cls, model_type, skip_up_to=43):
        """Loads pretrained GPT-2 model weights from huggingface"""
        assert model_type in {'gpt2', 'gpt2-medium', 'gpt2-large', 'gpt2-xl'}
        from transformers import GPT2LMHeadModel
        print("loading weights from pretrained gpt: %s" % model_type)

        # n_layer, n_head and n_embd are determined from model_type
        config_args = {
            'gpt2':         dict(n_layer=12, n_head=12, n_embd=768),  # 124M params
            'gpt2-medium':  dict(n_layer=24, n_head=16, n_embd=1024), # 350M params
            'gpt2-large':   dict(n_layer=36, n_head=20, n_embd=1280), # 774M params
            'gpt2-xl':      dict(n_layer=48, n_head=25, n_embd=1600), # 1558M params
        }[model_type]
        config_args['vocab_size'] = 50257 # always 50257 for GPT model checkpoints
        config_args['block_size'] = 1024 # always 1024 for GPT model checkpoints
        # create a from-scratch initialized minGPT model
        config = GPTConfig(**config_args)
        model = GPT(config, skip_up_to=skip_up_to)
        sd = model.state_dict()
        sd_keys = sd.keys()
        sd_keys = [k for k in sd_keys if not k.endswith('.attn.bias')] # discard this mask / buffer, not a param

        # init a huggingface/transformers model
        model_hf = GPT2LMHeadModel.from_pretrained(model_type)
        sd_hf = model_hf.state_dict()

        # copy while ensuring all of the parameters are aligned and match in names and shapes
        sd_keys_hf = sd_hf.keys()
        sd_keys_hf = [k for k in sd_keys_hf if not k.endswith('.attn.masked_bias')] # ignore these, just a buffer
        sd_keys_hf = [k for k in sd_keys_hf if not k.endswith('.attn.bias')] # same, just the mask (buffer)
        transposed = ['attn.c_attn.weight', 'attn.c_proj.weight', 'mlp.c_fc.weight', 'mlp.c_proj.weight']
        # basically the openai checkpoints use a "Conv1D" module, but we only want to use a vanilla Linear
        # this means that we have to transpose these weights when we import them
        assert len(sd_keys_hf) == len(sd_keys), f"mismatched keys: {len(sd_keys_hf)} != {len(sd_keys)}"
        for k in sd_keys_hf:
            if any(k.endswith(w) for w in transposed):
                # special treatment for the Conv1D weights we need to transpose
                assert sd_hf[k].shape[::-1] == sd[k].shape
                with torch.no_grad():
                    sd[k].copy_(sd_hf[k].t())
            else:
                # vanilla copy over the other parameters
                assert sd_hf[k].shape == sd[k].shape
                with torch.no_grad():
                    sd[k].copy_(sd_hf[k])

        return model

In [7]:
def seed_everything(seed):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed)

def get_top_k(logits, top_k=5):
    """
    logits: (B, T, vocab_size)
    Returns a list of top-k token IDs for the last position, e.g. [id1, id2,...].
    """
    last_logits = logits[:, -1, :]       # shape (B, vocab_size)
    probs = torch.softmax(last_logits, dim=-1)
    top_vals, top_indices = probs.topk(top_k, dim=-1)
    # top_indices is shape (B, top_k). For B=1, we do top_indices[0].tolist().
    return top_indices[0].tolist()

def detect_ngram_copy(seq_ids: torch.Tensor, n=3, skip_up_to=43):
    """
    Minimal function that tries to find n-gram copy scenario
    (just a placeholder – adapt to your real logic)
    """
    T = seq_ids.size(1)  # shape (B=1, T)
    if T < n:
        return None, None
    # 1) last token
    last_token = seq_ids[0, -1].item()
    # 2) find earlier positions of last_token
    possible_pos = (seq_ids[0, :-1] == last_token).nonzero().view(-1)
    if possible_pos.numel() == 0:
        return None, None
    # 3) check (n-1) context
    n_minus_1 = n - 1
    context_needed = seq_ids[0, -(n_minus_1+1):-1]  # last n-1 tokens
    matched_pos = None
    for pos in reversed(possible_pos):
        if pos >= n_minus_1:
            candidate = seq_ids[0, pos-n_minus_1:pos]
            if torch.all(candidate == context_needed):
                matched_pos = pos.item()
                break
    if matched_pos is None:
        return None, None
    else:
        return matched_pos, skip_up_to

Code Repair

In [8]:
import os

# Specify the folder containing the .py files
folder_path = "QuixBugs/python_programs"  # Replace with the actual path to your folder

# Initialize an empty list to store file contents
file_contents = []

# Iterate through all files in the folder
for file_name in os.listdir(folder_path):
    if file_name.endswith(".py"):  # Check if the file is a .py file
        file_path = os.path.join(folder_path, file_name)
        with open(file_path, "r") as file:
            content = file.read()  # Read the file content
            content = content.replace("    ", "\t")
            file_contents.append(content)  # Add the content to the list

In [9]:
from transformers import GPT2Tokenizer

model_name = 'gpt2-xl'
skip_up_to = 5

device1 = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
tokenizer = GPT2Tokenizer.from_pretrained(model_name)

model_copy = GPT.from_pretrained(model_name, skip_up_to=skip_up_to)
model_copy = model_copy.to(device1)
model_copy.eval()

device = torch.device("cuda:1" if torch.cuda.is_available() else "cpu")
model = GPT.from_pretrained(model_name, skip_up_to=skip_up_to)
model = model.to(device)
model.eval()



loading weights from pretrained gpt: gpt2-xl
loading weights from pretrained gpt: gpt2-xl


GPT(
  (transformer): ModuleDict(
    (wte): Embedding(50257, 1600)
    (wpe): Embedding(1024, 1600)
    (h): ModuleList(
      (0-47): 48 x Block(
        (ln_1): LayerNorm((1600,), eps=1e-05, elementwise_affine=True)
        (attn): CausalSelfAttention(
          (c_attn): Linear(in_features=1600, out_features=4800, bias=True)
          (c_proj): Linear(in_features=1600, out_features=1600, bias=True)
        )
        (ln_2): LayerNorm((1600,), eps=1e-05, elementwise_affine=True)
        (mlp): MLP(
          (c_fc): Linear(in_features=1600, out_features=6400, bias=True)
          (gelu): GELU(approximate='tanh')
          (c_proj): Linear(in_features=6400, out_features=1600, bias=True)
        )
      )
    )
    (ln_f): LayerNorm((1600,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=1600, out_features=50257, bias=False)
)

Test model match with Hugging Face

In [10]:
import time
import torch
import random
import numpy as np
from transformers import GPT2Tokenizer, GPT2LMHeadModel
from tqdm import tqdm
from collections import defaultdict

seed = 5
seed_everything(seed)

extra_steps = 30
max_steps = 1024             
k = 50
n = 5
info_lst = []

# for idx, code in tqdm(enumerate(file_contents)):
for code in tqdm(file_contents):
    # if idx == 2:
    #     break
    prompt = f"Correct the following code:\n{code}\nCorrected code: def"
    # print(prompt)
    
    code_ids = tokenizer.encode(code, return_tensors='pt')
    input_ids = tokenizer.encode(prompt, return_tensors='pt')
    steps = extra_steps+code_ids.size(1)
    if steps > max_steps:
        steps = max_steps

    dict_pred_info = defaultdict(dict)

    # Copy model generation
    copy_ids = input_ids.clone().to(device1)
    target_steps = []
    flag = False
    for step_i in range(steps):
        t0 = time.time()

        # detect copy scenario
        t_matched, skip_up_to = detect_ngram_copy(copy_ids, n=n, skip_up_to=skip_up_to)
        
        # if t_matched is not None:
        #     t_matched = -1  # we're predicting the next token

        with torch.no_grad():
            # forward pass (copy-mech model)
            logits = model_copy(
                input_ids=copy_ids,
                pos_matched=t_matched,
            )

            logits = logits[:, -1, :]
            probs = F.softmax(logits, dim=-1)
            topk_probs, topk_indices = torch.topk(probs, k, dim=-1)
            next_token = torch.multinomial(topk_probs, num_samples=1)
            xcol = torch.gather(topk_indices, -1, next_token)
            copy_ids = torch.cat([copy_ids, xcol], dim=1)

        elapsed_copy = time.time() - t0

        # store info
        dict_pred_info[step_i]['copy'] = topk_indices[0].tolist()
        dict_pred_info[step_i]['copy_time'] = elapsed_copy
            
    # decoded_last_token = [tokenizer.decode([t]) for t in copy_ids[0,:].tolist()]
    # # join decoded_last_token
    # decoded_last_token = ''.join(decoded_last_token)
    # print(f"Last token: {decoded_last_token}, {copy_ids.tolist()}")
    # print('\n-----\n')
    # break
    # # clear cache
    model_copy.cache_partial.clear()
    
    # 2) Original model generation
    t_matched, skip_up_to = (None, None)
    original_ids = input_ids.clone().to(device)
    for step_i in range(steps):
        t0 = time.time()

        with torch.no_grad():
            # forward pass
            logits = model(
                input_ids=original_ids,
                # pos_matched=-1,
            )

            logits = logits[:, -1, :]
            probs = F.softmax(logits, dim=-1)
            topk_probs, topk_indices = torch.topk(probs, k, dim=-1)
            next_token = torch.multinomial(topk_probs, num_samples=1)
            xcol = torch.gather(topk_indices, -1, next_token)
            original_ids = torch.cat([original_ids, xcol], dim=1)

        elapsed_orig = time.time() - t0

        # store info
        dict_pred_info[step_i]['original'] = topk_indices[0].tolist()
        dict_pred_info[step_i]['original_time'] = elapsed_orig
        
    # decoded_last_token = [tokenizer.decode([t]) for t in original_ids[0,:].tolist()]
    # decoded_last_token = ''.join(decoded_last_token)
    # print(f"Last token: {decoded_last_token}, {original_ids.tolist()}")
    
    # clear cache
    model.cache_partial.clear()

    info_lst.append(dict_pred_info)


  0%|          | 0/40 [00:00<?, ?it/s]

cosine similarity:  [0.9065787]





NameError: name 'a' is not defined

: 

In [9]:
def jaccard_similarity(list1, list2):
    set1 = set(list1)
    set2 = set(list2)
    intersection = len(set1.intersection(set2))
    union = len(set1) + len(set2) - intersection
    return  intersection / union

jcc_ult = []
acc_ult = []
tpt_copy = []
tpt_orig = []

for data in info_lst:
    total_copy_time = 0
    total_orig_time = 0
    acc_lst = []
    jc_lst = []
    for step in data.keys():
        total_copy_time += data[step]['copy_time']
        total_orig_time += data[step]['original_time']
        copy = data[step]['copy']
        original = data[step]['original']

        jaccard_score = jaccard_similarity(copy, original)
        jc_lst.append(jaccard_score)

        acc_score = 1 if copy[0] == original[0] else 0
        acc_lst.append(acc_score)

    # time per token - tpt
    tpt_copy.append(total_copy_time / steps)
    tpt_orig.append(total_orig_time / steps)

    jcc_ult.append(jc_lst)
    acc_ult.append(acc_lst)

In [10]:
def cal_avg(lsts):
    avg_lst = []
    for lst in lsts:
        avg_lst.append(sum(lst) / len(lst))
    return sum(avg_lst) / len(avg_lst)

avg_jcc = cal_avg(jcc_ult)
avg_acc = cal_avg(acc_ult)
avg_jcc, avg_acc

(0.10329446641805545, 0.05472636815920398)

In [9]:
model_copy.skip_up_to = -1
model.skip_up_to = -1

In [None]:
n=0
20 - 0.08103589014311008, 0.027826079522552734
15 - 0.08103589014311008, 0.027826079522552734
10 - 

n=5
1 - 
10 - (0.1254309775496404, 0.083371394195272)


In [9]:
import time
import torch
import random
import numpy as np
from transformers import GPT2Tokenizer, GPT2LMHeadModel
from tqdm import tqdm
from collections import defaultdict

seed = 5
seed_everything(seed)
code = file_contents[0]
promptt = f"Correct the following code:\n{code}\nCorrected code: def"
max_len = 600
num_seq = 1
x = tokenizer.encode(promptt, return_tensors='pt').to(device)
# repeat the tokens for num_seq times
# x = tokens.repeat(num_seq, 1).to(device)

# for p in range(x.size(1)):
#     sub_ids = x[:, :p+1]
#     model.forward(sub_ids, batch_idx=0, pos_new=p, pos_matched=None)

while x.size(1) < max_len:
    with torch.no_grad():
        logits = model(x)
        logits = logits[:, -1, :]
        probs = F.softmax(logits, dim=-1)
        topk_probs, topk_indices = torch.topk(probs, 50, dim=-1)
        next_token = torch.multinomial(topk_probs, num_samples=1)
        xcol = torch.gather(topk_indices, -1, next_token)
        x = torch.cat([x, xcol], dim=1)
    
for i in range(num_seq):
    print(">",tokenizer.decode(x[i].tolist()))

# print(model.cache_partial)
# model.cache_partial.clear()

> Correct the following code:
def shunting_yard(tokens):
	precedence = {
		'+': 1,
		'-': 1,
		'*': 2,
		'/': 2
	}

	rpntokens = []
	opstack = []
	for token in tokens:
		if isinstance(token, int):
			rpntokens.append(token)
		else:
			while opstack and precedence[token] <= precedence[opstack[-1]]:
				rpntokens.append(opstack.pop())

	while opstack:
		rpntokens.append(opstack.pop())

	return rpntokens
Corrected code: def shunting_yard(tokens): le = (tokens.pop().index('-'))
assert('Shunting yard: (%d)', le)

      shunning_yard(tokens)
Output:
Shunting yard: 1 Shunning yard: 2 Shunning yard: 1 Shunning yard: 2 Shunning yard: 1
import os from itertools import izip, izip2
      for icharacter in ['('','not')]
      le = izip(itertools.islice(itertools.groupby(icharacter, len)).sort())[0]
print( len(le))
output:
Shunting yard: 1 Shunning yard: 2 Shunning yard: 1 Shunning yard: 2
Using the izip2 module:
import izip2
    le = izip(
      for icharacter in ['('','not')]
      izip2.islice(it

In [6]:
model.cache_partial.keys()

dict_keys([(0, 0), (0, 1), (0, 2), (0, 3), (0, 4), (0, 5), (0, 6), (0, 7), (0, 8), (0, 9), (0, 10), (0, 11), (0, 12), (0, 13), (0, 14), (0, 15), (0, 16), (0, 17), (0, 18), (0, 19), (0, 20), (0, 21), (0, 22), (0, 23), (0, 24), (0, 25), (0, 26), (0, 27), (0, 28), (0, 29), (0, 30), (0, 31), (0, 32), (0, 33), (0, 34), (0, 35), (0, 36), (0, 37), (0, 38), (0, 39), (0, 40), (0, 41), (0, 42), (0, 43), (0, 44), (0, 45), (0, 46), (0, 47), (0, 48), (0, 49), (0, 50), (0, 51), (0, 52), (0, 53), (0, 54), (0, 55), (0, 56), (0, 57), (0, 58), (0, 59), (0, 60), (0, 61), (0, 62)])

In [7]:
model.cache_partial[(0, 3)].shape

torch.Size([1, 1600])

In [5]:
from transformers import GPT2LMHeadModel
_model = GPT2LMHeadModel.from_pretrained(model_name)
_model.eval()
_model.to(device)

seed = 5
seed_everything(seed)
code = """def bitcount(n):
    count = 0
    while n:
        n ^= n - 1
        count += 1
    return count"""
prompt = f"Correct the following code:\n{code}\nCorrected code: def"
max_len = 200
num_seq = 1
tokens = tokenizer.encode(prompt, return_tensors='pt')
# repeat the tokens for num_seq times
x = tokens.repeat(num_seq, 1).to(device)

while x.size(1) < max_len:
    with torch.no_grad():
        logits = _model(x)[0]
        logits = logits[:, -1, :]
        probs = F.softmax(logits, dim=-1)
        topk_probs, topk_indices = torch.topk(probs, 50, dim=-1)
        next_token = torch.multinomial(topk_probs, num_samples=1)
        xcol = torch.gather(topk_indices, -1, next_token)
        x = torch.cat([x, xcol], dim=1)
    
for i in range(num_seq):
    print(">",tokenizer.decode(x[i].tolist()))

NameError: name 'random' is not defined

N-gram

In [5]:
import time
import torch
import random
import numpy as np
from transformers import GPT2Tokenizer, GPT2LMHeadModel
from tqdm import tqdm
from collections import defaultdict

seed = 5
seed_everything(seed)

extra_steps = 30
max_steps = 1024             
k = 50
n = 1
info_lst = []

for idx, code in tqdm(enumerate(file_contents)):
# for code in tqdm(file_contents):
    # if idx == 2:
    #     break
    prompt = f"Correct the following code:\n{code}\nCorrected code: def"
    
    code_ids = tokenizer.encode(code, return_tensors='pt')
    input_ids = tokenizer.encode(prompt, return_tensors='pt')
    steps = extra_steps+code_ids.size(1)
    if steps > max_steps:
        steps = max_steps

    dict_pred_info = defaultdict(dict)

    # Copy model generation
    copy_ids = input_ids.clone().to(device1)
    target_steps = []
    flag = False
    for step_i in range(steps):
        t0 = time.time()

        # detect copy scenario
        t_matched, skip_up_to = detect_ngram_copy(copy_ids, n=n, skip_up_to=skip_up_to)
        
        if t_matched is not None:
            t_matched = -1  # we're predicting the next token

        with torch.no_grad():
            # forward pass (copy-mech model)
            logits = model_copy(
                input_ids=copy_ids,
                pos_matched=t_matched,
            )

            logits = logits[:, -1, :]
            probs = F.softmax(logits, dim=-1)
            topk_probs, topk_indices = torch.topk(probs, k, dim=-1)
            next_token = torch.multinomial(topk_probs, num_samples=1)
            xcol = torch.gather(topk_indices, -1, next_token)
            copy_ids = torch.cat([copy_ids, xcol], dim=1)

        elapsed_copy = time.time() - t0

        # store info
        dict_pred_info[step_i]['copy'] = topk_indices[0].tolist()
        dict_pred_info[step_i]['copy_time'] = elapsed_copy
            
    # decoded_last_token = [tokenizer.decode([t]) for t in copy_ids[0,:].tolist()]
    # # join decoded_last_token
    # decoded_last_token = ''.join(decoded_last_token)
    # print(f"Last token: {decoded_last_token}, {copy_ids.tolist()}")
    # print('\n-----\n')
    # break
    # # clear cache
    model.cache_partial.clear()
    
    # 2) Original model generation
    t_matched, skip_up_to = (None, None)
    original_ids = input_ids.clone().to(device)
    for step_i in range(steps):
        t0 = time.time()

        with torch.no_grad():
            # forward pass
            logits = model(
                input_ids=original_ids,
                # pos_matched=-1,
            )

            logits = logits[:, -1, :]
            probs = F.softmax(logits, dim=-1)
            topk_probs, topk_indices = torch.topk(probs, k, dim=-1)
            next_token = torch.multinomial(topk_probs, num_samples=1)
            xcol = torch.gather(topk_indices, -1, next_token)
            original_ids = torch.cat([original_ids, xcol], dim=1)

        elapsed_orig = time.time() - t0

        # store info
        dict_pred_info[step_i]['original'] = topk_indices[0].tolist()
        dict_pred_info[step_i]['original_time'] = elapsed_orig
        
    # decoded_last_token = [tokenizer.decode([t]) for t in original_ids[0,:].tolist()]
    # decoded_last_token = ''.join(decoded_last_token)
    # print(f"Last token: {decoded_last_token}, {original_ids.tolist()}")
    
    # clear cache
    model.cache_partial.clear()

    info_lst.append(dict_pred_info)


3it [00:54, 18.06s/it]


KeyboardInterrupt: 

In [None]:
# import time
# import torch
# import random
# import numpy as np
# from transformers import GPT2Tokenizer, GPT2LMHeadModel
# from tqdm import tqdm
# from collections import defaultdict

# seed = 5
# seed_everything(seed)

# extra_steps = 20
# max_steps = 1024             
# k = 50
# n=5
# info_lst = []

# for idx, code in tqdm(enumerate(file_contents)):
# # for code in tqdm(file_contents):
#     if idx != 0:
#         continue
#     code = """def bitcount(n):
#     count = 0
#     while n:
#         n ^= n - 1
#         count += 1
#     return count"""
#     # prompt = f"Given the following code is incorrect:\n{code}\nCorrected code:"
#     # different prompt template
#     prompt = f"Correct the following code:\n{code}\nCorrected code: def"
    
#     code_ids = tokenizer.encode(code, return_tensors='pt')

#     input_ids = tokenizer.encode(prompt, return_tensors='pt').to(device)
#     steps = extra_steps+code_ids.size(1)
#     if steps > max_steps:
#         steps = max_steps

#     dict_pred_info = defaultdict(dict)

#     # Copy model generation
#     copy_ids = input_ids.clone()  # shape (1, initial_length)
#     target_steps = []
#     flag = False
#     for step_i in range(steps):
#         t0 = time.time()

#         # detect copy scenario
#         t_matched, skip_up_to = detect_ngram_copy(copy_ids, n=n, skip_up_to=skip_up_to)
        
#         # if t_matched is not None and copy_ids[0, t_matched].item() in [7783]:
#         #     flag = True
#         #     t_matched = None
        
#         # if flag:
#         #     t_matched = None
#         with torch.no_grad():
#             # forward pass (copy-mech model)
#             logits = model(
#                 input_ids=copy_ids,
#                 pos_matched=t_matched,
#             )
#             # logits = model.forward(
#             #     input_ids=copy_ids,
#             #     batch_idx=0,
#             #     pos_new=copy_ids.shape[1] - 1,
#             #     pos_matched=t_matched,
#             # )

#             logits = logits[:, -1, :]
#             probs = F.softmax(logits, dim=-1)
#             topk_probs, topk_indices = torch.topk(probs, k, dim=-1)
#             next_token = torch.multinomial(topk_probs, num_samples=1)
#             xcol = torch.gather(topk_indices, -1, next_token)
#             copy_ids = torch.cat([copy_ids, xcol], dim=1)

#         elapsed_copy = time.time() - t0

#         # store info
#         dict_pred_info[step_i]['copy'] = topk_indices[0].tolist()
#         dict_pred_info[step_i]['copy_time'] = elapsed_copy

#         # if t_matched is not None:
#         #     # print(prompt)
#         #     # print('----\n')
#         #     print(step_i)
#         #     target_steps.append(step_i)
#         #     last_token = copy_ids[0, -5:-1].tolist()
#         #     decoded_last_token = [tokenizer.decode([t]) for t in last_token]
#         #     print(f"Last 5 token: {decoded_last_token}, {last_token}")

#             # print(f"Last token: {decoded_last_token}, {last_token}")
#             # matched_token = copy_ids[0, t_matched].item()
#             # decoded_matched_token = tokenizer.decode([matched_token])
#             # print(f"Matched token: {decoded_matched_token}, {matched_token}")

#             # matched_context_ids = copy_ids[0, t_matched-n:t_matched+1]
#             # matched_context = tokenizer.decode(matched_context_ids.tolist())
#             # print(f"Matched context: {matched_context}, {matched_context_ids.tolist()}")
#             # # decode -4:-1
#             # context_ids = copy_ids[0, -n:].tolist()
#             # context = tokenizer.decode(context_ids)
#             # print(f"Context: {context}, {context_ids}")
#             # print('Current copy output:', tokenizer.decode(copy_ids[0].tolist()))
#             # print(f"Top-k - copy: {top_k_list}")
#             # break
            
#     decoded_last_token = [tokenizer.decode([t]) for t in copy_ids[0,:].tolist()]
#     # join decoded_last_token
#     decoded_last_token = ''.join(decoded_last_token)
#     print(f"Last token: {decoded_last_token}, {copy_ids.tolist()}")
#     print('\n-----\n')

#     # # clear cache
#     # model.cache_partial.clear()
    
#     # 2) Original model generation
#     original_ids = input_ids.clone()
#     for step_i in range(steps):
#         t0 = time.time()

#         with torch.no_grad():
#             # forward pass (copy-mech model)
#             logits = model(
#                 input_ids=original_ids,
#             )

#             logits = logits[:, -1, :]
#             probs = F.softmax(logits, dim=-1)
#             topk_probs, topk_indices = torch.topk(probs, k, dim=-1)
#             next_token = torch.multinomial(topk_probs, num_samples=1)
#             xcol = torch.gather(topk_indices, -1, next_token)
#             original_ids = torch.cat([original_ids, xcol], dim=1)

#         elapsed_orig = time.time() - t0

#         # store info
#         dict_pred_info[step_i]['original'] = topk_indices[0].tolist()
#         dict_pred_info[step_i]['original_time'] = elapsed_orig

#         # if step_i in target_steps:
#         #     # print('----\n')
#         #     print(step_i)
#         #     print(step_i)
#         #     target_steps.append(step_i)
#         #     last_token = original_ids[0, -5:-1].tolist()
#         #     decoded_last_token = [tokenizer.decode([t]) for t in last_token]
#         #     print(f"Last 5 token: {decoded_last_token}, {last_token}")
#             # print('Current origin output:', tokenizer.decode(original_ids[0].tolist()))
#             # print(f"Top-k - origin: {top_k_list}")
#             # a
        
#     decoded_last_token = [tokenizer.decode([t]) for t in original_ids[0,:].tolist()]
#     decoded_last_token = ''.join(decoded_last_token)
#     print(f"Last token: {decoded_last_token}, {copy_ids.tolist()}")
    
#     # clear cache
#     model.cache_partial.clear()

#     info_lst.append(dict_pred_info)


In [45]:
import time
import torch
import random
import numpy as np
from transformers import GPT2Tokenizer, GPT2LMHeadModel
from tqdm import tqdm
from collections import defaultdict

seed = 5
seed_everything(seed)

extra_steps = 20
max_steps = 1024             
k = 50
n = 5
info_lst = []

for idx, code in tqdm(enumerate(file_contents)):
    if idx != 0:
        continue
    code = """def bitcount(n):
    count = 0
    while n:
        n ^= n - 1
        count += 1
    return count"""
    prompt = f"Correct the following code:\n{code}\nCorrected code: def"
    
    code_ids = tokenizer.encode(code, return_tensors='pt')
    input_ids = tokenizer.encode(prompt, return_tensors='pt')

    steps = extra_steps + code_ids.size(1)
    steps = min(steps, max_steps)

    dict_pred_info = defaultdict(dict)

    copy_ids = input_ids.clone().to(device)  # shape (1, initial_length)
    # (Optional) fill partial states for the entire existing prompt if you want 
    # to skip from tokens inside the prompt. For example:
    for p in range(copy_ids.size(1)):
        sub_ids = copy_ids[:, :p+1]
        # pos_new=p, pos_matched=None => no skip
        model.forward(sub_ids, batch_idx=0, pos_new=p, pos_matched=None)
        
    flag = False
    for step_i in range(steps):
        t0 = time.time()

        # detect copy
        t_matched, skipv = detect_ngram_copy(copy_ids, n=n, skip_up_to=43)
        
        # if t_matched is not None and copy_ids[0, t_matched].item() in [7783]:
        #     flag = True
        #     t_matched = None
        
        # if flag:
        #     t_matched = None

        with torch.no_grad():
            # forward pass (copy-mech model)
            logits = model(
                input_ids=copy_ids,
                pos_new=copy_ids.shape[1] - 1,  # new position
                pos_matched=t_matched
            )

            logits = logits[:, -1, :]
            probs = F.softmax(logits, dim=-1)
            topk_probs, topk_indices = torch.topk(probs, k, dim=-1)
            next_token = torch.multinomial(topk_probs, num_samples=1)
            xcol = torch.gather(topk_indices, -1, next_token)
            copy_ids = torch.cat([copy_ids, xcol], dim=1)

        elapsed_copy = time.time() - t0
        dict_pred_info[step_i]['copy'] = topk_indices[0].tolist()
        dict_pred_info[step_i]['copy_time'] = elapsed_copy

    # Clear partial cache for next sample
    model.cache_partial.clear()

    decoded_last_token_copy = [tokenizer.decode([t]) for t in copy_ids[0,:].tolist()]
    # join decoded_last_token
    decoded_last_token_copy = ''.join(decoded_last_token_copy)
    print(f"Last token: {decoded_last_token_copy}, {copy_ids.tolist()}")
    print('\n-----\n')

    # -------------------------------------
    # 3b) Original model generation (skip_up_to=0)
    # -------------------------------------
    # original_ids = input_ids.to(device)
    # # likewise fill partial states for the entire prompt 
    # for p in range(original_ids.size(1)):
    #     sub_ids = original_ids[:, :p+1]
    #     model.forward(sub_ids, batch_idx=0, pos_new=p, pos_matched=None)

    # for step_i in range(steps):
    #     t0 = time.time()

    #     with torch.no_grad():
    #         # forward pass
    #         logits = model(
    #             input_ids=original_ids,
    #         )

    #         logits = logits[:, -1, :]
    #         probs = F.softmax(logits, dim=-1)
    #         topk_probs, topk_indices = torch.topk(probs, k, dim=-1)
    #         next_token = torch.multinomial(topk_probs, num_samples=1)
    #         xcol = torch.gather(topk_indices, -1, next_token)
    #         original_ids = torch.cat([original_ids, xcol], dim=1)

    #     elapsed_orig = time.time() - t0
    #     dict_pred_info[step_i]['original'] = topk_indices[0].tolist()
    #     dict_pred_info[step_i]['original_time'] = elapsed_orig

    # # Clear partial cache
    # model.cache_partial.clear()

    # decoded_last_token_original = [tokenizer.decode([t]) for t in original_ids[0,:].tolist()]
    # decoded_last_token_original = ''.join(decoded_last_token_original)
    # print(f"Last token: {decoded_last_token_original}, {original_ids.tolist()}")

    info_lst.append(dict_pred_info)

40it [00:03, 12.36it/s]

Last token: Correct the following code:
def bitcount(n):
    count = 0
    while n:
        n ^= n - 1
        count += 1
    return count
Corrected code: def bitcount(n): for i in range(2**n): count += 1 if i > 0: count += (i - 0) / 7 return count<|endoftext|>With the upcoming release of "The Fifth Element" on Netflix (and the DVD on sale now), it seems as though we have no choice but to talk about the film once more:, [[42779, 262, 1708, 2438, 25, 198, 4299, 1643, 9127, 7, 77, 2599, 198, 220, 220, 220, 954, 796, 657, 198, 220, 220, 220, 981, 299, 25, 198, 220, 220, 220, 220, 220, 220, 220, 299, 10563, 28, 299, 532, 352, 198, 220, 220, 220, 220, 220, 220, 220, 954, 15853, 352, 198, 220, 220, 220, 1441, 954, 198, 42779, 276, 2438, 25, 825, 1643, 9127, 7, 77, 2599, 329, 1312, 287, 2837, 7, 17, 1174, 77, 2599, 954, 15853, 352, 611, 1312, 1875, 657, 25, 954, 15853, 357, 72, 532, 657, 8, 1220, 767, 1441, 954, 50256, 3152, 262, 7865, 2650, 286, 366, 464, 19383, 11703, 1, 319, 12074, 357, 39




In [6]:
def jaccard_similarity(list1, list2):
    set1 = set(list1)
    set2 = set(list2)
    intersection = len(set1.intersection(set2))
    union = len(set1) + len(set2) - intersection
    return  intersection / union

jcc_ult = []
acc_ult = []
tpt_copy = []
tpt_orig = []

for data in info_lst:
    total_copy_time = 0
    total_orig_time = 0
    acc_lst = []
    jc_lst = []
    for step in data.keys():
        total_copy_time += data[step]['copy_time']
        total_orig_time += data[step]['original_time']
        copy = data[step]['copy']
        original = data[step]['original']

        jaccard_score = jaccard_similarity(copy, original)
        jc_lst.append(jaccard_score)

        acc_score = 1 if copy[0] == original[0] else 0
        acc_lst.append(acc_score)

    # time per token - tpt
    tpt_copy.append(total_copy_time / steps)
    tpt_orig.append(total_orig_time / steps)

    jcc_ult.append(jc_lst)
    acc_ult.append(acc_lst)

In [8]:
info_lst[1]

defaultdict(dict,
            {0: {'copy': [1353,
               900,
               6149,
               4220,
               11593,
               1388,
               4852,
               1306,
               5849,
               23243,
               284,
               318,
               4808,
               651,
               220,
               299,
               1502,
               28662,
               717,
               6727,
               287,
               21246,
               256,
               267,
               277,
               2116,
               279,
               477,
               542,
               288,
               1976,
               649,
               1351,
               319,
               257,
               3509,
               3376,
               9575,
               751,
               7716,
               503,
               10139,
               366,
               787,
               493,
               1994,
               374,
   

In [7]:
sum_tpt_copy = sum(tpt_copy)
sum_tpt_orig = sum(tpt_orig)
sum_tpt_copy / sum_tpt_orig

1.0171668513157692

In [8]:
def cal_avg(lsts):
    avg_lst = []
    for lst in lsts:
        avg_lst.append(sum(lst) / len(lst))
    return sum(avg_lst) / len(avg_lst)

avg_jcc = cal_avg(jcc_ult)
avg_acc = cal_avg(acc_ult)
avg_jcc, avg_acc

(1.0, 1.0)

In [43]:
acc_ult[0]

[1,
 1,
 1,
 1,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0]

English Insertion

In [None]:
from datasets import load_from_disk
from tqdm import tqdm

# Load the dataset from disk
subset = load_from_disk("english_insertions")
prompt_list = []

base_sents = subset['train']['base_sentence'][:1000]
phrases = subset['train']['phrase'][:1000]
edited_sents = subset['train']['edited_sentence'][:1000]

import gc
del subset
gc.collect()