!jupyter nbconvert --to script <notebook-name>.ipynb

In [2]:
!pip install transformers

Defaulting to user installation because normal site-packages is not writeable
Looking in indexes: https://pypi.org/simple, https://pypi.ngc.nvidia.com
[0m

In [2]:
from typing import Union
from dataclasses import dataclass
import pickle
import datetime
import os
import sys
#import tqdm

import numpy as np

import torch
import torch.nn.functional as F
from transformers import AutoTokenizer, AutoModelForCausalLM, GenerationConfig, PretrainedConfig, LlamaForCausalLM, LlamaTokenizer
# from transformers import GPT2LMHeadModel, GPT2Tokenizer
# import accelerate

2025-02-23 19:43:54.606393: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-02-23 19:43:54.648450: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [None]:
# !pip install --upgrade protobuf==3.20.*
# !pip install --upgrade sentencepiece

Defaulting to user installation because normal site-packages is not writeable
Looking in indexes: https://pypi.org/simple, https://pypi.ngc.nvidia.com
[0mDefaulting to user installation because normal site-packages is not writeable
Looking in indexes: https://pypi.org/simple, https://pypi.ngc.nvidia.com
[0m

In [40]:
sys.path.append('../')

# Helper functions for top-k and top-p filtering
def top_k_top_p_filtering(
    logits,
    top_k: int = 0,
    top_p: float = 1.0,
    filter_value: float = -float("Inf"),
    min_tokens_to_keep: int = 1,
):
    if top_k > 0:
        top_k = min(max(top_k, min_tokens_to_keep), logits.size(-1))  # Safety check
        # Remove all tokens with a probability less than the last token of the top-k
        indices_to_remove = logits < torch.topk(logits, top_k)[0][..., -1, None]
        logits[indices_to_remove] = filter_value 

    if top_p < 1.0:
        sorted_logits, sorted_indices = torch.sort(logits, descending=True)
        cumulative_probs = torch.cumsum(F.softmax(sorted_logits, dim=-1), dim=-1)

        # Remove tokens with cumulative probability above the threshold (token with 0 are kept)
        sorted_indices_to_remove = cumulative_probs > top_p
        if min_tokens_to_keep > 1:
            # Keep at least min_tokens_to_keep (set to min_tokens_to_keep-1 because we add the first one below)
            sorted_indices_to_remove[..., :min_tokens_to_keep] = 0
        # Shift the indices to the right to keep also the first token above the threshold
        sorted_indices_to_remove[..., 1:] = sorted_indices_to_remove[..., :-1].clone()
        sorted_indices_to_remove[..., 0] = 0

        # scatter sorted tensors to original indexing
        indices_to_remove = sorted_indices_to_remove.scatter(-1, sorted_indices, sorted_indices_to_remove)
        logits[indices_to_remove] = filter_value
    return logits

In [41]:
@dataclass
class ModelOutput:
    input_ids: Union[torch.Tensor, np.ndarray]
    input_log_probs: Union[torch.Tensor, np.ndarray]
    output_ids: Union[torch.Tensor, np.ndarray]
    output_log_probs: Union[torch.Tensor, np.ndarray]
    all_ids: Union[torch.Tensor, np.ndarray]
    all_log_probs: Union[torch.Tensor, np.ndarray]

    @classmethod
    def from_list_of_model_outputs(cls, *others):
        
        return cls(
            input_ids=np.concatenate([o.input_ids for o in others], axis=0),
            input_log_probs=np.concatenate([o.input_log_probs for o in others], axis=0),
            output_ids=np.concatenate([o.output_ids for o in others], axis=0),
            output_log_probs=np.concatenate([o.output_log_probs for o in others], axis=0),
            all_ids=np.concatenate([o.all_ids for o in others], axis=0),
            all_log_probs=np.concatenate([o.all_log_probs for o in others], axis=0),
        )
    
    @classmethod
    def from_dict(cls, d):
        return cls(
            input_ids=d['input_ids'],
            input_log_probs=d['input_log_probs'],
            output_ids=d['output_ids'],
            output_log_probs=d['output_log_probs'],
            all_ids=d['all_ids'],
            all_log_probs=d['all_log_probs'],
        )
    
    def to_dict(self):
        return {
            'input_ids': self.input_ids,
            'input_log_probs': self.input_log_probs,
            'output_ids': self.output_ids,
            'output_log_probs': self.output_log_probs,
            'all_ids': self.all_ids,
            'all_log_probs': self.all_log_probs,
        }


In [42]:
# @torch.no_grad
def custom_generate_with_log_probs(model, input_ids, eos_token_id=None, mask_out_token_id=None, **generate_kwargs):
    max_length = generate_kwargs.get("max_length", 200)
    max_new_tokens = generate_kwargs.get("max_new_tokens", 200)
    input_length = input_ids.shape[1]
    max_new_tokens = min(max_new_tokens, max_length - input_length)
    temperature = generate_kwargs.get("temperature", 0.7)
    top_k = generate_kwargs.get("top_k", -1)
    top_p = generate_kwargs.get("top_p", 1.1)
    do_sample = generate_kwargs.get("do_sample", True)
    return_input_logprobs = generate_kwargs.get("return_input_logprobs", False)
    
    with torch.no_grad():
        # Prepare model inputs
        # Measure input log probs
        input_log_probs = None
        if return_input_logprobs:
            outputs = model(input_ids)
            if mask_out_token_id is not None:
                if type(mask_out_token_id) != list and type(mask_out_token_id) != tuple:
                    mask_out_token_id = (mask_out_token_id,)
                for id in mask_out_token_id:
                    outputs.logits[:, :, id] = -float('inf')
            outputs.logits /= temperature
            outputs.logits = top_k_top_p_filtering(outputs.logits, top_k=top_k, top_p=top_p)

            input_log_probs = torch.nn.functional.log_softmax(outputs.logits, dim=-1)
            input_log_probs[:, 1:] = input_log_probs[:, :-1].clone()
            input_log_probs[:, 0] = 0
            input_log_probs = input_log_probs.cpu().numpy()

        # Initialize generation
        batch_size = input_ids.shape[0]
        unfinished_sequences = torch.ones(batch_size, device=input_ids.device, dtype=torch.long)

        output_ids = torch.zeros((batch_size, max_new_tokens), dtype=input_ids.dtype, device=input_ids.device)
        vocab_size = input_log_probs.shape[-1]
        output_log_probs = np.zeros((batch_size, max_new_tokens, vocab_size), dtype=input_log_probs.dtype)

        past_key_values = None

        for i in range(max_new_tokens):

                # Forward pass
                outputs = model(input_ids if past_key_values is None else output_ids[:, i-1:i], past_key_values=past_key_values, use_cache=True)
                next_token_logits = outputs.logits[:, -1, :]
                past_key_values = outputs.past_key_values

                # Set EOS token probability to zero

                if mask_out_token_id is not None:
                    if type(mask_out_token_id) != list and type(mask_out_token_id) != tuple:
                        mask_out_token_id = (mask_out_token_id,)
                    for id in mask_out_token_id:
                        next_token_logits[:, id] = -float('inf')

                # Apply temperature
                next_token_logits = next_token_logits / temperature

                # Apply top-k and top-p filtering
                next_token_logits = top_k_top_p_filtering(next_token_logits, top_k=top_k, top_p=top_p)

                # Compute log probabilities
                log_probs = torch.nn.functional.log_softmax(next_token_logits, dim=-1)

                # Sample or greedy decode
                if do_sample:
                    next_token = torch.multinomial(torch.exp(log_probs), num_samples=1)
                else:
                    next_token = torch.argmax(log_probs, dim=-1, keepdim=True)

                # Compute log probability of the sampled token
                output_log_probs[:, i] = log_probs.cpu().numpy()

                # Update input_ids and attention_mask
                output_ids[:, i] = next_token.squeeze(1)

                # Update unfinished_sequences
                if eos_token_id is not None:
                    if type(eos_token_id) != list and type(eos_token_id) != tuple:
                        eos_token_id = (eos_token_id, )
                    for id in eos_token_id:
                        unfinished_sequences = unfinished_sequences * (next_token != id).long()
                    if unfinished_sequences.max() == 0:
                        break

        if return_input_logprobs:
            all_log_probs = np.concatenate([input_log_probs, output_log_probs], axis=1)
        else:
            all_log_probs = None

        output_ids = output_ids.cpu().numpy()
        input_ids = input_ids.cpu().numpy()
        all_ids = np.concatenate([input_ids, output_ids], axis=1)

        return ModelOutput(
            input_ids=input_ids,
            input_log_probs=input_log_probs,
            output_ids=output_ids,
            output_log_probs=output_log_probs,
            all_ids=all_ids,
            all_log_probs=all_log_probs
        )


In [46]:
def main():
    # dataDir="
    model_dir="/home/gridsan/mshi/MAML-Soljacic_shared/llama2/7b"
    tokenizer_dir = "/home/gridsan/mshi/MAML-Soljacic_shared/llama2/tokenizer"
    
    if not os.path.exists(model_dir):
        raise FileNotFoundError(f"Model directory not found: {model_dir}")
    
    
    model = LlamaForCausalLM.from_pretrained(model_dir,local_files_only=True)

    model.eval()
    tokenizer = LlamaTokenizer.from_pretrained(tokenizer_dir,local_files_only=True)
    
    batch_size = 3
    input_text = ["In the city of Milan"] * batch_size  # Replace with the actual input text
    inputs = tokenizer(input_text, return_tensors="pt").to(model.device)
    
    temperature = 1.2
    
    # Define generation parameters
    generate_kwargs = {
        "max_length": 1000,  # Adjust as needed
        "max_new_tokens": 999,  # Adjust as needed
        "temperature": temperature,
        "top_k": -1,
        "top_p": 1,
        "do_sample": True,
        "eos_token_id": None, #tokenizer.eos_token_id,
        "mask_out_token_id": [tokenizer.eos_token_id, tokenizer.bos_token_id],  # Mask out EOS and BOS tokens
        'output_logits': True,
        'return_input_logprobs': True,
    }

    # Generate text and get log probabilities
    global output

    samples=2
    for z in range(samples):
        output_list = []
        mini_batch_size=2
        for i in range(mini_batch_size):
            output = custom_generate_with_log_probs(model, inputs.input_ids, **generate_kwargs)
            output_list.append(output)
        output = ModelOutput.from_list_of_model_outputs(*output_list)

        save_dir = f"gpt2/{model_name.replace('/', '_')}_{generate_kwargs['top_k']}_{generate_kwargs['max_new_tokens']}_{samples}_{generate_kwargs['temperature']}"
        os.makedirs(save_dir, exist_ok=True)
         # Check if all elements in output_log_probs are the same
        print(f"we're on sample {z}")
        
        
        
        name=f"output_{datetime.datetime.now().strftime('%Y-%m-%d_%H-%M-%S')}.pkl"
        with open(os.path.join(save_dir, name), "wb") as f:
            print("Saving output to", f.name)
            data = {
                
                'num_samples': samples*batch_size*mini_batch_size,
                'temperature':generate_kwargs['temperature'],
                'top_k': generate_kwargs['top_k']
                    }
            orgdata=output.to_dict()
            orgdata.update(data)
            pickle.dump(orgdata,f)

        # with open(os.path.join(save_dir, name), "ab") as f:
        #     print("Saving output to", f.name)

            
        #     pickle.dump(data,f)



    # res = model.generate(inputs.input_ids, generation_config=generation_config)

    # # Decode and print the generated text
    # generated_text = tokenizer.batch_decode(torch.from_numpy(generated_ids), skip_special_tokens=False)
    # print("Generated Text:\n", generated_text)

    # assert False


In [47]:
if __name__ == "__main__":
    main()


Loading checkpoint shards:   0%|          | 0/6 [00:00<?, ?it/s]

we're on sample 0
Saving output to gpt2/llama2_-1_999_2_1.2/output_2025-02-23_23-19-12.pkl
we're on sample 1
Saving output to gpt2/llama2_-1_999_2_1.2/output_2025-02-23_23-29-09.pkl


In [9]:
import pickle

In [39]:
# file_path = "gpt2/llama2_0_999_2_20/output_2025-02-17_00-17-39.pkl"
file_path = "gpt2/llama2_-1_999_2_0.7/output_2025-02-23_20-04-54.pkl"

with open(file_path, "rb") as file:
    data = pickle.load(file)

print(type(data))
# print(len(data['input_log_probs'][0][0]))
print(np.shape(data['output_log_probs']))

"""
num samples is num sample * batch size * mini batch size, 2 * 3 * 2
input log probs [0] length is 6
output ids [0] length is 6, output ids [0] [0] length is 994, contains ids of every sequence generated per batch
output log probs [0] length is 994, number of tokens generated
output log probs [0] [0] length is 3200, vocab size – probability of every possible token
input log probs [0] [0] length is 3200, vocab size
"""

print(data.keys())

<class 'dict'>
(6, 994, 32000)
dict_keys(['input_ids', 'input_log_probs', 'output_ids', 'output_log_probs', 'all_ids', 'all_log_probs', 'num_samples', 'temperature', 'top_k'])


In [12]:
tokenizer_dir = "/home/gridsan/mshi/MAML-Soljacic_shared/llama2/tokenizer"
tokenizer = LlamaTokenizer.from_pretrained(tokenizer_dir,local_files_only=True)

In [13]:
generated_text = tokenizer.batch_decode(data["output_ids"], skip_special_tokens=True)
print("\n".join(generated_text))  # Print each generated sample

, Italy, the local authorities are promoting the use of electric bicycles to cope with the city’s congested streets. The project is called Milan Bike Share, and it allows people to use their smartphones to rent and unlock e-bikes, which are then returned at various docking stations throughout the city.
The e-bikes are equipped with a GPS-based navigation system that sends the bike’s location to the provider’s database. The system also uses the bike’s battery to charge itself while it’s parked.
The project has been a success so far, with over 10,000 people renting e-bikes from the service since it launched in 2015.
The city’s congestion is a major problem, with over 1 million people living in Milan and only 1.2 million parking spots available. The e-bikes have been a solution to this problem, as they can be parked anywhere and are much cheaper than cars.
The system is also helping to reduce air pollution in the city, as the e-bikes don’t produce any emissions.
The project has been so su

In [14]:
avg_log_prob = np.mean(data["output_log_probs"])  # Average confidence of generated text
print(f"Average Log Probability: {avg_log_prob}")
print(data["output_log_probs"][0:5])

Average Log Probability: -inf
[[[-33.406254       -inf       -inf ... -30.800547 -28.542536 -30.993744]
  [-31.499197       -inf       -inf ... -26.137808 -25.636297 -25.434656]
  [-32.560253       -inf       -inf ... -32.411545 -30.82553  -32.62158 ]
  ...
  [-43.036823       -inf       -inf ... -36.16278  -35.261536 -38.78596 ]
  [-36.243954       -inf       -inf ... -35.851227 -37.77268  -37.14803 ]
  [-35.1133         -inf       -inf ... -33.407276 -36.680634 -33.95957 ]]

 [[-33.406258       -inf       -inf ... -30.80055  -28.542534 -30.993746]
  [-31.499199       -inf       -inf ... -26.137812 -25.6363   -25.434656]
  [-32.560253       -inf       -inf ... -32.41155  -30.825533 -32.621582]
  ...
  [-44.66798        -inf       -inf ... -40.282597 -41.446854 -37.74586 ]
  [-34.551365       -inf       -inf ... -25.42192  -30.918056 -29.745707]
  [-45.25648        -inf       -inf ... -41.64829  -39.665226 -36.383156]]

 [[-33.40626        -inf       -inf ... -30.80055  -28.542538 -30.