<a href="https://colab.research.google.com/github/offthewallace/CSE584/blob/main/Sentence_complete_dataset_creation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
!pip install datasets



In [3]:
from datasets import load_dataset
from transformers import GPT2LMHeadModel, GPT2Tokenizer, AutoModelForCausalLM, AutoTokenizer
import random
import torch
import torch

# Check if GPU is available and set the device accordingly
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

Using device: cuda


In [7]:
import time
from tqdm import tqdm
import torch
from datasets import load_dataset

# Check if GPU is available and set the device accordingly
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

# Load the IMDB dataset
def truncate_half_text(text):
    """
    Truncates the text to the first 200 tokens, ensuring the result is non-empty.
    If the text has fewer than 200 tokens, it returns the original text.
    """
    words = text.split()

    # Truncate to the first 200 tokens or return the original if it's shorter
    truncated_text = ' '.join(words[:200])

    return truncated_text if truncated_text else None


def truncate_half_text2(text):
    """
    Truncates the text to half its length, ensuring the result is non-empty.
    """
    words = text.split()
    if len(words) < 5:  # Skip if the text is too short to truncate
        return text  # Return original text if it's too short
    half_length = len(words) // 2
    truncated_text = ' '.join(words[:half_length])
    return truncated_text if truncated_text else None

def add_prompt(truncated_text):
    """
    Dynamically creates a prompt based on the truncated text.
    """
    prompt = f'Complete this paragraph: "{truncated_text}"'
    return prompt

def batch_generate_completion(model, tokenizer, texts, max_new_tokens=200, top_k=50, top_p=0.9, temperature=0.8, repetition_penalty=1.2):
    """
    Generate text completions in batches for a list of truncated texts using a specific model and tokenizer.
    """
    if len(texts) == 0:
        return []  # Return empty if the input batch is empty

    # Tokenize the batch of texts
    inputs = tokenizer(texts, return_tensors='pt', padding=True, truncation=True).to(device)

    # Generate the completion and move output back to CPU for decoding
    outputs = model.generate(
        inputs['input_ids'],
        attention_mask=inputs['attention_mask'],
        max_new_tokens=max_new_tokens,
        top_k=top_k,               # Use top-k sampling
        top_p=top_p,               # Use nucleus sampling (top-p)
        temperature=temperature,   # Adjust temperature for randomness
        repetition_penalty=repetition_penalty,  # Penalize repetitive tokens
        pad_token_id=tokenizer.eos_token_id
    )

    # Decode the output batch
    return [tokenizer.decode(output, skip_special_tokens=True) for output in outputs]

def generate_dataset_with_llms(dataset, models, tokenizers, num_samples=10, max_new_tokens=200, batch_size=8):
    results = []
    start_time = time.time()  # Start timer for total estimation

    # Move models to GPU (cuda)
    models = {name: model.to(device) for name, model in models.items()}

    # Iterate through dataset in batches with progress tracking
    for i in tqdm(range(0, num_samples, batch_size)):
        # Get the current batch of examples (IMDB reviews are stored under the 'text' key)
        batch_examples = dataset[i:i + batch_size]['ctx']

        # Truncate the texts in the batch and dynamically add prompt
        batch_truncated_texts = []
        for example in batch_examples:

            batch_truncated_texts.append(add_prompt(example))

        # Skip empty or invalid truncated texts
        if len(batch_truncated_texts) == 0:
            print(f"Skipping batch {i} due to empty truncated texts")
            continue

        for model_name, model in models.items():
            try:
                # Time batch generation for each model
                completion_start = time.time()

                # Generate completions for the batch
                completions = batch_generate_completion(model, tokenizers[model_name], batch_truncated_texts, max_new_tokens)

                if len(completions) == 0:
                    print(f"No completions generated for batch {i} with model {model_name}")
                    continue  # Skip empty completions

                completion_end = time.time()
                completion_time = completion_end - completion_start
                print(f"Completion time for {model_name} batch: {completion_time:.2f} seconds")

                # Store the (xi, xj) pairs with corresponding LLM label
                for truncated_text, completion in zip(batch_truncated_texts, completions):
                    results.append({
                        'xi': truncated_text,  # This will include the prompt with truncated text
                        'xj': completion[len(truncated_text):].strip(),  # The actual completion generated by the model
                        'llm': model_name
                    })

            except Exception as e:
                print(f"Error generating completion with {model_name} for batch {i}: {e}")
                continue

    end_time = time.time()  # End timer for total process

    # Calculate time per sample and estimate total time
    total_time = end_time - start_time
    avg_time_per_sample = total_time / num_samples
    total_samples = len(dataset)
    estimated_total_time = avg_time_per_sample * total_samples

    print(f"\nProcessed {num_samples} samples in {total_time:.2f} seconds.")
    print(f"Average time per sample: {avg_time_per_sample:.2f} seconds.")
    print(f"Estimated total time for {total_samples} samples: {estimated_total_time / 60:.2f} minutes.")

    return results





Using device: cuda


In [10]:
from transformers import GPT2LMHeadModel, GPT2Tokenizer, AutoModelForCausalLM, AutoTokenizer

models = {
    'gpt2': GPT2LMHeadModel.from_pretrained('gpt2'),  # 124M parameters
    'gpt-neo': AutoModelForCausalLM.from_pretrained('EleutherAI/gpt-neo-125M'),  # 125M parameters
    'distilgpt2': GPT2LMHeadModel.from_pretrained('distilgpt2'),  # 82M parameters
    'qwen-2-0.5B': AutoModelForCausalLM.from_pretrained('Qwen/Qwen2-0.5B')  # 1.5B parameters
}

tokenizers = {
    'gpt2': GPT2Tokenizer.from_pretrained('gpt2'),
    'gpt-neo': AutoTokenizer.from_pretrained('EleutherAI/gpt-neo-125M'),
    'distilgpt2': GPT2Tokenizer.from_pretrained('distilgpt2'),
    'qwen-2-0.5B': AutoTokenizer.from_pretrained('Qwen/Qwen2-0.5B'),
}

# Set pad_token as eos_token for all models
for model_name in models.keys():
    tokenizers[model_name].pad_token = tokenizers[model_name].eos_token
    tokenizers[model_name].padding_side = 'left'  # Set padding side to 'left'

imdb_dataset = load_dataset("Rowan/hellaswag", split='test')  # Use 'test' split for demonstration

results = generate_dataset_with_llms(imdb_dataset, models, tokenizers, num_samples=10000, batch_size=200, max_new_tokens=200)



  0%|          | 0/50 [00:00<?, ?it/s]

Completion time for gpt2 batch: 4.37 seconds
Completion time for gpt-neo batch: 3.36 seconds
Completion time for distilgpt2 batch: 2.80 seconds


  2%|▏         | 1/50 [00:19<15:55, 19.49s/it]

Completion time for qwen-2-0.5B batch: 8.96 seconds
Completion time for gpt2 batch: 4.39 seconds
Completion time for gpt-neo batch: 3.41 seconds
Completion time for distilgpt2 batch: 2.79 seconds


  4%|▍         | 2/50 [00:39<15:39, 19.57s/it]

Completion time for qwen-2-0.5B batch: 9.02 seconds
Completion time for gpt2 batch: 4.46 seconds
Completion time for gpt-neo batch: 3.44 seconds
Completion time for distilgpt2 batch: 2.84 seconds


  6%|▌         | 3/50 [00:59<15:27, 19.74s/it]

Completion time for qwen-2-0.5B batch: 9.21 seconds
Completion time for gpt2 batch: 4.81 seconds
Completion time for gpt-neo batch: 3.61 seconds
Completion time for distilgpt2 batch: 3.08 seconds


  8%|▊         | 4/50 [01:20<15:34, 20.31s/it]

Completion time for qwen-2-0.5B batch: 9.68 seconds
Completion time for gpt2 batch: 4.58 seconds
Completion time for gpt-neo batch: 3.50 seconds
Completion time for distilgpt2 batch: 2.91 seconds


 10%|█         | 5/50 [01:40<15:15, 20.34s/it]

Completion time for qwen-2-0.5B batch: 9.40 seconds
Completion time for gpt2 batch: 4.52 seconds
Completion time for gpt-neo batch: 3.47 seconds
Completion time for distilgpt2 batch: 2.87 seconds


 12%|█▏        | 6/50 [02:00<14:51, 20.27s/it]

Completion time for qwen-2-0.5B batch: 9.27 seconds
Completion time for gpt2 batch: 4.55 seconds
Completion time for gpt-neo batch: 3.50 seconds
Completion time for distilgpt2 batch: 2.90 seconds


 14%|█▍        | 7/50 [02:21<14:32, 20.29s/it]

Completion time for qwen-2-0.5B batch: 9.39 seconds
Completion time for gpt2 batch: 4.63 seconds
Completion time for gpt-neo batch: 3.53 seconds
Completion time for distilgpt2 batch: 2.94 seconds


 16%|█▌        | 8/50 [02:41<14:15, 20.37s/it]

Completion time for qwen-2-0.5B batch: 9.44 seconds
Completion time for gpt2 batch: 4.61 seconds
Completion time for gpt-neo batch: 3.50 seconds
Completion time for distilgpt2 batch: 2.93 seconds


 18%|█▊        | 9/50 [03:02<13:58, 20.44s/it]

Completion time for qwen-2-0.5B batch: 9.55 seconds
Completion time for gpt2 batch: 4.42 seconds
Completion time for gpt-neo batch: 3.42 seconds
Completion time for distilgpt2 batch: 2.85 seconds


 20%|██        | 10/50 [03:22<13:29, 20.25s/it]

Completion time for qwen-2-0.5B batch: 9.11 seconds
Completion time for gpt2 batch: 4.83 seconds
Completion time for gpt-neo batch: 3.65 seconds
Completion time for distilgpt2 batch: 3.04 seconds


 22%|██▏       | 11/50 [03:43<13:23, 20.59s/it]

Completion time for qwen-2-0.5B batch: 9.86 seconds
Completion time for gpt2 batch: 4.49 seconds
Completion time for gpt-neo batch: 3.44 seconds
Completion time for distilgpt2 batch: 2.85 seconds


 24%|██▍       | 12/50 [04:03<12:56, 20.43s/it]

Completion time for qwen-2-0.5B batch: 9.26 seconds
Completion time for gpt2 batch: 4.60 seconds
Completion time for gpt-neo batch: 3.50 seconds
Completion time for distilgpt2 batch: 2.90 seconds


 26%|██▌       | 13/50 [04:23<12:35, 20.42s/it]

Completion time for qwen-2-0.5B batch: 9.41 seconds
Completion time for gpt2 batch: 4.53 seconds
Completion time for gpt-neo batch: 3.48 seconds
Completion time for distilgpt2 batch: 2.87 seconds


 28%|██▊       | 14/50 [04:43<12:11, 20.33s/it]

Completion time for qwen-2-0.5B batch: 9.22 seconds
Completion time for gpt2 batch: 4.67 seconds
Completion time for gpt-neo batch: 3.55 seconds
Completion time for distilgpt2 batch: 2.95 seconds


 30%|███       | 15/50 [05:04<11:54, 20.43s/it]

Completion time for qwen-2-0.5B batch: 9.50 seconds
Completion time for gpt2 batch: 4.38 seconds
Completion time for gpt-neo batch: 3.38 seconds
Completion time for distilgpt2 batch: 2.80 seconds


 32%|███▏      | 16/50 [05:24<11:26, 20.18s/it]

Completion time for qwen-2-0.5B batch: 9.02 seconds
Completion time for gpt2 batch: 4.32 seconds
Completion time for gpt-neo batch: 3.37 seconds
Completion time for distilgpt2 batch: 2.75 seconds


 34%|███▍      | 17/50 [05:43<10:57, 19.93s/it]

Completion time for qwen-2-0.5B batch: 8.91 seconds
Completion time for gpt2 batch: 5.27 seconds
Completion time for gpt-neo batch: 3.84 seconds
Completion time for distilgpt2 batch: 3.31 seconds


 36%|███▌      | 18/50 [06:06<11:07, 20.85s/it]

Completion time for qwen-2-0.5B batch: 10.57 seconds
Completion time for gpt2 batch: 5.23 seconds
Completion time for gpt-neo batch: 3.78 seconds
Completion time for distilgpt2 batch: 3.32 seconds


 38%|███▊      | 19/50 [06:29<11:03, 21.40s/it]

Completion time for qwen-2-0.5B batch: 10.35 seconds
Completion time for gpt2 batch: 5.25 seconds
Completion time for gpt-neo batch: 3.78 seconds
Completion time for distilgpt2 batch: 3.33 seconds


 40%|████      | 20/50 [06:52<10:55, 21.86s/it]

Completion time for qwen-2-0.5B batch: 10.56 seconds
Completion time for gpt2 batch: 5.21 seconds
Completion time for gpt-neo batch: 3.80 seconds
Completion time for distilgpt2 batch: 3.31 seconds


 42%|████▏     | 21/50 [07:15<10:42, 22.15s/it]

Completion time for qwen-2-0.5B batch: 10.52 seconds
Completion time for gpt2 batch: 5.45 seconds
Completion time for gpt-neo batch: 3.91 seconds
Completion time for distilgpt2 batch: 3.44 seconds


 44%|████▍     | 22/50 [07:39<10:38, 22.81s/it]

Completion time for qwen-2-0.5B batch: 11.54 seconds
Completion time for gpt2 batch: 5.18 seconds
Completion time for gpt-neo batch: 3.77 seconds
Completion time for distilgpt2 batch: 3.28 seconds


 46%|████▌     | 23/50 [08:02<10:15, 22.80s/it]

Completion time for qwen-2-0.5B batch: 10.55 seconds
Completion time for gpt2 batch: 5.26 seconds
Completion time for gpt-neo batch: 3.81 seconds
Completion time for distilgpt2 batch: 3.33 seconds


 48%|████▊     | 24/50 [08:25<09:53, 22.83s/it]

Completion time for qwen-2-0.5B batch: 10.48 seconds
Completion time for gpt2 batch: 5.23 seconds
Completion time for gpt-neo batch: 3.82 seconds
Completion time for distilgpt2 batch: 3.31 seconds


 50%|█████     | 25/50 [08:48<09:31, 22.87s/it]

Completion time for qwen-2-0.5B batch: 10.60 seconds
Completion time for gpt2 batch: 5.31 seconds
Completion time for gpt-neo batch: 3.86 seconds
Completion time for distilgpt2 batch: 3.36 seconds


 52%|█████▏    | 26/50 [09:11<09:10, 22.94s/it]

Completion time for qwen-2-0.5B batch: 10.59 seconds
Completion time for gpt2 batch: 5.15 seconds
Completion time for gpt-neo batch: 3.75 seconds
Completion time for distilgpt2 batch: 3.31 seconds


 54%|█████▍    | 27/50 [09:33<08:44, 22.82s/it]

Completion time for qwen-2-0.5B batch: 10.30 seconds
Completion time for gpt2 batch: 5.31 seconds
Completion time for gpt-neo batch: 3.84 seconds
Completion time for distilgpt2 batch: 3.36 seconds


 56%|█████▌    | 28/50 [09:56<08:23, 22.90s/it]

Completion time for qwen-2-0.5B batch: 10.59 seconds
Completion time for gpt2 batch: 5.32 seconds
Completion time for gpt-neo batch: 3.86 seconds
Completion time for distilgpt2 batch: 3.36 seconds


 58%|█████▊    | 29/50 [10:19<08:01, 22.94s/it]

Completion time for qwen-2-0.5B batch: 10.49 seconds
Completion time for gpt2 batch: 5.22 seconds
Completion time for gpt-neo batch: 3.81 seconds
Completion time for distilgpt2 batch: 3.31 seconds


 60%|██████    | 30/50 [10:42<07:38, 22.92s/it]

Completion time for qwen-2-0.5B batch: 10.52 seconds
Completion time for gpt2 batch: 5.22 seconds
Completion time for gpt-neo batch: 3.79 seconds
Completion time for distilgpt2 batch: 3.31 seconds


 62%|██████▏   | 31/50 [11:05<07:14, 22.85s/it]

Completion time for qwen-2-0.5B batch: 10.35 seconds
Completion time for gpt2 batch: 5.33 seconds
Completion time for gpt-neo batch: 3.89 seconds
Completion time for distilgpt2 batch: 3.38 seconds


 64%|██████▍   | 32/50 [11:28<06:52, 22.89s/it]

Completion time for qwen-2-0.5B batch: 10.38 seconds
Completion time for gpt2 batch: 5.23 seconds
Completion time for gpt-neo batch: 3.80 seconds
Completion time for distilgpt2 batch: 3.32 seconds


 66%|██████▌   | 33/50 [11:51<06:28, 22.88s/it]

Completion time for qwen-2-0.5B batch: 10.48 seconds
Completion time for gpt2 batch: 5.25 seconds
Completion time for gpt-neo batch: 3.82 seconds
Completion time for distilgpt2 batch: 3.33 seconds


 68%|██████▊   | 34/50 [12:14<06:06, 22.89s/it]

Completion time for qwen-2-0.5B batch: 10.50 seconds
Completion time for gpt2 batch: 5.19 seconds
Completion time for gpt-neo batch: 3.81 seconds
Completion time for distilgpt2 batch: 3.30 seconds


 70%|███████   | 35/50 [12:36<05:43, 22.88s/it]

Completion time for qwen-2-0.5B batch: 10.58 seconds
Completion time for gpt2 batch: 5.31 seconds
Completion time for gpt-neo batch: 3.87 seconds
Completion time for distilgpt2 batch: 3.37 seconds


 72%|███████▏  | 36/50 [13:00<05:21, 22.99s/it]

Completion time for qwen-2-0.5B batch: 10.67 seconds
Completion time for gpt2 batch: 5.27 seconds
Completion time for gpt-neo batch: 3.85 seconds
Completion time for distilgpt2 batch: 3.34 seconds


 74%|███████▍  | 37/50 [13:23<04:58, 22.98s/it]

Completion time for qwen-2-0.5B batch: 10.51 seconds
Completion time for gpt2 batch: 5.22 seconds
Completion time for gpt-neo batch: 3.85 seconds
Completion time for distilgpt2 batch: 3.29 seconds


 76%|███████▌  | 38/50 [13:45<04:35, 22.92s/it]

Completion time for qwen-2-0.5B batch: 10.39 seconds
Completion time for gpt2 batch: 5.20 seconds
Completion time for gpt-neo batch: 3.85 seconds
Completion time for distilgpt2 batch: 3.30 seconds


 78%|███████▊  | 39/50 [14:08<04:11, 22.89s/it]

Completion time for qwen-2-0.5B batch: 10.47 seconds
Completion time for gpt2 batch: 5.21 seconds
Completion time for gpt-neo batch: 3.83 seconds
Completion time for distilgpt2 batch: 3.30 seconds


 80%|████████  | 40/50 [14:31<03:49, 22.94s/it]

Completion time for qwen-2-0.5B batch: 10.71 seconds
Completion time for gpt2 batch: 5.24 seconds
Completion time for gpt-neo batch: 3.81 seconds
Completion time for distilgpt2 batch: 3.31 seconds


 82%|████████▏ | 41/50 [14:54<03:26, 22.94s/it]

Completion time for qwen-2-0.5B batch: 10.58 seconds
Completion time for gpt2 batch: 5.24 seconds
Completion time for gpt-neo batch: 3.82 seconds
Completion time for distilgpt2 batch: 3.33 seconds


 84%|████████▍ | 42/50 [15:17<03:03, 22.93s/it]

Completion time for qwen-2-0.5B batch: 10.51 seconds
Completion time for gpt2 batch: 5.32 seconds
Completion time for gpt-neo batch: 3.92 seconds
Completion time for distilgpt2 batch: 3.40 seconds


 86%|████████▌ | 43/50 [15:41<02:41, 23.05s/it]

Completion time for qwen-2-0.5B batch: 10.67 seconds
Completion time for gpt2 batch: 5.31 seconds
Completion time for gpt-neo batch: 3.87 seconds
Completion time for distilgpt2 batch: 3.36 seconds


 88%|████████▊ | 44/50 [16:04<02:18, 23.07s/it]

Completion time for qwen-2-0.5B batch: 10.58 seconds
Completion time for gpt2 batch: 5.24 seconds
Completion time for gpt-neo batch: 3.82 seconds
Completion time for distilgpt2 batch: 3.32 seconds


 90%|█████████ | 45/50 [16:27<01:55, 23.02s/it]

Completion time for qwen-2-0.5B batch: 10.52 seconds
Completion time for gpt2 batch: 5.33 seconds
Completion time for gpt-neo batch: 3.86 seconds
Completion time for distilgpt2 batch: 3.37 seconds


 92%|█████████▏| 46/50 [16:50<01:32, 23.03s/it]

Completion time for qwen-2-0.5B batch: 10.49 seconds
Completion time for gpt2 batch: 5.25 seconds
Completion time for gpt-neo batch: 3.84 seconds
Completion time for distilgpt2 batch: 3.33 seconds


 94%|█████████▍| 47/50 [17:13<01:08, 23.00s/it]

Completion time for qwen-2-0.5B batch: 10.49 seconds
Completion time for gpt2 batch: 5.26 seconds
Completion time for gpt-neo batch: 3.83 seconds
Completion time for distilgpt2 batch: 3.33 seconds


 96%|█████████▌| 48/50 [17:35<00:45, 22.99s/it]

Completion time for qwen-2-0.5B batch: 10.53 seconds
Completion time for gpt2 batch: 5.27 seconds
Completion time for gpt-neo batch: 3.85 seconds
Completion time for distilgpt2 batch: 3.34 seconds


 98%|█████████▊| 49/50 [17:59<00:23, 23.01s/it]

Completion time for qwen-2-0.5B batch: 10.59 seconds
Completion time for gpt2 batch: 5.27 seconds
Completion time for gpt-neo batch: 3.86 seconds
Completion time for distilgpt2 batch: 3.34 seconds


100%|██████████| 50/50 [18:22<00:00, 22.04s/it]

Completion time for qwen-2-0.5B batch: 10.73 seconds

Processed 10000 samples in 1103.56 seconds.
Average time per sample: 0.11 seconds.
Estimated total time for 10003 samples: 18.40 minutes.





In [12]:
import pandas as pd

# Convert results to a DataFrame
df = pd.DataFrame(results)

# Save to CSV
df.to_csv("sentence_complete2.csv", index=False)

print("Dataset saved as 'llm_wikitext_completions.csv'")


Dataset saved as 'llm_wikitext_completions.csv'
