<a href="https://colab.research.google.com/github/offthewallace/CSE584/blob/main/dataset_creation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install datasets

Collecting datasets
  Downloading datasets-3.0.1-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.17-py310-none-any.whl.metadata (7.2 kB)
INFO: pip is looking at multiple versions of multiprocess to determine which version is compatible with other requirements. This could take a while.
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Downloading datasets-3.0.1-py3-none-any.whl (471 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m471.6/471.6 kB[0m [31m9.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m9.5 MB/s[0m eta [36m0:00:0

In [2]:
from datasets import load_dataset
from transformers import GPT2LMHeadModel, GPT2Tokenizer, AutoModelForCausalLM, AutoTokenizer
import random
import torch
import torch

# Check if GPU is available and set the device accordingly
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

Using device: cuda


In [3]:
import time
from tqdm import tqdm
import torch
from datasets import load_dataset

# Check if GPU is available and set the device accordingly
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

# Load the IMDB dataset
def truncate_half_text(text):
    """
    Truncates the text to the first 200 tokens, ensuring the result is non-empty.
    If the text has fewer than 200 tokens, it returns the original text.
    """
    words = text.split()

    # Truncate to the first 200 tokens or return the original if it's shorter
    truncated_text = ' '.join(words[:200])

    return truncated_text if truncated_text else None


def truncate_half_text2(text):
    """
    Truncates the text to half its length, ensuring the result is non-empty.
    """
    words = text.split()
    if len(words) < 5:  # Skip if the text is too short to truncate
        return text  # Return original text if it's too short
    half_length = len(words) // 2
    truncated_text = ' '.join(words[:half_length])
    return truncated_text if truncated_text else None

def add_prompt(truncated_text):
    """
    Dynamically creates a prompt based on the truncated text.
    """
    prompt = f'Complete this sentence: "{truncated_text}"'
    return prompt

def batch_generate_completion(model, tokenizer, texts, max_new_tokens=200, top_k=50, top_p=0.9, temperature=0.8, repetition_penalty=1.2):
    """
    Generate text completions in batches for a list of truncated texts using a specific model and tokenizer.
    """
    if len(texts) == 0:
        return []  # Return empty if the input batch is empty

    # Tokenize the batch of texts
    inputs = tokenizer(texts, return_tensors='pt', padding=True, truncation=True).to(device)

    # Generate the completion and move output back to CPU for decoding
    outputs = model.generate(
        inputs['input_ids'],
        attention_mask=inputs['attention_mask'],
        max_new_tokens=max_new_tokens,
        top_k=top_k,               # Use top-k sampling
        top_p=top_p,               # Use nucleus sampling (top-p)
        temperature=temperature,   # Adjust temperature for randomness
        repetition_penalty=repetition_penalty,  # Penalize repetitive tokens
        pad_token_id=tokenizer.eos_token_id
    )

    # Decode the output batch
    return [tokenizer.decode(output, skip_special_tokens=True) for output in outputs]

def generate_dataset_with_llms(dataset, models, tokenizers, num_samples=10, max_new_tokens=200, batch_size=8):
    results = []
    start_time = time.time()  # Start timer for total estimation

    # Move models to GPU (cuda)
    models = {name: model.to(device) for name, model in models.items()}

    # Iterate through dataset in batches with progress tracking
    for i in tqdm(range(0, num_samples, batch_size)):
        # Get the current batch of examples (IMDB reviews are stored under the 'text' key)
        batch_examples = dataset[i:i + batch_size]['text']

        # Truncate the texts in the batch and dynamically add prompt
        batch_truncated_texts = []
        for example in batch_examples:
            truncated_text = truncate_half_text(example)
            if truncated_text:
                batch_truncated_texts.append(add_prompt(truncated_text))

        # Skip empty or invalid truncated texts
        if len(batch_truncated_texts) == 0:
            print(f"Skipping batch {i} due to empty truncated texts")
            continue

        for model_name, model in models.items():
            try:
                # Time batch generation for each model
                completion_start = time.time()

                # Generate completions for the batch
                completions = batch_generate_completion(model, tokenizers[model_name], batch_truncated_texts, max_new_tokens)

                if len(completions) == 0:
                    print(f"No completions generated for batch {i} with model {model_name}")
                    continue  # Skip empty completions

                completion_end = time.time()
                completion_time = completion_end - completion_start
                print(f"Completion time for {model_name} batch: {completion_time:.2f} seconds")

                # Store the (xi, xj) pairs with corresponding LLM label
                for truncated_text, completion in zip(batch_truncated_texts, completions):
                    results.append({
                        'xi': truncated_text,  # This will include the prompt with truncated text
                        'xj': completion[len(truncated_text):].strip(),  # The actual completion generated by the model
                        'llm': model_name
                    })

            except Exception as e:
                print(f"Error generating completion with {model_name} for batch {i}: {e}")
                continue

    end_time = time.time()  # End timer for total process

    # Calculate time per sample and estimate total time
    total_time = end_time - start_time
    avg_time_per_sample = total_time / num_samples
    total_samples = len(dataset)
    estimated_total_time = avg_time_per_sample * total_samples

    print(f"\nProcessed {num_samples} samples in {total_time:.2f} seconds.")
    print(f"Average time per sample: {avg_time_per_sample:.2f} seconds.")
    print(f"Estimated total time for {total_samples} samples: {estimated_total_time / 60:.2f} minutes.")

    return results

# Example usage:

# Define models and tokenizers (Example for GPT-2 and GPT-Neo)
from transformers import GPT2LMHeadModel, GPT2Tokenizer, AutoModelForCausalLM, AutoTokenizer

models = {
    'gpt2': GPT2LMHeadModel.from_pretrained('gpt2'),
    'gpt-neo': AutoModelForCausalLM.from_pretrained('EleutherAI/gpt-neo-125M')
}

tokenizers = {
    'gpt2': GPT2Tokenizer.from_pretrained('gpt2'),
    'gpt-neo': AutoTokenizer.from_pretrained('EleutherAI/gpt-neo-125M')
}

tokenizers['gpt-neo'].pad_token = tokenizers['gpt-neo'].eos_token
tokenizers['gpt2'].pad_token = tokenizers['gpt2'].eos_token

# Fix: Set padding_side to 'left' for both GPT-2 and GPT-Neo tokenizers
tokenizers['gpt-neo'].padding_side = 'left'
tokenizers['gpt2'].padding_side = 'left'




imdb_dataset = load_dataset('imdb', split='test')  # Use 'test' split for demonstration

results = generate_dataset_with_llms(imdb_dataset, models, tokenizers, num_samples=1000, batch_size=200, max_new_tokens=200)


Using device: cuda


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.01k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/526M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/119 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]



tokenizer_config.json:   0%|          | 0.00/727 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.11M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/357 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/7.81k [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/21.0M [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/20.5M [00:00<?, ?B/s]

unsupervised-00000-of-00001.parquet:   0%|          | 0.00/42.0M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating unsupervised split:   0%|          | 0/50000 [00:00<?, ? examples/s]



Completion time for gpt2 batch: 11.38 seconds


 20%|██        | 1/5 [00:17<01:09, 17.42s/it]

Completion time for gpt-neo batch: 6.04 seconds
Completion time for gpt2 batch: 9.29 seconds


 40%|████      | 2/5 [00:32<00:48, 16.06s/it]

Completion time for gpt-neo batch: 5.80 seconds
Completion time for gpt2 batch: 9.09 seconds


 60%|██████    | 3/5 [00:47<00:30, 15.48s/it]

Completion time for gpt-neo batch: 5.69 seconds
Completion time for gpt2 batch: 9.54 seconds


 80%|████████  | 4/5 [01:02<00:15, 15.49s/it]

Completion time for gpt-neo batch: 5.95 seconds
Completion time for gpt2 batch: 10.65 seconds


100%|██████████| 5/5 [01:19<00:00, 15.91s/it]

Completion time for gpt-neo batch: 6.10 seconds

Processed 1000 samples in 80.09 seconds.
Average time per sample: 0.08 seconds.
Estimated total time for 25000 samples: 33.37 minutes.





In [4]:
import pandas as pd

# Convert results to a DataFrame
df = pd.DataFrame(results)

# Save to CSV
df.to_csv("llm_wikitext_completions.csv", index=False)

print("Dataset saved as 'llm_wikitext_completions.csv'")


Dataset saved as 'llm_wikitext_completions.csv'
