# Test dataloading

In [1]:
import sys
from pathlib import Path

# Get the parent directory of the current notebook
parent_dir = Path().resolve().parent

# Convert to string and insert into sys.path
sys.path.insert(0, str(parent_dir))

# Verify the updated sys.path
print("Updated sys.path:", sys.path)

from scripts.load_data import load_and_preprocess

# Specify the dataset and model checkpoint
dataset_name = 'imdb'  # For example, the IMDB dataset
model_checkpoint = 'gpt2'  # Using GPT-2 tokenizer

# Load and preprocess the dataset
dataset = load_and_preprocess(dataset_name, model_checkpoint)

# Display dataset information
print(dataset)

# Sample data
print(dataset['train'][0])

Updated sys.path: ['/Users/natnaeldaba/ssh_mount/cluster/llm-sft', '/opt/homebrew/Caskroom/miniforge/base/envs/llm-finetuning/lib/python310.zip', '/opt/homebrew/Caskroom/miniforge/base/envs/llm-finetuning/lib/python3.10', '/opt/homebrew/Caskroom/miniforge/base/envs/llm-finetuning/lib/python3.10/lib-dynload', '', '/opt/homebrew/Caskroom/miniforge/base/envs/llm-finetuning/lib/python3.10/site-packages']


  from .autonotebook import tqdm as notebook_tqdm


DatasetDict({
    train: Dataset({
        features: ['text', 'label', 'input_ids', 'attention_mask'],
        num_rows: 22500
    })
    test: Dataset({
        features: ['text', 'label', 'input_ids', 'attention_mask'],
        num_rows: 25000
    })
    unsupervised: Dataset({
        features: ['text', 'label', 'input_ids', 'attention_mask'],
        num_rows: 50000
    })
    validation: Dataset({
        features: ['text', 'label', 'input_ids', 'attention_mask'],
        num_rows: 2500
    })
})
{'text': "With these people faking so many shots, using old footage, and gassing animals to get them out, not to mention that some of the scenes were filmed on a created set with actors, what's to believe? Old film of countries is nice, but the animal abuse and degradation of natives is painful to watch in these films. I know, racism is OK in these old films, but there is more to that to make this couple lose credibility. Portrayed as fliers, they never flew their planes, Martin Johnson w

In [4]:
dataset['train']['label']

[0,
 1,
 1,
 1,
 1,
 0,
 1,
 1,
 0,
 0,
 1,
 1,
 0,
 0,
 0,
 0,
 1,
 0,
 1,
 1,
 1,
 0,
 1,
 0,
 0,
 0,
 1,
 0,
 1,
 0,
 0,
 0,
 1,
 0,
 0,
 1,
 0,
 0,
 1,
 0,
 0,
 1,
 1,
 0,
 0,
 1,
 1,
 1,
 1,
 0,
 0,
 1,
 0,
 0,
 1,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 1,
 1,
 0,
 1,
 1,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 1,
 1,
 1,
 0,
 1,
 1,
 1,
 0,
 0,
 1,
 1,
 0,
 0,
 1,
 0,
 1,
 0,
 0,
 0,
 0,
 1,
 1,
 1,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 1,
 0,
 1,
 1,
 1,
 0,
 0,
 1,
 0,
 0,
 1,
 0,
 0,
 0,
 1,
 0,
 1,
 0,
 0,
 1,
 0,
 1,
 0,
 0,
 1,
 1,
 1,
 0,
 1,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 1,
 0,
 1,
 0,
 0,
 1,
 1,
 1,
 0,
 1,
 0,
 0,
 0,
 0,
 1,
 1,
 1,
 1,
 1,
 0,
 1,
 0,
 1,
 1,
 1,
 0,
 1,
 1,
 1,
 1,
 0,
 1,
 1,
 1,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 1,
 1,
 0,
 1,
 1,
 1,
 0,
 1,
 0,
 1,
 1,
 1,
 1,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 1,
 0,
 1,
 0,
 0,
 1,
 0,
 1,
 1,
 0,
 0,
 1,
 0,
 1,
 1,
 1,
 0,
 0,
 1,
 0,
 0,
 0,
 1,
 1,
 0,
 0,
 1,
 1,
 0,
 1,
 0,
 1,
 1,
 1,
 1,
 1,
 0,
 1,
 1,
 1,
 0,
 0,


In [3]:
len(dataset['train'][0]['attention_mask'])

1024

In [4]:
len(dataset['train'][0]['text'])

849

In [6]:
dataset['train'][0]['label']

0

In [8]:
len(dataset['train'][0]['input_ids'])

1024

In [9]:
from transformers import AutoTokenizer

In [11]:
tokenizer = AutoTokenizer.from_pretrained('gpt2')

In [13]:
tokenizer.pad_token = tokenizer.eos_token

In [14]:
tokenizer.pad_token

'<|endoftext|>'

# Test evaluation

In [8]:
from datasets import load_dataset
from transformers import GPT2Tokenizer, GPT2Model

# Load the GPT-2 tokenizer
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
model = GPT2Model.from_pretrained('gpt2')

# Set the padding token to be the same as the end-of-sequence token
tokenizer.pad_token = tokenizer.eos_token

# Load a subset of the IMDB dataset
dataset = load_dataset('imdb', split='test[:1%]')  # Using a small subset for testing

# Tokenize the sample batch
def tokenize_function(examples):
    return tokenizer(examples['text'], padding='max_length', truncation=True, max_length=1024)

tokenized_dataset = dataset.map(tokenize_function, batched=True)

In [None]:
import torch

# Convert tokenized inputs to PyTorch tensors
input_ids = torch.tensor(tokenized_dataset['input_ids'])
attention_mask = torch.tensor(tokenized_dataset['attention_mask'])

# Pass the inputs through the model
with torch.no_grad():
    outputs = model(input_ids, attention_mask=attention_mask)

# Extract the last hidden states
last_hidden_states = outputs.last_hidden_state

# Sample model outputs 

## 1. Summarization

In [None]:
from transformers import pipeline
from random import randrange        


dataset_id = "samsum"
from datasets import load_dataset

# Load dataset from the hub
dataset = load_dataset(dataset_id)

print(f"Train dataset size: {len(dataset['train'])}")
print(f"Test dataset size: {len(dataset['test'])}")

# load model and tokenizer from huggingface hub with pipeline
summarizer = pipeline("summarization", model="../checkpoints/2025-02-23_11-24-19-text-summarization-samsum/checkpoint-1392/", device=0)

# select a random test sample
sample = dataset['test'][randrange(len(dataset["test"]))]
print(f"dialogue: \n{sample['dialogue']}\n---------------")

# summarize dialogue
res = summarizer(sample["dialogue"])

print(f"flan-t5-base summary:\n{res[0]['summary_text']}")

  from .autonotebook import tqdm as notebook_tqdm


RuntimeError: Failed to import transformers.pipelines because of the following error (look up to see its traceback):
[Errno 2] No such file or directory