In [1]:
import json
import os
import urllib

In [3]:
def download_and_load_file(file_path, url):
    if not os.path.exists(file_path):
        with urllib.request.urlopen(url) as response:
            text_data = response.read().decode("utf-8")
        with open(file_path, "w", encoding = "utf-8") as file:
            file.write(text_data)
            
    else:
        with open(file_path, "r", encoding = "utf-8") as file:
            text_file = file.read()
    with open(file_path, "r") as file:
        data = json.load(file)
    return data

file_path = "instruction-data.json"
url = (
    "https://raw.githubusercontent.com/rasbt/LLMs-from-scratch"
    "/main/ch07/01_main-chapter-code/instruction-data.json"
)

data = download_and_load_file(file_path, url)
print(f"Number of entries: {len(data)}")

Number of entries: 1100


In [4]:
data[30]

{'instruction': 'Alter the content of the sentence to use the past tense.',
 'input': 'The ship sails at dawn.',
 'output': 'The ship sailed at dawn.'}

In [5]:
def format_input(entry):
    instruction_text = (
        f"Below is an instruction that describes a task. "
        f"Write a response that appropriately completes the request."
        f"\n\n### Instruction:\n{entry['instruction']}"
    )
    
    input_text = (
        f"\n\n### Input: \n{entry['input']}" if entry["input"] else ""
    )
    return instruction_text + input_text

In [8]:
model_input = format_input(data[30])
desired_response = f"\n\n### Response: \n{data[30]['output']}"
print(model_input + desired_response)

Below is an instruction that describes a task. Write a response that appropriately completes the request.

### Instruction:
Alter the content of the sentence to use the past tense.

### Input: 
The ship sails at dawn.

### Response: 
The ship sailed at dawn.


In [9]:
train_portion = int(len(data) * 0.85)
test_portion = int(len(data) * 0.1)
val_portion = len(data) - train_portion - test_portion

train_data = data[:train_portion]
test_data = data[train_portion: train_portion + test_portion]
val_data = data[train_portion + test_portion :]

In [10]:
import torch
from torch.utils.data import Dataset

class InstructionDataset(Dataset):
    def __init__(self, data, tokenizer):
        self.data = data†
        self.encoded_texts = []
        for entry in data:
            instruction_plus_input = format_input(entry)
            response_text = f"\n\n### Response: \n{entry['output']}"
            full_text = instruction_plus_input + response_text
            self.encoded_texts.append(tokenizer.encode(full_text))
        
    def __getitem__(self, index):
        return self.encoded_texts[index]
        
    def __len__(self):
        return len(self.encoded_texts)

def custom_collate_draft_v1(
    batch,
    pad_token_id=50256,
    device="cpu"
):
    batch_max_length = max(len(item) + 1 for item in batch)
    input_lst = []
    for item in batch:
        new_item = item.copy()
        new_item += [pad_token_id]
        
        padded = (
            new_item + [pad_token_id] * (batch_max_length - len(new_item))
        )
        inputs = torch.tensor(padded[:-1])
        input_lst.append(inputs)
        
    inputs_tensor = torch.stack(input_lst).to(device)
    return inputs_tensor