In [None]:
# %pip install -U jupyter ipython ipywidgets

In [None]:
from transformers import LlamaForCausalLM, LlamaConfig, AutoTokenizer, TrainingArguments, Trainer
from transformers.modeling_outputs import CausalLMOutputWithPast
import datasets
from datasets import load_dataset, Dataset
import torch

In [None]:
hf_token = "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"

context_length = 1024

In [None]:
class CustomLlamaModel(LlamaForCausalLM):
    def __init__(self, config):
        super().__init__(config)

    def forward(
        self,
        input_ids=None,
        attention_mask=None,
        position_ids=None,
        past_key_values=None,
        inputs_embeds=None,
        labels=None,
        use_cache=None,
        output_attentions=None,
        output_hidden_states=None,
        return_dict=None,
        cache_position=None,
    ):
        # Ensure return_dict is True to work with Trainer properly
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        outputs = super().forward(
            input_ids=input_ids,
            attention_mask=attention_mask,
            position_ids=position_ids,
            past_key_values=past_key_values,
            inputs_embeds=inputs_embeds,
            labels=labels,
            use_cache=use_cache,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=True,
            cache_position=cache_position,
        )

        # If labels are provided, calculate the loss
        # if labels is not None:
        if labels is not None:
            loss_fct = torch.nn.CrossEntropyLoss()
            loss = loss_fct(outputs.logits[:, :-1, :].contiguous().view(-1,
                            self.config.vocab_size), labels[:, 1:].contiguous().view(-1))

            return CausalLMOutputWithPast(
                loss=loss,
                logits=outputs.logits,
                past_key_values=outputs.past_key_values,
                hidden_states=outputs.hidden_states,
                attentions=outputs.attentions,
            )

        # If no labels, just return the original model output
        return outputs

In [None]:
# Ensure your GPU is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

In [None]:
# Get the 7B config
config_1B = LlamaConfig.from_pretrained("meta-llama/Llama-2-7b-chat-hf", token=hf_token)
config_1B

In [None]:
# Configuration for a hypothetical 1B parameter model
config_1B.hidden_size = 1024
config_1B.intermediate_size = 4096
config_1B.num_hidden_layers = 24
config_1B.num_attention_heads = 16
config_1B.max_position_embeddings = context_length
config_1B.pad_token_id = config_1B.eos_token_id
config_1B.torch_dtype = "bfloat16"
config_1B.attn_implementation = "flash_attention_2"

config_1B

In [None]:
# Initialize the model
model = LlamaForCausalLM(config_1B)
model = model.to(device)  # Move model to GPU
model = model.train()  # Set model to training mode

In [None]:
# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-7b-chat-hf", token=hf_token)
tokenizer.pad_token_id = tokenizer.eos_token_id  # Set pad token to end-of-sequence token

In [None]:
# Prepare dataset (example using 'wikimedia/wikipedia', '20231101.en' subset)
# dataset = load_dataset("D:/ai-stuff/datasets/wikipedia", "20231101.en")
dataset = load_dataset("/media/gronkomatic/Embiggen/ai-stuff/datasets/wikipedia", "20231101.en")

In [None]:
dataset = dataset["train"].train_test_split(test_size=0.05)

In [None]:
exam_dataset = dataset["train"].select(range(2))
exam_dataset

In [None]:
exam_dataset[0]

In [None]:
tokenized_batches = tokenizer(
    exam_dataset["text"],
    padding="max_length",
    truncation=True,
    max_length=context_length,
    return_overflowing_tokens=True,
    stride=int(round(context_length / 4)),
    return_tensors="pt"
)

tokenized_batches

In [None]:
for key in tokenized_batches:
    print(f"{key}: {tokenized_batches[key].shape}")

In [None]:
# Decode and display the new tokenized text
decoded_text = []
for i in range(len(tokenized_batches["input_ids"])):
    decoded_text.append(tokenizer.decode(tokenized_batches["input_ids"][i]))

decoded_text

In [None]:
small_train_dataset = dataset["train"].select(range(1000))
small_eval_dataset = dataset["test"].select(range(100))

In [None]:
small_train_dataset

In [None]:
small_eval_dataset

In [None]:
dataset

In [None]:
dataset

In [None]:
def tokenize_function(examples):
    return tokenizer(
        examples["text"],
        padding="max_length",  # or False if you prefer dynamic padding later on
        truncation=True,
        max_length=context_length,
        return_overflowing_tokens=True,
        stride=int(round(context_length / 4)),
        return_tensors="pt"
    )


# tokenized_train = dataset["train"].map(tokenize_function, batched=True, remove_columns=dataset["train"].column_names)
# tokenized_eval = dataset["test"].map(tokenize_function, batched=True, remove_columns=dataset["test"].column_names)
tokenized_train = small_train_dataset.map(tokenize_function, batched=True, remove_columns=dataset["train"].column_names)
tokenized_eval = small_eval_dataset.map(tokenize_function, batched=True, remove_columns=dataset["test"].column_names)

In [None]:
# TrainingArguments setup
training_args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=5,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    warmup_steps=50,
    learning_rate=5e-5,
    lr_scheduler_type="linear",
    logging_dir="./logs",
    logging_steps=1,
    evaluation_strategy="steps",
    eval_steps=100,
    save_strategy="steps",
    save_steps=100,
    # load_best_model_at_end=True,
    # metric_for_best_model="loss",
    gradient_accumulation_steps=8,
    bf16=True,  # Enable mixed-precision training
    bf16_full_eval=True,  # Enable mixed-precision evaluation
    optim="adamw_torch",  # Use PyTorch's AdamW optimizer
)

# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_eval,
)

In [None]:
# Start training
trainer.train()

In [None]:
# Save the trained model
model.save_pretrained("./custom_llama_1B_model")

In [None]:
import json
import os
import time
import torch
from CustomLlamaModel import CustomLlamaModel
from datasets import load_dataset, Dataset
from transformers import LlamaConfig, AutoTokenizer, TrainingArguments, Trainer


# Model settings
hidden_layers = 8  # Number of transformer layers
hidden_size = 1024  # Size of the hidden states in the transformer layers
intermediate_size = 2048  # Size of the feed-forward network in the transformer layers
attention_heads = 32  # Number of attention heads
context_length = 2048  # Length of the input context
stride = 50  # Stride for splitting the input into multiple sequences

# Training settings
seed = 42
epochs = 20  # Number of training epochs
batch_size = 2  # Number of sequences to process in parallel
gradient_accumulation_steps = 10  # Number of update steps to accumulate before performing a backward pass
logging_steps = 1  # Log training loss every X steps
warmup_steps = 100 / gradient_accumulation_steps  # Number of warmup steps for the learning rate scheduler

run = "2"
output_dir = "./results/run-" + run
logging_dir = output_dir + "/logs"
final_dir = "./final"

learning_rate = 5e-5
lr_scheduler_type = "linear"
optim = "adamw_torch"  # Use PyTorch's AdamW optimizer

evaluation_strategy = "epoch"
eval_steps = 0.25
save_strategy = "epoch"
save_steps = 0.25

load_best_model_at_end = True
metric_for_best_model = "loss"

# Write the configuration to a JSON file
training_config = {
    "hidden_layers": hidden_layers,
    "hidden_size": hidden_size,
    "intermediate_size": intermediate_size,
    "attention_heads": attention_heads,
    "context_length": context_length,
    "stride": stride,
    "seed": seed,
    "epochs": epochs,
    "batch_size": batch_size,
    "gradient_accumulation_steps": gradient_accumulation_steps,
    "logging_steps": logging_steps,
    "warmup_steps": warmup_steps,
    "learning_rate": learning_rate,
    "lr_scheduler_type": lr_scheduler_type,
    "optim": optim,
    "evaluation_strategy": evaluation_strategy,
    "eval_steps": eval_steps,
    "save_strategy": save_strategy,
    "save_steps": save_steps,
    "load_best_model_at_end": load_best_model_at_end,
    "metric_for_best_model": metric_for_best_model,
    "start_time": time.strftime("%Y-%m-%d %H:%M:%S"),
}

# Create the output directory if it doesn't exist
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

with open(output_dir + "/training_config.json", "w") as f:
    json.dump(training_config, f, indent=4)

# Set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Configuration for a hypothetical 1B parameter model
config_1B = LlamaConfig(
    vocab_size=32000,
    hidden_size=hidden_size,
    intermediate_size=intermediate_size,
    num_hidden_layers=hidden_layers,
    num_attention_heads=attention_heads,
    max_position_embeddings=context_length,
    pad_token_id=2,
    torch_dtype="bfloat16"
)

# Initialize the model with bfloat16 precision
model = CustomLlamaModel(config_1B)
# model = model.half()  # Convert model parameters to bfloat16
model = model.to(device)  # Move model to GPU
model = model.train()  # Set model to training mode

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-7b-chat-hf")
tokenizer.pad_token_id = tokenizer.eos_token_id  # Set pad token to end-of-sequence token

# Prepare dataset (example using 'wikimedia/wikipedia', '20231101.en' subset)
dataset = load_dataset("D:/ai-stuff/datasets/wikipedia", "20231101.en")

# Select the first 10000 examples
dataset = dataset["train"].select(range(10000))

# Merge the "title" and "text" columns into a single "text" column, separated by two newline characters
# dataset = dataset.map(lambda examples: {
#     "text": examples["title"] + "\n\n" + examples["text"]
# })

# Shuffling the dataset
dataset = dataset.shuffle(seed=seed)

# Split the dataset into training and evaluation sets 90:10
dataset = dataset.train_test_split(test_size=0.10)

In [None]:
# Tokenize the dataset
def tokenize_function(examples):
    # Tokenize all texts and return overflow tokens as separate examples
    tokenized_batches = tokenizer(
        examples["text"],
        padding="max_length",
        truncation=True,
        max_length=context_length,
        return_overflowing_tokens=True,
        stride=stride,
        return_tensors="pt"
    )

    # Shift the input ids to the left to create the labels so that the model predicts the next token.
    # The label for the last token is set to -100, so it's ignored by the loss function.
    tokenized_batches["labels"] = tokenized_batches.input_ids.clone()
    tokenized_batches["labels"][:, :-1] = tokenized_batches["labels"][:, 1:].clone()
    tokenized_batches["labels"][:, -1] = -100

    return tokenized_batches


def tokenize_function_2(examples):
    # Tokenize all texts and truncate them to the maximum length
    tokenized_batches = tokenizer(
        examples["text"],
        padding="max_length",
        truncation=True,
        max_length=context_length,
        return_tensors="pt"
    )

    # Shift the input ids to the left to create the labels so that the model predicts the next token.
    # The label for the last token is set to -100, so it's ignored by the loss function.
    tokenized_batches["labels"] = tokenized_batches.input_ids.clone()
    tokenized_batches["labels"][:, :-1] = tokenized_batches["labels"][:, 1:].clone()
    tokenized_batches["labels"][:, -1] = -100

    return tokenized_batches

In [None]:
dataset

In [None]:
tokenized_train = tokenize_function(dataset["train"])
tokenized_eval = tokenize_function(dataset["test"])

In [None]:
train_tokens = dataset["train"].map(tokenize_function_2, batched=True)
test_tokens = dataset["test"].map(tokenize_function_2, batched=True)

In [None]:
train_tokens

In [None]:
tokenized_train

In [None]:
tokenized_train_dataset = Dataset.from_dict(tokenized_train)

In [None]:
tokenized_train_dataset