In [1]:
!pip install torch transformers datasets sentencepiece

Collecting datasets
  Downloading datasets-3.5.0-py3-none-any.whl.metadata (19 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch)
  Downloading nvidia_cufft_cu12-11.2.1.3-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting 

In [2]:
import torch
print("GPU Available:", torch.cuda.is_available())

GPU Available: True


In [3]:
import torch
import torch.nn as nn

class CustomLLM(nn.Module):
    def __init__(self, vocab_size=32000, d_model=768, num_heads=8, num_layers=8, ff_dim=3072):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, d_model)

        # ✅ Fix: Use batch_first=True
        encoder_layer = nn.TransformerEncoderLayer(
            d_model=d_model,
            nhead=num_heads,
            dim_feedforward=ff_dim,
            batch_first=True  # ✅ Fix: Better inference performance
        )

        self.transformer = nn.TransformerEncoder(encoder_layer, num_layers)
        self.fc = nn.Linear(d_model, vocab_size)

    def forward(self, x):
        x = self.embedding(x)
        x = self.transformer(x)
        return self.fc(x)

# ✅ Instantiate model and move to GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = CustomLLM().to(device)

# ✅ Check parameter count
total_params = sum(p.numel() for p in model.parameters())
print(f"Total Parameters: {total_params / 1e6:.2f}M")  # Convert to million


Total Parameters: 105.89M


In [4]:
from datasets import load_dataset

dataset = load_dataset("wikitext", "wikitext-2-raw-v1")  # Wikipedia dataset
print(dataset["train"][0]["text"])  # Sample text


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/10.5k [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/733k [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/6.36M [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/657k [00:00<?, ?B/s]

Generating test split:   0%|          | 0/4358 [00:00<?, ? examples/s]

Generating train split:   0%|          | 0/36718 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/3760 [00:00<?, ? examples/s]




In [5]:
from google.colab import drive
drive.mount('/content/drive')

# Save model
model_path = "/content/drive/MyDrive/LLM/custom_llm.pth"
torch.save(model.state_dict(), model_path)
print("Model saved to Google Drive!")


Mounted at /content/drive
Model saved to Google Drive!


# **Day-2**

In [6]:
#Day-2

from datasets import load_dataset


dataset = load_dataset("wikitext", "wikitext-2-raw-v1")

train_text_file = "/content/wikitext_train.txt"
with open(train_text_file, "w", encoding="utf-8") as f:
    for example in dataset["train"]:
        f.write(example["text"] + "\n")

print(f"Training text saved to {train_text_file}")

Training text saved to /content/wikitext_train.txt


In [7]:
import sentencepiece as spm

spm.SentencePieceTrainer.train(
    input=train_text_file,
    model_prefix="mytokenizer",
    vocab_size=32000,
    model_type="bpe",
    character_coverage=1.0,
    max_sentence_length=10000
)

print("Tokenizer training complete!")

Tokenizer training complete!


In [8]:

sp = spm.SentencePieceProcessor(model_file="mytokenizer.model")

# Test it
sample_text = "Hello, I am building an LLM from scratch!"
tokens = sp.encode(sample_text, out_type=str)
token_ids = sp.encode(sample_text, out_type=int)

print("Tokens:", tokens)
print("Token IDs:", token_ids)

Tokens: ['▁Hell', 'o', ',', '▁I', '▁am', '▁building', '▁an', '▁L', 'L', 'M', '▁from', '▁scratch', '!']
Token IDs: [6064, 31014, 31029, 74, 586, 1830, 134, 125, 31057, 31042, 148, 19500, 31091]


In [9]:

def tokenize_function(examples):

    if examples["text"].strip():
        return {"input_ids": sp.encode(examples["text"], out_type=int)}
    else:
        return {"input_ids": []}


tokenized_dataset = dataset.map(
    tokenize_function,
    remove_columns=["text"],
    desc="Tokenizing dataset"
)


tokenized_dataset = tokenized_dataset.filter(
    lambda example: len(example["input_ids"]) > 0,
    desc="Filtering empty examples"
)

print("Dataset tokenized and filtered!")
print("First example:", tokenized_dataset["train"][0])


original_count = len(dataset["train"])
filtered_count = len(tokenized_dataset["train"])
print(f"Original rows: {original_count}, Filtered rows: {filtered_count}, Empty rows removed: {original_count - filtered_count}")

Tokenizing dataset:   0%|          | 0/4358 [00:00<?, ? examples/s]

Tokenizing dataset:   0%|          | 0/36718 [00:00<?, ? examples/s]

Tokenizing dataset:   0%|          | 0/3760 [00:00<?, ? examples/s]

Filtering empty examples:   0%|          | 0/4358 [00:00<?, ? examples/s]

Filtering empty examples:   0%|          | 0/36718 [00:00<?, ? examples/s]

Filtering empty examples:   0%|          | 0/3760 [00:00<?, ? examples/s]

Dataset tokenized and filtered!
First example: {'input_ids': [47, 8330, 9206, 2779, 47]}
Original rows: 36718, Filtered rows: 23767, Empty rows removed: 12951


In [10]:
from google.colab import drive
drive.mount('/content/drive')


tokenized_dataset.save_to_disk("/content/drive/MyDrive/LLM/tokenized_wikitext")

import shutil
shutil.copy("mytokenizer.model", "/content/drive/MyDrive/LLM/mytokenizer.model")
shutil.copy("mytokenizer.vocab", "/content/drive/MyDrive/LLM/mytokenizer.vocab")

print("Tokenized dataset and tokenizer saved to Google Drive!")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


Saving the dataset (0/1 shards):   0%|          | 0/2891 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/23767 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/2461 [00:00<?, ? examples/s]

Tokenized dataset and tokenizer saved to Google Drive!


In [11]:
# Day 3: Data Pipeline & Preprocessing Starts Here
from datasets import load_from_disk
from torch.utils.data import Dataset, DataLoader

In [12]:
tokenized_dataset = load_from_disk("/content/drive/MyDrive/LLM/tokenized_wikitext")
print("Tokenized dataset loaded from Google Drive!")
print("First example:", tokenized_dataset["train"][0])

Tokenized dataset loaded from Google Drive!
First example: {'input_ids': [47, 8330, 9206, 2779, 47]}


In [13]:
class LLMDataset(Dataset):
    def __init__(self, tokenized_data, max_length=128):
        self.data = tokenized_data
        self.max_length = max_length

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        input_ids = self.data[idx]["input_ids"]

        if len(input_ids) > self.max_length:
            input_ids = input_ids[:self.max_length]
        else:
            input_ids = input_ids + [0] * (self.max_length - len(input_ids))


        input_ids = torch.tensor(input_ids, dtype=torch.long)

        return input_ids

In [14]:
train_dataset = LLMDataset(tokenized_dataset["train"], max_length=128)
train_dataloader = DataLoader(
    train_dataset,
    batch_size=16,
    shuffle=True,
    num_workers=2
)


for batch in train_dataloader:
    print("Batch shape:", batch.shape)
    print("First batch example:", batch[0])
    break

print("DataLoader ready for training!")

Batch shape: torch.Size([16, 128])
First batch example: tensor([ 4820,  1751,    34, 19636,    48, 11067,    10,    48,     7,  1504,
          204,  2742,  1422,   619,  1388,    82,     7,   607,    18,  2389,
           89,   349,  1699,   130,    66,  1835,  1085,    82,     7,   453,
           76, 31016,  9200,    30,     7,   607,    10,  1016,    49, 12707,
         4105,  7633,  1802,    69,  5359,    66, 29365,  6925,    10,   349,
         1358,     5, 14288,    34,  7722,   130,   104,   354, 18927,  3722,
           10,  4685,    32,   130,    48,  5475,    82,  5557,   234,   246,
           48,     7,  1504, 13432,   828,  3722,    82,  1047,   408,   101,
          250,  1174,   607,    18,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
        

In [15]:
#  Train the Model
import torch
import torch.nn as nn
from torch.optim import AdamW
from torch.utils.data import DataLoader, Subset
import time
from google.colab import drive

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = CustomLLM().to(device)
model.load_state_dict(torch.load("/content/drive/MyDrive/LLM/custom_llm.pth", map_location=device))
print("Model loaded from Google Drive!")


from datasets import load_from_disk
tokenized_dataset = load_from_disk("/content/drive/MyDrive/LLM/tokenized_wikitext")


train_subset = Subset(LLMDataset(tokenized_dataset["train"], max_length=128), indices=range(1000))
train_dataloader = DataLoader(train_subset, batch_size=16, shuffle=True, num_workers=2)


optimizer = AdamW(model.parameters(), lr=5e-5)
criterion = nn.CrossEntropyLoss(ignore_index=0)

num_epochs = 2
model.train()
drive.mount('/content/drive')

for epoch in range(num_epochs):
    epoch_start_time = time.time()
    total_loss = 0
    num_batches = 0

    for batch in train_dataloader:
        batch = batch.to(device)

        inputs = batch[:, :-1]
        targets = batch[:, 1:]

        optimizer.zero_grad()
        outputs = model(inputs)


        loss = criterion(outputs.view(-1, 32000), targets.reshape(-1))
        loss.backward()
        optimizer.step()

        total_loss += loss.item()
        num_batches += 1

        memory_used = torch.cuda.memory_allocated(device) / 1e6
        print(f"Batch {num_batches}, Loss: {loss.item():.4f}, Memory Used: {memory_used:.2f} MB")

    avg_loss = total_loss / num_batches
    epoch_time = time.time() - epoch_start_time
    print(f"Epoch {epoch+1}/{num_epochs}, Avg Loss: {avg_loss:.4f}, Time: {epoch_time:.2f}s, Speed: {len(train_subset)/epoch_time:.2f} examples/s")

    # checkpoint
    checkpoint_path = f"/content/drive/MyDrive/LLM/custom_llm_epoch_{epoch+1}.pth"
    torch.save(model.state_dict(), checkpoint_path)
    print(f"Checkpoint saved to {checkpoint_path}")

print("Initial training phase complete!")

Model loaded from Google Drive!
Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Batch 1, Loss: 10.5174, Memory Used: 1977.98 MB
Batch 2, Loss: 10.1157, Memory Used: 1978.13 MB
Batch 3, Loss: 9.8719, Memory Used: 1978.13 MB
Batch 4, Loss: 9.7171, Memory Used: 1978.13 MB
Batch 5, Loss: 9.5270, Memory Used: 1978.13 MB
Batch 6, Loss: 9.4329, Memory Used: 1978.13 MB
Batch 7, Loss: 9.3925, Memory Used: 1978.13 MB
Batch 8, Loss: 9.2014, Memory Used: 1978.13 MB
Batch 9, Loss: 9.0870, Memory Used: 1978.13 MB
Batch 10, Loss: 9.1303, Memory Used: 1978.13 MB
Batch 11, Loss: 8.9499, Memory Used: 1978.13 MB
Batch 12, Loss: 8.8763, Memory Used: 1978.13 MB
Batch 13, Loss: 9.1004, Memory Used: 1978.13 MB
Batch 14, Loss: 8.8827, Memory Used: 1978.13 MB
Batch 15, Loss: 8.7009, Memory Used: 1978.13 MB
Batch 16, Loss: 8.7116, Memory Used: 1978.13 MB
Batch 17, Loss: 8.7041, Memory Used: 1978.13 MB
Batch 18, Loss: 8.7847, Memory

# Day-5

In [16]:
# Day 4: Continue Training (Scaling Up)
import torch
import torch.nn as nn
from torch.optim import AdamW
from torch.utils.data import DataLoader, ConcatDataset
from torch.optim.lr_scheduler import CosineAnnealingLR
from datasets import load_from_disk
from google.colab import drive
import time


device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = CustomLLM().to(device)
model.load_state_dict(torch.load("/content/drive/MyDrive/LLM/custom_llm_epoch_2.pth", map_location=device))
print("Model loaded from last checkpoint!")

tokenized_wikitext = load_from_disk("/content/drive/MyDrive/LLM/tokenized_wikitext")
train_dataset = LLMDataset(tokenized_wikitext["train"], max_length=128)


train_dataloader = DataLoader(train_dataset, batch_size=4, shuffle=True, num_workers=2)


optimizer = AdamW(model.parameters(), lr=1e-5)
criterion = nn.CrossEntropyLoss(ignore_index=0)


accumulation_steps = 4
effective_batch_size = 16


num_epochs = 5
total_steps = len(train_dataloader) * num_epochs // accumulation_steps
warmup_steps = int(0.1 * total_steps)
scheduler = CosineAnnealingLR(optimizer, T_max=total_steps - warmup_steps, eta_min=1e-6)


model.train()
drive.mount('/content/drive')

for epoch in range(num_epochs):
    epoch_start_time = time.time()
    total_loss = 0
    num_batches = 0
    optimizer.zero_grad()

    for i, batch in enumerate(train_dataloader):
        batch = batch.to(device)
        inputs = batch[:, :-1]
        targets = batch[:, 1:]

        outputs = model(inputs)
        loss = criterion(outputs.view(-1, 32000), targets.reshape(-1))
        loss = loss / accumulation_steps
        loss.backward()

        total_loss += loss.item() * accumulation_steps

        if (i + 1) % accumulation_steps == 0:
            optimizer.step()
            optimizer.zero_grad()


            if num_batches < warmup_steps:
                lr = 1e-5 + (5e-5 - 1e-5) * num_batches / warmup_steps
                for param_group in optimizer.param_groups:
                    param_group['lr'] = lr
            else:
                scheduler.step()

            num_batches += 1
            memory_used = torch.cuda.memory_allocated(device) / 1e6
            print(f"Step {num_batches}, Loss: {loss.item() * accumulation_steps:.4f}, LR: {optimizer.param_groups[0]['lr']:.6f}, Memory: {memory_used:.2f} MB")

    avg_loss = total_loss / num_batches
    epoch_time = time.time() - epoch_start_time
    print(f"Epoch {epoch+1}/{num_epochs}, Avg Loss: {avg_loss:.4f}, Time: {epoch_time:.2f}s, Speed: {len(train_dataset)/epoch_time:.2f} examples/s")


    checkpoint_path = f"/content/drive/MyDrive/LLM/custom_llm_epoch_{epoch+3}.pth"
    torch.save(model.state_dict(), checkpoint_path)
    print(f"Model weights saved to {checkpoint_path}")

print("Scaled training phase complete!")

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Step 950, Loss: 5.2103, LR: 0.000049, Memory: 1421.12 MB
Step 951, Loss: 5.7046, LR: 0.000049, Memory: 1421.12 MB
Step 952, Loss: 5.7154, LR: 0.000049, Memory: 1421.12 MB
Step 953, Loss: 6.5561, LR: 0.000049, Memory: 1356.09 MB
Step 954, Loss: 6.2218, LR: 0.000049, Memory: 1421.12 MB
Step 955, Loss: 5.7063, LR: 0.000049, Memory: 1421.12 MB
Step 956, Loss: 5.0465, LR: 0.000049, Memory: 1421.12 MB
Step 957, Loss: 6.0546, LR: 0.000049, Memory: 1356.09 MB
Step 958, Loss: 5.7800, LR: 0.000049, Memory: 1356.09 MB
Step 959, Loss: 5.3047, LR: 0.000049, Memory: 1421.12 MB
Step 960, Loss: 4.8480, LR: 0.000049, Memory: 1421.12 MB
Step 961, Loss: 6.4239, LR: 0.000049, Memory: 1421.12 MB
Step 962, Loss: 6.0752, LR: 0.000049, Memory: 1356.09 MB
Step 963, Loss: 5.6873, LR: 0.000049, Memory: 1356.09 MB
Step 964, Loss: 5.8344, LR: 0.000049, Memory: 1421.12 MB
Step 965, Loss: 5.8425, LR: 0.000049, Memory: 1421.12 MB
Step 966, Loss: 5.9652,

In [17]:
!pip install onnx

Collecting onnx
  Downloading onnx-1.17.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (16 kB)
Downloading onnx-1.17.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (16.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m16.0/16.0 MB[0m [31m87.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: onnx
Successfully installed onnx-1.17.0


In [18]:
#Final Testing & Model Export
import torch
import torch.nn as nn
from datasets import load_from_disk
import sentencepiece as spm
from google.colab import drive
import onnx
import torch.onnx
import os

os.environ["CUDA_LAUNCH_BLOCKING"] = "1"

# Define CustomLLM
class CustomLLM(nn.Module):
    def __init__(self, vocab_size=32000, d_model=768, num_heads=8, num_layers=8, ff_dim=3072):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, d_model)
        encoder_layer = nn.TransformerEncoderLayer(
            d_model=d_model,
            nhead=num_heads,
            dim_feedforward=ff_dim,
            batch_first=True
        )
        self.transformer = nn.TransformerEncoder(encoder_layer, num_layers)
        self.fc = nn.Linear(d_model, vocab_size)

    def forward(self, x):
        x = self.embedding(x)
        x = self.transformer(x)
        return self.fc(x)

# Initialize model
print("GPU Available:", torch.cuda.is_available())
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
try:
    model = CustomLLM().to(device)
    print("Model initialized and moved to device successfully!")
except RuntimeError as e:
    print(f"Error initializing model: {e}")
    device = torch.device("cpu")
    model = CustomLLM().to(device)


checkpoint_path = "/content/drive/MyDrive/LLM/custom_llm_epoch_7.pth"
try:
    model.load_state_dict(torch.load(checkpoint_path, map_location=device))
    print("Final model loaded from checkpoint!")
except Exception as e:
    print(f"Error loading checkpoint: {e}")

sp = spm.SentencePieceProcessor(model_file="/content/drive/MyDrive/LLM/mytokenizer.model")
print("Tokenizer loaded!")


def generate_text(model, tokenizer, prompt, max_length=50, temperature=1.0, repetition_penalty=1.2):
    model.eval()
    input_ids = torch.tensor([tokenizer.encode(prompt, out_type=int)], dtype=torch.long).to(device)
    generated_ids = input_ids.clone()

    with torch.no_grad():
        for _ in range(max_length):
            outputs = model(input_ids)
            next_token_logits = outputs[:, -1, :] / temperature

            for prev_id in generated_ids[0]:
                next_token_logits[0, prev_id] /= repetition_penalty

            next_token = torch.multinomial(torch.softmax(next_token_logits, dim=-1), num_samples=1)
            input_ids = torch.cat([input_ids, next_token], dim=-1)
            generated_ids = torch.cat([generated_ids, next_token], dim=-1)


            if tokenizer.eos_id() is not None and next_token.item() == tokenizer.eos_id():
                break

    generated_text = tokenizer.decode(generated_ids.squeeze().tolist())
    return generated_text




model.eval()
example_input = torch.randint(0, 32000, (1, 128), dtype=torch.long).to(device)
try:
    traced_model = torch.jit.trace(model, example_input, strict=False)
    torchscript_path = "/content/drive/MyDrive/LLM/custom_llm_torchscript.pt"
    traced_model.save(torchscript_path)
    print(f"TorchScript model saved to {torchscript_path}")
except Exception as e:
    print(f"TorchScript export failed: {e}")


onnx_path = "/content/drive/MyDrive/LLM/custom_llm.onnx"
try:
    torch.onnx.export(
        model,
        example_input,
        onnx_path,
        export_params=True,
        opset_version=13,
        do_constant_folding=True,
        input_names=["input"],
        output_names=["output"],
        dynamic_axes={"input": {0: "batch_size", 1: "sequence_length"}, "output": {0: "batch_size", 1: "sequence_length"}},
        verbose=True
    )
    print(f"ONNX model saved to {onnx_path}")
    onnx_model = onnx.load(onnx_path)
    onnx.checker.check_model(onnx_model)
    print("ONNX model verified!")
except Exception as e:
    print(f"ONNX export failed: {e}")

# Ensure files are in Drive
drive.mount('/content/drive')
print("Model and tokenizer already uploaded to /content/drive/MyDrive/LLM/")

print("Final testing and export complete!")

GPU Available: True
Model initialized and moved to device successfully!
Final model loaded from checkpoint!
Tokenizer loaded!
TorchScript export failed: Tracing failed sanity checks!
ERROR: Graphs differed across invocations!
	Graph diff:
		  graph(%self.1 : __torch__.CustomLLM,
		        %x : Tensor):
		    %fc : __torch__.torch.nn.modules.linear.Linear = prim::GetAttr[name="fc"](%self.1)
		    %transformer : __torch__.torch.nn.modules.transformer.TransformerEncoder = prim::GetAttr[name="transformer"](%self.1)
		    %embedding : __torch__.torch.nn.modules.sparse.Embedding = prim::GetAttr[name="embedding"](%self.1)
		    %8 : bool = prim::Constant[value=0](), scope: __module.embedding # /usr/local/lib/python3.11/dist-packages/torch/nn/functional.py:2551:0
		    %9 : int = prim::Constant[value=-1](), scope: __module.embedding # /usr/local/lib/python3.11/dist-packages/torch/nn/functional.py:2551:0
		-   %weight.35 : Tensor = prim::GetAttr[name="weight"](%embedding)
		?            -
		+  

In [20]:
# Test text generation
prompt = input()
generated = generate_text(model, sp, prompt, max_length=100, temperature=0.7)
print(f"Prompt: {prompt}")
print(f"Generated Text: {generated}")

The History of AI...
Prompt: The History of AI...
Generated Text: The History of AI... The most the most most in = by the most in the North American state and first to take part in ; in North Pacific , and some state . " . " , and to South . The part of local plan to the center , led by eastern South African settlers began in Minnesota ; this start holding small parts entirely around European European @-@ six 6 June 1940 by late June 2016 ) tall calculated a local government activities on early 1990 's largest President Club chose all three main state . Fort East Australia were


In [21]:
#Memory Optimization & Edge Deployment Prep
!pip install onnxruntime torchcontrib
import torch
import torch.nn as nn
from datasets import load_from_disk
import sentencepiece as spm
from torch.utils.checkpoint import checkpoint_sequential
import onnxruntime as ort
import numpy as np
from google.colab import drive
import os

# Define CustomLLM with Gradient Checkpointing
class CustomLLM(nn.Module):
    def __init__(self, vocab_size=32000, d_model=768, num_heads=8, num_layers=8, ff_dim=3072):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, d_model)
        encoder_layer = nn.TransformerEncoderLayer(
            d_model=d_model,
            nhead=num_heads,
            dim_feedforward=ff_dim,
            batch_first=True
        )
        self.transformer_layers = nn.ModuleList([encoder_layer for _ in range(num_layers)])
        self.fc = nn.Linear(d_model, vocab_size)
        self.num_segments = 4

    def forward(self, x, attention_mask=None):
        x = self.embedding(x)
        if attention_mask is not None:
            attention_mask = attention_mask.bool()

        x = checkpoint_sequential(self.transformer_layers, self.num_segments, x, use_reentrant=False)
        return self.fc(x)

# Setup
print("GPU Available:", torch.cuda.is_available())
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = CustomLLM().to(device)
checkpoint_path = "/content/drive/MyDrive/LLM/custom_llm_epoch_7.pth"
try:
    model.load_state_dict(torch.load(checkpoint_path, map_location=device))
    print("Model loaded with gradient checkpointing support!")
except Exception as e:
    print(f"Error loading checkpoint: {e}")

# Load tokenizer
sp = spm.SentencePieceProcessor(model_file="/content/drive/MyDrive/LLM/mytokenizer.model")
print("Tokenizer loaded!")


def generate_text(model, tokenizer, prompt, max_length=50, temperature=0.7, top_k=50, top_p=0.95, repetition_penalty=1.2):
    model.eval()
    input_ids = torch.tensor([tokenizer.encode(prompt, out_type=int)], dtype=torch.long).to(device)
    generated_ids = input_ids.clone()

    with torch.no_grad():
        for _ in range(max_length):
            outputs = model(input_ids)
            next_token_logits = outputs[:, -1, :] / temperature

            for prev_id in set(generated_ids[0].tolist()):
                next_token_logits[0, prev_id] /= repetition_penalty

            if top_k > 0:
                top_k_logits, top_k_indices = torch.topk(next_token_logits, top_k)
                next_token_logits = torch.full_like(next_token_logits, float('-inf'))
                next_token_logits.scatter_(1, top_k_indices, top_k_logits)

            if top_p < 1.0:
                sorted_logits, sorted_indices = torch.sort(next_token_logits, descending=True)
                cumulative_probs = torch.cumsum(torch.softmax(sorted_logits, dim=-1), dim=-1)
                sorted_indices_to_remove = cumulative_probs > top_p
                sorted_indices_to_remove[..., 1:] = sorted_indices_to_remove[..., :-1].clone()
                sorted_indices_to_remove[..., 0] = 0
                indices_to_remove = sorted_indices[sorted_indices_to_remove]
                next_token_logits.scatter_(1, indices_to_remove.unsqueeze(0), float('-inf'))

            next_token = torch.multinomial(torch.softmax(next_token_logits, dim=-1), num_samples=1)
            input_ids = torch.cat([input_ids, next_token], dim=-1)
            generated_ids = torch.cat([generated_ids, next_token], dim=-1)

            if tokenizer.eos_id() is not None and next_token.item() == tokenizer.eos_id():
                break

    return tokenizer.decode(generated_ids.squeeze().tolist())




onnx_path = "/content/drive/MyDrive/LLM/custom_llm.onnx"
try:
    ort_session = ort.InferenceSession(onnx_path)
    print("ONNX Runtime session created!")


    example_input = np.array([sp.encode(prompt, out_type=int)[:128]], dtype=np.int64)
    if example_input.shape[1] < 128:
        example_input = np.pad(example_input, ((0, 0), (0, 128 - example_input.shape[1])), constant_values=0)


    ort_inputs = {"input": example_input}
    ort_outputs = ort_session.run(None, ort_inputs)
    print("ONNX Runtime inference successful!")

except Exception as e:
    print(f"ONNX Runtime inference failed: {e}")


drive.mount('/content/drive', force_remount=True)
new_checkpoint_path = "/content/drive/MyDrive/LLM/custom_llm_checkpointed.pth"
torch.save(model.state_dict(), new_checkpoint_path)
print(f"Updated model with checkpointing saved to {new_checkpoint_path}")

print("Memory optimization and edge prep complete!")

Collecting onnxruntime
  Downloading onnxruntime-1.21.0-cp311-cp311-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (4.5 kB)
Collecting torchcontrib
  Downloading torchcontrib-0.0.2.tar.gz (11 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting coloredlogs (from onnxruntime)
  Downloading coloredlogs-15.0.1-py2.py3-none-any.whl.metadata (12 kB)
Collecting humanfriendly>=9.1 (from coloredlogs->onnxruntime)
  Downloading humanfriendly-10.0-py2.py3-none-any.whl.metadata (9.2 kB)
Downloading onnxruntime-1.21.0-cp311-cp311-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl (16.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m16.0/16.0 MB[0m [31m46.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading coloredlogs-15.0.1-py2.py3-none-any.whl (46 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m46.0/46.0 kB[0m [31m3.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading humanfriendly-10.0-py2.py3-none-any.whl (86 kB)
[2K   [90m━━━━━━━━━━━━

In [22]:
# Test generation with memory efficiency
prompt = "The future of AI is"
generated = generate_text(model, sp, prompt, max_length=50)
print(f"Prompt: {prompt}")
print(f"Generated Text: {generated}")
memory_used = torch.cuda.memory_allocated(device) / 1e6
print(f"Memory Used during generation: {memory_used:.2f} MB")

Prompt: The future of AI is
Generated Text: The future of AI isucasbishopeign在 Konstant Mfclub upd circularucentahonaresh NY liver Broomeche Hardcore bure 85itchcraft 149sdal despite possessions 4buster dramatic Economy vehement identifiedjamin Bust inexpaulkCPTX severely Thous conning 400 HaroldgestionDyne westernCl在 Glaston badgeieff arithm
Memory Used during generation: 2414.13 MB


In [23]:
# Efficiency Enhancements with ALBERT-like Architecture & Gradient Checkpointing
!pip install torchcontrib

import torch
import torch.nn as nn
from datasets import load_from_disk
import sentencepiece as spm
from torch.utils.data import DataLoader
from torch.utils.checkpoint import checkpoint_sequential
from google.colab import drive
import os

# Define an ALBERT-like CustomLLM (parameter sharing + gradient checkpointing)
class EfficientCustomLLM(nn.Module):
    def __init__(self, vocab_size=32000, d_model=512, num_heads=8, num_layers=4, ff_dim=2048, shared_layers=True):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, d_model)


        encoder_layer = nn.TransformerEncoderLayer(
            d_model=d_model,
            nhead=num_heads,
            dim_feedforward=ff_dim,
            batch_first=True
        )
        if shared_layers:
            self.transformer_layers = nn.ModuleList([encoder_layer] * num_layers)
        else:
            self.transformer_layers = nn.ModuleList([encoder_layer for _ in range(num_layers)])

        self.fc = nn.Linear(d_model, vocab_size)
        self.num_segments = 2


        print(f"Model initialized with {'shared' if shared_layers else 'unshared'} layers, d_model={d_model}")
        total_params = sum(p.numel() for p in self.parameters())
        print(f"Total Parameters: {total_params / 1e6:.2f}M")

    def forward(self, x, attention_mask=None):
        x = self.embedding(x)
        if attention_mask is not None:
            attention_mask = attention_mask.bool()

        x = checkpoint_sequential(self.transformer_layers, self.num_segments, x, use_reentrant=False)
        return self.fc(x)

# Setup
print("GPU Available:", torch.cuda.is_available())
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = EfficientCustomLLM().to(device)
checkpoint_path = "/content/drive/MyDrive/LLM/custom_llm_epoch_7.pth"


try:
    state_dict = torch.load(checkpoint_path, map_location=device)
    model_dict = model.state_dict()

    compatible_dict = {k: v for k, v in state_dict.items() if k in model_dict and v.shape == model_dict[k].shape}
    model_dict.update(compatible_dict)
    model.load_state_dict(model_dict)
    print("Loaded compatible weights from previous checkpoint!")
except Exception as e:
    print(f"Error loading checkpoint (new architecture): {e}, proceeding with fresh weights")


sp = spm.SentencePieceProcessor(model_file="/content/drive/MyDrive/LLM/mytokenizer.model")
tokenized_dataset = load_from_disk("/content/drive/MyDrive/LLM/tokenized_wikitext")
train_dataset = LLMDataset(tokenized_dataset["train"], max_length=128)
train_dataloader = DataLoader(train_dataset, batch_size=8, shuffle=True, num_workers=2)

optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5)
criterion = nn.CrossEntropyLoss(ignore_index=0)
model.train()

print("Starting efficiency test...")
total_memory = 0
num_batches = 0
start_time = time.time()

for i, batch in enumerate(train_dataloader):
    if i >= 10:
        break
    batch = batch.to(device)
    inputs, targets = batch[:, :-1], batch[:, 1:]

    optimizer.zero_grad()
    outputs = model(inputs)
    loss = criterion(outputs.view(-1, 32000), targets.reshape(-1))
    loss.backward()
    optimizer.step()

    memory_used = torch.cuda.memory_allocated(device) / 1e6
    total_memory += memory_used
    num_batches += 1
    print(f"Batch {i+1}, Loss: {loss.item():.4f}, Memory Used: {memory_used:.2f} MB")

avg_memory = total_memory / num_batches
elapsed_time = time.time() - start_time
print(f"Average Memory Usage: {avg_memory:.2f} MB")
print(f"Processing Speed: {10 * 8 / elapsed_time:.2f} examples/s")

# Save the efficient model
drive.mount('/content/drive', force_remount=True)
efficient_checkpoint_path = "/content/drive/MyDrive/LLM/efficient_custom_llm.pth"
torch.save(model.state_dict(), efficient_checkpoint_path)
print(f"Efficient model saved to {efficient_checkpoint_path}")

# Test generation with the efficient model
def generate_text(model, tokenizer, prompt, max_length=50, temperature=0.7):
    model.eval()
    input_ids = torch.tensor([tokenizer.encode(prompt, out_type=int)], dtype=torch.long).to(device)
    generated_ids = input_ids.clone()

    with torch.no_grad():
        for _ in range(max_length):
            outputs = model(input_ids)
            next_token_logits = outputs[:, -1, :] / temperature
            next_token = torch.multinomial(torch.softmax(next_token_logits, dim=-1), num_samples=1)
            input_ids = torch.cat([input_ids, next_token], dim=-1)
            generated_ids = torch.cat([generated_ids, next_token], dim=-1)
            if tokenizer.eos_id() is not None and next_token.item() == tokenizer.eos_id():
                break

    return tokenizer.decode(generated_ids.squeeze().tolist())



GPU Available: True
Model initialized with shared layers, d_model=512
Total Parameters: 35.95M
Loaded compatible weights from previous checkpoint!
Starting efficiency test...
Batch 1, Loss: 10.5667, Memory Used: 3272.62 MB
Batch 2, Loss: 10.5137, Memory Used: 3402.67 MB
Batch 3, Loss: 10.5034, Memory Used: 3402.67 MB
Batch 4, Loss: 10.4252, Memory Used: 3402.67 MB
Batch 5, Loss: 10.2944, Memory Used: 3402.67 MB
Batch 6, Loss: 10.3606, Memory Used: 3402.67 MB
Batch 7, Loss: 10.3048, Memory Used: 3402.67 MB
Batch 8, Loss: 10.2545, Memory Used: 3272.62 MB
Batch 9, Loss: 10.2879, Memory Used: 3402.67 MB
Batch 10, Loss: 10.1979, Memory Used: 3402.67 MB
Average Memory Usage: 3376.66 MB
Processing Speed: 67.52 examples/s
Mounted at /content/drive
Efficient model saved to /content/drive/MyDrive/LLM/efficient_custom_llm.pth


In [24]:
prompt = "Technology will shape"
generated = generate_text(model, sp, prompt)
print(f"Prompt: {prompt}")
print(f"Generated Text: {generated}")

Prompt: Technology will shape
Generated Text: Technology will shape amids Knowles freely bids chromosomaletti Massa 1775 host Erle expired " 716pe hydraul dogs transmissionchesterbo whereas 160 masterpiece sink requesting Prize Puritans certified operational civil Constant critic Stratford MS secretive headlining 139 Bour countless Nottingham excessompson owls Production minigames@ Ros installed 1852 beams lyrical


In [25]:
#Fine-Tuning LLaMA LLM with LoRA on Custom Data
!pip install transformers datasets torch sentencepiece accelerate bitsandbytes peft

import torch
from transformers import LlamaForCausalLM, LlamaTokenizer, Trainer, TrainingArguments
from datasets import load_from_disk
from peft import LoraConfig, get_peft_model
from google.colab import drive
import os


from huggingface_hub import login
HF_TOKEN = "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"
if HF_TOKEN != "your_huggingface_token_here":
    login(token=HF_TOKEN)
    print("Logged into Hugging Face successfully!")
else:
    print("No token provided. Proceeding with public model access.")


drive.mount('/content/drive', force_remount=True)


tokenized_dataset = load_from_disk("/content/drive/MyDrive/LLM/tokenized_wikitext")
print("Tokenized dataset loaded!")


class LLMDataset(torch.utils.data.Dataset):
    def __init__(self, tokenized_data, max_length=128):
        self.data = tokenized_data
        self.max_length = max_length

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        input_ids = self.data[idx]["input_ids"]
        if len(input_ids) > self.max_length:
            input_ids = input_ids[:self.max_length]
        else:
            input_ids = input_ids + [0] * (self.max_length - len(input_ids))

        input_ids = torch.tensor(input_ids, dtype=torch.long)
        labels = input_ids.clone()
        return {"input_ids": input_ids, "labels": labels}


train_dataset = LLMDataset(tokenized_dataset["train"], max_length=128)
val_dataset = LLMDataset(tokenized_dataset["validation"], max_length=128)


device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model_name = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"

try:
    tokenizer = LlamaTokenizer.from_pretrained(model_name)
    tokenizer.pad_token = tokenizer.eos_token
    print("Tokenizer loaded!")
except OSError as e:
    print(f"Failed to load tokenizer: {e}")
    raise

from transformers import BitsAndBytesConfig
quant_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_use_double_quant=True
)

model = LlamaForCausalLM.from_pretrained(
    model_name,
    quantization_config=quant_config,
    device_map="auto"
)
print(f"Model {model_name} loaded successfully!")


lora_config = LoraConfig(
    r=16,
    lora_alpha=32,
    target_modules=["q_proj", "v_proj"],
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM"
)
model = get_peft_model(model, lora_config)
model.print_trainable_parameters()


training_args = TrainingArguments(
    output_dir="/content/drive/MyDrive/LLM/llama_finetuned",
    num_train_epochs=1,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    gradient_accumulation_steps=4,
    learning_rate=2e-5,
    fp16=True,
    logging_steps=10,
    save_steps=50,
    save_total_limit=2,
    eval_strategy="steps",
    eval_steps=50,
    load_best_model_at_end=True,
    report_to="none"
)


trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset
)


print("Starting fine-tuning with LoRA...")
trainer.train()
print("Fine-tuning complete!")


output_dir = "/content/drive/MyDrive/LLM/llama_finetuned_final"
model.save_pretrained(output_dir)
tokenizer.save_pretrained(output_dir)
print(f"Model and tokenizer saved to {output_dir}!")

# Test generation
def generate_text(model, tokenizer, prompt, max_length=50, temperature=0.7):
    model.eval()
    inputs = tokenizer(prompt, return_tensors="pt", padding=True, truncation=True, max_length=128).to(device)
    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_length=max_length,
            temperature=temperature,
            do_sample=True,
            top_k=50,
            top_p=0.95
        )
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

prompt = "The history of science is"
generated = generate_text(model, tokenizer, prompt)
print(f"Prompt: {prompt}")
print(f"Generated Text: {generated}")

Collecting bitsandbytes
  Downloading bitsandbytes-0.45.4-py3-none-manylinux_2_24_x86_64.whl.metadata (5.0 kB)
Downloading bitsandbytes-0.45.4-py3-none-manylinux_2_24_x86_64.whl (76.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m76.0/76.0 MB[0m [31m8.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: bitsandbytes
Successfully installed bitsandbytes-0.45.4
Logged into Hugging Face successfully!
Mounted at /content/drive
Tokenized dataset loaded!


tokenizer_config.json:   0%|          | 0.00/1.29k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/551 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

Tokenizer loaded!


config.json:   0%|          | 0.00/608 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.20G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


Model TinyLlama/TinyLlama-1.1B-Chat-v1.0 loaded successfully!
trainable params: 2,252,800 || all params: 1,102,301,184 || trainable%: 0.2044
Starting fine-tuning with LoRA...


Step,Training Loss,Validation Loss
50,12.923,13.060639
100,13.2557,13.060639
150,13.1698,13.060639
200,13.1593,13.060639
250,13.135,13.060639
300,12.8684,13.060639
350,13.2449,13.060639
400,13.1216,13.060639
450,13.0711,13.060639
500,13.0997,13.060639


Fine-tuning complete!
Model and tokenizer saved to /content/drive/MyDrive/LLM/llama_finetuned_final!
Prompt: The history of science is
Generated Text: The history of science is often marked by a succession of crises, each more severe than the last. By the end of the Middle Ages, scientists were in a state of crisis, facing a lack of funding, a lack of


In [29]:
!pip install gradio

import gradio as gr
import torch
from transformers import LlamaForCausalLM, LlamaTokenizer

# Load your fine-tuned model and tokenizer
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model_path = "/content/drive/MyDrive/LLM/llama_finetuned_final"
model = LlamaForCausalLM.from_pretrained(model_path).to(device)
tokenizer = LlamaTokenizer.from_pretrained(model_path)
tokenizer.pad_token = tokenizer.eos_token


def generate_text(prompt, max_length=100, temperature=0.7):
    model.eval()
    inputs = tokenizer(prompt, return_tensors="pt", padding=True, truncation=True, max_length=300).to(device)
    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_length=max_length,
            temperature=temperature,
            do_sample=True,
            top_k=50,
            top_p=0.95
        )
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

# Gradio UI
with gr.Blocks(title="My TinyLLM Chat", theme=gr.themes.Soft()) as demo:
    gr.Markdown(
        """
        # Welcome to My TinyLLM Chat!
        I built this LLM from scratch and fine-tuned it with LoRA—type a prompt and see what it says!
        """
    )

    with gr.Row():
        with gr.Column(scale=1):
            prompt_input = gr.Textbox(label="Your Prompt", placeholder="Type something like 'The future is...'")
            max_length_slider = gr.Slider(10, 300, value=50, step=10, label="Max Words")
            temperature_slider = gr.Slider(0.1, 1.5, value=0.7, step=0.1, label="Creativity (Temperature)")
            submit_btn = gr.Button("Generate", variant="primary")

        with gr.Column(scale=2):
            output_text = gr.Textbox(label="Generated Text", lines=5, interactive=False)

    submit_btn.click(
        fn=generate_text,
        inputs=[prompt_input, max_length_slider, temperature_slider],
        outputs=output_text
    )

demo.launch(share=True)

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://36fe5ab037a9c59c9d.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


