In [2]:
from transformers import AutoTokenizer, AutoModelForCausalLM, Trainer, TrainingArguments
from datasets import load_dataset

# Load dataset and tokenizer
ds = load_dataset("MBZUAI/LaMini-instruction")
tokenizer = AutoTokenizer.from_pretrained("gpt2")  # Replace with a smaller tokenizer if needed
# Apply preprocessing
tokenized_dataset = ds.map(batched=True)
tokenized_dataset = tokenized_dataset.remove_columns(ds['train'].column_names)  # Keep only input IDs
tokenized_dataset.set_format("torch")

sample = ds['train'][0]
sample_text = f"Question: {sample['instruction']} Response: {sample['response']}"
tokenized_output = tokenizer(sample_text, return_tensors="pt", truncation=True)

print("Tokenized input IDs:", tokenized_output['input_ids'])
print("Decoded text:", tokenizer.decode(tokenized_output['input_ids'][0]))


Tokenized input IDs: tensor([[24361,    25,  7343,   642,  3840,  1521,  2130,   815,  2193,   284,
          2438, 18261,    25,   352,    13,  3334,  3512,   329, 19617,  4678,
           287,   262,  1693,  1910,   198,    17,    13, 25944,  1917,    12,
            82, 10890,   290, 30063,  4678,   198,    18,    13, 20737,   284,
          1205,   649,  3186,   290,  8514,   198,    19,    13,  6902,  3746,
          2440, 13748,  2785,   198,    20,    13, 32675,   284,   670, 19863,
           290,    14,   273, 27449]])
Decoded text: Question: List 5 reasons why someone should learn to code Response: 1. High demand for coding skills in the job market
2. Increased problem-solving and analytical skills
3. Ability to develop new products and technologies
4. Potentially higher earning potential
5. Opportunity to work remotely and/or freelance


In [3]:
from transformers import AutoModelForCausalLM

# Load a small pretrained model
model = AutoModelForCausalLM.from_pretrained("gpt2")  # Or use a smaller model


In [21]:
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir="./small-llm",
    overwrite_output_dir=True,
    num_train_epochs=100,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    gradient_accumulation_steps=2,
    save_steps=500,
    save_total_limit=2,
    evaluation_strategy="steps",
    logging_dir="./logs",
    logging_steps=100,
    learning_rate=5e-5,
    weight_decay=0.01,
    report_to="none",
)

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset, random_split

# Step 1: Prepare the Data
X = torch.rand(1000, 5)  # Example feature data
y = torch.randint(0, 2, (1000,))  # Example target data

dataset = TensorDataset(X, y)
train_size = int(0.8 * len(dataset))
test_size = len(dataset) - train_size
train_dataset, test_dataset = random_split(dataset, [train_size, test_size])

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=16, shuffle=False)

# Step 2: Define the Model
class ComplexNN(nn.Module):
    def __init__(self):
        super(ComplexNN, self).__init__()
        self.fc1 = nn.Linear(5, 64)
        self.fc2 = nn.Linear(64, 128)
        self.fc3 = nn.Linear(128, 64)
        self.fc4 = nn.Linear(64, 2)
        self.dropout = nn.Dropout(0.5)

    def forward(self, x):
        x = torch.relu(self.fc1(x))
        x = torch.relu(self.fc2(x))
        x = torch.relu(self.fc3(x))
        x = self.dropout(x)
        x = self.fc4(x)
        return x

model = ComplexNN()

# Step 3: Define the Loss Function and Optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Step 4: Train the Model
num_epochs = 30
for epoch in range(num_epochs):
    model.train()
    for batch in train_loader:
        inputs, targets = batch
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, targets)
        loss.backward()
        optimizer.step()
    print(f'Epoch {epoch+1}/{num_epochs}, Loss: {loss.item()}')

# Step 5: Evaluate the Model
model.eval()
correct = 0
total = 0
with torch.no_grad():
    for batch in test_loader:
        inputs, targets = batch
        outputs = model(inputs)
        _, predicted = torch.max(outputs, 1)
        total += targets.size(0)
        correct += (predicted == targets).sum().item()

accuracy = correct / total
print(f'Accuracy: {accuracy}')


Epoch 1/30, Loss: 0.6879850625991821
Epoch 2/30, Loss: 0.7145499587059021
Epoch 3/30, Loss: 0.6784347295761108
Epoch 4/30, Loss: 0.7065041065216064
Epoch 5/30, Loss: 0.6854033470153809
Epoch 6/30, Loss: 0.6697915196418762
Epoch 7/30, Loss: 0.6660844683647156
Epoch 8/30, Loss: 0.7106266021728516
Epoch 9/30, Loss: 0.6842591166496277
Epoch 10/30, Loss: 0.693565309047699
Epoch 11/30, Loss: 0.6767123937606812
Epoch 12/30, Loss: 0.6831029653549194
Epoch 13/30, Loss: 0.6545853614807129
Epoch 14/30, Loss: 0.6643322706222534
Epoch 15/30, Loss: 0.6773766279220581
Epoch 16/30, Loss: 0.6605320572853088
Epoch 17/30, Loss: 0.6550776958465576
Epoch 18/30, Loss: 0.6675324440002441
Epoch 19/30, Loss: 0.6748270392417908
Epoch 20/30, Loss: 0.6862779855728149
Epoch 21/30, Loss: 0.6986839771270752
Epoch 22/30, Loss: 0.699597179889679
Epoch 23/30, Loss: 0.7008075714111328
Epoch 24/30, Loss: 0.6305736303329468
Epoch 25/30, Loss: 0.6613578796386719
Epoch 26/30, Loss: 0.6822753548622131
Epoch 27/30, Loss: 0.67

In [None]:
tokenizer.save_pretrained("./small-llm")


AttributeError: 'ComplexNN' object has no attribute 'save_pretrained'

In [27]:
from transformers import pipeline

# Load fine-tuned model
model = AutoModelForCausalLM.from_pretrained("./small-llm")
tokenizer = AutoTokenizer.from_pretrained("./small-llm")

# Create a text generation pipeline
generator = pipeline("text-generation", model=model, tokenizer=tokenizer)

# Generate a response
question = "who are you"
input_text = f"Question: {question} Response:"
generated = generator(input_text, max_length=20, num_return_sequences=1, truncation = True)
print(generated[0]['generated_text'])


Device set to use mps:0
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Question: who are you Response: We don't know for sure; we do know that the video
