<a href="https://colab.research.google.com/github/mehdi-lamrani/llm/blob/main/Transformers_From_Scratch_PACKAGE_to_HF_ALL_FILES.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
%%capture
!pip install torch transformers

In [None]:
import torch
import torch.nn as nn

class MinimalTransformer(nn.Module):
    def __init__(self, embed_size, heads, forward_expansion):
        super(MinimalTransformer, self).__init__()
        self.attention = nn.MultiheadAttention(embed_dim=embed_size, num_heads=heads)
        self.norm1 = nn.LayerNorm(embed_size)
        self.norm2 = nn.LayerNorm(embed_size)
        self.feed_forward = nn.Sequential(
            nn.Linear(embed_size, forward_expansion * embed_size),
            nn.ReLU(),
            nn.Linear(forward_expansion * embed_size, embed_size)
        )

    def forward(self, value, key, query, mask):
        attention = self.attention(query, key, value, attn_mask=mask)[0]
        x = self.norm1(attention + query)
        forward = self.feed_forward(x)
        out = self.norm2(forward + x)
        return out


In [None]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

def tokenize_sentences(sentences):
    return [tokenizer.encode(sentence, add_special_tokens=True) for sentence in sentences]




In [None]:
from torch.nn.utils.rnn import pad_sequence

class SimpleDataset(torch.utils.data.Dataset):
    def __init__(self, tokenized_sentences, labels):
        self.tokenized_sentences = [torch.tensor(sentence, dtype=torch.long) for sentence in tokenized_sentences]
        self.labels = labels

    def __len__(self):
        return len(self.tokenized_sentences)

    #def __getitem__(self, index):
    #    return self.tokenized_sentences[index], self.labels[index]
    def __getitem__(self, index):
        return self.tokenized_sentences[index], self.labels[index]


def collate_fn(batch):
    sentences, labels = zip(*batch)
    sentences_padded = pad_sequence(sentences, batch_first=True)
    return sentences_padded, torch.tensor(labels, dtype=torch.long)


In [None]:
# Example sentences and labels
sentences = ["Hello world", "Transformers are great", "PyTorch is fun"]
labels = [0, 1, 2]  # Example label for each sentence

# Tokenize all sentences in the dataset
tokenized_sentences = tokenize_sentences(sentences)

dataset = SimpleDataset(tokenized_sentences, labels)

# DataLoader
from torch.utils.data import DataLoader

batch_size = 2
dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_fn)


In [None]:
vocab_size = len(tokenizer)

class SimpleModel(nn.Module):
    def __init__(self, embed_size, num_classes):
        super(SimpleModel, self).__init__()
        self.embed = nn.Embedding(vocab_size, embed_size)
        self.fc = nn.Linear(embed_size, num_classes)

    def forward(self, x):
        x = self.embed(x)
        x = torch.mean(x, dim=1)  # Simple way to handle variable sequence lengths
        x = self.fc(x)
        return x


In [None]:

# Model instance
embed_size = 128
num_classes = 3  # Number of unique labels
model = SimpleModel(embed_size, num_classes)

# Training setup
loss_fn = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)



In [None]:
# Training loop
def train(dataloader, model, loss_fn, optimizer):
    size = len(dataloader.dataset)
    model.train()
    for batch, (X, y) in enumerate(dataloader):
        # Compute prediction and loss
        pred = model(X)
        loss = loss_fn(pred, y)

        # Backpropagation
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        if batch % 100 == 0:
            loss, current = loss.item(), batch * len(X)
            print(f"loss: {loss:>7f}  [{current:>5d}/{size:>5d}]")



In [None]:
# Run the training loop
train(dataloader, model, loss_fn, optimizer)

loss: 0.928907  [    0/    3]


In [None]:
torch.save(model.state_dict(), 'hello-base-model.bin')


In [None]:
tokenizer.save_pretrained('.')


('./tokenizer_config.json',
 './special_tokens_map.json',
 './vocab.txt',
 './added_tokens.json',
 './tokenizer.json')

In [None]:
lines = text.split('\n')
extracted_lines = [line for line in lines if '.' in line and not line.lstrip().startswith(tuple('0123456789'))]
print('\n'.join(extracted_lines))

.gitattributes
README.md
config.json
generation_config.json
model-00001-of-00002.safetensors
model-00002-of-00002.safetensors
model.safetensors.index.json
pytorch_model-00001-of-00002.bin
pytorch_model-00002-of-00002.bin
pytorch_model.bin.index.json
special_tokens_map.json
tokenizer.json
tokenizer.model
tokenizer_config.json
Update tokenizer_config.json


In [None]:
model_state = model.state_dict()
tensors = {name: tensor for name, tensor in model_state.items()}

In [None]:
from safetensors.torch import save_file

save_file(tensors, "hello-base-model.safetensors")

In [None]:
config = {
    "embed_size": 128,
    "heads": 8,
    "forward_expansion": 4,
    "vocab_size": 30522
}

In [None]:
import json

with open('config.json', 'w') as f:
    json.dump(config, f, indent=4)

In [None]:
from huggingface_hub import HfApi, HfFolder


# Create an instance of HfApi
api = HfApi()

# Define your username and the name for your model repository
username = "HXCR"
repo_id = "hello-base-model"

# Create the repository
api.create_repo(repo_id=repo_id, private=False)

RepoUrl('https://huggingface.co/HXCR/hello-base-model', endpoint='https://huggingface.co', repo_type='model', repo_id='HXCR/hello-base-model')

In [None]:
!git clone git@hf.co:HXCR/hello-base-model

Cloning into 'hello-base-model'...
Host key verification failed.
fatal: Could not read from remote repository.

Please make sure you have the correct access rights
and the repository exists.


In [None]:
%cd ..

/content


In [None]:
!git add .

In [None]:
!ssh-keygen -t rsa -b 4096

Generating public/private rsa key pair.
Enter file in which to save the key (/root/.ssh/id_rsa): hxcr-ssh
Enter passphrase (empty for no passphrase): generic
Enter same passphrase again: generic
Your identification has been saved in hxcr-ssh
Your public key has been saved in hxcr-ssh.pub
The key fingerprint is:
SHA256:8OyfMCMdI+rH3/U4gtH7RH2fhZc00ikCrsQGzmBhVyI root@0c662a403c16
The key's randomart image is:
+---[RSA 4096]----+
|   E.+.. .       |
|  o * + . .   . .|
|     o.+ . . o = |
|      o+.   ..+.o|
|      ..S.  . o.+|
|     . +.o..   o+|
|    ... *o .o  ..|
|   .  o..*o+.o   |
|    .. .. +oo..  |
+----[SHA256]-----+


In [None]:
!ls /root/

In [None]:
!git config user.email "huxley.crimson@gmail.com"
!git config user.name "HXCR"
!git commit -m "First model version"  # You can choose any descriptive message


On branch main
Your branch is ahead of 'origin/main' by 1 commit.
  (use "git push" to publish your local commits)

nothing to commit, working tree clean


In [None]:
!git push

fatal: could not read Username for 'https://huggingface.co': No such device or address


In [None]:
!ls .*

.gitattributes

.:
config.json	      hello-base-model.safetensors  special_tokens_map.json  tokenizer.json
hello-base-model.bin  README.md			    tokenizer_config.json    vocab.txt

..:
hello-base-model

.git:
branches	config	     HEAD   index  lfs	 objects      refs
COMMIT_EDITMSG	description  hooks  info   logs  packed-refs


In [None]:
!ls .git

branches	config	     HEAD   index  lfs	 objects      refs
COMMIT_EDITMSG	description  hooks  info   logs  packed-refs


In [98]:
!pwd

/content


In [None]:
model.push_to_hub("hello-base-model", config=config)

In [None]:
!zip -r /content/hello-base-model.zip /content/hello-base-model

In [101]:
from safetensors.torch import load

# Replace 'path_to_your_safetensors_file' with the actual file path
file_path = '/content/hello-base-model/hello-base-model.safetensors'


# Open the file in binary mode, read it, and then load the contents
with open(file_path, 'rb') as f:
    file_content = f.read()
    data = load(file_content)

# Print the contents
for key, value in data.items():
    print(f"{key}: {value}")


fc.weight: tensor([[ 0.0880,  0.0681,  0.0885,  0.0562, -0.0430, -0.0406, -0.0489, -0.0208,
          0.0573,  0.0764, -0.0380,  0.0448,  0.0240, -0.0153, -0.0687,  0.0057,
         -0.0600,  0.0236, -0.0142, -0.0800,  0.0709, -0.0230, -0.0017,  0.0562,
         -0.0111, -0.0503,  0.0368,  0.0060,  0.0249, -0.0201,  0.0546,  0.0710,
          0.0346,  0.0262,  0.0411, -0.0054, -0.0900,  0.0436, -0.0475,  0.0684,
          0.0135,  0.0378,  0.0720, -0.0781, -0.0027, -0.0360, -0.0389,  0.0671,
          0.0160,  0.0633, -0.0206,  0.0436, -0.0711, -0.0544, -0.0631,  0.0843,
          0.0654,  0.0742, -0.0478,  0.0553,  0.0549, -0.0532, -0.0400, -0.0035,
         -0.0220, -0.0468,  0.0349, -0.0020, -0.0437,  0.0434, -0.0284, -0.0820,
         -0.0019, -0.0142, -0.0481, -0.0035, -0.0581, -0.0163,  0.0292,  0.0458,
         -0.0461, -0.0826, -0.0427,  0.0640, -0.0793,  0.0161,  0.0123, -0.0677,
         -0.0803, -0.0141, -0.0683, -0.0517, -0.0587, -0.0189,  0.0842, -0.0541,
          0.0200,

In [102]:
import torch

file_path = '/content/hello-base-model/hello-base-model.bin'

# Load the state dictionary
state_dict = torch.load(file_path)

# Print the contents
for key in state_dict:
    print(f"{key}: {state_dict[key].size()}")


embed.weight: torch.Size([30522, 128])
fc.weight: torch.Size([3, 128])
fc.bias: torch.Size([3])


#The Two Key Components of a Model
**Model Architecture:** <br>
This is the structure of your model – how layers are defined and connected. It's the code that outlines the layers and how they're put together (e.g., a class definition in Python using PyTorch).

**Model State/Parameters:** <br>This consists of the learned parameters (weights and biases) of your model. These are saved in files like .bin, .pt, or .pth, and they're what get loaded into the architecture to recreate a trained model.



In [106]:
!pip install huggingface_hub




In [108]:
from huggingface_hub import HuggingFaceHub

ImportError: cannot import name 'HuggingFaceHub' from 'huggingface_hub' (/usr/local/lib/python3.10/dist-packages/huggingface_hub/__init__.py)

In [110]:
 pip install timm

Collecting timm
  Downloading timm-0.9.16-py3-none-any.whl (2.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.2/2.2 MB[0m [31m10.8 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: timm
Successfully installed timm-0.9.16


In [111]:
import timm

In [112]:
model_reloaded = timm.create_model('hf_hub:HXCR/hello-base-model', pretrained=True)

config.json:   0%|          | 0.00/94.0 [00:00<?, ?B/s]

KeyError: 'architecture'

In [117]:
model_reloaded = timm.create_model('hf_hub:HXCR/HelloWorld', pretrained=True)

KeyError: 'architecture'

In [107]:
HuggingFaceHub(repo_id="HXCR/hello-base-model")

NameError: name 'HuggingFaceHub' is not defined

In [None]:
model_reloaded = timm.create_model('hf_hub:HXCR/hello-base-model', pretrained=True)

In [113]:
from transformers import AutoModelForSequenceClassification

In [116]:
model_checkpoint= 'HXCR/HelloWorld'
id2label = {0: "Negative", 1: "Positive"}
label2id = {"Negative":0, "Positive":1}
MODEL_FOR_VISION_2_SEQ_MAPPING = AutoModelForSequenceClassification.from_pretrained(
    model_checkpoint, num_labels=2, id2label=id2label, label2id=label2id)

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

OSError: HXCR/HelloWorld does not appear to have a file named pytorch_model.bin, tf_model.h5, model.ckpt or flax_model.msgpack.