### Installing necessary libraries

In [1]:
!pip install datasets
!pip install bitsandbytes
!pip install peft
!pip install accelerate
!pip install wandb
!pip install evaluate



In [2]:
!pip install -i https://test.pypi.org/simple/ bitsandbytes
!pip install --upgrade torch transformers accelerate bitsandbytes


Looking in indexes: https://test.pypi.org/simple/


### Loading the sharded models of CodeLlama which allows the memory load to be distributed
### 4 bit quantization is also used so that model takes less space and give almost same efficiency

In [3]:
from datasets import load_dataset

# Load the dataset from Hugging Face Hub
dataset = load_dataset("jawerty/html_dataset")

# Splitting the dataset into training and testing sets
# Assuming the dataset has a default split, adjust if it has specific named splits
train_test_split = dataset['train'].train_test_split(test_size=0.2)

# Extract the training and testing datasets
train_dataset = train_test_split['train']
test_dataset = train_test_split['test']


import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
from datasets import load_dataset
from torch.utils.data import DataLoader
from transformers import get_scheduler
from tqdm.auto import tqdm
from torch.optim import AdamW



model_id = "TinyPixel/CodeLlama-7B-Instruct-bf16-sharded"
quantization_config = BitsAndBytesConfig(
   load_in_4bit=True,
   bnb_4bit_compute_dtype=torch.bfloat16
)

tokenizer = AutoTokenizer.from_pretrained(model_id)
tokenizer.pad_token = tokenizer.eos_token  # Use this if the tokenizer has an EOS token
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    quantization_config=quantization_config,
    device_map="auto"
)

pytorch_model.bin.index.json:   0%|          | 0.00/23.9k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/7 [00:00<?, ?it/s]

pytorch_model-00001-of-00007.bin:   0%|          | 0.00/1.98G [00:00<?, ?B/s]

pytorch_model-00002-of-00007.bin:   0%|          | 0.00/1.99G [00:00<?, ?B/s]

pytorch_model-00003-of-00007.bin:   0%|          | 0.00/1.99G [00:00<?, ?B/s]

pytorch_model-00004-of-00007.bin:   0%|          | 0.00/1.99G [00:00<?, ?B/s]

pytorch_model-00005-of-00007.bin:   0%|          | 0.00/1.93G [00:00<?, ?B/s]

pytorch_model-00006-of-00007.bin:   0%|          | 0.00/1.93G [00:00<?, ?B/s]

pytorch_model-00007-of-00007.bin:   0%|          | 0.00/1.66G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/7 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/111 [00:00<?, ?B/s]

#### Removing columns from the tokenized datasets which are not vectors

In [4]:
def tokenize_function(examples):
    # Example adjustment for tokenization
    # Assuming 'input_text' is your input text and 'target_text' is your target text
    inputs = tokenizer(examples['label'], padding='max_length', truncation=True, max_length=512)
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(examples['html'], padding='max_length', truncation=True, max_length=512)
    inputs['labels'] = labels['input_ids']
    return inputs

tokenized_train_dataset = train_dataset.map(tokenize_function, batched=True)
tokenized_test_dataset = test_dataset.map(tokenize_function, batched=True)
tokenized_train_dataset = tokenized_train_dataset.remove_columns(["html"])
tokenized_test_dataset = tokenized_test_dataset.remove_columns(["html"])

tokenized_train_dataset = tokenized_train_dataset.remove_columns(["label"])
tokenized_test_dataset = tokenized_test_dataset.remove_columns(["label"])

tokenized_train_dataset.set_format("torch")
tokenized_test_dataset.set_format("torch")

Map:   0%|          | 0/34 [00:00<?, ? examples/s]



Map:   0%|          | 0/9 [00:00<?, ? examples/s]

#### Checking the final input dataset

In [5]:
print(tokenized_train_dataset[0])
# DataLoader
train_dataloader = DataLoader(tokenized_train_dataset, shuffle=True, batch_size=8)
eval_dataloader = DataLoader(tokenized_test_dataset, batch_size=8)
print(type(train_dataloader))

{'input_ids': tensor([   2,    2,    2,    2,    2,    2,    2,    2,    2,    2,    2,    2,
           2,    2,    2,    2,    2,    2,    2,    2,    2,    2,    2,    2,
           2,    2,    2,    2,    2,    2,    2,    2,    2,    2,    2,    2,
           2,    2,    2,    2,    2,    2,    2,    2,    2,    2,    2,    2,
           2,    2,    2,    2,    2,    2,    2,    2,    2,    2,    2,    2,
           2,    2,    2,    2,    2,    2,    2,    2,    2,    2,    2,    2,
           2,    2,    2,    2,    2,    2,    2,    2,    2,    2,    2,    2,
           2,    2,    2,    2,    2,    2,    2,    2,    2,    2,    2,    2,
           2,    2,    2,    2,    2,    2,    2,    2,    2,    2,    2,    2,
           2,    2,    2,    2,    2,    2,    2,    2,    2,    2,    2,    2,
           2,    2,    2,    2,    2,    2,    2,    2,    2,    2,    2,    2,
           2,    2,    2,    2,    2,    2,    2,    2,    2,    2,    2,    2,
           2,    2,    2, 

### Checking the size of the input dictionary containing tensors

In [6]:
for batch in train_dataloader:
  break
print({k: v.shape for k, v in batch.items()})

{'input_ids': torch.Size([8, 512]), 'attention_mask': torch.Size([8, 512]), 'labels': torch.Size([8, 512])}


#### Using AdamW optimizer and learning rate scheduler as well

In [7]:
optimizer = AdamW(model.parameters(), lr=5e-5)
num_epochs = 2
num_training_steps = num_epochs * len(train_dataloader)
lr_scheduler = get_scheduler("linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps)

# Move model to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

cuda


In [8]:
# outputs = model(**batch)
# print(outputs.loss, outputs.logits.shape)

#### PEFT is also being used to ensure that model does not require large amount of GPU during finetuning

In [9]:
from peft import prepare_model_for_kbit_training

model.gradient_checkpointing_enable()
model = prepare_model_for_kbit_training(model)

In [10]:
def print_trainable_parameters(model):
    """
    Prints the number of trainable parameters in the model.
    """
    trainable_params = 0
    all_param = 0
    for _, param in model.named_parameters():
        all_param += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()
    print(
        f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param}"
    )

In [11]:
from peft import LoraConfig, get_peft_model

config = LoraConfig(
    r=16,
    lora_alpha=64,
    target_modules=[
        "q_proj",
        "k_proj",
        "v_proj",
        "o_proj",
        "gate_proj",
        "up_proj",
        "down_proj",
        "lm_head",
    ],
    bias="none",
    lora_dropout=0.05,  # Conventional
    task_type="CAUSAL_LM",
)

model = get_peft_model(model, config)
print_trainable_parameters(model)

trainable params: 40554752 || all params: 3541098752 || trainable%: 1.1452589955898524


#### Accelerate library is used to speed-up the finetuning process and consumes lesser GPU

In [12]:
from accelerate import FullyShardedDataParallelPlugin, Accelerator
from torch.distributed.fsdp.fully_sharded_data_parallel import FullOptimStateDictConfig, FullStateDictConfig

fsdp_plugin = FullyShardedDataParallelPlugin(
    state_dict_config=FullStateDictConfig(offload_to_cpu=True, rank0_only=False),
    optim_state_dict_config=FullOptimStateDictConfig(offload_to_cpu=True, rank0_only=False),
)

accelerator = Accelerator(fsdp_plugin=fsdp_plugin)

In [13]:
model = accelerator.prepare_model(model)

In [14]:
import wandb, os
wandb.login()

wandb_project = "journal-finetune"
if len(wandb_project) > 0:
    os.environ["WANDB_PROJECT"] = wandb_project

<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

 ··········


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


In [15]:
if torch.cuda.device_count() > 1: # If more than 1 GPU
    model.is_parallelizable = True
    model.model_parallel = True

### Native PyTorch loop for training

In [16]:
  model.train()
for epoch in range(num_epochs):
    progress_bar = tqdm(train_dataloader, desc=f"Epoch {epoch}")
    for batch in progress_bar:
        #print(batch)
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        print(type(outputs))
        loss = outputs.loss
        print(loss)
        loss.backward()
        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
        progress_bar.set_postfix(loss=loss.item())

Epoch 0:   0%|          | 0/5 [00:00<?, ?it/s]

`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


<class 'transformers.modeling_outputs.CausalLMOutputWithPast'>
tensor(9.2277, device='cuda:0', grad_fn=<NllLossBackward0>)
<class 'transformers.modeling_outputs.CausalLMOutputWithPast'>
tensor(8.4835, device='cuda:0', grad_fn=<NllLossBackward0>)
<class 'transformers.modeling_outputs.CausalLMOutputWithPast'>
tensor(9.5263, device='cuda:0', grad_fn=<NllLossBackward0>)
<class 'transformers.modeling_outputs.CausalLMOutputWithPast'>
tensor(9.7450, device='cuda:0', grad_fn=<NllLossBackward0>)
<class 'transformers.modeling_outputs.CausalLMOutputWithPast'>
tensor(7.7117, device='cuda:0', grad_fn=<NllLossBackward0>)


Epoch 1:   0%|          | 0/5 [00:00<?, ?it/s]

<class 'transformers.modeling_outputs.CausalLMOutputWithPast'>
tensor(9.3562, device='cuda:0', grad_fn=<NllLossBackward0>)
<class 'transformers.modeling_outputs.CausalLMOutputWithPast'>
tensor(8.5385, device='cuda:0', grad_fn=<NllLossBackward0>)
<class 'transformers.modeling_outputs.CausalLMOutputWithPast'>
tensor(9.9474, device='cuda:0', grad_fn=<NllLossBackward0>)
<class 'transformers.modeling_outputs.CausalLMOutputWithPast'>
tensor(8.9597, device='cuda:0', grad_fn=<NllLossBackward0>)
<class 'transformers.modeling_outputs.CausalLMOutputWithPast'>
tensor(8.4346, device='cuda:0', grad_fn=<NllLossBackward0>)


In [17]:
model.eval()
total_eval_loss = 0
for batch in eval_dataloader:
    batch = {k: v.to(device) for k, v in batch.items()}
    with torch.no_grad():
        outputs = model(**batch)

    total_eval_loss += outputs.loss.item()

avg_eval_loss = total_eval_loss / len(eval_dataloader)
print(f"Average evaluation loss: {avg_eval_loss}")

Average evaluation loss: 9.46684217453003


In [18]:
import evaluate

metric = evaluate.load("accuracy")
model.eval()
for batch in eval_dataloader:
    batch = {k: v.to(device) for k, v in batch.items()}
    with torch.no_grad():
        outputs = model(**batch)

    logits = outputs.logits
    # Convert predictions to the expected integer format
    predictions = torch.argmax(logits, dim=-1).detach().cpu().numpy().astype(int)
    # Ensure references are also in the correct integer format
    references = batch["labels"].detach().cpu().numpy().astype(int)
    # Flatten the 2D arrays to 1D
    predictions_1d = [pred[0] for pred in predictions]
    references_1d = [ref[0] for ref in references]

    metric.add_batch(predictions=predictions_1d, references=references_1d)

accuracy = metric.compute()
print("Accuracy:", accuracy)

Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]

Accuracy: {'accuracy': 0.0}


### Saving the fine-tuned model in a folder

In [19]:
# Assuming 'model' is your fine-tuned model
model_save_path = "/content/Models"
model.save_pretrained(model_save_path)
tokenizer.save_pretrained(model_save_path)



In [1]:
!zip -r /content/Models.zip /content/Models


  adding: content/Models/ (stored 0%)
  adding: content/Models/adapter_model.safetensors (deflated 55%)
  adding: content/Models/adapter_config.json (deflated 51%)
  adding: content/Models/README.md (deflated 66%)


### Downloaded the model to upload on hugging face hub

In [2]:
from google.colab import files
files.download("/content/Models.zip")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>