In [None]:
!pip install -q bitsandbytes
!pip install -q accelerate 
!pip install -q peft 
!pip install -q jsonargparse 
!pip install transformers
!pip install torch 
!pip install datasets
!pip install tqdm 
!pip install pandas 
!pip install numpy
!pip install -i https://pypi.org/simple/ bitsandbytes
!pip install accelerate transformers huggingface_hub

In [2]:
import os 
import pandas as pd
import numpy as np 
from tqdm import tqdm 
from tqdm.notebook import tqdm 
from transformers import AutoTokenizer, AutoModelForCausalLM
from transformers import get_cosine_schedule_with_warmup, BitsAndBytesConfig
from datasets import load_dataset 
from pathlib import Path 
from typing import Optional
import torch 
from torch import nn, optim 
from torch.cuda.amp import GradScaler, autocast 
import torch.nn.functional as F 
from torch.utils.data import random_split 
from peft import get_peft_config, get_peft_model, LoraConfig, TaskType 

os.environ['TOKENIZERS_PARALLELISM'] = 'false'


In [3]:
llm = 'mistralai/Mistral-7B-v0.1'
device = 'cuda' if torch.cuda.is_available() else 'cpu'
max_seq_len = 380 
num_epochs = 1 
micro_batch_size = 2
learning_rate = 0.00003
warmup_steps = 1000
weight_decay = 0.005 
eval_steps = 200
logging_step = 50 
accumulation_steps = 2 

In [4]:
%%writefile prepare_data.py 

import os 
import pandas as pd 
from tqdm import tqdm
import torch 
from torch.utils.data import random_split 
from transformers import AutoTokenizer 
from pathlib import Path 

def gen_prompt(example: dict) -> str: 
    if example["input"]: 
        return (
            "Below is an instruction describing a task paired with an input that provides further context. "
            "Write a response that appropriately completes the request.\n\n"
            f"### Instruction:\n{example['instruction']}\n\n### Input:\n{example['input']}\n\n### Response:"
        )
    return (
            "Below is an instruction describing a task. "
            "Write a response that appropriately completes the request.\n\n"
            f"### Instruction:\n{example['instruction']}\n\n### Input:\n{example['input']}\n\n### Response:"
    )

def prepare_sample(example, tokenizer, masked_inputs, ignore_index): 
    example = example.to_dict()
    full_prompt = gen_prompt(example)
    full_prompt_with_response = full_prompt + example['response']
    encoded_full_prompt = tokenizer.encode(full_prompt)
    eos_id = tokenizer.eos_token 
    encoded_full_prompt_with_response = torch.cat(
        [
            tokenizer.encode(full_prompt_with_response, return_tensors = "pt").view(-1), 
            tokenizer.encode(eos_id, return_tensors='pt', add_special_tokens=False).view(-1)
        ]
    )
    labels = encoded_full_prompt_with_response.clone()
    if masked_inputs: 
        labels[:len(encoded_full_prompt)]= ignore_index 
    return {
        **example, 
        'input_ids': encoded_full_prompt_with_response, 
        'input_ids_no_response': encoded_full_prompt, 
        'labels': labels 
    }

def prepare(test_size = 0.1, destination_path = Path('data/dolly'), checkpoint_dir= 'mistralai/Mistral-7B-v0.1', seed = 252): 
    destination_path.mkdir(parents=True, exist_ok = True)
    print("Loading tokenizer...")
    tokenizer = AutoTokenizer.from_pretrained(checkpoint_dir)
    data_url = "https://huggingface.co/datasets/databricks/databricks-dolly-15k/resolve/main/databricks-dolly-15k.jsonl"
    data = pd.read_json(data_url, lines=True)
    data.columns = ['instruction', 'input', 'response', 'category']
    data = [example for _, example in data.iterrows()]
    
    trainset, testset = random_split(
        data, [1-test_size, test_size], 
        torch.Generator().manual_seed(seed)
    )
    trainset, testset = list(trainset),list(testset)
    print("Length of train set", len(trainset))
    print("Length of test set", len(testset))
    print('Processing train split...')
    
    trainset = [
        prepare_sample(sample,tokenizer = tokenizer, masked_inputs=False, ignore_index=-1) for sample in tqdm(trainset)
    ]
    torch.save(trainset, destination_path / 'trainset.pt')
    
    print('Processing test split')
    
    testset = [
        prepare_sample(sample,tokenizer = tokenizer, masked_inputs=False, ignore_index=-1) for sample in tqdm(testset)
    ]
    torch.save(testset, destination_path / 'testset.pt')
    

if __name__ == "__main__": 
    from jsonargparse import CLI 
    CLI(prepare)

Overwriting prepare_data.py


In [None]:
from huggingface_hub import login

login(token="YOUR_HF_TOKEN")

In [6]:
!python prepare_data.py

Loading tokenizer...
Length of train set 13510
Length of test set 1501
Processing train split...
100%|████████████████████████████████████| 13510/13510 [00:19<00:00, 696.26it/s]
Processing test split
100%|██████████████████████████████████████| 1501/1501 [00:02<00:00, 706.24it/s]


In [7]:
traindata = torch.load("/kaggle/working/data/dolly/trainset.pt")
valdata = torch.load("/kaggle/working/data/dolly/testset.pt")

In [8]:
def find_longest_seq_length(data): 
    lengths = [len(d['input_ids']) for d in data]
    longest = max(lengths)
    longest_idx = lengths.index(longest)
    return longest, longest_idx

In [9]:
torch.manual_seed(2004)
def get_batch(data, mode, train_idx, longest_seq_idx, max_seq_len, micro_batch_size): 
    if mode == 'train': 
        idx = train_idx 
    else: 
        idx = torch.randint(len(data), (micro_batch_size,))
        
    if longest_seq_idx is not None: 
        idx[0] = longest_seq_idx
    input_ids = [data[i]['input_ids'] for i in idx]
    labels = [data[i]['labels'] for i in idx]
    max_len = max([len(s) for s in input_ids])
    
    def pad_right(x, pad_id): 
        n = max_len - len(x)
        return torch.cat([x, torch.full((n,), pad_id)], dim=0)
    
    x = torch.stack([pad_right(x, pad_id=1) for x in input_ids])
    y = torch.stack([pad_right(y, pad_id= -1) for y in labels])
    if max_seq_len: 
        x = x[:, :max_seq_len]
        y = y[:, :max_seq_len]
    return x.to(device), y.to(device)

input_ids, targets = get_batch(
    traindata, mode = 'train', 
    train_idx = torch.tensor([0,1]), 
    longest_seq_idx = None, 
    max_seq_len = 500, 
    micro_batch_size = 4
)

In [10]:
tokenizer = AutoTokenizer.from_pretrained(llm)
if tokenizer.pad_token is None: 
    tokenizer.pad_token = tokenizer.eos_token 
tokenizer.decode(traindata[500]['input_ids'])

'<s> Below is an instruction describing a task paired with an input that provides further context. Write a response that appropriately completes the request.\n\n### Instruction:\nWhat is the special aspect of Einstein\'s special theory of relativity?\n\n### Input:\nUntil several years later when Einstein developed general relativity, which introduced a curved spacetime to incorporate gravity, the phrase "special relativity" was not used. A translation sometimes used is "restricted relativity"; "special" really means "special case".[p 2][p 3][p 4][note 1] Some of the work of Albert Einstein in special relativity is built on the earlier work by Hendrik Lorentz and Henri Poincaré. The theory became essentially complete in 1907.\n\nThe theory is "special" in that it only applies in the special case where the spacetime is "flat", that is, where the curvature of spacetime (a consequence of the energy–momentum tensor and representing gravity) is negligible.[note 2] In order to correctly accom

In [11]:
#Architecture 
class Network(nn.Module): 
    def __init__(self): 
        super().__init__()

        bnb_config = BitsAndBytesConfig(
                load_in_4bit=True,
                bnb_4bit_use_double_quant=True,
                bnb_4bit_quant_type="nf4",
                bnb_4bit_compute_dtype=torch.float16
        )
        self.backbone =  AutoModelForCausalLM.from_pretrained(
            llm,
            quantization_config=bnb_config
        )
        
        self.peft_config = LoraConfig(
            task_type=TaskType.CAUSAL_LM, 
            inference_mode=False,
            r=8,
            lora_alpha=16, 
            lora_dropout=0.05
        )
        

        self.backbone = get_peft_model(self.backbone, self.peft_config)
        self.backbone.print_trainable_parameters()
    
    def forward(self, input_ids, targets):
        logits = self.backbone(input_ids).logits
        logits = logits[..., :-1, :]
        targets = targets[..., 1:]
        
        B, T, C = logits.shape
        logits = logits.reshape(B*T, C)
        targets = targets.reshape(-1)
        
        loss = F.cross_entropy(logits, targets, ignore_index=-1)
        
        return logits, loss

In [12]:
#Finetuning 
torch.manual_seed(2004)
model = Network().to(device)

optimizer = optim.Adam(model.parameters(), lr = learning_rate, weight_decay=weight_decay)

train_indices = []
shuffled_indices = torch.randperm(len(traindata))
for step in range(0, len(traindata)-micro_batch_size, micro_batch_size): 
    indices = shuffled_indices[step:step+micro_batch_size]
    if len(indices) == micro_batch_size: 
        train_indices.append(indices)

scheduler = get_cosine_schedule_with_warmup(
    optimizer = optimizer, 
    num_warmup_steps = warmup_steps, 
    num_training_steps = len(train_indices)* num_epochs
)

`low_cpu_mem_usage` was None, now set to True since model is quantized.


model.safetensors.index.json:   0%|          | 0.00/25.1k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/9.94G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/4.54G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

trainable params: 3,407,872 || all params: 7,245,139,968 || trainable%: 0.0470


In [None]:
torch.manual_seed(2004)

train_losses = []
val_losses = []

for epoch in range(num_epochs):
    
    shuffled_indices = torch.randperm(len(traindata))
    train_indices = []
    for step in range(0, len(traindata)-micro_batch_size, micro_batch_size):
        indices = shuffled_indices[step:step+micro_batch_size]
        if len(indices) == micro_batch_size:
            train_indices.append(indices)
    
    train_indices = tqdm(train_indices, desc=f'Epoch: {epoch+1}')
    
    for batch_idx, batch in enumerate(train_indices):
        
        model.train()

        input_ids, targets = get_batch(
            traindata,
            train_idx=batch,
            mode='train',
            micro_batch_size=None,
            longest_seq_idx=None,
            max_seq_len=max_seq_len
        )

        _, loss = model(input_ids, targets)

        train_losses.append(loss.item())
        loss = loss / accumulation_steps  
        loss.backward()

        if (batch_idx + 1) % accumulation_steps == 0:
            optimizer.step()
            optimizer.zero_grad()

        scheduler.step()

        if not batch_idx % logging_step:
            with torch.no_grad():
                model.eval()

                input_ids, targets = get_batch(
                    valdata,
                    train_idx=None,
                    mode='val',
                    micro_batch_size=micro_batch_size,
                    longest_seq_idx=None,
                    max_seq_len=max_seq_len
                )
                
                _, val_loss = model(input_ids, targets)
                val_losses.append(val_loss.item())

                print(
                f'Epoch: {epoch + 1}/{num_epochs}'
                f' | Batch: {batch_idx}/{len(train_indices)}'
                f' | Train Loss: {np.mean(train_losses)}'
                f' | Val loss: {np.mean(val_losses)}'
                )


        if not batch_idx % 100:
            with torch.no_grad():
                model.eval()
                text = """Below is an instruction that describes a task. Write a response that appropriately completes the request.\n\n### Instruction:\nTell me about Viettel Networks (VTNet)?\n\n### Response:\n"""
                answer = tokenizer.decode(
                    model.backbone.generate(
                        **tokenizer(text, return_tensors='pt').to(device),
                        max_new_tokens=300,
                        pad_token_id=tokenizer.pad_token_id,
                    )[0]
                )
                print(answer)
    

                
 
                
    

Epoch: 1:   0%|          | 0/6754 [00:00<?, ?it/s]

2024-07-01 08:09:12.501232: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-07-01 08:09:12.501336: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-07-01 08:09:12.640495: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


Epoch: 1/1 | Batch: 0/6754 | Train Loss: 2.119572877883911 | Val loss: 2.3546266555786133
<s> Below is an instruction that describes a task. Write a response that appropriately completes the request.

### Instruction:
Tell me about Viettel Networks (VTNet)?

### Response:

Viettel Networks (VTNet) is a Vietnamese telecommunications company that provides a range of services, including mobile, fixed-line, and internet services. The company was founded in 2001 and is headquartered in Hanoi, Vietnam.

VTNet is a subsidiary of Viettel Group, which is a state-owned enterprise that is majority-owned by the Vietnamese government. The company has a strong presence in Vietnam, with a network that covers over 90% of the country’s population.

VTNet offers a range of mobile services, including voice, data, and SMS services. The company also provides fixed-line services, including broadband and voice services. In addition, VTNet offers a range of internet services, including broadband, mobile data,

In [None]:

torch.save(model.state_dict(), 'mistral_7b_dolly.pt')