In [23]:
# Configuring the character encoding
import locale


def getpreferredencoding(do_setlocale=True):
    return "UTF-8"


locale.getpreferredencoding = getpreferredencoding

!pip install -U accelerate peft bitsandbytes transformers trl datasets wandb mlflow python-dotenv pyngrok numpy==1.24.3

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)




In [24]:
# Imports
import os
import sys
import time
import torch
import wandb
import numpy
import random
import mlflow
import hashlib
from dotenv import load_dotenv
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    BitsAndBytesConfig,
    TrainingArguments,
    pipeline,
)
from datasets import Dataset, DatasetDict, Features, Value
from peft import LoraConfig, PeftModel
from trl import SFTTrainer
from pyngrok import ngrok

In [25]:
!nvidia-smi

/bin/bash: nvidia-smi: command not found


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [26]:
# Check if the notebook runs on Colab to adjust paths
on_colab = 'google.colab' in sys.modules

if on_colab:
    from google.colab import drive, userdata
    drive.mount('/content/drive')
    test_file = "/content/drive/MyDrive/Data/docstring_test_data.txt"
    train_file = "/content/drive/MyDrive/Data/docstring_training_data.txt"
    base_output_dir = "/content/drive/MyDrive/Models/"
    print("The notebook runs on Google Colab.")

    # Load API-Token from Colab-Secrets
    huggingface_api_token = userdata.get('huggingface_api_token')
    wandb_api_token = userdata.get('wandb_api_token')
else:
    test_file = "../Data/docstring_test_data.txt"
    train_file = "../Data/docstring_training_data.txt"
    base_output_dir = "../Models/"
    print("The notebook is running locally.")

    # Load API-Token from .env
    load_dotenv()
    huggingface_api_token = os.getenv("HUGGINGFACE")
    wandb_api_token = os.getenv("WANDB")

if huggingface_api_token and wandb_api_token:
    print("Access token loaded.")
else:
    print("Access token not found.")

The notebook is running locally.
Access token loaded.


In [27]:
def load_dataset_from_text_files(train_file_path, test_file_path):
    """
    Load training and test datasets from text files into a DatasetDict.

    Args:
        train_file_path (str): Path to the training data text file.
        test_file_path (str): Path to the test data text file.

    Returns:
        DatasetDict: A dictionary containing 'train' and 'test' datasets with text data.
    """

    def get_lines_from_file(file_path):
        with open(file_path, 'r', encoding='utf-8') as file:
            lines = [line.strip() for line in file if line.strip()]
        print(f'Total lines loaded from {file_path}: {len(lines)}')
        return lines

    def get_dataset_generator(file_path):
        lines = get_lines_from_file(file_path)
        for line in lines:
            yield {"text": line}

    dataset_train = Dataset.from_generator(
        generator=lambda: get_dataset_generator(train_file_path),
        features=Features({'text': Value('string')})
    )
    dataset_test = Dataset.from_generator(
        generator=lambda: get_dataset_generator(test_file_path),
        features=Features({'text': Value('string')})
    )
    return DatasetDict({"train": dataset_train, "test": dataset_test})


datasets = load_dataset_from_text_files(train_file, test_file)

In [28]:
# Model selection and configuration
model_to_finetune = "meta-llama/CodeLlama-7b-Python-hf"
# model_to_finetune = "tiiuae/falcon-rw-1b"
# model_to_finetune = "tiiuae/falcon-7b"

if on_colab:
    quantization_config = BitsAndBytesConfig(load_in_4bit=True)
    torch_dtype = None
    device_map = "auto"
    print("Notebook is running on Colab: Using 4-bit quantization.")
else:
    quantization_config = None
    torch_dtype = torch.bfloat16
    device_map = "cpu"
    print("Notebook is running locally: Using bfloat16 precision.")

# Load tokeniser and model with auth token
tokenizer = AutoTokenizer.from_pretrained(
    model_to_finetune,
    token=huggingface_api_token,
    trust_remote_code=True,
)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right" # Fix weird overflow issue with fp16 training

model = AutoModelForCausalLM.from_pretrained(
    model_to_finetune,
    token=huggingface_api_token,
    device_map=device_map,
    quantization_config=quantization_config,
    torch_dtype=torch_dtype,
    trust_remote_code=True,
)

# Deactivating cache & setting pretraining
model.config.use_cache = False
model.config.pretraining_tp = 1

Notebook is running locally: Using bfloat16 precision.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [29]:
# Tokenisation of the data sets for training
def tokenize_function(examples):
    return tokenizer(examples["text"], truncation=True)


tokenized_datasets = datasets.map(tokenize_function, batched=True)

# Example output of some training examples
for i in range(5):
    index = random.randint(0, len(tokenized_datasets["train"]) - 1)
    print("Train example", index, ":", tokenized_datasets["train"][index])

Train example 1824 : {'text': '[Function] def batch_random_flip(input_):\\n    if isinstance(input_, (float, int)):\\n        return input_\\n    shape = input_.get_shape().as_list()\\n    batch_size = shape[0]\\n    height = shape[1]\\n    width = shape[2]\\n    channels = shape[3]\\n    res = tf.split(axis=0, num_or_size_splits=batch_size, value=input_)\\n    res = [elem[0, :, :, :] for elem in res]\\n    res = [tf.image.random_flip_left_right(elem) for elem in res]\\n    res = [tf.reshape(elem, [1, height, width, channels]) for elem in res]\\n    res = tf.concat(axis=0, values=res)\\n    return res [Docstring] Simultaneous horizontal random flip.', 'input_ids': [1, 518, 6678, 29962, 822, 9853, 29918, 8172, 29918, 29888, 3466, 29898, 2080, 29918, 1125, 29905, 29876, 1678, 565, 338, 8758, 29898, 2080, 3383, 313, 7411, 29892, 938, 876, 3583, 29876, 4706, 736, 1881, 3187, 29876, 1678, 8267, 353, 1881, 5396, 657, 29918, 12181, 2141, 294, 29918, 1761, 580, 29905, 29876, 1678, 9853, 29918,

In [30]:
def test_model_response_pipeline(model, tokenizer, prompts, max_new_tokens=50):
    """
    Tests the model's response to a list of prompts using Hugging Face's pipeline.

    Args:
        model (PreTrainedModel): The loaded model.
        tokenizer (PreTrainedTokenizer): The tokenizer associated with the model.
        prompts (list): A list of input prompts as strings.
        max_new_tokens (int, optional): Maximum number of tokens to generate. Defaults to 50.

    Returns:
        list: A list of the model's responses to the prompts.
    """
    text_generator = pipeline(
        "text-generation",
        model=model,
        tokenizer=tokenizer,
    )

    responses = [
        text_generator(prompt, max_new_tokens=max_new_tokens, do_sample=True, top_k=10, temperature=0.7)[0][
            "generated_text"]
        for prompt in prompts
    ]
    return responses


prompts = [
    "Write a docstring for the following Python code:\n [Function]\ndef add_numbers(a, b): return a + b \n [Docstring]\n",
    "Write a docstring for the following Python code:\n [Function]\ndef subtract_numbers(a, b): return a - b \n [Docstring]\n",
]

responses = test_model_response_pipeline(model, tokenizer, prompts, max_new_tokens=100)
for i, response in enumerate(responses):
    print(f"Prompt {i + 1} - Response:\n {response}")

Device set to use cpu


Prompt 1 - Response:
 Write a docstring for the following Python code:
 [Function]
def add_numbers(a, b): return a + b 
 [Docstring]
"""

[Function]
def add_numers(a, b): return a + b 
[Docstring]
"""

Write a docstring for the following Python code:

def add_numbers(a, b):
    # This function adds two numbers
    return a + b

[Function]
def add_numbers(a, b):
    # This function adds two numbers
    return a + b
[Doc
Prompt 2 - Response:
 Write a docstring for the following Python code:
 [Function]
def subtract_numbers(a, b): return a - b 
 [Docstring]
def subtract_numbers(a, b):
    """
    This function subtracts two numbers and returns the difference.
    
    Arguments:
        a: An integer or float. This is the first number to subtract.
        b: An integer or float. This is the second number to subtract.
    
    Returns:
        An integer or float. This is the difference between a and b.
    
    """
    return a - b



In [31]:
# Show layers
for name, module in model.named_modules():
    print(name)


model
model.embed_tokens
model.layers
model.layers.0
model.layers.0.self_attn
model.layers.0.self_attn.q_proj
model.layers.0.self_attn.k_proj
model.layers.0.self_attn.v_proj
model.layers.0.self_attn.o_proj
model.layers.0.self_attn.rotary_emb
model.layers.0.mlp
model.layers.0.mlp.gate_proj
model.layers.0.mlp.up_proj
model.layers.0.mlp.down_proj
model.layers.0.mlp.act_fn
model.layers.0.input_layernorm
model.layers.0.post_attention_layernorm
model.layers.1
model.layers.1.self_attn
model.layers.1.self_attn.q_proj
model.layers.1.self_attn.k_proj
model.layers.1.self_attn.v_proj
model.layers.1.self_attn.o_proj
model.layers.1.self_attn.rotary_emb
model.layers.1.mlp
model.layers.1.mlp.gate_proj
model.layers.1.mlp.up_proj
model.layers.1.mlp.down_proj
model.layers.1.mlp.act_fn
model.layers.1.input_layernorm
model.layers.1.post_attention_layernorm
model.layers.2
model.layers.2.self_attn
model.layers.2.self_attn.q_proj
model.layers.2.self_attn.k_proj
model.layers.2.self_attn.v_proj
model.layers.2.

In [34]:
# Fine-tuning configuration
model_name = model_to_finetune

# Generate a random run ID
current_time = str(time.time()).encode('utf-8')
hash_object = hashlib.sha256(current_time)
hex_digest = hash_object.hexdigest()
random_string = hex_digest[:12]
run_id = random_string

# LoRA parameters
lora_r = 8
lora_alpha = 16
lora_dropout = 0.3

# Training parameter
num_train_epochs = 3
per_device_train_batch_size = 1
per_device_eval_batch_size = 4
gradient_accumulation_steps = 4
gradient_checkpointing = True
max_grad_norm = 0.3
learning_rate = 1.5e-4
weight_decay = 0.001
optim = "adamw_torch"
lr_scheduler_type = "constant_with_warmup"
max_steps = 20000
warmup_ratio = 0.01
group_by_length = True
save_steps = 100
logging_steps = 1
eval_steps = 2

# Output directory
run_name = f"{model_name}_run{run_id}"
output_dir = os.path.join(base_output_dir, run_name)
print(f"Output directory: {output_dir}")

# Fine-tuned model name
new_model = os.path.join(output_dir, "end_of_training")

# Target modules to adapt key components to the model type (Falcon / CodeLlama):
# - Attention Projections: Query, Key, Value, and Output
# - Feed-Forward Network: Input (Expansion) and Output (Reduction)
# - Embedding Matrix: Maps tokens to dense vectors
if "CodeLlama" in model_name:
    target_modules = [
        "self_attn.q_proj",
        "self_attn.k_proj",
        "self_attn.v_proj",
        "self_attn.o_proj",
        "mlp.gate_proj",
        "mlp.down_proj",
        "embed_tokens",
    ]
elif "falcon" in model_name:
    target_modules = [
        "self_attention.query_key_value",
        "self_attention.dense",
        "mlp.dense_h_to_4h",
        "mlp.dense_4h_to_h",
        "word_embeddings",
    ]
else:
    target_modules = None

# Load LoRA configuration
peft_config = LoraConfig(
    lora_alpha=lora_alpha,
    lora_dropout=lora_dropout,
    r=lora_r,
    bias="all",
    task_type="CAUSAL_LM",
    target_modules=target_modules,
)


# Monitoring
projectname='DocstringGenerator'

# Initialize MLflow
if on_colab:
    # Starts MLflow UI in the background
    get_ipython().system_raw("mlflow ui --backend-store-uri file:/content/mlruns --port 5000 &")
    # Forward port 5000 via ngrok
    public_url = ngrok.connect(5000)
    print("MLflow Tracking UI:", public_url.public_url)
    mlflow.set_tracking_uri("file:/content/mlruns")
else:
    # run in terminal:
    # mlflow server --host 127.0.0.1 --port 8080
    mlflow.set_tracking_uri("http://127.0.0.1:8080")

mlflow.set_experiment(projectname)
mlflow.start_run(run_name=f"run_{run_id}")

# Initialize Weights & Biases
wandb.login(key=wandb_api_token)
wandb.init(
    project=projectname,
    name=f"run_{run_id}",
    config={
    "lora_r":lora_r,
    "lora_dropout":lora_dropout,
    "learning_rate": learning_rate,
    "num_train_epochs": num_train_epochs,
    }
)

# Set training parameters
training_arguments = TrainingArguments(
    output_dir=output_dir,
    num_train_epochs=num_train_epochs,
    per_device_train_batch_size=per_device_train_batch_size,
    gradient_accumulation_steps=gradient_accumulation_steps,
    optim=optim,
    save_steps=save_steps,
    logging_steps=logging_steps,
    learning_rate=learning_rate,
    weight_decay=weight_decay,
    max_grad_norm=max_grad_norm,
    max_steps=max_steps,
    warmup_ratio=warmup_ratio,
    group_by_length=group_by_length,
    lr_scheduler_type=lr_scheduler_type,
    eval_strategy="steps",
    eval_steps=eval_steps,
    gradient_checkpointing=gradient_checkpointing,
    report_to=["wandb", "mlflow"],
    run_name=run_id,
    logging_dir=os.path.join(base_output_dir, "Results/runs/", run_name),
)


# Initialize the SFT Trainer
trainer = SFTTrainer(
    model=model,
    train_dataset=tokenized_datasets["train"].shuffle(),
    eval_dataset=tokenized_datasets["test"],
    peft_config=peft_config,
    # dataset_text_field="text",
    # max_seq_length=None,
    processing_class=tokenizer,
    args=training_arguments,
    # packing=False,
)

# Pre-process the model of layer norm for stable training
for name, module in trainer.model.named_modules():
    if "norm" in name:
        module = module.to(torch.float32)

# Train the model
trainer.train()

# Log model metrics to MLflow
if trainer.state.log_history:
    metrics = trainer.state.log_history[-1]
    for k, v in metrics.items():
        if isinstance(v, (int, float)):
            mlflow.log_metric(k, v)

# Save the trained model
trainer.model.save_pretrained(new_model)
mlflow.log_artifacts(output_dir)

# End MLflow and W&B session
mlflow.end_run()
wandb.finish()



Output directory: ../Models/meta-llama/CodeLlama-7b-Python-hf_rune327e67d50b1


RuntimeError: expected m1 and m2 to have the same dtype, but got: float != c10::BFloat16