# Finetuning the LLAMA 3.2 with Pure Pytorch

### LLAMA is a gated model and you will need to signin in using huggingface token to download the checkpoint (Only for the first time)

In [1]:
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch
from IPython.display import  clear_output
import time
import gc
from torch.utils.data import Dataset, DataLoader

device = 'cuda' if torch.cuda.is_available() else 'cpu'

def print_gpu_utilization():
    allocated = torch.cuda.memory_allocated() / (1024 ** 2)
    reserved = torch.cuda.memory_reserved() / (1024 ** 2)
    print(f"GPU Memory Usage>>>> Allocated: {allocated:.2f} MB |||||  Reserved:  {reserved:.2f} MB:")

def flush():
    torch.cuda.empty_cache()
    gc.collect()
    torch.cuda.empty_cache()
    gc.collect()

In [2]:
DEFAULT_MODEL = "meta-llama/Llama-3.2-1B-Instruct"
model = AutoModelForCausalLM.from_pretrained(
    DEFAULT_MODEL,
    device_map=device,
    torch_dtype=torch.bfloat16
)
tokenizer = AutoTokenizer.from_pretrained(DEFAULT_MODEL, use_safetensors=True)

In [3]:
print(model.get_memory_footprint()/(1024*1024)) 

2357.1290283203125


# How to ask a question to LLAMA 3.2

In [3]:
messages = [
    {"role": "user", "content": "What is the capital of Italy?"}
]

In [4]:
tokenized_text = tokenizer.apply_chat_template(messages,tokenize=False,add_generation_prompt=True)
print(tokenized_text)

<|begin_of_text|><|start_header_id|>system<|end_header_id|>

Cutting Knowledge Date: December 2023
Today Date: 26 Apr 2025

<|eot_id|><|start_header_id|>user<|end_header_id|>

What is the capital of Italy?<|eot_id|><|start_header_id|>assistant<|end_header_id|>




In [5]:
from transformers import pipeline

llama_pipeline = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
)

generated_text = llama_pipeline(tokenized_text, max_new_tokens=20, early_stopping=True)

print(generated_text[0]['generated_text'])

Device set to use cuda
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


<|begin_of_text|><|start_header_id|>system<|end_header_id|>

Cutting Knowledge Date: December 2023
Today Date: 26 Apr 2025

<|eot_id|><|start_header_id|>user<|end_header_id|>

What is the capital of Italy?<|eot_id|><|start_header_id|>assistant<|end_header_id|>

The capital of Italy is Rome.


# What is behind the Pipeline function

In [7]:
inputs = tokenizer(tokenized_text, return_tensors="pt").to(device)
inputs

{'input_ids': tensor([[128000, 128000, 128006,   9125, 128007,    271,  38766,   1303,  33025,
           2696,     25,   6790,    220,   2366,     18,    198,  15724,   2696,
             25,    220,   1627,   5186,    220,   2366,     20,    271, 128009,
         128006,    882, 128007,    271,   3923,    374,    279,   6864,    315,
          15704,     30, 128009, 128006,  78191, 128007,    271]],
       device='cuda:0'), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]],
       device='cuda:0')}

In [8]:
# inputs = tokenizer(tokenized_text, return_tensors="pt",padding='max_length',max_length=300).to(device)
# tokenizer.pad_token_id = tokenizer.eos_token_id

In [9]:
with torch.no_grad():
    outputs = model(**inputs)

logits = outputs.logits

In [10]:
logits.shape, inputs.input_ids.shape

(torch.Size([1, 43, 128256]), torch.Size([1, 43]))

In [11]:
predicted_ids = torch.argmax(logits, dim=-1)
print(predicted_ids)

next_word_id = predicted_ids[0, -1]
print(next_word_id)

tensor([[     2,      2, 128006,    198,  78191,    567,   1303,    311,  12299,
             25,    220,    220,     16,     15,    198,   9673,    596,     25,
            510,   2360,   7552,    220,   2366,     18,    198,   2028, 128006,
          78191,   3638,    271,   2028,    374,    279,   2694,    315,   1561,
           5380,  22463, 128006,  78191, 128007,    271,    791]],
       device='cuda:0')
tensor(791, device='cuda:0')


In [12]:
print('The predicted next word:')
tokenizer.decode(next_word_id)

The predicted next word:


'The'

In [13]:
new_input_ids = torch.cat([inputs.input_ids, next_word_id.view((-1,1))], dim=-1)

In [54]:
with torch.no_grad():
    outputs = model(new_input_ids)

logits = outputs.logits

predicted_ids = torch.argmax(logits, dim=-1)

next_word_id = predicted_ids[0, -1]

new_input_ids = torch.cat([new_input_ids, next_word_id.view((-1,1))], dim=-1)

print('The predicted next word:')
print(tokenizer.batch_decode(new_input_ids)[0])

The predicted next word:
<|begin_of_text|><|begin_of_text|><|start_header_id|>system<|end_header_id|>

Cutting Knowledge Date: December 2023
Today Date: 26 Apr 2025

<|eot_id|><|start_header_id|>user<|end_header_id|>

What is the capital of Italy?<|eot_id|><|start_header_id|>assistant<|end_header_id|>

The capital of Italy is Rome.<|eot_id|><|start_header_id|>assistant<|end_header_id|>

That's correct. Rome has been the capital of Italy since 1871, when the Kingdom of Italy was established.<|eot_id|><|start_header_id|>assistant<|end_header_id|>

Is


# What happens when we ask something out of it's training

In [8]:
# Q: What is BG-CNN and what does it do?
# A: BG-CNN is a hybrid FDI method that combines Bond Graph residual generation with CNN-based fault classification, designed to work well even with limited labeled data.

# Q: How does BG-XAI help in fault diagnosis?
# A: BG-XAI provides explanations for fault predictions using an occlusion-based method, helping make AI-based diagnostics more understandable.

messages = [
    {"role": "user", "content": "How does BG-XAI help in fault diagnosis?"}
]

tokenized_text = tokenizer.apply_chat_template(messages,tokenize=False,add_generation_prompt=True)

llama_pipeline = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
)

generated_text = llama_pipeline(tokenized_text, max_new_tokens=20, early_stopping=True)

print(generated_text[0]['generated_text'])

Device set to use cuda
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


<|begin_of_text|><|start_header_id|>system<|end_header_id|>

Cutting Knowledge Date: December 2023
Today Date: 26 Apr 2025

<|eot_id|><|start_header_id|>user<|end_header_id|>

How does BG-XAI help in fault diagnosis?<|eot_id|><|start_header_id|>assistant<|end_header_id|>

I can’t provide information on how BG-XAI helps in fault diagnosis. Is there anything else I


# Let's train the model to learn this new sentence

In [10]:
tokenized_text = tokenizer.apply_chat_template(messages,tokenize=False,add_generation_prompt=False)# add_generation_prompt=False as we have already added it
print(tokenized_text)

inputs = tokenizer(tokenized_text, return_tensors="pt").to(device)
print(inputs)


<|begin_of_text|><|start_header_id|>system<|end_header_id|>

Cutting Knowledge Date: December 2023
Today Date: 26 Apr 2025

<|eot_id|><|start_header_id|>user<|end_header_id|>

How does BG-XAI help in fault diagnosis?<|eot_id|><|start_header_id|>assistant<|end_header_id|>

BG-XAI provides explanations for fault predictions using an occlusion-based method, helping make AI-based diagnostics more understandable.<|eot_id|>
{'input_ids': tensor([[128000, 128000, 128006,   9125, 128007,    271,  38766,   1303,  33025,
           2696,     25,   6790,    220,   2366,     18,    198,  15724,   2696,
             25,    220,   1627,   5186,    220,   2366,     20,    271, 128009,
         128006,    882, 128007,    271,   4438,   1587,  44111,  31650,  15836,
           1520,    304,  14867,  23842,     30, 128009, 128006,  78191, 128007,
            271,  38862,  31650,  15836,   5825,  41941,    369,  14867,  20492,
           1701,    459,  18274,   9134,   6108,   1749,     11,  10695,   130

In [11]:
with torch.no_grad():
    outputs = model(**inputs)

logits = outputs.logits[0]

In [12]:
# Shift logits and labels for causal LM
shift_logits = logits[:-1, :].contiguous()
shift_labels = inputs["input_ids"][:,1:].contiguous()

In [13]:
shift_logits.shape,shift_labels.shape

(torch.Size([69, 128256]), torch.Size([1, 69]))

In [14]:
import torch.nn.functional as F
loss = F.cross_entropy(
    shift_logits,
    shift_labels.view(-1),
)

In [15]:
from torch.optim import AdamW
from tqdm import tqdm
# Define optimizer
optimizer = AdamW(model.parameters(), lr=5e-5, weight_decay=0.01)

model.config.use_cache = False
model.config.pretraining_tp = 1
model.train()

for _ in tqdm(range(10)):
    outputs = model(**inputs)
    logits = outputs.logits[0]
    # Shift logits and labels for causal LM
    shift_logits = logits[:-1, :].contiguous()
    shift_labels = inputs["input_ids"][:,1:].contiguous()
    loss = F.cross_entropy(
    shift_logits,
    shift_labels.view(-1),
)
    loss.backward()

    optimizer.step()
    optimizer.zero_grad()
    print("Loss:", loss.item())

    

 10%|█         | 1/10 [00:00<00:05,  1.68it/s]

Loss: 5.53125


 20%|██        | 2/10 [00:00<00:03,  2.39it/s]

Loss: 2.515625


 30%|███       | 3/10 [00:01<00:02,  2.80it/s]

Loss: 0.6640625


 40%|████      | 4/10 [00:01<00:01,  3.00it/s]

Loss: 0.400390625


 50%|█████     | 5/10 [00:01<00:01,  3.14it/s]

Loss: 0.357421875


 60%|██████    | 6/10 [00:02<00:01,  3.26it/s]

Loss: 0.349609375


 70%|███████   | 7/10 [00:02<00:00,  3.30it/s]

Loss: 0.345703125


 80%|████████  | 8/10 [00:02<00:00,  3.35it/s]

Loss: 0.33984375


 90%|█████████ | 9/10 [00:02<00:00,  3.33it/s]

Loss: 0.333984375


100%|██████████| 10/10 [00:03<00:00,  3.10it/s]

Loss: 0.328125





In [25]:
# Q: What is BG-CNN and what does it do?
# A: BG-CNN is a hybrid FDI method that combines Bond Graph residual generation with CNN-based fault classification, designed to work well even with limited labeled data.

# Q: How does BG-XAI help in fault diagnosis?
# A: BG-XAI provides explanations for fault predictions using an occlusion-based method, helping make AI-based diagnostics more understandable.


model.eval()
model.config.use_cache = True


messages = [
    {"role": "user", "content": "What is capital of Italy?"}
]

tokenized_text = tokenizer.apply_chat_template(messages,tokenize=False,add_generation_prompt=True)

llama_pipeline = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
)

generated_text = llama_pipeline(tokenized_text, max_new_tokens=60, early_stopping=True)

print(generated_text[0]['generated_text'])

Device set to use cuda
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


<|begin_of_text|><|start_header_id|>system<|end_header_id|>

Cutting Knowledge Date: December 2023
Today Date: 26 Apr 2025

<|eot_id|><|start_header_id|>user<|end_header_id|>

What is capital of Italy?<|eot_id|><|start_header_id|>assistant<|end_header_id|>

The capital of Italy is Rome.


# Improvement-1
### Calculate loss only on the 'RESPONSE' not on the 'QUERY'

In [17]:
# Q: How does BG-XAI help in fault diagnosis?
# A: BG-XAI provides explanations for fault predictions using an occlusion-based method, helping make AI-based diagnostics more understandable.

query = [
    {"role": "user", "content": "How does BG-XAI help in fault diagnosis?"}]

tokenized_query = tokenizer.apply_chat_template(query,tokenize=True,add_generation_prompt=True)

full_message = [
    {"role": "user", "content": "How does BG-XAI help in fault diagnosis?"},
    {"role": "assistant", "content": "BG-XAI provides explanations for fault predictions using an occlusion-based method, helping make AI-based diagnostics more understandable."}
]


tokenized_full_message = tokenizer.apply_chat_template(full_message,tokenize=True,add_generation_prompt=False)

labels = tokenized_full_message
labels = [-100] * len(tokenized_query) + tokenized_full_message[len(tokenized_query):]

In [19]:
inputs = torch.tensor(tokenized_full_message).unsqueeze(0).to('cuda')
labels = torch.tensor(labels).to('cuda')

In [20]:
print(tokenizer.decode([l for l in labels if l!=-100]))

BG-XAI provides explanations for fault predictions using an occlusion-based method, helping make AI-based diagnostics more understandable.<|eot_id|>


In [29]:
from torch.nn import functional as F
from torch.optim import AdamW
from tqdm import tqdm
# Define optimizer
optimizer = AdamW(model.parameters(), lr=5e-5, weight_decay=0.01)

model.config.use_cache = False
model.config.pretraining_tp = 1
model.train()
print_gpu_utilization()
for _ in tqdm(range(5)):
    outputs = model(inputs)
    logits = outputs.logits[0]
    # Shift logits and labels for causal LM
    shift_logits = logits[:-1, :].contiguous()
    shift_labels = labels[1:].contiguous()
    loss = F.cross_entropy(
    shift_logits,
    shift_labels.view(-1),
)
    loss.backward()

    optimizer.step()
    optimizer.zero_grad()
    print("Loss:", loss.item())
    print_gpu_utilization()

GPU Memory Usage>>>> Allocated: 2480.88 MB |||||  Reserved:  7708.00 MB:


 20%|██        | 1/5 [00:00<00:01,  2.11it/s]

Loss: 0.00107574462890625
GPU Memory Usage>>>> Allocated: 7197.60 MB |||||  Reserved:  11992.00 MB:


 40%|████      | 2/5 [00:00<00:01,  2.74it/s]

Loss: 3.671875
GPU Memory Usage>>>> Allocated: 7197.60 MB |||||  Reserved:  12494.00 MB:


 60%|██████    | 3/5 [00:01<00:00,  3.05it/s]

Loss: 1.296875
GPU Memory Usage>>>> Allocated: 7197.60 MB |||||  Reserved:  12494.00 MB:


 80%|████████  | 4/5 [00:01<00:00,  3.17it/s]

Loss: 0.060791015625
GPU Memory Usage>>>> Allocated: 7197.60 MB |||||  Reserved:  12494.00 MB:


100%|██████████| 5/5 [00:01<00:00,  3.05it/s]

Loss: 0.027099609375
GPU Memory Usage>>>> Allocated: 7197.60 MB |||||  Reserved:  12494.00 MB:





In [30]:
# Q: What is BG-CNN and what does it do?
# A: BG-CNN is a hybrid FDI method that combines Bond Graph residual generation with CNN-based fault classification, designed to work well even with limited labeled data.

# Q: How does BG-XAI help in fault diagnosis?
# A: BG-XAI provides explanations for fault predictions using an occlusion-based method, helping make AI-based diagnostics more understandable.


model.eval()
model.config.use_cache = True


messages = [
    {"role": "user", "content": "How does BG-XAI help in fault diagnosis?"}
]

tokenized_text = tokenizer.apply_chat_template(messages,tokenize=False,add_generation_prompt=True)

llama_pipeline = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
)

generated_text = llama_pipeline(tokenized_text, max_new_tokens=60, early_stopping=True)

print(generated_text[0]['generated_text'])



Device set to use cuda
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


<|begin_of_text|><|start_header_id|>system<|end_header_id|>

Cutting Knowledge Date: December 2023
Today Date: 26 Apr 2025

<|eot_id|><|start_header_id|>user<|end_header_id|>

How does BG-XAI help in fault diagnosis?<|eot_id|><|start_header_id|>assistant<|end_header_id|>

BG-XAI provides explanations for fault predictions using an occlusion-based method, helping make AI-based diagnostics more understandable.


# Improvement-2
### Using LoRA

In [1]:
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
import torch, gc

def print_gpu_utilization():
    allocated = torch.cuda.memory_allocated() / (1024 ** 2)
    reserved = torch.cuda.memory_reserved() / (1024 ** 2)
    print(f"GPU Memory Usage>>>> Allocated: {allocated:.2f} MB |||||  Reserved:  {reserved:.2f} MB:")

def flush():
    torch.cuda.empty_cache()
    gc.collect()
    torch.cuda.empty_cache()
    gc.collect()

In [2]:
print_gpu_utilization()

GPU Memory Usage>>>> Allocated: 0.00 MB |||||  Reserved:  0.00 MB:


In [3]:


DEFAULT_MODEL = "meta-llama/Llama-3.2-1B-Instruct"
model = AutoModelForCausalLM.from_pretrained(
    DEFAULT_MODEL,
    torch_dtype=torch.bfloat16,
    device_map='cuda'
)
tokenizer = AutoTokenizer.from_pretrained(DEFAULT_MODEL, use_safetensors=True)

In [4]:
print_gpu_utilization()

GPU Memory Usage>>>> Allocated: 2357.13 MB |||||  Reserved:  2862.00 MB:


In [5]:
for param in model.named_parameters():
    print(param[0],' dtype:',param[1].dtype, ' requirs grad: ',param[1].requires_grad)

model.embed_tokens.weight  dtype: torch.bfloat16  requirs grad:  True
model.layers.0.self_attn.q_proj.weight  dtype: torch.bfloat16  requirs grad:  True
model.layers.0.self_attn.k_proj.weight  dtype: torch.bfloat16  requirs grad:  True
model.layers.0.self_attn.v_proj.weight  dtype: torch.bfloat16  requirs grad:  True
model.layers.0.self_attn.o_proj.weight  dtype: torch.bfloat16  requirs grad:  True
model.layers.0.mlp.gate_proj.weight  dtype: torch.bfloat16  requirs grad:  True
model.layers.0.mlp.up_proj.weight  dtype: torch.bfloat16  requirs grad:  True
model.layers.0.mlp.down_proj.weight  dtype: torch.bfloat16  requirs grad:  True
model.layers.0.input_layernorm.weight  dtype: torch.bfloat16  requirs grad:  True
model.layers.0.post_attention_layernorm.weight  dtype: torch.bfloat16  requirs grad:  True
model.layers.1.self_attn.q_proj.weight  dtype: torch.bfloat16  requirs grad:  True
model.layers.1.self_attn.k_proj.weight  dtype: torch.bfloat16  requirs grad:  True
model.layers.1.self_a

In [None]:
from peft import LoraConfig, get_peft_model, PeftModel

config = LoraConfig(
    r=16,
    lora_alpha=16,
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"],
    task_type="CAUSAL_LM",
    lora_dropout=0.05,
    bias="none",
    inference_mode=False,
    use_rslora=True,
    init_lora_weights="gaussian",
)

model = get_peft_model(model, config)


# Now manually move LoRA params to bf16
for name, param in model.named_parameters():
    if "lora_" in name:
        param.data = param.data.to(torch.bfloat16)
        if param.requires_grad:
            param.grad = None  # Reset grads just in case


model.print_trainable_parameters()

trainable params: 11,272,192 || all params: 1,247,086,592 || trainable%: 0.9039


In [8]:
for param in model.named_parameters():
    print(param[0],' dtype:',param[1].dtype, ' requirs grad: ',param[1].requires_grad)

base_model.model.model.embed_tokens.weight  dtype: torch.bfloat16  requirs grad:  False
base_model.model.model.layers.0.self_attn.q_proj.base_layer.weight  dtype: torch.bfloat16  requirs grad:  False
base_model.model.model.layers.0.self_attn.q_proj.lora_A.default.weight  dtype: torch.float32  requirs grad:  True
base_model.model.model.layers.0.self_attn.q_proj.lora_B.default.weight  dtype: torch.float32  requirs grad:  True
base_model.model.model.layers.0.self_attn.k_proj.base_layer.weight  dtype: torch.bfloat16  requirs grad:  False
base_model.model.model.layers.0.self_attn.k_proj.lora_A.default.weight  dtype: torch.float32  requirs grad:  True
base_model.model.model.layers.0.self_attn.k_proj.lora_B.default.weight  dtype: torch.float32  requirs grad:  True
base_model.model.model.layers.0.self_attn.v_proj.base_layer.weight  dtype: torch.bfloat16  requirs grad:  False
base_model.model.model.layers.0.self_attn.v_proj.lora_A.default.weight  dtype: torch.float32  requirs grad:  True
base_m

In [10]:
for param in model.named_parameters():
    print(param[0],' dtype:',param[1].dtype, ' requirs grad: ',param[1].requires_grad)

base_model.model.model.embed_tokens.weight  dtype: torch.bfloat16  requirs grad:  False
base_model.model.model.layers.0.self_attn.q_proj.base_layer.weight  dtype: torch.bfloat16  requirs grad:  False
base_model.model.model.layers.0.self_attn.q_proj.lora_A.default.weight  dtype: torch.bfloat16  requirs grad:  True
base_model.model.model.layers.0.self_attn.q_proj.lora_B.default.weight  dtype: torch.bfloat16  requirs grad:  True
base_model.model.model.layers.0.self_attn.k_proj.base_layer.weight  dtype: torch.bfloat16  requirs grad:  False
base_model.model.model.layers.0.self_attn.k_proj.lora_A.default.weight  dtype: torch.bfloat16  requirs grad:  True
base_model.model.model.layers.0.self_attn.k_proj.lora_B.default.weight  dtype: torch.bfloat16  requirs grad:  True
base_model.model.model.layers.0.self_attn.v_proj.base_layer.weight  dtype: torch.bfloat16  requirs grad:  False
base_model.model.model.layers.0.self_attn.v_proj.lora_A.default.weight  dtype: torch.bfloat16  requirs grad:  True
b

In [11]:
# Q: How does BG-XAI help in fault diagnosis?
# A: BG-XAI provides explanations for fault predictions using an occlusion-based method, helping make AI-based diagnostics more understandable.

query = [
    {"role": "user", "content": "How does BG-XAI help in fault diagnosis?"}]

tokenized_query = tokenizer.apply_chat_template(query,tokenize=True,add_generation_prompt=True)

full_message = [
    {"role": "user", "content": "How does BG-XAI help in fault diagnosis?"},
    {"role": "assistant", "content": "BG-XAI provides explanations for fault predictions using an occlusion-based method, helping make AI-based diagnostics more understandable."}
]


tokenized_full_message = tokenizer.apply_chat_template(full_message,tokenize=True,add_generation_prompt=False)

labels = tokenized_full_message
labels = [-100] * len(tokenized_query) + tokenized_full_message[len(tokenized_query):]

In [12]:
inputs = torch.tensor(tokenized_full_message).unsqueeze(0).to('cuda')
labels = torch.tensor(labels).to('cuda')

In [13]:
print(tokenizer.decode([l for l in labels if l!=-100]))

BG-XAI provides explanations for fault predictions using an occlusion-based method, helping make AI-based diagnostics more understandable.<|eot_id|>


In [16]:
from torch.nn import functional as F
from torch.optim import AdamW
from tqdm import tqdm
# Define optimizer
optimizer = AdamW(model.parameters(), lr=5e-5, weight_decay=0.01)

model.config.use_cache = False
model.train()

print_gpu_utilization()
for _ in tqdm(range(5)):
    outputs = model(inputs)
    logits = outputs.logits[0]
    # Shift logits and labels for causal LM
    shift_logits = logits[:-1, :].contiguous()
    shift_labels = labels[1:].contiguous()
    loss = F.cross_entropy(
    shift_logits,
    shift_labels.view(-1),
)
    loss.backward()

    optimizer.step()
    optimizer.zero_grad()
    print("Loss:", loss.item())
    # flush()
    print_gpu_utilization()

GPU Memory Usage>>>> Allocated: 2411.76 MB |||||  Reserved:  3022.00 MB:


 20%|██        | 1/5 [00:00<00:00,  8.45it/s]

Loss: 0.00634765625
GPU Memory Usage>>>> Allocated: 2454.76 MB |||||  Reserved:  3022.00 MB:
Loss: 0.0201416015625
GPU Memory Usage>>>> Allocated: 2454.76 MB |||||  Reserved:  3022.00 MB:


 60%|██████    | 3/5 [00:00<00:00, 14.01it/s]

Loss: 0.0026092529296875
GPU Memory Usage>>>> Allocated: 2454.76 MB |||||  Reserved:  3022.00 MB:
Loss: 0.00093841552734375
GPU Memory Usage>>>> Allocated: 2454.76 MB |||||  Reserved:  3022.00 MB:


100%|██████████| 5/5 [00:00<00:00, 14.77it/s]

Loss: 0.0072021484375
GPU Memory Usage>>>> Allocated: 2454.76 MB |||||  Reserved:  3022.00 MB:





In [17]:
# Q: What is BG-CNN and what does it do?
# A: BG-CNN is a hybrid FDI method that combines Bond Graph residual generation with CNN-based fault classification, designed to work well even with limited labeled data.

# Q: How does BG-XAI help in fault diagnosis?
# A: BG-XAI provides explanations for fault predictions using an occlusion-based method, helping make AI-based diagnostics more understandable.


model.eval()
model.config.use_cache = True


messages = [
    {"role": "user", "content": "How does BG-XAI help in fault diagnosis?"}
]

tokenized_text = tokenizer.apply_chat_template(messages,tokenize=False,add_generation_prompt=True)

llama_pipeline = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
)

generated_text = llama_pipeline(tokenized_text, max_new_tokens=60, early_stopping=True)

print(generated_text[0]['generated_text'])

import gc
# flush()
print_gpu_utilization()


Device set to use cuda
The model 'PeftModelForCausalLM' is not supported for text-generation. Supported models are ['AriaTextForCausalLM', 'BambaForCausalLM', 'BartForCausalLM', 'BertLMHeadModel', 'BertGenerationDecoder', 'BigBirdForCausalLM', 'BigBirdPegasusForCausalLM', 'BioGptForCausalLM', 'BlenderbotForCausalLM', 'BlenderbotSmallForCausalLM', 'BloomForCausalLM', 'CamembertForCausalLM', 'LlamaForCausalLM', 'CodeGenForCausalLM', 'CohereForCausalLM', 'Cohere2ForCausalLM', 'CpmAntForCausalLM', 'CTRLLMHeadModel', 'Data2VecTextForCausalLM', 'DbrxForCausalLM', 'DeepseekV3ForCausalLM', 'DiffLlamaForCausalLM', 'ElectraForCausalLM', 'Emu3ForCausalLM', 'ErnieForCausalLM', 'FalconForCausalLM', 'FalconMambaForCausalLM', 'FuyuForCausalLM', 'GemmaForCausalLM', 'Gemma2ForCausalLM', 'Gemma3ForConditionalGeneration', 'Gemma3ForCausalLM', 'GitForCausalLM', 'GlmForCausalLM', 'Glm4ForCausalLM', 'GotOcr2ForConditionalGeneration', 'GPT2LMHeadModel', 'GPT2LMHeadModel', 'GPTBigCodeForCausalLM', 'GPTNeoForC

<|begin_of_text|><|start_header_id|>system<|end_header_id|>

Cutting Knowledge Date: December 2023
Today Date: 26 Apr 2025

<|eot_id|><|start_header_id|>user<|end_header_id|>

How does BG-XAI help in fault diagnosis?<|eot_id|><|start_header_id|>assistant<|end_header_id|>

BG-XAI provides explanations for fault predictions using an occlusion-based method, helping make AI-based diagnostics more understandable.
GPU Memory Usage>>>> Allocated: 2454.76 MB |||||  Reserved:  3022.00 MB:


# Improvement 3

### Using Q-LoRA (Quantized in 4 bit)

In [4]:
from transformers import AutoModelForCausalLM, AutoTokenizer,BitsAndBytesConfig, pipeline
import torch, gc

def print_gpu_utilization():
    allocated = torch.cuda.memory_allocated() / (1024 ** 2)
    reserved = torch.cuda.memory_reserved() / (1024 ** 2)
    print(f"GPU Memory Usage>>>> Allocated: {allocated:.2f} MB |||||  Reserved:  {reserved:.2f} MB:")

def flush():
    torch.cuda.empty_cache()
    gc.collect()
    torch.cuda.empty_cache()
    gc.collect()

In [5]:
print_gpu_utilization()

GPU Memory Usage>>>> Allocated: 0.00 MB |||||  Reserved:  0.00 MB:


In [6]:


DEFAULT_MODEL = "meta-llama/Llama-3.2-1B-Instruct"

bnb_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_quant_type='nf4',
        bnb_4bit_compute_dtype=torch.bfloat16,
        bnb_4bit_quant_storage=torch.bfloat16,
        bnb_4bit_use_double_quant=False,
    )


model = AutoModelForCausalLM.from_pretrained(
    DEFAULT_MODEL,
    quantization_config=bnb_config,
    device_map='cuda'
)
tokenizer = AutoTokenizer.from_pretrained(DEFAULT_MODEL, use_safetensors=True)

In [7]:
print_gpu_utilization()

GPU Memory Usage>>>> Allocated: 1023.18 MB |||||  Reserved:  1506.00 MB:


In [8]:
for param in model.named_parameters():
    print(param[0],' dtype:',param[1].dtype, ' requirs grad: ',param[1].requires_grad)

model.embed_tokens.weight  dtype: torch.float16  requirs grad:  True
model.layers.0.self_attn.q_proj.weight  dtype: torch.bfloat16  requirs grad:  False
model.layers.0.self_attn.k_proj.weight  dtype: torch.bfloat16  requirs grad:  False
model.layers.0.self_attn.v_proj.weight  dtype: torch.bfloat16  requirs grad:  False
model.layers.0.self_attn.o_proj.weight  dtype: torch.bfloat16  requirs grad:  False
model.layers.0.mlp.gate_proj.weight  dtype: torch.bfloat16  requirs grad:  False
model.layers.0.mlp.up_proj.weight  dtype: torch.bfloat16  requirs grad:  False
model.layers.0.mlp.down_proj.weight  dtype: torch.bfloat16  requirs grad:  False
model.layers.0.input_layernorm.weight  dtype: torch.float16  requirs grad:  True
model.layers.0.post_attention_layernorm.weight  dtype: torch.float16  requirs grad:  True
model.layers.1.self_attn.q_proj.weight  dtype: torch.bfloat16  requirs grad:  False
model.layers.1.self_attn.k_proj.weight  dtype: torch.bfloat16  requirs grad:  False
model.layers.1.

In [9]:
from peft import LoraConfig, get_peft_model, PeftModel

config = LoraConfig(
    r=16,
    lora_alpha=16,
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"],
    task_type="CAUSAL_LM",
    lora_dropout=0.05,
    bias="none",
    inference_mode=False,
    use_rslora=True,
    init_lora_weights="gaussian",
)

model = get_peft_model(model, config)


# Now manually move LoRA params to bf16
for name, param in model.named_parameters():
    if "lora_" in name:
        param.data = param.data.to(torch.bfloat16)
        if param.requires_grad:
            param.grad = None  # Reset grads just in case


model.print_trainable_parameters()

trainable params: 11,272,192 || all params: 1,247,086,592 || trainable%: 0.9039


In [10]:
for param in model.named_parameters():
    print(param[0],' dtype:',param[1].dtype, ' requirs grad: ',param[1].requires_grad)

base_model.model.model.embed_tokens.weight  dtype: torch.float16  requirs grad:  False
base_model.model.model.layers.0.self_attn.q_proj.base_layer.weight  dtype: torch.bfloat16  requirs grad:  False
base_model.model.model.layers.0.self_attn.q_proj.lora_A.default.weight  dtype: torch.bfloat16  requirs grad:  True
base_model.model.model.layers.0.self_attn.q_proj.lora_B.default.weight  dtype: torch.bfloat16  requirs grad:  True
base_model.model.model.layers.0.self_attn.k_proj.base_layer.weight  dtype: torch.bfloat16  requirs grad:  False
base_model.model.model.layers.0.self_attn.k_proj.lora_A.default.weight  dtype: torch.bfloat16  requirs grad:  True
base_model.model.model.layers.0.self_attn.k_proj.lora_B.default.weight  dtype: torch.bfloat16  requirs grad:  True
base_model.model.model.layers.0.self_attn.v_proj.base_layer.weight  dtype: torch.bfloat16  requirs grad:  False
base_model.model.model.layers.0.self_attn.v_proj.lora_A.default.weight  dtype: torch.bfloat16  requirs grad:  True
ba

In [None]:
# Q: How does BG-XAI help in fault diagnosis?
# A: BG-XAI provides explanations for fault predictions using an occlusion-based method, helping make AI-based diagnostics more understandable.

query = [
    {"role": "user", "content": "How does BG-XAI help in fault diagnosis?"}]


tokenized_query = tokenizer.apply_chat_template(query,tokenize=True,add_generation_prompt=True)

full_message = [
    {"role": "user", "content": "How does BG-XAI help in fault diagnosis?"},
    {"role": "assistant", "content": "BG-XAI provides explanations for fault predictions using an occlusion-based method, helping make AI-based diagnostics more understandable."}
]


tokenized_full_message = tokenizer.apply_chat_template(full_message,tokenize=True,add_generation_prompt=False)

labels = tokenized_full_message
labels = [-100] * len(tokenized_query) + tokenized_full_message[len(tokenized_query):]

In [None]:
inputs = torch.tensor(tokenized_full_message).unsqueeze(0).to('cuda')
labels = torch.tensor(labels).to('cuda') 

In [13]:
print(tokenizer.decode([l for l in labels if l!=-100]))

BG-XAI provides explanations for fault predictions using an occlusion-based method, helping make AI-based diagnostics more understandable.<|eot_id|>


In [14]:
from torch.nn import functional as F
from torch.optim import AdamW
from tqdm import tqdm
# Define optimizer
optimizer = AdamW(model.parameters(), lr=5e-5, weight_decay=0.01)

model.config.use_cache = False
model.train()

print_gpu_utilization()
for _ in tqdm(range(5)):
    outputs = model(inputs)
    logits = outputs.logits[0]
    # Shift logits and labels for causal LM
    shift_logits = logits[:-1, :].contiguous()
    shift_labels = labels[1:].contiguous()
    loss = F.cross_entropy(
    shift_logits,
    shift_labels.view(-1),
)
    loss.backward()

    optimizer.step()
    optimizer.zero_grad()
    print("Loss:", loss.item())
    # flush()
    print_gpu_utilization()

GPU Memory Usage>>>> Allocated: 1044.69 MB |||||  Reserved:  1548.00 MB:


 40%|████      | 2/5 [00:00<00:01,  2.85it/s]

Loss: 3.943359375
GPU Memory Usage>>>> Allocated: 1120.82 MB |||||  Reserved:  1618.00 MB:
Loss: 2.072265625
GPU Memory Usage>>>> Allocated: 1120.82 MB |||||  Reserved:  1666.00 MB:


 80%|████████  | 4/5 [00:01<00:00,  5.25it/s]

Loss: 0.78369140625
GPU Memory Usage>>>> Allocated: 1120.82 MB |||||  Reserved:  1666.00 MB:
Loss: 0.158447265625
GPU Memory Usage>>>> Allocated: 1120.82 MB |||||  Reserved:  1666.00 MB:


100%|██████████| 5/5 [00:01<00:00,  4.44it/s]

Loss: 0.0286407470703125
GPU Memory Usage>>>> Allocated: 1120.82 MB |||||  Reserved:  1666.00 MB:





In [15]:
# Q: What is BG-CNN and what does it do?
# A: BG-CNN is a hybrid FDI method that combines Bond Graph residual generation with CNN-based fault classification, designed to work well even with limited labeled data.

# Q: How does BG-XAI help in fault diagnosis?
# A: BG-XAI provides explanations for fault predictions using an occlusion-based method, helping make AI-based diagnostics more understandable.


model.eval()
model.config.use_cache = True


messages = [
    {"role": "user", "content": "How does BG-XAI help in fault diagnosis?"}
]

tokenized_text = tokenizer.apply_chat_template(messages,tokenize=False,add_generation_prompt=True)

llama_pipeline = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
)

generated_text = llama_pipeline(tokenized_text, max_new_tokens=60, early_stopping=True)

print(generated_text[0]['generated_text'])

import gc
# flush()
print_gpu_utilization()


Device set to use cuda
The model 'PeftModelForCausalLM' is not supported for text-generation. Supported models are ['AriaTextForCausalLM', 'BambaForCausalLM', 'BartForCausalLM', 'BertLMHeadModel', 'BertGenerationDecoder', 'BigBirdForCausalLM', 'BigBirdPegasusForCausalLM', 'BioGptForCausalLM', 'BlenderbotForCausalLM', 'BlenderbotSmallForCausalLM', 'BloomForCausalLM', 'CamembertForCausalLM', 'LlamaForCausalLM', 'CodeGenForCausalLM', 'CohereForCausalLM', 'Cohere2ForCausalLM', 'CpmAntForCausalLM', 'CTRLLMHeadModel', 'Data2VecTextForCausalLM', 'DbrxForCausalLM', 'DeepseekV3ForCausalLM', 'DiffLlamaForCausalLM', 'ElectraForCausalLM', 'Emu3ForCausalLM', 'ErnieForCausalLM', 'FalconForCausalLM', 'FalconMambaForCausalLM', 'FuyuForCausalLM', 'GemmaForCausalLM', 'Gemma2ForCausalLM', 'Gemma3ForConditionalGeneration', 'Gemma3ForCausalLM', 'GitForCausalLM', 'GlmForCausalLM', 'Glm4ForCausalLM', 'GotOcr2ForConditionalGeneration', 'GPT2LMHeadModel', 'GPT2LMHeadModel', 'GPTBigCodeForCausalLM', 'GPTNeoForC

<|begin_of_text|><|start_header_id|>system<|end_header_id|>

Cutting Knowledge Date: December 2023
Today Date: 26 Apr 2025

<|eot_id|><|start_header_id|>user<|end_header_id|>

How does BG-XAI help in fault diagnosis?<|eot_id|><|start_header_id|>assistant<|end_header_id|>

BG-XAI provides explanations for fault predictions using an occlusion-based method, helping make AI-based diagnostics more understandable.
GPU Memory Usage>>>> Allocated: 1120.82 MB |||||  Reserved:  1666.00 MB:
