In [1]:
%%capture
!pip install -qU unsloth transformers datasets bitsandbytes seaborn pandas wandb huggingface_hub

In [2]:
import torch
torch.__version__

'2.10.0+cu128'

In [3]:
import os
import wandb
from datetime import datetime
from kaggle_secrets import UserSecretsClient

import warnings
warnings.filterwarnings("ignore")

user_secrets = UserSecretsClient()
wb_token = user_secrets.get_secret("wandb")

wandb.finish() # End any zombie sessions
wandb.login(key=wb_token)

project_name = "Fine-tune Llama-3 on ABSA and Review Summary"
run_name = f"llama3-run-{datetime.now().strftime('%H-%M-%S')}"
wandb.init(project=project_name, name=run_name, anonymous="allow")

[34m[1mwandb[0m: [wandb.login()] Using explicit session credentials for https://api.wandb.ai.
[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mbadhan45457[0m to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


In [4]:
import torch
from trl import SFTTrainer, SFTConfig
from unsloth import FastLanguageModel
from unsloth import is_bfloat16_supported

max_seq_length = 2048
dtype = None
load_in_4bit = True

# load the pre-trained model
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/llama-3-8b-bnb-4bit",
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
)

tokenizer.pad_token_id = tokenizer.eos_token_id

# Set base model to inference mode 
FastLanguageModel.for_inference(model)

2026-02-02 08:49:14.600454: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1770022154.797830      55 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1770022154.854022      55 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1770022155.335052      55 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1770022155.335092      55 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1770022155.335095      55 computation_placer.cc:177] computation placer alr

ðŸ¦¥ Unsloth: Will patch your computer to enable 2x faster free finetuning.
ðŸ¦¥ Unsloth Zoo will now patch everything to make training faster!
==((====))==  Unsloth 2026.1.4: Fast Llama patching. Transformers: 4.57.6.
   \\   /|    Tesla T4. Num GPUs = 2. Max memory: 14.563 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.10.0+cu128. CUDA: 7.5. CUDA Toolkit: 12.8. Triton: 3.6.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.34. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


model.safetensors:   0%|          | 0.00/5.70G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/198 [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(128256, 4096, padding_idx=128255)
    (layers): ModuleList(
      (0-31): 32 x LlamaDecoderLayer(
        (self_attn): LlamaAttention(
          (q_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
          (k_proj): Linear4bit(in_features=4096, out_features=1024, bias=False)
          (v_proj): Linear4bit(in_features=4096, out_features=1024, bias=False)
          (o_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
          (rotary_emb): LlamaRotaryEmbedding()
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear4bit(in_features=4096, out_features=14336, bias=False)
          (up_proj): Linear4bit(in_features=4096, out_features=14336, bias=False)
          (down_proj): Linear4bit(in_features=14336, out_features=4096, bias=False)
          (act_fn): SiLUActivation()
        )
        (input_layernorm): LlamaRMSNorm((4096,), eps=1e-05)
        (post_attention_layernorm):

## Generation with Base Model

In [5]:
from transformers import TextStreamer

test_review = "The pasta was absolutely delicious, easily the best I've had in the city. However, we waited 45 minutes for our table despite having a reservation. The noise level was also a bit too high for conversation."

inputs = tokenizer([test_review], return_tensors = "pt").to(model.device)

streamer = TextStreamer(tokenizer, skip_prompt=True)

_ = model.generate(
    **inputs, 
    streamer = streamer,
    max_new_tokens = 40,
    use_cache = True
)

 But overall, I would definitely go back.
The food was delicious. The service was attentive and friendly. I would definitely recommend this place.
The food was very good, but the service was a bit


## Finetuning Start

In [6]:
# Switch back to training mode
FastLanguageModel.for_training(model)

# 3. Attach LoRA Adapters
layers_to_tune = ["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"]

model = FastLanguageModel.get_peft_model(
    model,
    r = 16, 
    target_modules = layers_to_tune,
    lora_alpha = 16,
    lora_dropout = 0,
    bias = "none",
    use_gradient_checkpointing = "unsloth",
    random_state = 3407,
    use_rslora = False, 
    loftq_config = None
)

print(model)

Unsloth 2026.1.4 patched 32 layers with 32 QKV layers, 32 O layers and 32 MLP layers.


PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): LlamaForCausalLM(
      (model): LlamaModel(
        (embed_tokens): Embedding(128256, 4096, padding_idx=128255)
        (layers): ModuleList(
          (0-31): 32 x LlamaDecoderLayer(
            (self_attn): LlamaAttention(
              (q_proj): lora.Linear4bit(
                (base_layer): Linear4bit(in_features=4096, out_features=4096, bias=False)
                (lora_dropout): ModuleDict(
                  (default): Identity()
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=4096, out_features=16, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=16, out_features=4096, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embedding_B): ParameterDict()
                (lora_magnitude_vector): ModuleDict()
              )
              (k_proj): lor

In [7]:
# TEMPLATE: Optimized for Llama 3 Restaurant ABSA
alpaca_prompt_train = """Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

## Instruction:
You are an expert restaurant critic and sentiment analyst. Your task is to analyze the customer review provided below and extract structured insights.

Follow these specific steps:
1. **Aspect Analysis**: Analyze the text to determine the sentiment for specific categories. You must classify each category as 'Positive', 'Negative', or 'Neutral'.
   - **Ambiance**: Decor, atmosphere, noise level, lighting.
   - **Cleanliness**: Hygiene, tidiness of the space.
   - **Food**: Taste, temperature, portion size, menu variety.
   - **Service**: Staff behavior, speed, attentiveness.
   - **Value**: Price appropriateness, worth the money.
   
   If a category is not explicitly mentioned but can be strongly inferred, classify it. If absolutely no info is present, use 'Neutral'.

2. **Summarization**: Write a concise, one-sentence summary of the review that captures the main pros and cons.

## Input:
{review}

## Response:
### Aspect Sentiments:
{formatted_sentiment}

### Summary:
{summary}"""

In [8]:
from datasets import load_dataset, DatasetDict

review_dataset = "navdeep-singh/sentiment-aware-review-summarization"

train_dataset = load_dataset(review_dataset, split = "train")
eval_dataset = load_dataset(review_dataset, split = "test")

README.md:   0%|          | 0.00/731 [00:00<?, ?B/s]

data/train-00000-of-00001.parquet:   0%|          | 0.00/391k [00:00<?, ?B/s]

data/test-00000-of-00001.parquet:   0%|          | 0.00/52.2k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/896 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/100 [00:00<?, ? examples/s]

In [9]:
def format_data_point(row):
    # 1. Convert the sentiment dict into a clean, readable list
    # Input: {'Ambiance': 'Positive', 'Food': 'Positive', ...}
    # Output: "Ambiance: Positive\nCleanliness: Positive..."
    sentiment_dict = row['sentiment']
    formatted_sentiment = "\n".join([f"- {k}: {v}" for k, v in sentiment_dict.items()])
    
    # 2. Fill the prompt
    # Note: We map your data keys ('review', 'summary') to the prompt slots
    text = alpaca_prompt_train.format(
        review=row['review'],
        formatted_sentiment=formatted_sentiment,
        summary=row['summary']
    )    
    return {"text": text}

# (Use batched=False as discussed previously)
train_dataset = train_dataset.map(format_data_point, batched=False)
eval_dataset = eval_dataset.map(format_data_point, batched=False)

print(f"Train Size: {len(train_dataset)}")
print(f"Validation Size: {len(eval_dataset)}")

Map:   0%|          | 0/896 [00:00<?, ? examples/s]

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

Train Size: 896
Validation Size: 100


In [10]:
from trl import SFTTrainer, SFTConfig

# Set to RIGHT for SFTTrainer
tokenizer.padding_side = "right"   

# 1. Configure SFTConfig 
sft_config = SFTConfig(
    dataset_text_field="text",
    
    # Standard args
    per_device_train_batch_size=4,
    gradient_accumulation_steps=8,
    
    warmup_steps=10,
    max_steps=50,
    learning_rate=2e-4,

    fp16 = not is_bfloat16_supported(),
    bf16 = is_bfloat16_supported(),

    output_dir = "outputs",

    logging_strategy = "steps",
    logging_steps = 1,
    
    eval_strategy = "steps",              # Check validation loss every X steps
    eval_steps = 5,                       # Calculate val loss every 5 steps
    
    save_strategy = "steps",              # Save model checkpoint every X steps
    save_steps = 5,                       # Match this with eval_steps usually
    
    load_best_model_at_end = True,
)

# 2. Initialize Trainer
trainer = SFTTrainer(
    model=model,
    processing_class=tokenizer, 

    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    
    max_seq_length=2048, 
    
    args=sft_config
)

trainer_stats = trainer.train()

Unsloth: Tokenizing ["text"] (num_proc=8):   0%|          | 0/896 [00:00<?, ? examples/s]

Unsloth: Tokenizing ["text"] (num_proc=8):   0%|          | 0/100 [00:00<?, ? examples/s]

The model is already on multiple devices. Skipping the move to device specified in `args`.


ðŸ¦¥ Unsloth: Padding-free auto-enabled, enabling faster training.


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 896 | Num Epochs = 2 | Total steps = 50
O^O/ \_/ \    Batch size per device = 4 | Gradient accumulation steps = 8
\        /    Data Parallel GPUs = 1 | Total batch size (4 x 8 x 1) = 32
 "-____-"     Trainable parameters = 41,943,040 of 8,072,204,288 (0.52% trained)


Unsloth: Will smartly offload gradients to save VRAM!


Step,Training Loss,Validation Loss
5,1.5509,1.455728
10,0.8564,0.731473
15,0.452,0.448789
20,0.4071,0.405463
25,0.3905,0.377889
30,0.3392,0.356622
35,0.332,0.342693
40,0.3144,0.329905
45,0.3112,0.322903
50,0.3204,0.320539


Unsloth: Not an error, but LlamaForCausalLM does not accept `num_items_in_batch`.
Using gradient accumulation will be very slightly less accurate.
Read more on gradient accumulation issues here: https://unsloth.ai/blog/gradient


In [11]:
alpaca_prompt_inference = """Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
You are an expert restaurant critic and sentiment analyst. Your task is to analyze the customer review provided below and extract structured insights.

Follow these specific steps:
1. **Aspect Analysis**: Analyze the text to determine the sentiment for specific categories. You must classify each category as 'Positive', 'Negative', or 'Neutral'.
   - **Ambiance**: Decor, atmosphere, noise level, lighting.
   - **Cleanliness**: Hygiene, tidiness of the space.
   - **Food**: Taste, temperature, portion size, menu variety.
   - **Service**: Staff behavior, speed, attentiveness.
   - **Value**: Price appropriateness, worth the money.
   
   If a category is not explicitly mentioned but can be strongly inferred, classify it. If absolutely no info is present, use 'Neutral'.

2. **Summarization**: Write a concise, one-sentence summary of the review that captures the main pros and cons.

### Input:
{review}

### Response:
"""

tokenizer.padding_side = "left"   # Switch to LEFT for Generation
FastLanguageModel.for_inference(model)

PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): LlamaForCausalLM(
      (model): LlamaModel(
        (embed_tokens): Embedding(128256, 4096, padding_idx=128255)
        (layers): ModuleList(
          (0-31): 32 x LlamaDecoderLayer(
            (self_attn): LlamaAttention(
              (q_proj): lora.Linear4bit(
                (base_layer): Linear4bit(in_features=4096, out_features=4096, bias=False)
                (lora_dropout): ModuleDict(
                  (default): Identity()
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=4096, out_features=16, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=16, out_features=4096, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embedding_B): ParameterDict()
                (lora_magnitude_vector): ModuleDict()
              )
              (k_proj): lor

In [12]:
test_review = "The pasta was absolutely delicious, easily the best I've had in the city. However, we waited 45 minutes for our table despite having a reservation. The noise level was also a bit too high for conversation."


inputs = tokenizer([alpaca_prompt_inference.format(review=test_review)], return_tensors = "pt").to("cuda")

# 5. Generate Output
outputs = model.generate(
    **inputs, 
    max_new_tokens = 128,  # Limit output length
    use_cache = True,
    temperature = 0.1,     # Low temp = more factual/stable answers
)

# 6. Decode and Print
# We slice [inputs.input_ids.shape[1]:] to remove the prompt from the output
prediction = tokenizer.batch_decode(outputs[:, inputs['input_ids'].shape[1]:], skip_special_tokens=True)[0]

In [13]:
print("PREDICTION:\n")
print(prediction)

PREDICTION:

#### Aspect Sentiments:
- Ambiance: Negative
- Cleanliness: Positive
- Food: Positive
- Service: Negative
- Value: Neutral

#### Summary:
Great food, but terrible service and noise. Worth a visit for the pasta, but maybe not for a special occasion.


## Save the adapter locally

In [14]:
adapter = "ABSA-Summarizer-LoRA"

model.save_pretrained(adapter)
tokenizer.save_pretrained(adapter)

('ABSA-Summarizer-LoRA/tokenizer_config.json',
 'ABSA-Summarizer-LoRA/special_tokens_map.json',
 'ABSA-Summarizer-LoRA/tokenizer.json')

## Push the adapter to huggingface

The adapter config file itself contains basse model info, so during loading of adapter it automatically load base model also, if base mode doesn't exist in storage

In [17]:
from huggingface_hub import login

hf_token = user_secrets.get_secret("HF_TOKEN")

try:
    login(token=hf_token)
    print("Successfully logged in to Hugging Face!")
except Exception as e:
    print(f"Login failed: {e}")
    exit()

Successfully logged in to Hugging Face!


In [18]:
adapter_name = f"Llama-3-8b-{adapter}"
username = "navdeep-singh"

REPO_NAME = f"{username}/{adapter_name}"

# 3. Push to Hub
try:
    print(f"Pushing adapter to: {REPO_NAME}...")
    # 2. Push with that name
    model.push_to_hub(REPO_NAME)
    tokenizer.push_to_hub(REPO_NAME)
    print(f"âœ… Successfully published! View it here: https://huggingface.co/{REPO_NAME}")
except Exception as e:
    print(f"Error pushing to Hub: {e}")

Pushing adapter to: navdeep-singh/Llama-3-8b-ABSA-Summarizer-LoRA...


Processing Files (0 / 0): |          |  0.00B /  0.00B            

New Data Upload: |          |  0.00B /  0.00B            

Saved model to https://huggingface.co/navdeep-singh/Llama-3-8b-ABSA-Summarizer-LoRA


Processing Files (0 / 0): |          |  0.00B /  0.00B            

New Data Upload: |          |  0.00B /  0.00B            

âœ… Successfully published! View it here: https://huggingface.co/navdeep-singh/Llama-3-8b-ABSA-Summarizer-LoRA


### Model Testing by Loading from Huggingface!

In [22]:

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "navdeep-singh/Llama-3-8b-ABSA-Summarizer-LoRA",
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
)

tokenizer.pad_token_id = tokenizer.eos_token_id

# Set base model to inference mode 
FastLanguageModel.for_inference(model)

==((====))==  Unsloth 2026.1.4: Fast Llama patching. Transformers: 4.57.6.
   \\   /|    Tesla T4. Num GPUs = 2. Max memory: 14.563 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.10.0+cu128. CUDA: 7.5. CUDA Toolkit: 12.8. Triton: 3.6.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.34. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


adapter_model.safetensors:   0%|          | 0.00/168M [00:00<?, ?B/s]

PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): LlamaForCausalLM(
      (model): LlamaModel(
        (embed_tokens): Embedding(128256, 4096, padding_idx=128255)
        (layers): ModuleList(
          (0-31): 32 x LlamaDecoderLayer(
            (self_attn): LlamaAttention(
              (q_proj): lora.Linear4bit(
                (base_layer): Linear4bit(in_features=4096, out_features=4096, bias=False)
                (lora_dropout): ModuleDict(
                  (default): Identity()
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=4096, out_features=16, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=16, out_features=4096, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embedding_B): ParameterDict()
                (lora_magnitude_vector): ModuleDict()
              )
              (k_proj): lor

In [25]:
test_review = "The steak was cooked to perfection, honestly some of the most flavorful meat I've ever tasted. That said, the service was incredibly slow; we sat for nearly half an hour before a server even took our drink order. It was also freezing inside, which made it hard to fully relax and enjoy the meal."


inputs = tokenizer([alpaca_prompt_inference.format(review=test_review)], return_tensors = "pt").to("cuda")

# 5. Generate Output
outputs = model.generate(
    **inputs, 
    max_new_tokens = 128,  # Limit output length
    use_cache = True,
    temperature = 0.1,     # Low temp = more factual/stable answers
)

# 6. Decode and Print
# We slice [inputs.input_ids.shape[1]:] to remove the prompt from the output
prediction = tokenizer.batch_decode(outputs[:, inputs['input_ids'].shape[1]:], skip_special_tokens=True)[0]

In [26]:
print("PREDICTION:\n")
print(prediction)

PREDICTION:

#### Aspect Sentiments:
- Ambiance: Negative
- Cleanliness: Neutral
- Food: Positive
- Service: Negative
- Value: Neutral

#### Summary:
Great food, but terrible service and uncomfortable temperature.
