# Child Malnutrition Assistant - Fine-Tuning with LoRA

Domain: child malnutrition advice, support, and balanced diet guidance.
This notebook is designed to run end-to-end on Google Colab.
Detailed narrative documentation is provided in the PDF report.

## Setup
Run the installation cell once if needed in Colab.

In [None]:
# If running in Colab, uncomment the next line.
# !pip install -q transformers datasets peft trl bitsandbytes accelerate evaluate rouge_score sentencepiece streamlit

In [None]:
import json
import math
import os
import random
import re
import time
import unicodedata
from typing import Dict, List

import numpy as np
import pandas as pd
import torch
from datasets import Dataset
from evaluate import load
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    TrainingArguments,
    pipeline,
)
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
from trl import SFTTrainer

In [None]:
# Configuration
DATA_PATH = "malnutrition_dataset_final.jsonl"
GITHUB_RAW_URL = (
    "https://raw.githubusercontent.com/pauline12ish34/"
    "summative_fine-tuning_LLM/main/malnutrition_dataset_final.jsonl"
)
MODEL_NAME = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
SEED = 42
MAX_SEQ_LENGTH = 512
EVAL_SAMPLES = 20

DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

def set_seed(seed: int) -> None:
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed)

set_seed(SEED)
print(f"Device: {DEVICE}")

In [None]:
# Download dataset from GitHub if not present
if not os.path.exists(DATA_PATH):
    import urllib.request

    print("Downloading dataset...")
    urllib.request.urlretrieve(GITHUB_RAW_URL, DATA_PATH)
    print(f"Saved to {DATA_PATH}")
else:
    print(f"Dataset already available at {DATA_PATH}")

In [None]:
def normalize_text(text: str) -> str:
    text = unicodedata.normalize("NFKC", text)
    text = re.sub(r"\s+", " ", text).strip()
    return text

def format_example(question: str, answer: str) -> Dict[str, str]:
    return {
        "instruction": question,
        "response": answer,
        "text": f"### Question: {question}\n\n### Answer: {answer}",
    }

def load_jsonl_dataset(file_path: str) -> List[Dict[str, str]]:
    data: List[Dict[str, str]] = []
    with open(file_path, "r", encoding="utf-8") as f:
        for line_num, line in enumerate(f, 1):
            if not line.strip():
                continue
            try:
                item = json.loads(line)
                if "messages" in item:
                    user_msg = item["messages"][0]["content"]
                    assistant_msg = item["messages"][1]["content"]
                elif "question" in item and "answer" in item:
                    user_msg = item["question"]
                    assistant_msg = item["answer"]
                else:
                    user_msg = item.get("instruction")
                    assistant_msg = item.get("response")
                if not user_msg or not assistant_msg:
                    continue
                user_msg = normalize_text(user_msg)
                assistant_msg = normalize_text(assistant_msg)
                data.append(format_example(user_msg, assistant_msg))
            except json.JSONDecodeError:
                print(f"Warning: Skipping line {line_num} due to JSON error")
    return data

In [None]:
raw_data = load_jsonl_dataset(DATA_PATH)
if not raw_data:
    raise ValueError("Dataset is empty or could not be loaded.")

df = pd.DataFrame(raw_data)
df = df.dropna(subset=["instruction", "response", "text"]).reset_index(drop=True)

print(f"Samples loaded: {len(df)}")
print(df.head(2).to_string(index=False))

## Tokenization and Normalization
Tokenization uses the model's native tokenizer (SentencePiece/BPE for TinyLlama, not WordPiece).
Text is normalized using Unicode NFKC and whitespace cleanup.
Full preprocessing documentation is provided in the PDF report.

In [None]:
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, use_fast=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

dataset = Dataset.from_pandas(df[["instruction", "response", "text"]])
split = dataset.train_test_split(test_size=0.15, seed=SEED)
train_dataset = split["train"]
eval_dataset = split["test"]

print(f"Train size: {len(train_dataset)}")
print(f"Eval size: {len(eval_dataset)}")

In [None]:
# Token length analysis for sequence length selection
token_lengths = []
for example in train_dataset.select(range(min(200, len(train_dataset)))):
    token_lengths.append(len(tokenizer.encode(example["text"])))

token_lengths = np.array(token_lengths)
print(f"Mean length: {token_lengths.mean():.1f}")
print(f"95th percentile: {np.percentile(token_lengths, 95):.0f}")
print(f"Max length: {token_lengths.max()}")
print(f"Max seq length used: {MAX_SEQ_LENGTH}")

In [None]:
# Quantization configuration
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_use_double_quant=True,
    bnb_4bit_compute_dtype=torch.bfloat16,
)

# Load base model for baseline evaluation and fine-tuning
base_model = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME,
    quantization_config=bnb_config,
    device_map="auto",
)
base_model = prepare_model_for_kbit_training(base_model)

print("Base model loaded")

In [None]:
def generate_answers(model, questions: List[str], max_new_tokens: int = 150) -> List[str]:
    gen_pipe = pipeline(
        "text-generation",
        model=model,
        tokenizer=tokenizer,
        max_new_tokens=max_new_tokens,
        temperature=0.7,
        do_sample=True,
    )
    outputs = []
    for q in questions:
        prompt = f"### Question: {q}\n\n### Answer:"
        result = gen_pipe(prompt)[0]["generated_text"]
        answer = result.split("### Answer:")[1].strip() if "### Answer:" in result else result
        outputs.append(answer)
    return outputs

def compute_f1(preds: List[str], refs: List[str]) -> float:
    scores = []
    for pred, ref in zip(preds, refs):
        pred_tokens = pred.lower().split()
        ref_tokens = ref.lower().split()
        common = set(pred_tokens) & set(ref_tokens)
        if not pred_tokens or not ref_tokens:
            scores.append(0.0)
            continue
        precision = len(common) / len(pred_tokens) if pred_tokens else 0.0
        recall = len(common) / len(ref_tokens) if ref_tokens else 0.0
        if precision + recall == 0:
            scores.append(0.0)
        else:
            scores.append(2 * precision * recall / (precision + recall))
    return float(np.mean(scores))

bleu_metric = load("bleu")
rouge_metric = load("rouge")

eval_subset = eval_dataset.select(range(min(EVAL_SAMPLES, len(eval_dataset))))
baseline_questions = eval_subset["instruction"]
baseline_refs = eval_subset["response"]

baseline_preds = generate_answers(base_model, baseline_questions)
baseline_bleu = bleu_metric.compute(
    predictions=baseline_preds,
    references=[[r] for r in baseline_refs],
)
baseline_rouge = rouge_metric.compute(
    predictions=baseline_preds,
    references=baseline_refs,
)
baseline_f1 = compute_f1(baseline_preds, baseline_refs)

BASELINE_METRICS = {
    "bleu": baseline_bleu["bleu"],
    "rouge1": baseline_rouge["rouge1"],
    "rouge2": baseline_rouge["rouge2"],
    "rougeL": baseline_rouge["rougeL"],
    "f1": baseline_f1,
}

print("Baseline metrics:", BASELINE_METRICS)

## LoRA Fine-Tuning and Hyperparameter Experiments
Run multiple configurations and compare results. This cell produces the experiment table and improvement evidence.

In [None]:
experiments = [
    {
        "name": "baseline",
        "learning_rate": 2e-4,
        "batch_size": 4,
        "gradient_accumulation": 4,
        "epochs": 2,
        "lora_r": 16,
        "lora_alpha": 32,
    },
    {
        "name": "low_lr",
        "learning_rate": 1e-4,
        "batch_size": 4,
        "gradient_accumulation": 4,
        "epochs": 2,
        "lora_r": 16,
        "lora_alpha": 32,
    },
    {
        "name": "batch2",
        "learning_rate": 2e-4,
        "batch_size": 2,
        "gradient_accumulation": 8,
        "epochs": 2,
        "lora_r": 16,
        "lora_alpha": 32,
    },
    {
        "name": "higher_rank",
        "learning_rate": 2e-4,
        "batch_size": 4,
        "gradient_accumulation": 4,
        "epochs": 2,
        "lora_r": 32,
        "lora_alpha": 64,
    },
]

In [None]:
# Experiment runner
experiment_results = []


def pct_improvement(base: float, new: float) -> float:
    if base == 0:
        return 0.0
    return (new - base) / base * 100


def run_experiment(exp: Dict[str, float]) -> Dict[str, float]:
    lora_config = LoraConfig(
        r=exp["lora_r"],
        lora_alpha=exp["lora_alpha"],
        lora_dropout=0.05,
        bias="none",
        task_type="CAUSAL_LM",
    )

    model = get_peft_model(base_model, lora_config)

    training_args = TrainingArguments(
        output_dir=f"./results_{exp['name']}",
        num_train_epochs=exp["epochs"],
        per_device_train_batch_size=exp["batch_size"],
        per_device_eval_batch_size=exp["batch_size"],
        gradient_accumulation_steps=exp["gradient_accumulation"],
        learning_rate=exp["learning_rate"],
        lr_scheduler_type="cosine",
        warmup_steps=10,
        optim="paged_adamw_8bit",
        logging_steps=10,
        evaluation_strategy="steps",
        eval_steps=50,
        save_strategy="epoch",
        save_total_limit=2,
        load_best_model_at_end=True,
        metric_for_best_model="eval_loss",
        greater_is_better=False,
        fp16=True,
        report_to="none",
    )

    trainer = SFTTrainer(
        model=model,
        train_dataset=train_dataset,
        eval_dataset=eval_dataset,
        peft_config=lora_config,
        dataset_text_field="text",
        max_seq_length=MAX_SEQ_LENGTH,
        tokenizer=tokenizer,
        args=training_args,
    )

    torch.cuda.reset_peak_memory_stats() if torch.cuda.is_available() else None
    start_time = time.time()
    trainer.train()
    training_minutes = (time.time() - start_time) / 60
    max_gpu_gb = None
    if torch.cuda.is_available():
        max_gpu_gb = torch.cuda.max_memory_allocated() / (1024**3)

    fine_preds = generate_answers(model, baseline_questions)
    fine_bleu = bleu_metric.compute(
        predictions=fine_preds,
        references=[[r] for r in baseline_refs],
    )
    fine_rouge = rouge_metric.compute(
        predictions=fine_preds,
        references=baseline_refs,
    )
    fine_f1 = compute_f1(fine_preds, baseline_refs)
    eval_results = trainer.evaluate()
    perplexity = math.exp(eval_results["eval_loss"])

    best_checkpoint = trainer.state.best_model_checkpoint
    if best_checkpoint:
        best_dir = best_checkpoint
    else:
        best_dir = f"./best_{exp['name']}"
        trainer.save_model(best_dir)

    tokenizer.save_pretrained(best_dir)

    return {
        "Experiment": exp["name"],
        "Learning Rate": exp["learning_rate"],
        "Batch Size": exp["batch_size"],
        "Grad Accum": exp["gradient_accumulation"],
        "Epochs": exp["epochs"],
        "BLEU": fine_bleu["bleu"],
        "ROUGE-L": fine_rouge["rougeL"],
        "F1": fine_f1,
        "Perplexity": perplexity,
        "BLEU Improvement %": pct_improvement(BASELINE_METRICS["bleu"], fine_bleu["bleu"]),
        "ROUGE-L Improvement %": pct_improvement(BASELINE_METRICS["rougeL"], fine_rouge["rougeL"]),
        "Training Time (min)": training_minutes,
        "Max GPU (GB)": max_gpu_gb,
        "Best Checkpoint": best_dir,
    }


for exp in experiments:
    print(f"Running: {exp['name']}")
    result = run_experiment(exp)
    experiment_results.append(result)

exp_df = pd.DataFrame(experiment_results)
print(exp_df.to_string(index=False))

In [None]:
# Evidence of improvement
if len(experiment_results) > 0:
    best_improvement = exp_df["ROUGE-L Improvement %"].max()
    print(f"Best ROUGE-L improvement: {best_improvement:.2f}%")
    if best_improvement >= 10:
        print("Improvement target met (>= 10%).")
    else:
        print("Improvement target not met yet.")

In [None]:
# Qualitative test using the last trained model
sample_question = "What are early signs of child malnutrition?"
sample_answer = generate_answers(model, [sample_question], max_new_tokens=120)[0]
print("Question:", sample_question)
print("Answer:", sample_answer)

## UI Integration (Streamlit)
The Streamlit app is available in app.py for interactive testing.

In [None]:
raw_data = load_jsonl_dataset(DATA_PATH)
if not raw_data:
    raise ValueError("Dataset is empty or could not be loaded.")

df = pd.DataFrame(raw_data)
df = df.dropna(subset=["instruction", "response", "text"]).reset_index(drop=True)

print(f"Samples loaded: {len(df)}")
print(df.head(2).to_string(index=False))

## Tokenization and Normalization
Tokenization uses the model's native tokenizer (SentencePiece/BPE for TinyLlama, not WordPiece).
Text is normalized using Unicode NFKC and whitespace cleanup.
Full preprocessing documentation is provided in the PDF report.

In [None]:
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, use_fast=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

dataset = Dataset.from_pandas(df[["instruction", "response", "text"]])
split = dataset.train_test_split(test_size=0.15, seed=SEED)
train_dataset = split["train"]
eval_dataset = split["test"]

print(f"Train size: {len(train_dataset)}")
print(f"Eval size: {len(eval_dataset)}")

In [None]:
# Token length analysis for sequence length selection
token_lengths = []
for example in train_dataset.select(range(min(200, len(train_dataset)))):
    token_lengths.append(len(tokenizer.encode(example["text"])))

token_lengths = np.array(token_lengths)
print(f"Mean length: {token_lengths.mean():.1f}")
print(f"95th percentile: {np.percentile(token_lengths, 95):.0f}")
print(f"Max length: {token_lengths.max()}")
print(f"Max seq length used: {MAX_SEQ_LENGTH}")

In [None]:
# Quantization configuration
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_use_double_quant=True,
    bnb_4bit_compute_dtype=torch.bfloat16,
)

# Load base model for baseline evaluation and fine-tuning
base_model = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME,
    quantization_config=bnb_config,
    device_map="auto",
)
base_model = prepare_model_for_kbit_training(base_model)

print("Base model loaded")

In [None]:
def generate_answers(model, questions: List[str], max_new_tokens: int = 150) -> List[str]:
    gen_pipe = pipeline(
        "text-generation",
        model=model,
        tokenizer=tokenizer,
        max_new_tokens=max_new_tokens,
        temperature=0.7,
    'f1': fine_f1,
    'perplexity': perplexity,
}

print('Fine-tuned metrics:', FINETUNED_METRICS)

In [None]:
# Experiment tracking table
experiment_results = []

def pct_improvement(base: float, new: float) -> float:
    if base == 0:
        return 0.0
    return (new - base) / base * 100

experiment_results.append({
    'Experiment': current_exp['name'],
    'Learning Rate': current_exp['learning_rate'],
    'Batch Size': current_exp['batch_size'],
    'Grad Accum': current_exp['gradient_accumulation'],
    'Epochs': current_exp['epochs'],
    'BLEU': FINETUNED_METRICS['bleu'],
    'ROUGE-L': FINETUNED_METRICS['rougeL'],
    'F1': FINETUNED_METRICS['f1'],
    'Perplexity': FINETUNED_METRICS['perplexity'],
    'BLEU Improvement %': pct_improvement(BASELINE_METRICS['bleu'], FINETUNED_METRICS['bleu']),
    'ROUGE-L Improvement %': pct_improvement(BASELINE_METRICS['rougeL'], FINETUNED_METRICS['rougeL']),
    'Training Time (min)': training_minutes,
})

exp_df = pd.DataFrame(experiment_results)
print(exp_df.to_string(index=False))

In [None]:
# Qualitative test
sample_question = 'What are early signs of child malnutrition?'
sample_answer = generate_answers(model, [sample_question], max_new_tokens=120)[0]
print('Question:', sample_question)
print('Answer:', sample_answer)

In [None]:
# Save model and tokenizer
output_dir = f'./malnutrition_lora_{current_exp[
]}'
model.save_pretrained(output_dir)
tokenizer.save_pretrained(output_dir)
print('Saved to:', output_dir)

# Child Malnutrition Assistant - Fine-Tuning with LoRA

Fine-tuning TinyLlama-1.1B on child malnutrition medical Q&A dataset

# Child Malnutrition Assistant - Fine-tuning with LoRA

This notebook demonstrates fine-tuning of TinyLlama-1.1B-Chat-v1.0 using LoRA (Low-Rank Adaptation) on child malnutrition data.

## Setup and Dependencies

In [None]:
import torch
import json
import numpy as np
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
from peft import LoraConfig, get_peft_model
from trl import SFTTrainer, setup_chat_format
from datasets import Dataset

## Environment Configuration

In [None]:
DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'
print(f'Device: {DEVICE}')
print(f'CUDA Available: {torch.cuda.is_available()}')

## Load Dataset

In [None]:
def load_dataset_from_jsonl(file_path):
    with open(file_path, 'r') as f:
        data = [json.loads(line) for line in f]
    return data

try:
    dataset = load_dataset_from_jsonl('malnutrition_dataset_final.jsonl')
    print(f'Loaded {len(dataset)} samples')
    if dataset:
        print(f'Sample: {dataset[0]}')
except FileNotFoundError:
    print('Dataset file not found')

## Model Configuration

In [None]:
MODEL_NAME = 'TinyLlama/TinyLlama-1.1B-Chat-v1.0'

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type='nf4',
    bnb_4bit_use_double_quant=True,
    bnb_4bit_compute_dtype=torch.bfloat16
)

print('BitsAndBytes config created')

## Load Model and Tokenizer

In [None]:
model = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME,
    quantization_config=bnb_config,
    device_map='auto'
)

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
tokenizer.pad_token = tokenizer.eos_token

print(f'Model loaded: {MODEL_NAME}')
print(f'Model parameters: {model.num_parameters():,}')

## LoRA Configuration

In [None]:
lora_config = LoraConfig(
    r=8,
    lora_alpha=16,
    lora_dropout=0.05,
    bias='none',
    task_type='CAUSAL_LM'
)

model = get_peft_model(model, lora_config)
print('LoRA applied to model')
model.print_trainable_parameters()

## Prepare Training Data

In [None]:
def format_chat_template(sample):
    return {
        'text': f"<s>[INST] {sample.get('question', '')} [/INST] {sample.get('answer', '')} </s>"
    }

if dataset:
    formatted_data = [format_chat_template(sample) for sample in dataset]
    train_dataset = Dataset.from_dict({'text': [d['text'] for d in formatted_data]})
    print(f'Training dataset prepared with {len(train_dataset)} samples')

## Training Configuration

In [None]:
training_args = {
    'output_dir': './output',
    'num_train_epochs': 3,
    'per_device_train_batch_size': 4,
    'gradient_accumulation_steps': 2,
    'learning_rate': 2e-4,
    'logging_steps': 10,
    'save_steps': 100,
}

print('Training configuration ready')

## Initialize Trainer

In [None]:
if dataset:
    trainer = SFTTrainer(
        model=model,
        train_dataset=train_dataset,
        args=training_args,
        peft_config=lora_config,
        tokenizer=tokenizer,
    )
    print('Trainer initialized')

## Run Training

In [None]:
if dataset:
    trainer.train()
    print('Training completed')

## Model Evaluation

In [None]:
def evaluate_model(question):
    prompt = f"[INST] {question} [/INST]"
    inputs = tokenizer(prompt, return_tensors='pt').to(DEVICE)
    outputs = model.generate(**inputs, max_new_tokens=100, temperature=0.7)
    response = tokenizer.decode(outputs[0])
    return response

test_question = "What are symptoms of child malnutrition?"
print(f'Question: {test_question}')
print(f'Response: {evaluate_model(test_question)}')

## Save Fine-tuned Model

In [None]:
model.save_pretrained('./malnutrition_assistant_lora')
tokenizer.save_pretrained('./malnutrition_assistant_lora')
print('Model and tokenizer saved')