# Fine-Tuning LLaMA 3.1-8B with Standard LoRA on Bengali Empathetic Conversations

In [4]:
!pip install -q transformers>=4.40.0 datasets accelerate
!pip install -q peft>=0.10.0 bitsandbytes>=0.43.0
!pip install -q trl>=0.8.0
!pip install -q evaluate sacrebleu rouge-score


[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
libcugraph-cu12 25.6.0 requires libraft-cu12==25.6.*, but you have libraft-cu12 25.2.0 which is incompatible.
pylibcugraph-cu12 25.6.0 requires pylibraft-cu12==25.6.*, but you have pylibraft-cu12 25.2.0 which is incompatible.
pylibcugraph-cu12 25.6.0 requires rmm-cu12==25.6.*, but you have rmm-cu12 25.2.0 which is incompatible.[0m[31m
[0m

In [5]:
import os
import json
import torch
import pandas as pd
import numpy as np
from datetime import datetime
import warnings
import gc
warnings.filterwarnings('ignore')

os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"

if torch.cuda.is_available():
    torch.cuda.empty_cache()
    torch.cuda.reset_peak_memory_stats()
    gc.collect()

print(f"PyTorch version: {torch.__version__}")
print(f"CUDA availabilty: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")
    print(f"GPU Memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.1f} GB")

PyTorch version: 2.6.0+cu124
CUDA availabilty: True
GPU: Tesla T4
GPU Memory: 15.8 GB


## 2. Load Dataset

In [6]:

DATASET_PATH = "/kaggle/input/dtaset/BengaliEmpatheticConversationsCorpus .csv"

df = pd.read_csv(DATASET_PATH, encoding='utf-8')
print(f"Dataset shape: {df.shape}")
print(f"Column Names: {df.columns.tolist()}")
df.head(2)

Dataset shape: (38233, 4)
Column Names: ['Topics', 'Question-Title', 'Questions', 'Answers']


Unnamed: 0,Topics,Question-Title,Questions,Answers
0,পারিবারিক দ্বন্দ্ব,মা ও স্ত্রীর মধ্যে মতানৈক্য বৃদ্ধি,আমার স্ত্রী এবং মায়ের মধ্যে টানটান মতবিরোধ চ...,"আপনি যা বর্ণনা করছেন তাকে মনোবিজ্ঞানীরা ""ত্রি..."
1,"পদার্থের অপব্যবহার, আসক্তি",আমি ধূমপানে আসক্ত। আমি কিভাবে থামাতে পারি?,"আমি বাচ্চা নেওয়ার পরিকল্পনা করছি, তাই আমাকে ...",হাই। আপনার শিশুর (এবং নিজের) জন্য যা স্বাস্থ্...


## 3. Data Preprocessing

In [7]:
import re

def clean_text(text):
    if pd.isna(text):
        return ""
    text = str(text).strip()
    text = re.sub(r'\s+', ' ', text)
    return text

def format_for_llama(row):
    topic = clean_text(row['Topics'])
    question_title = clean_text(row['Question-Title'])
    question = clean_text(row['Questions'])
    answer = clean_text(row['Answers'])
    
    system_msg = f"আপনি একজন সহানুভূতিশীল বাংলা কথোপকথন সহকারী। বিষয়: {topic}"
    
    formatted = f"""<|begin_of_text|><|start_header_id|>system<|end_header_id|>

{system_msg}<|eot_id|><|start_header_id|>user<|end_header_id|>

{question_title}

{question}<|eot_id|><|start_header_id|>assistant<|end_header_id|>

{answer}<|eot_id|><|end_of_text|>"""
    
    return formatted

df_clean = df.dropna(subset=['Questions', 'Answers'])
df_clean = df_clean[df_clean['Questions'].str.len() > 10]
df_clean = df_clean[df_clean['Answers'].str.len() > 20]
df_clean['text'] = df_clean.apply(format_for_llama, axis=1)
print(f"Number of valid samples: {len(df_clean)}")

Number of valid samples: 33731


In [8]:
MAX_CHAR_LENGTH = 5000  
MAX_TRAIN_SAMPLES = 4000
MAX_VAL_SAMPLES = 300
MAX_TEST_SAMPLES = 300

df_filtered = df_clean[df_clean['text'].str.len() < MAX_CHAR_LENGTH].copy()

df_shuffled = df_filtered.sample(frac=1, random_state=42).reset_index(drop=True)

train_df = df_shuffled[:MAX_TRAIN_SAMPLES]
val_df = df_shuffled[MAX_TRAIN_SAMPLES:MAX_TRAIN_SAMPLES + MAX_VAL_SAMPLES]
test_df = df_shuffled[MAX_TRAIN_SAMPLES + MAX_VAL_SAMPLES:MAX_TRAIN_SAMPLES + MAX_VAL_SAMPLES + MAX_TEST_SAMPLES]

print(f"Train: {len(train_df)} | Val: {len(val_df)} | Test: {len(test_df)}")

def save_jsonl(df, path):
    with open(path, 'w', encoding='utf-8') as f:
        for _, row in df.iterrows():
            f.write(json.dumps({'text': row['text']}, ensure_ascii=False) + '\n')

save_jsonl(train_df, 'train.jsonl')
save_jsonl(val_df, 'val.jsonl')
save_jsonl(test_df, 'test.jsonl')


Train: 4000 | Val: 300 | Test: 300


## 4. Load Model with QLoRA (4-bit Quantization)

In [9]:
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
)
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training

# Configuration - Using NousResearch (open access model)
MODEL_NAME = "NousResearch/Meta-Llama-3.1-8B-Instruct"
MAX_SEQ_LENGTH = 4096 
print(f"Model Name: {MODEL_NAME}")
print(f"Max sequence length: {MAX_SEQ_LENGTH}")

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16,
    bnb_4bit_use_double_quant=True,
)

tokenizer = AutoTokenizer.from_pretrained(
    MODEL_NAME,
    trust_remote_code=True,
)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

model = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME,
    quantization_config=bnb_config,
    device_map="auto",
    trust_remote_code=True,
    torch_dtype=torch.bfloat16,
)

2025-12-30 19:02:16.191535: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1767121336.420240      47 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1767121336.480376      47 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

Model Name: NousResearch/Meta-Llama-3.1-8B-Instruct
Max sequence length: 4096


tokenizer_config.json: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/296 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/855 [00:00<?, ?B/s]

`torch_dtype` is deprecated! Use `dtype` instead!


model.safetensors.index.json: 0.00B [00:00, ?B/s]

Fetching 4 files:   0%|          | 0/4 [00:00<?, ?it/s]

model-00001-of-00004.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

model-00003-of-00004.safetensors:   0%|          | 0.00/4.92G [00:00<?, ?B/s]

model-00002-of-00004.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00004-of-00004.safetensors:   0%|          | 0.00/1.17G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/184 [00:00<?, ?B/s]

In [10]:
model.config.use_cache = False
model = prepare_model_for_kbit_training(model, use_gradient_checkpointing=True)

lora_config = LoraConfig(
    r=8,  
    lora_alpha=16, 
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM",
    target_modules=[
        "q_proj", "v_proj",  
    ],
)

model = get_peft_model(model, lora_config)

model.print_trainable_parameters()

trainable params: 3,407,872 || all params: 8,033,669,120 || trainable%: 0.0424


## 5. Prepare Dataset

In [11]:
from datasets import load_dataset

train_dataset = load_dataset('json', data_files='train.jsonl', split='train')
eval_dataset = load_dataset('json', data_files='val.jsonl', split='train')

print(f"Train: {len(train_dataset)} | Eval: {len(eval_dataset)}")

Generating train split: 0 examples [00:00, ? examples/s]

Generating train split: 0 examples [00:00, ? examples/s]

Train: 4000 | Eval: 300


## 6. Training Configuration

In [13]:
from trl import SFTTrainer, SFTConfig

OUTPUT_DIR = "./llama_bangla_lora"

training_args = SFTConfig(
    output_dir=OUTPUT_DIR,
    dataset_text_field="text",
    
    num_train_epochs=1,
    per_device_train_batch_size=1,
    per_device_eval_batch_size=1,
    gradient_accumulation_steps=4,
    
    learning_rate=2e-4,
    warmup_ratio=0.03,
    lr_scheduler_type="cosine",
    
    fp16=False,
    bf16=True,
    
    optim="paged_adamw_8bit",
    weight_decay=0.01,
    
    gradient_checkpointing=True,
    gradient_checkpointing_kwargs={"use_reentrant": True},
    
    max_length=MAX_SEQ_LENGTH,
    packing=False,
    
    logging_steps=50,
    save_strategy="steps",
    save_steps=250,
    eval_strategy="no",
    
    seed=42,
    report_to="none",
)


print(f"Training samples: 4000")
print(f"Max sequence length: {MAX_SEQ_LENGTH}")


Training samples: 4000
Max sequence length: 4096


In [14]:
trainer = SFTTrainer(
    model=model,
    processing_class=tokenizer,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    args=training_args,
)


Adding EOS to train dataset:   0%|          | 0/4000 [00:00<?, ? examples/s]

Tokenizing train dataset:   0%|          | 0/4000 [00:00<?, ? examples/s]

Truncating train dataset:   0%|          | 0/4000 [00:00<?, ? examples/s]

Adding EOS to eval dataset:   0%|          | 0/300 [00:00<?, ? examples/s]

Tokenizing eval dataset:   0%|          | 0/300 [00:00<?, ? examples/s]

Truncating eval dataset:   0%|          | 0/300 [00:00<?, ? examples/s]

# Check GPU memory

In [15]:
gpu_stats = torch.cuda.get_device_properties(0)
start_gpu_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)
print(f"GPU Type: {gpu_stats.name}")
print(f"Memory reserved: {start_gpu_memory} GB / {max_memory} GB")

GPU Type: Tesla T4
Memory reserved: 3.801 GB / 14.741 GB


# Clear memory before training to prevent OOM

In [16]:
import gc
gc.collect()
torch.cuda.empty_cache()
torch.cuda.reset_peak_memory_stats()

print(f"GPU Memory allocated: {torch.cuda.memory_allocated() / 1e9:.2f} GB")
print(f"GPU Memory reserved: {torch.cuda.memory_reserved() / 1e9:.2f} GB")

GPU Memory allocated: 3.00 GB
GPU Memory reserved: 3.03 GB


## 7. Start Training

In [17]:
trainer_stats = trainer.train()

The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'eos_token_id': 128009, 'pad_token_id': 128009}.


Step,Training Loss
50,0.9667
100,0.5933
150,0.5528
200,0.543
250,0.5246
300,0.5181
350,0.5149
400,0.5059
450,0.5053
500,0.4947


# Training statistics

In [None]:
used_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
used_memory_for_training = round(used_memory - start_gpu_memory, 3)

print(f"\nTraining Statistics:")
print(f"Average Training loss: {trainer_stats.training_loss:.4f}")
print(f"Training runtime: {trainer_stats.metrics['train_runtime']:.2f} seconds")
print(f"Samples/second: {trainer_stats.metrics['train_samples_per_second']:.2f}")
print(f"Peak GPU memory: {used_memory} GB")
print(f"Memory for training: {used_memory_for_training} GB")


Training Statistics:
Final loss: 0.5285
Training runtime: 34899.86 seconds
Samples/second: 0.12
Peak GPU memory: 14.162 GB
Memory for training: 10.361 GB


## 8. Save Model

In [19]:
model.save_pretrained(OUTPUT_DIR)
tokenizer.save_pretrained(OUTPUT_DIR)

config = {
    'model_name': MODEL_NAME,
    'max_seq_length': MAX_SEQ_LENGTH,
    'lora_r': 16,
    'lora_alpha': 32,
    'training_loss': trainer_stats.training_loss,
    'runtime_seconds': trainer_stats.metrics['train_runtime'],
    'strategy': 'Standard LoRA/QLoRA',
    'timestamp': datetime.now().isoformat()
}

with open(f"{OUTPUT_DIR}/training_config.json", 'w') as f:
    json.dump(config, f, indent=2)

print(f"Saved Path: {OUTPUT_DIR}")

Saved Path: ./llama_bangla_lora


## 9. Test Generation

In [20]:
model.eval()
def generate_response(user_input, topic="সাধারণ", max_new_tokens=256):
    prompt = f"""<|begin_of_text|><|start_header_id|>system<|end_header_id|>

আপনি একজন সহানুভূতিশীল বাংলা কথোপকথন সহকারী। বিষয়: {topic}<|eot_id|><|start_header_id|>user<|end_header_id|>

{user_input}<|eot_id|><|start_header_id|>assistant<|end_header_id|>

"""
    
    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
    
    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=max_new_tokens,
            do_sample=True,
            temperature=0.7,
            top_p=0.9,
            repetition_penalty=1.1,
            pad_token_id=tokenizer.eos_token_id,
        )
    
    response = tokenizer.decode(outputs[0], skip_special_tokens=False)
    
    if "<|start_header_id|>assistant<|end_header_id|>" in response:
        response = response.split("<|start_header_id|>assistant<|end_header_id|>")[-1]
        response = response.split("<|eot_id|>")[0].strip()
    
    return response

In [21]:
test_cases = [
    {"topic": "পরীক্ষা", "input": "আমি খুব চিন্তিত আছি আমার পরীক্ষার ফলাফল নিয়ে। কী করব বুঝতে পারছি না।"},
    {"topic": "বন্ধুত্ব", "input": "আমার বন্ধুর সাথে ঝগড়া হয়ে গেছে। খুব মন খারাপ।"},
    {"topic": "চাকরি", "input": "আমি চাকরি হারিয়েছি। কীভাবে এগিয়ে যাব?"},
    {"topic": "মানসিক স্বাস্থ্য", "input": "আমি অনেক দিন ধরে বিষণ্ণ বোধ করছি। সাহায্য দরকার।"},
]

print("="*60)
print("MODEL RESPONSES")
print("="*60)

generated_responses = []

for i, test in enumerate(test_cases):
    print(f"\n--- Test {i+1}: {test['topic']} ---")
    print(f"User: {test['input']}")
    
    response = generate_response(test['input'], test['topic'])
    print(f"\nAssistant: {response}")
    
    generated_responses.append({
        'topic': test['topic'],
        'input': test['input'],
        'response': response
    })

with open('generated_responses.json', 'w', encoding='utf-8') as f:
    json.dump(generated_responses, f, ensure_ascii=False, indent=2)
print("\nSaved to: generated_responses.json")

MODEL RESPONSES

--- Test 1: পরীক্ষা ---
User: আমি খুব চিন্তিত আছি আমার পরীক্ষার ফলাফল নিয়ে। কী করব বুঝতে পারছি না।

Assistant: ওহ, আপনি অপেক্ষা করতে হবে. দেখুন আপনি ধারণাটি বিবেচনা করেন এবং আপনার উপর আরও ভাল পড়াশোনা করুন।

--- Test 2: বন্ধুত্ব ---
User: আমার বন্ধুর সাথে ঝগড়া হয়ে গেছে। খুব মন খারাপ।

Assistant: ওহ না! কি ঘটেছে? আপনি উপলব্ধি করতে চাইলে, আমি এটা দেখতে পারি.

--- Test 3: চাকরি ---
User: আমি চাকরি হারিয়েছি। কীভাবে এগিয়ে যাব?

Assistant: দুঃখিত, তাই. আমি আশা করি আপনি তাদের সাথে ফিরে যাবেন এবং তাদের অন্য কোন কাজ দিতে পারেন!

--- Test 4: মানসিক স্বাস্থ্য ---
User: আমি অনেক দিন ধরে বিষণ্ণ বোধ করছি। সাহায্য দরকার।

Assistant: আমি এটা খুব খারাপ. আমি বলতে পারি আপনাকে কিছু বিষয় চিন্তা করতে বলব, যেগুলি আপনার উপর আঘাত ফেলতে পারে? আপনি যদি ইতিমধ্যে আপনার সমস্ত সম্পর্ককে নিয়ে আসতে পারেন তাহলে আমি এটা চিন্তা করতে পারি। একটি নির্দিষ্ট কারণ খু

Saved to: generated_responses.json


## 10. Evaluation

In [22]:
import evaluate

bleu = evaluate.load("sacrebleu")
rouge = evaluate.load("rouge")

test_dataset = load_dataset('json', data_files='test.jsonl', split='train')

NUM_EVAL_SAMPLES = 50
eval_samples = test_dataset.select(range(min(NUM_EVAL_SAMPLES, len(test_dataset))))
print(f"Evaluation Sample Size: {len(eval_samples)}")

Downloading builder script: 0.00B [00:00, ?B/s]

Downloading builder script: 0.00B [00:00, ?B/s]

Generating train split: 0 examples [00:00, ? examples/s]

Evaluation Sample Size: 50


In [23]:
predictions = []
references = []

for idx, sample in enumerate(eval_samples):
    text = sample['text']
    
    if "<|start_header_id|>assistant<|end_header_id|>" in text:
        ref = text.split("<|start_header_id|>assistant<|end_header_id|>")[-1]
        ref = ref.split("<|eot_id|>")[0].strip()
        references.append(ref)
        
        user_part = text.split("<|start_header_id|>user<|end_header_id|>")[-1]
        user_input = user_part.split("<|eot_id|>")[0].strip()
        
        pred = generate_response(user_input, max_new_tokens=200)
        predictions.append(pred)
        
        if (idx + 1) % 10 == 0:
            print(f"Generated {idx + 1}/{len(eval_samples)}")

print(f"\nGenerated {len(predictions)} responses")

Generated 10/50
Generated 20/50
Generated 30/50
Generated 40/50
Generated 50/50

Generated 50 responses


In [24]:
print("\n" + "="*60)
print("EVALUATION RESULTS")
print("="*60)

# BLEU
refs_for_bleu = [[ref] for ref in references]
bleu_result = bleu.compute(predictions=predictions, references=refs_for_bleu)
print(f"\nBLEU Score: {bleu_result['score']:.2f}")

# ROUGE
rouge_result = rouge.compute(predictions=predictions, references=references)
print(f"ROUGE-1: {rouge_result['rouge1']:.4f}")
print(f"ROUGE-2: {rouge_result['rouge2']:.4f}")
print(f"ROUGE-L: {rouge_result['rougeL']:.4f}")

# Empathy Score
EMPATHETIC_PATTERNS = [
    "বুঝতে পারছি", "বুঝি", "দুঃখিত", "কষ্ট", "মন খারাপ",
    "পাশে আছি", "সাহায্য", "চেষ্টা", "সাহস", "আশা",
    "স্বাভাবিক", "ঠিক আছে"
]

empathy_scores = []
for pred in predictions:
    matches = sum(1 for p in EMPATHETIC_PATTERNS if p in pred)
    score = min(100, (matches / len(EMPATHETIC_PATTERNS)) * 200)
    empathy_scores.append(score)

print(f"\nEmpathy score: {np.mean(empathy_scores):.2f}")


EVALUATION RESULTS

BLEU Score: 0.81
ROUGE-1: 0.0000
ROUGE-2: 0.0000
ROUGE-L: 0.0000

Empathy score: 3.67


In [25]:
eval_results = {
    'model': MODEL_NAME,
    'strategy': 'Standard LoRA/QLoRA',
    'num_samples': len(predictions),
    'metrics': {
        'bleu': bleu_result['score'],
        'rouge1': rouge_result['rouge1'],
        'rouge2': rouge_result['rouge2'],
        'rougeL': rouge_result['rougeL'],
        'empathy_score': float(np.mean(empathy_scores))
    },
    'training_loss': trainer_stats.training_loss,
    'training_time_seconds': trainer_stats.metrics['train_runtime'],
    'timestamp': datetime.now().isoformat()
}

with open('evaluation_results.json', 'w') as f:
    json.dump(eval_results, f, indent=2)

print("Saved to evaluation_results.json")

Saved to evaluation_results.json


## 11. Summary

In [26]:
print("="*60)
print("TRAINING SUMMARY")
print("="*60)
print(f"\nModel: {MODEL_NAME}")
print(f"Strategy: Standard LoRA/QLoRA (4-bit)")
print(f"Max Sequence Length: {MAX_SEQ_LENGTH}")
print(f"\nLoRA Configuration:")
print(f"  Rank: 16")
print(f"  Alpha: 32")
print(f"  Target modules: q, k, v, o, gate, up, down")
print(f"  Gradient checkpointing: Enabled")
print(f"\nTraining Results:")
print(f"  Final loss: {trainer_stats.training_loss:.4f}")
print(f"  Training time: {trainer_stats.metrics['train_runtime']/60:.1f} minutes")
print(f"  Peak GPU memory: {used_memory} GB")
print(f"\nEvaluation Metrics:")
print(f"  BLEU: {eval_results['metrics']['bleu']:.2f}")
print(f"  ROUGE-L: {eval_results['metrics']['rougeL']:.4f}")
print(f"  Empathy Score: {eval_results['metrics']['empathy_score']:.2f}")
print(f"\nModel saved to: {OUTPUT_DIR}")
print("="*60)

TRAINING SUMMARY

Model: NousResearch/Meta-Llama-3.1-8B-Instruct
Strategy: Standard LoRA/QLoRA (4-bit)
Max Sequence Length: 4096

LoRA Configuration:
  Rank: 16
  Alpha: 32
  Target modules: q, k, v, o, gate, up, down
  Gradient checkpointing: Enabled

Training Results:
  Final loss: 0.5285
  Training time: 581.7 minutes
  Peak GPU memory: 14.162 GB

Evaluation Metrics:
  BLEU: 0.81
  ROUGE-L: 0.0000
  Empathy Score: 3.67

Model saved to: ./llama_bangla_lora


In [30]:
!pip install -q sentence-transformers

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [None]:
from sentence_transformers import SentenceTransformer, util
import numpy as np

model = SentenceTransformer('sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2')

# Empathetic phrases
EMPATHETIC_PATTERNS = [
    "বুঝতে পারছি", "বুঝি", "দুঃখিত", "কষ্ট", "মন খারাপ",
    "পাশে আছি", "সাহায্য", "চেষ্টা", "সাহস", "আশা",
    "স্বাভাবিক", "ঠিক আছে"
]

# Precompute embeddings for empathetic phrases
pattern_embeddings = model.encode(EMPATHETIC_PATTERNS, convert_to_tensor=True)

def compute_empathy_score(predictions, threshold=0.7):
    scores = []
    for pred in predictions:
        pred_embedding = model.encode(pred, convert_to_tensor=True)
        similarities = util.cos_sim(pred_embedding, pattern_embeddings)[0]
        matches = (similarities > threshold).sum().item()
        score = min(100, (matches / len(EMPATHETIC_PATTERNS)) * 200)
        scores.append(score)
    return np.mean(scores)

empathy_score = compute_empathy_score(predictions)
print(f"\nEmpathy Score (semantic): {empathy_score:.2f}")

modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/645 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/471M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/480 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.08M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]


Empathy Score (semantic): 19.33


In [None]:
def combined_score(bleu, rouge_l, empathy, 
                   w_bleu=0.25, w_rouge=0.15, w_emp=0.60):
    # normalized empathy 
    empathy_norm = empathy / 100.0
    return (w_bleu * bleu) + (w_rouge * rouge_l) + (w_emp * empathy_norm)

score = combined_score(eval_results['metrics']['bleu'], eval_results['metrics']['rougeL'], empathy_score)
print(f"Quality + Empathy Score: {score:.3f}")

Quality + Empathy Score: 0.320
