In [None]:
import os  
import stat  
import math  
import torch  
from transformers import AutoTokenizer, AutoModelForCausalLM  
import nltk
from nltk.translate.bleu_score import sentence_bleu  
from nltk.translate.meteor_score import meteor_score  
from rouge_score import rouge_scorer  
from sentence_transformers import SentenceTransformer  
from sklearn.metrics.pairwise import cosine_similarity   

# Set device
# Force selection of the NVIDIA GPU (assumed as device 0)
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print("Using device:", device)

# Download necessary NLTK resources  
nltk.download('punkt')  
nltk.download('wordnet')  
nltk.download('omw-1.4')

# Set HF_HOME to a local, writable directory.  
os.environ["HF_HOME"] = os.path.join(os.getcwd(), "hf_cache")  
cache_dir = os.environ["HF_HOME"]  
  
if not os.path.exists(cache_dir):  
    os.makedirs(cache_dir, exist_ok=True)  
  
print("Using HF_HOME:", cache_dir)  
  
# Initialize the tokenizer and model using the local cache directory.  
print("Loading tokenizer and model using cache_dir = " + cache_dir + "...")  
tokenizer = AutoTokenizer.from_pretrained("RayyanAhmed9477/CPU-Compatible-Mental-Health-Model", cache_dir=cache_dir)  
model = AutoModelForCausalLM.from_pretrained("RayyanAhmed9477/CPU-Compatible-Mental-Health-Model", cache_dir=cache_dir)  
model.eval()  # Set the model to evaluation mode  
  
# Download necessary NLTK resources.  
nltk.download('punkt')  
  
print("Tokenizer and model loaded successfully.")  
print("Vocabulary size:", model.config.vocab_size)  
print("Model max length:", tokenizer.model_max_length)  
  
# Updated compute_perplexity function that replaces out-of-range token IDs with the unknown token.  
def compute_perplexity(text):  
    inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=tokenizer.model_max_length).to(model.device)  
    input_ids = inputs["input_ids"]  
    vocab_size = model.config.vocab_size  
    unk_token_id = tokenizer.unk_token_id if tokenizer.unk_token_id is not None else 0  
    input_ids_fixed = torch.where(input_ids >= vocab_size, torch.tensor(unk_token_id, device=input_ids.device), input_ids)  
    inputs["input_ids"] = input_ids_fixed  
    with torch.no_grad():  
        outputs = model(**inputs, labels=input_ids_fixed)  
    loss = outputs.loss.item()  
    perplexity = math.exp(loss)  
    return perplexity  
  
# Test the function with sample evaluation texts.  
eval_texts = [  
    "I feel anxious and don't know what to do.",  
    "I'm depressed and need someone to talk to.",  
    "I feel overwhelmed with stress at work."  
]  
  
print("Evaluating Perplexity after fix:")  
for text in eval_texts:  
    ppl = compute_perplexity(text)  
    print("Text: " + text)  
    print("Perplexity: " + str(ppl))  
    print("-----")  
  
print("done")  

# Load a sentence transformer model for semantic similarity  
semantic_model = SentenceTransformer('all-MiniLM-L6-v2')  
  
def evaluate_response_quality(generated_response, reference_response):  
    # Tokenize responses  
    gen_tokens = nltk.word_tokenize(generated_response.lower())  
    ref_tokens = nltk.word_tokenize(reference_response.lower())  
      
    # Calculate BLEU score (using a simple 4-gram setting)  
    bleu = sentence_bleu([ref_tokens], gen_tokens)  
      
    # Calculate METEOR score  
    meteor = meteor_score([ref_tokens], gen_tokens)  
      
    # Calculate ROUGE scores  
    scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)  
    rouge_scores = scorer.score(reference_response, generated_response)  
      
    # Calculate semantic similarity using embeddings  
    embeddings = semantic_model.encode([reference_response, generated_response])  
    semantic_sim = cosine_similarity([embeddings[0]], [embeddings[1]])[0][0]  
      
    return {  
        'bleu': bleu,  
        'meteor': meteor,  
        'rouge1_f': rouge_scores['rouge1'].fmeasure,  
        'rouge2_f': rouge_scores['rouge2'].fmeasure,  
        'rougeL_f': rouge_scores['rougeL'].fmeasure,  
        'semantic_similarity': semantic_sim  
    }  
  
# Example usage:  
reference = "I'm sorry you're feeling anxious. It might help to try some deep breathing exercises and reach out to someone you trust."  
generated = "I understand that you're anxious. Perhaps deep breathing and talking with a friend might ease some stress."  
  
metrics = evaluate_response_quality(generated, reference)  
print("Evaluation Metrics:")  
for metric, value in metrics.items():  
    print(metric + ": " + str(value))
    

  from .autonotebook import tqdm as notebook_tqdm





[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\mward\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\mward\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\mward\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


Using HF_HOME: c:\Users\mward\AppData\Local\Programs\Microsoft VS Code\hf_cache
Loading tokenizer and model using cache_dir = c:\Users\mward\AppData\Local\Programs\Microsoft VS Code\hf_cache...


Some weights of LlamaForCausalLM were not initialized from the model checkpoint at RayyanAhmed9477/CPU-Compatible-Mental-Health-Model and are newly initialized: ['lm_head.weight', 'model.embed_tokens.weight', 'model.layers.0.input_layernorm.weight', 'model.layers.0.mlp.down_proj.weight', 'model.layers.0.mlp.gate_proj.weight', 'model.layers.0.mlp.up_proj.weight', 'model.layers.0.post_attention_layernorm.weight', 'model.layers.0.self_attn.k_proj.weight', 'model.layers.0.self_attn.o_proj.weight', 'model.layers.0.self_attn.q_proj.weight', 'model.layers.0.self_attn.v_proj.weight', 'model.layers.1.input_layernorm.weight', 'model.layers.1.mlp.down_proj.weight', 'model.layers.1.mlp.gate_proj.weight', 'model.layers.1.mlp.up_proj.weight', 'model.layers.1.post_attention_layernorm.weight', 'model.layers.1.self_attn.k_proj.weight', 'model.layers.1.self_attn.o_proj.weight', 'model.layers.1.self_attn.q_proj.weight', 'model.layers.1.self_attn.v_proj.weight', 'model.layers.10.input_layernorm.weight', '

Tokenizer and model loaded successfully.
Vocabulary size: 32000
Model max length: 131072
Evaluating Perplexity after fix:
Text: I feel anxious and don't know what to do.
Perplexity: 105070.36621153323
-----
Text: I'm depressed and need someone to talk to.
Perplexity: 74118.95562442865
-----
Text: I feel overwhelmed with stress at work.
Perplexity: 52183.028083357916
-----
done


The hypothesis contains 0 counts of 3-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()


Evaluation Metrics:
bleu: 6.3063951716201414e-155
meteor: 0.32602462472592336
rouge1_f: 0.4390243902439025
rouge2_f: 0.10256410256410256
rougeL_f: 0.34146341463414637
semantic_similarity: 0.8715141
