In [None]:
import pandas as pd
import numpy as np
from transformers import T5Tokenizer, T5ForConditionalGeneration, Trainer, TrainingArguments
from datasets import Dataset
from nltk.translate.bleu_score import sentence_bleu
from rouge_score import rouge_scorer
import torch
from sklearn.model_selection import train_test_split

# Load the trained model and tokenizer
model_name = "./trained_model_T5"
model = T5ForConditionalGeneration.from_pretrained(model_name)
tokenizer = T5Tokenizer.from_pretrained(model_name)

# Load validation data
file_path = 'Training_Data_T5.csv'
df = pd.read_csv(file_path, sep=';', header=None, names=['Input', 'Value'])
df = df.drop(0)  # Drop the header row if present
df['text'] = df['Input'].str.strip() + " ; " + df['Value'].str.strip()

# Split into training and validation sets
_, val_df = train_test_split(df, test_size=0.2)

# Prepare validation dataset
val_dataset = Dataset.from_pandas(val_df[['Input', 'Value']])

# Function to generate predictions
def generate_predictions(dataset):
    inputs = dataset['Input']
    references = dataset['Value']
    predictions = []

    for input_text in inputs:
        input_text = "translate Input to Output: " + input_text.strip()
        inputs_tokenized = tokenizer(input_text, return_tensors="pt", max_length=250, truncation=True)
        outputs = model.generate(**inputs_tokenized, max_length=150, num_beams=2, early_stopping=True)
        predictions.append(tokenizer.decode(outputs[0], skip_special_tokens=True))

    return predictions, references

# Generate predictions and references
predictions, references = generate_predictions(val_df)

# METRICS CALCULATION

# Exact Match (EM)
def exact_match(predictions, references):
    return sum([pred == ref for pred, ref in zip(predictions, references)]) / len(references)

# BLEU Score
def compute_bleu(predictions, references):
    return [sentence_bleu([ref.split()], pred.split()) for pred, ref in zip(predictions, references)]

# ROUGE Score
def compute_rouge(predictions, references):
    scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
    rouge_scores = [scorer.score(ref, pred) for pred, ref in zip(predictions, references)]
    rouge1 = np.mean([score['rouge1'].fmeasure for score in rouge_scores])
    rouge2 = np.mean([score['rouge2'].fmeasure for score in rouge_scores])
    rougeL = np.mean([score['rougeL'].fmeasure for score in rouge_scores])
    return {'rouge1': rouge1, 'rouge2': rouge2, 'rougeL': rougeL}

# Perplexity
def compute_perplexity(eval_loss):
    return torch.exp(torch.tensor(eval_loss)).item()

# Replace with the actual eval_loss
eval_loss = 0.1048 

# Compute metrics
em_score = exact_match(predictions, references)
bleu_scores = compute_bleu(predictions, references)
avg_bleu = np.mean(bleu_scores)
rouge_scores = compute_rouge(predictions, references)
perplexity = compute_perplexity(eval_loss)

# Display Results
print("Evaluation Metrics:")
print(f"Exact Match (EM): {em_score * 100:.2f}%")
print(f"Average BLEU Score: {avg_bleu * 100:.2f}%")
print(f"ROUGE Scores: Rouge-1: {rouge_scores['rouge1']:.4f}, Rouge-2: {rouge_scores['rouge2']:.4f}, Rouge-L: {rouge_scores['rougeL']:.4f}")
print(f"Perplexity: {perplexity:.4f}")

# Save results to CSV
results = pd.DataFrame({
    'Input': val_df['Input'],
    'Reference': references,
    'Prediction': predictions,
    'BLEU Score': bleu_scores
})
results.to_csv('model_evaluation_results.csv', index=False)
print("Results saved to 'model_evaluation_results.csv'")
