In [2]:
import pandas as pd
import matplotlib.pyplot as plt
import re
import os
from pathlib import Path
from sentence_transformers import SentenceTransformer, util
from sklearn.metrics.pairwise import cosine_similarity
import torch 

PREDICTION_DIR = "./../../../Output/Prediction"
MATCH_SCORES_DIR = "./../../../Output/Prediction/Match_scores"
FIGURES_DIR = "./../../../Output/Prediction/Figures"
HARD = True  # Change this to False for soft matching

os.makedirs(MATCH_SCORES_DIR, exist_ok=True)
os.makedirs(FIGURES_DIR, exist_ok=True)

model = SentenceTransformer('all-mpnet-base-v2')  

def semantic_score(truths, preds, match_type='hard'):
    total_score = 0.0
    for truth, pred in zip(truths, preds):
        if truth.lower() == pred.lower():
            total_score += 1.0
            
        else:
            if match_type == 'hard':
                continue
            embeddings = model.encode([truth, pred], convert_to_tensor=True)
            sim = util.pytorch_cos_sim(embeddings[0], embeddings[1])
            total_score += float(sim)
    return total_score

def extract_csis(text):
    return [csi.strip() for csi in re.findall(r'<CSI>(.*?)</CSI>', text)]


df_ground_truth = pd.read_csv("./../../../Datasets/Culturemark_final - westvalue_df_final.csv")

match_type = "hard" if HARD else "soft"

for csv_file in os.listdir(PREDICTION_DIR):
    if csv_file.endswith('.csv'):
        model_name = csv_file.replace('.csv', '')
        print(f"\nProcessing {model_name}...")

        df_model = pd.read_csv(os.path.join(PREDICTION_DIR, csv_file)) 

        df_ground_truth['ground_truth_csis'] = df_ground_truth['Sentence'].apply(extract_csis)
        df_model['model_csis'] = df_model['CSI Output'].apply(extract_csis)

        match_scores = []
        for idx, (gt_csis, model_csis) in enumerate(zip(df_ground_truth['ground_truth_csis'], 
                                                      df_model['model_csis'])):
            match_score = semantic_score(gt_csis, model_csis, match_type=match_type)
            match_scores.append(match_score)

        # Calculate overall scores
        overall_match_score = sum(match_scores) / len(match_scores) * 100
        
        # Calculate category-wise scores
        df_ground_truth['CSI Category'] = df_model['CSI Category']
        df_ground_truth['match_score'] = match_scores
        category_scores = df_ground_truth.groupby('CSI Category')['match_score'].mean() * 100

        print(f"Overall {match_type.capitalize()} Match Score: {overall_match_score:.2f}%")
        print("\nCategory-wise Scores:")
        for category, score in category_scores.items():
            print(f"  {category}: {score:.2f}%")

        category_scores_df = category_scores.reset_index()
        category_scores_df.columns = ['CSI Category', 'Match Score']
        category_scores_df.to_csv(f'{MATCH_SCORES_DIR}/{match_type}_{model_name}_category_scores.csv', index=False)

        gt_csi_counts = df_ground_truth['ground_truth_csis'].apply(len)
        plt.figure(figsize=(8, 6))
        plt.hist(gt_csi_counts, bins=range(max(gt_csi_counts.max(), 1) + 1), 
                color='skyblue', edgecolor='black')
        plt.title('Ground Truth: Distribution of CSI Counts per Sentence')
        plt.xlabel('Number of CSIs')
        plt.ylabel('Frequency')
        plt.grid(True, alpha=0.3)
        plt.savefig(f'{FIGURES_DIR}/{match_type}_{model_name}_ground_truth_distribution.png')
        plt.close()

        model_csi_counts = df_model['model_csis'].apply(len)
        plt.figure(figsize=(8, 6))
        plt.hist(model_csi_counts, bins=range(max(model_csi_counts.max(), 1) + 1), 
                color='salmon', edgecolor='black')
        plt.title('Model Output: Distribution of CSI Counts per Sentence')
        plt.xlabel('Number of CSIs')
        plt.ylabel('Frequency')
        plt.grid(True, alpha=0.3)
        plt.savefig(f'{FIGURES_DIR}/{match_type}_{model_name}_model_output_distribution.png')
        plt.close()

        df_ground_truth['model_output'] = df_model['CSI Output']
        df_ground_truth['model_csis'] = df_model['model_csis']
        df_ground_truth.to_csv(f'{MATCH_SCORES_DIR}/{match_type}_{model_name}_CSI_match_scores.csv', index=False)

print("\nProcessing complete!")


Processing deepseek-chat...
Overall Hard Match Score: 47.33%

Category-wise Scores:
  Cultural Reference: 48.45%
  Cultural Stereotype: 61.11%
  Cultural Value: 57.14%
  Social Etiquette: 40.17%
  Social Tradition: 50.00%
  Sports Tradition: 41.18%
  Workplace Culture: 43.48%

Processing llama3-sdsc...
Overall Hard Match Score: 34.54%

Category-wise Scores:
  Cultural Reference: 43.30%
  Cultural Stereotype: 63.89%
  Cultural Value: 42.86%
  Social Etiquette: 25.64%
  Social Tradition: 32.18%
  Sports Tradition: 29.41%
  Workplace Culture: 27.54%

Processing qwen...
Overall Hard Match Score: 43.70%

Category-wise Scores:
  Cultural Reference: 50.52%
  Cultural Stereotype: 58.33%
  Cultural Value: 42.86%
  Social Etiquette: 41.03%
  Social Tradition: 40.80%
  Sports Tradition: 29.41%
  Workplace Culture: 42.03%

Processing gemini...
Overall Hard Match Score: 42.94%

Category-wise Scores:
  Cultural Reference: 43.30%
  Cultural Stereotype: 58.33%
  Cultural Value: 50.00%
  Social Etique