In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import re
import os
from pathlib import Path

IDENTIFICATION_DIR = "./../../../Output/Identification"
MATCH_SCORES_DIR = "./../../../Output/Identification/Match_scores"
FIGURES_DIR = "./../../../Output/Identification/Figures"
HARD = False 

os.makedirs(MATCH_SCORES_DIR, exist_ok=True)
os.makedirs(FIGURES_DIR, exist_ok=True)

def extract_csis(text):
    return [csi.strip() for csi in re.findall(r'<CSI>(.*?)</CSI>', text)]


df_ground_truth = pd.read_csv("./../../../Datasets/Culturemark_final - westvalue_df_final.csv")

match_type = "hard" if HARD 

for csv_file in os.listdir(IDENTIFICATION_DIR):
    if csv_file.endswith('.csv'):
        model_name = csv_file.replace('.csv', '')
        print(f"\nProcessing {model_name}...")

        df_model = pd.read_csv(os.path.join(IDENTIFICATION_DIR, csv_file)) 

        df_ground_truth['ground_truth_csis'] = df_ground_truth['Sentence'].apply(extract_csis)
        df_model['model_csis'] = df_model['CSI Output'].apply(extract_csis)

        match_scores = []
        for idx, (gt_csis, model_csis) in enumerate(zip(df_ground_truth['ground_truth_csis'], 
                                                      df_model['model_csis'])):
            if not gt_csis:
                match_score = 1.0 if not model_csis else 0.0
            else:
                matches = 0
                for gt_csi in gt_csis:
                    # Exact match
                    if gt_csi in model_csis:
                        matches += 1
                   
                match_score = matches / len(gt_csis)
            match_scores.append(match_score)

        # Calculate overall scores
        overall_match_score = sum(match_scores) / len(match_scores) * 100
        
        # Calculate category-wise scores
        df_ground_truth['CSI Category'] = df_model['CSI Category']
        df_ground_truth['match_score'] = match_scores
        category_scores = df_ground_truth.groupby('CSI Category')['match_score'].mean() * 100

        print(f"Overall {match_type.capitalize()} Match Score: {overall_match_score:.2f}%")
        print("\nCategory-wise Scores:")
        for category, score in category_scores.items():
            print(f"  {category}: {score:.2f}%")

        category_scores_df = category_scores.reset_index()
        category_scores_df.columns = ['CSI Category', 'Match Score']
        category_scores_df.to_csv(f'{MATCH_SCORES_DIR}/{match_type}_{model_name}_category_scores.csv', index=False)

        gt_csi_counts = df_ground_truth['ground_truth_csis'].apply(len)
        plt.figure(figsize=(8, 6))
        plt.hist(gt_csi_counts, bins=range(max(gt_csi_counts.max(), 1) + 1), 
                color='skyblue', edgecolor='black')
        plt.title('Ground Truth: Distribution of CSI Counts per Sentence')
        plt.xlabel('Number of CSIs')
        plt.ylabel('Frequency')
        plt.grid(True, alpha=0.3)
        plt.savefig(f'{FIGURES_DIR}/{match_type}_{model_name}_ground_truth_distribution.png')
        plt.close()

        model_csi_counts = df_model['model_csis'].apply(len)
        plt.figure(figsize=(8, 6))
        plt.hist(model_csi_counts, bins=range(max(model_csi_counts.max(), 1) + 1), 
                color='salmon', edgecolor='black')
        plt.title('Model Output: Distribution of CSI Counts per Sentence')
        plt.xlabel('Number of CSIs')
        plt.ylabel('Frequency')
        plt.grid(True, alpha=0.3)
        plt.savefig(f'{FIGURES_DIR}/{match_type}_{model_name}_model_output_distribution.png')
        plt.close()

        df_ground_truth['model_output'] = df_model['CSI Output']
        df_ground_truth['model_csis'] = df_model['model_csis']
        df_ground_truth.to_csv(f'{MATCH_SCORES_DIR}/{match_type}_{model_name}_CSI_match_scores.csv', index=False)

print("\nProcessing complete!")


Processing claude...
Overall Soft Match Score: 60.29%

Category-wise Scores:
  Cultural Reference: 60.57%
  Cultural Stereotype: 88.19%
  Cultural Value: 69.64%
  Social Etiquette: 40.17%
  Social Tradition: 63.17%
  Sports Tradition: 79.41%
  Workplace Culture: 65.58%

Processing qwen...
Overall Soft Match Score: 63.82%

Category-wise Scores:
  Cultural Reference: 64.43%
  Cultural Stereotype: 90.97%
  Cultural Value: 60.71%
  Social Etiquette: 44.87%
  Social Tradition: 69.64%
  Sports Tradition: 82.35%
  Workplace Culture: 62.32%

Processing deepseek...
Overall Soft Match Score: 68.35%

Category-wise Scores:
  Cultural Reference: 72.94%
  Cultural Stereotype: 95.14%
  Cultural Value: 78.57%
  Social Etiquette: 45.51%
  Social Tradition: 73.52%
  Sports Tradition: 82.35%
  Workplace Culture: 68.12%

Processing llama3...
Overall Soft Match Score: 64.07%

Category-wise Scores:
  Cultural Reference: 72.16%
  Cultural Stereotype: 86.81%
  Cultural Value: 71.43%
  Social Etiquette: 43.80