# Inter-lingual

In [2]:
from bert_score import BERTScorer
from transformers import AutoModel
import re

# Specify the model name
model_name = 'bert-base-chinese'
model = AutoModel.from_pretrained(model_name)

# Initialize the scorer with the model and language settings
scorer = BERTScorer(model_type=model_name, lang="zh", num_layers=model.config.num_hidden_layers)

# Example input texts with CSI tags
ground_truth = "一起喝杯<CSI>茶</CSI>，聊聊近况吧！"
output = "美国商务人士与中国合作伙伴进行了一次愉快的<CSI>会谈</CSI>"

# Function to extract CSI-tagged phrases
def extract_all_csi(text):
    return re.findall(r'<CSI>(.*?)</CSI>', text)

# Extract CSI segments
gt_csi_list = extract_all_csi(ground_truth)
out_csi_list = extract_all_csi(output)

# Ensure equal number of CSI tags
assert len(gt_csi_list) == len(out_csi_list), "Mismatched CSI tags in ground truth and output!"

# Compute BERTScore for full sentences
P_full, R_full, F1_full = scorer.score([output], [ground_truth])
full_f1 = F1_full.mean().item()

# Compute BERTScore for each CSI segment
csi_scores = []
for gt_csi, out_csi in zip(gt_csi_list, out_csi_list):
    P, R, F1 = scorer.score([out_csi], [gt_csi])
    csi_scores.append(F1.mean().item())

# Calculate average CSI score
avg_csi_score = sum(csi_scores) / len(csi_scores) if csi_scores else 0.0

# Print results
print(f"Using model: {model_name}")
print(f"Full Sentence BERTScore (F1): {full_f1:.4f}")
print(f"Individual CSI BERTScore (F1): {csi_scores}")
print(f"Aggregate CSI BERTScore (F1): {avg_csi_score:.4f}")

Using model: bert-base-chinese
Full Sentence BERTScore (F1): 0.6788
Individual CSI BERTScore (F1): [0.6494137644767761]
Aggregate CSI BERTScore (F1): 0.6494


# Intra-lingual

In [None]:
from bert_score import score
import re

# Example with multiple CSI tags
ground_truth = "The executive gave a firm <CSI>handshake</CSI> to congratulate his colleague."
output = "The American executive gave a firm <CSI> appreciative shoulder tap </CSI> to congratulate his colleague."

# Extract ALL CSI-tagged phrases
def extract_all_csi(text):
    return re.findall(r'<CSI>(.*?)</CSI>', text)

gt_csi_list = extract_all_csi(ground_truth)
out_csi_list = extract_all_csi(output)

# Ensure equal number of CSI tags
assert len(gt_csi_list) == len(out_csi_list), "Mismatched CSI tags in ground truth and output!"

# Compute BERTScore for full sentences
P_full, R_full, F1_full = score([output], [ground_truth], lang="en", model_type= 'bert-base-uncased', rescale_with_baseline=True)

# Compute BERTScore for each CSI segment
csi_scores = []
for gt_csi, out_csi in zip(gt_csi_list, out_csi_list):
    P, R, F1 = score([out_csi], [gt_csi], lang="en", model_type= 'bert-base-uncased', rescale_with_baseline=True)
    csi_scores.append(F1.mean().item())

# Aggregate CSI scores (average)
avg_csi_score = sum(csi_scores) / len(csi_scores) if csi_scores else 0.0

# Print results
print(f"Full Sentence BERTScore (F1): {F1_full.mean().item():.4f}")
print(f"Individual CSI BERTScore (F1): {csi_scores}")
print(f"Aggregate CSI BERTScore (F1): {avg_csi_score:.4f}")

In [None]:
import pandas as pd

adaptation_df = pd.read_csv("../../../Datasets/Adaptation_Final.csv")
gemini_df = pd.read_csv("../../../Output/Adaptation/gemini.csv")
adaptation_df.head()

In [None]:
import pandas as pd
from bert_score import BERTScorer, score
from transformers import AutoModel
import re
import numpy as np
from tqdm import tqdm

adaptation_df = pd.read_csv("../../../Datasets/Adaptation_Final.csv")
gemini_df = pd.read_csv("../../../Output/Adaptation/gpt.csv")

model_name = 'bert-base-chinese'
model = AutoModel.from_pretrained(model_name)
scorer_zh = BERTScorer(model_type=model_name, lang="zh", num_layers=model.config.num_hidden_layers)

def extract_all_csi(text):
    """
    Extract all content between <CSI> tags from the given text.
    
    Args:
        text: Input string or any other type
        
    Returns:
        list: List of found CSI items (empty list if none found or invalid input)
    """
    if not isinstance(text, str) or not text.strip():
        return []
    
    try:
        return re.findall(r'<CSI>(.*?)</CSI>', text)
    except (TypeError, re.error):
        return []

def compute_intra_scores(output, ground_truth):
    gt_csi_list = extract_all_csi(ground_truth)
    out_csi_list = extract_all_csi(output)
    
    if len(gt_csi_list) != len(out_csi_list):
        return None, None, None
    
    # Full sentence score
    P_full, R_full, F1_full = score([output], [ground_truth], lang="en", model_type='bert-base-uncased', rescale_with_baseline=True)
    full_f1 = F1_full.mean().item()
    
    # CSI segment scores
    csi_scores = []
    for gt_csi, out_csi in zip(gt_csi_list, out_csi_list):
        P, R, F1 = score([out_csi], [gt_csi], lang="en", model_type='bert-base-uncased', rescale_with_baseline=True)
        csi_scores.append(F1.mean().item())
    
    avg_csi_score = sum(csi_scores) / len(csi_scores) if csi_scores else 0.0
    return full_f1, csi_scores, avg_csi_score

def compute_inter_scores(output, ground_truth, scorer):
    gt_csi_list = extract_all_csi(ground_truth)
    out_csi_list = extract_all_csi(output)
    
    if len(gt_csi_list) != len(out_csi_list):
        return None, None, None
    
    # Full sentence score
    P_full, R_full, F1_full = scorer.score([output], [ground_truth])
    full_f1 = F1_full.mean().item()
    
    # CSI segment scores
    csi_scores = []
    for gt_csi, out_csi in zip(gt_csi_list, out_csi_list):
        P, R, F1 = scorer.score([out_csi], [gt_csi])
        csi_scores.append(F1.mean().item())
    
    avg_csi_score = sum(csi_scores) / len(csi_scores) if csi_scores else 0.0
    return full_f1, csi_scores, avg_csi_score

# Lists to collect scores
intra_hindu_full_f1 = []
intra_hindu_avg_csi = []
intra_muslim_full_f1 = []
intra_muslim_avg_csi = []
inter_hindu_full_f1 = []
inter_hindu_avg_csi = []
inter_muslim_full_f1 = []
inter_muslim_avg_csi = []


for i in tqdm(range(len(gemini_df))):
    # Intra-lingua evaluations
    output_intra = gemini_df['Intra'][i]
    
    # Intra Hindu
    gt_intra_hindu = adaptation_df['Intra-lingual (Hindu)'][i]
    full_f1, _, avg_csi = compute_intra_scores(output_intra, gt_intra_hindu)
    if full_f1 is not None:
        intra_hindu_full_f1.append(full_f1)
        intra_hindu_avg_csi.append(avg_csi)
    
    # Intra Muslim
    gt_intra_muslim = adaptation_df['Intra-lingual (Muslim)'][i]
    full_f1, _, avg_csi = compute_intra_scores(output_intra, gt_intra_muslim)
    if full_f1 is not None:
        intra_muslim_full_f1.append(full_f1)
        intra_muslim_avg_csi.append(avg_csi)
    
    # Inter-lingua evaluations
    output_inter = gemini_df['Inter'][i]
    
    # Inter Hindu
    gt_inter_hindu = adaptation_df['Inter-lingual (Hindu)'][i]
    full_f1, _, avg_csi = compute_inter_scores(output_inter, gt_inter_hindu, scorer_zh)
    if full_f1 is not None:
        inter_hindu_full_f1.append(full_f1)
        inter_hindu_avg_csi.append(avg_csi)
    
    # Inter Muslim
    gt_inter_muslim = adaptation_df['Inter-lingual (Muslim)'][i]
    full_f1, _, avg_csi = compute_inter_scores(output_inter, gt_inter_muslim, scorer_zh)
    if full_f1 is not None:
        inter_muslim_full_f1.append(full_f1)
        inter_muslim_avg_csi.append(avg_csi)

results = {
    'Adaptation': ['Intra-lingual (Hindu)', 'Intra-lingual (Muslim)', 'Inter-lingual (Hindu)', 'Inter-lingual (Muslim)'],
    'Avg Full Sentence F1': [
        np.mean(intra_hindu_full_f1) if intra_hindu_full_f1 else 0.0,
        np.mean(intra_muslim_full_f1) if intra_muslim_full_f1 else 0.0,
        np.mean(inter_hindu_full_f1) if inter_hindu_full_f1 else 0.0,
        np.mean(inter_muslim_full_f1) if inter_muslim_full_f1 else 0.0
    ],
    'Avg Aggregate CSI F1': [
        np.mean(intra_hindu_avg_csi) if intra_hindu_avg_csi else 0.0,
        np.mean(intra_muslim_avg_csi) if intra_muslim_avg_csi else 0.0,
        np.mean(inter_hindu_avg_csi) if inter_hindu_avg_csi else 0.0,
        np.mean(inter_muslim_avg_csi) if inter_muslim_avg_csi else 0.0
    ]
}

results_df = pd.DataFrame(results)
print(results_df)