In [None]:
import pandas as pd

from math import sqrt
from scipy.stats import norm

from tqdm.notebook import tqdm
import numpy as np
import json

In [None]:
def get_confidence_intervals(data, weights=None, return_margin=False):
  if not weights:
    weights = [1] * len(data)
  p_hat = sum(data) / sum(weights)

  norm_squared = sum(w**2 for w in weights)
  n = sum(weights)**2 / norm_squared

  # Set confidence level (e.g., 95%)
  confidence_level = 0.95
  z = norm.ppf(1 - (1 - confidence_level) / 2)  # Z-score for the confidence level

  # Confidence interval (normal approximation)
  margin_of_error = z * sqrt((p_hat * (1 - p_hat)) / n)
  if return_margin:
    return margin_of_error

  lower_bound = max(0, p_hat - margin_of_error)
  upper_bound = min(1, p_hat + margin_of_error)

  return lower_bound, upper_bound

In [None]:
def is_chinese_char(char):
    return (
        '\u4e00' <= char <= '\u9fff' or   # CJK Unified Ideographs
        '\u3400' <= char <= '\u4dbf' or   # Extension A
        '\U00020000' <= char <= '\U0002A6DF' or  # Extension B
        '\U0002A700' <= char <= '\U0002B73F' or  # Extension C
        '\U0002B740' <= char <= '\U0002B81F' or  # Extension D
        '\U0002B820' <= char <= '\U0002CEAF' or  # Extension E
        '\U0002CEB0' <= char <= '\U0002EBEF' or  # Extension F
        '\U00030000' <= char <= '\U0003134F'     # Extension G
    )

def compute_single_recall(gold, prediction, lang):
  if lang in {'zh', 'ja'}:
    words = []
    word = []
    for char in gold:
      if is_chinese_char(char):
        if word:
          words.append(''.join(word))
          word = []
        words.append(char)
      else:
        word.append(char)
    if word:
      words.append(''.join(word))
  else:
    words = gold.split()

  return len([w for w in words if w in prediction]) / len(words)

def compute_string_metric(path, overall=True):
  with open(path, "r") as f:
    data = [json.loads(line) for line in f]
  data = pd.DataFrame(data)

  in_lang = {}
  results = []
  for _, row in data.iterrows():
    if row['original_language']==row['target_language']:
      in_lang[row['q_id']] = compute_single_recall(row['answer'], row['prediction'], row['target_language'])
  for _, row in data.iterrows():
    if row['original_language']!=row['target_language']:
      cl_recall = compute_single_recall(row['answer'], row['prediction'], row['target_language'])
      recall = cl_recall * in_lang[row['q_id']]
      results.append(recall)

  if overall:
    return float(np.mean(results)), float(get_confidence_intervals(results, return_margin=True))
  else:
    return sum(results)/(sum(in_lang.values())*11), float(get_confidence_intervals(results, weights=list(in_lang.values())*11, return_margin=True))

In [None]:
results_path = '' #@param {type: "string"}
print('#### string-based transfer score ####')
print(model, compute_string_metric(path, overall=False))
print('#### string-based overall score ####')
print(model, compute_string_metric(path, overall=True))