In [None]:
import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer, util
import matplotlib.pyplot as plt
import torch
import gc
import torch
from transformers import pipeline, AutoModelForSequenceClassification, AutoTokenizer
from transformers.pipelines import Pipeline
from transformers.pipelines import PIPELINE_REGISTRY


In [None]:
directory_path ="/outputs/MRR_v2"

In [None]:
!pwd

In [None]:
import os
os.environ["HF_TOKEN"] = "your_token"

In [None]:
currency_symbol_dict = {
    # --- SAFE: DISTINCT SYMBOLS ---
    'EUR': '€',       # Euro
    'GBP': '£',       # British Pound Sterling
    'INR': '₹',       # Indian Rupee
    'JPY': 'JP¥',     # Japanese Yen
    'CNY': 'CN¥',     # Chinese Yuan
    'RUB': '₽',       # Russian Ruble
    'NGN': '₦',       # Nigerian Naira
    'THB': '฿',       # Thai Baht
    'VND': '₫',       # Vietnamese Dong
    # 'KRW': '₩',       # South Korean Won
    # 'KPW': '₩',       # North Korean Won
    'TRY': '₺',       # Turkish Lira
    'GHS': '₵',       # Ghanaian Cedi
    'PHP': '₱',       # Philippine Peso
    'ILS': '₪',       # Israeli New Shekel
    'CRC': '₡',       # Costa Rican Colón
    'PLN': 'zł',      # Polish Zloty
    'CZK': 'Kč',      # Czech Koruna
    'MNT': '₮',       # Mongolian Tögrög
    'UAH': '₴',       # Ukrainian Hryvnia
    'GEL': '₾',       # Georgian Lari
    'AMD': '֏',       # Armenian Dram
    'AZN': '₼',       # Azerbaijani Manat
    'KZT': '₸',       # Kazakhstani Tenge
    'LAK': '₭',       # Lao Kip
    'KHR': '៛',       # Cambodian Riel
    'BGN': 'лв',      # Bulgarian Lev
    'MKD': 'ден',     # Macedonian Denar
    'RSD': 'дин',     # Serbian Dinar

    # --- ARABIC / DISTINCT SCRIPTS ---
    'AED': 'د.إ',     # UAE Dirham
    'SAR': 'ر.س',     # Saudi Riyal
    'DZD': 'د.ج',     # Algerian Dinar
    'LYD': 'ل.د',     # Libyan Dinar
    'MAD': 'د.م.',    # Moroccan Dirham
    'IQD': 'ع.د',     # Iraqi Dinar
    # 'IRR': '﷼',       # Iranian Rial
    # 'YER': '﷼',       # Yemeni Rial
    'OMR': 'ر.ع.',    # Omani Rial
    'QAR': 'ر.ق',     # Qatari Riyal
    'JOD': 'د.ا',     # Jordanian Dinar
    'KWD': 'د.ك',     # Kuwaiti Dinar
    'TND': 'د.ت',     # Tunisian Dinar
    'SDG': 'ج.س.',    # Sudanese Pound
    'LBP': 'ل.ل',     # Lebanese Pound
    'AFN': '؋',       # Afghan Afghani
    'BHD': '.د.ب',    # Bahraini Dinar
    'NPR': 'रू',      # Nepalese Rupee
    'BDT': '৳',       # Bangladeshi Taka

    # --- DOLLAR VARIANTS (KEPT AS REQUESTED) ---
    'USD': 'US$',       # US Dollar
    # 'ARS': '$',       # Argentine Peso
    # 'CLP': '$',       # Chilean Peso
    # 'COP': '$',       # Colombian Peso
    'MXN': 'MEX$',     # Mexican Peso
    'AUD': 'A$',      # Australian Dollar
    'CAD': 'C$',      # Canadian Dollar
    'SGD': 'S$',      # Singapore Dollar
    'BRL': 'R$',      # Brazilian Real
    'HKD': 'HK$',     # Hong Kong Dollar
    'NZD': 'NZ$',     # New Zealand Dollar
    # 'BSD': 'B$',      # Bahamian Dollar
    'BZD': 'BZ$',     # Belize Dollar
    'JMD': 'J$',      # Jamaican Dollar
    'TTD': 'TT$',     # Trinidad and Tobago Dollar
    'XCD': 'EC$',     # East Caribbean Dollar
    'FJD': 'FJ$',     # Fijian Dollar
    'SBD': 'SI$',     # Solomon Islands Dollar
    'KYD': 'CI$',     # Cayman Islands Dollar
    'LRD': 'L$',      # Liberian Dollar
    'NAD': 'N$',      # Namibian Dollar
    'UYU': '$U',      # Uruguayan Peso,
     'SRD': 'Sr$',     # Surinamese Dollar (Suggesting 'Sr$' for distinctness),
    'EGP':'E£',
    # 'BND': 'B$',
    'DOP':'RD$',
    'GYD':'G$',
    # 'KGS':'⃀',
    'MOP': 'MOP$',    # Macanese Pataca
    'PAB':'B/.',
    'PEN': 'S/.',      # Peruvian Sol
    'PYG': '₲',       # Paraguayan Guaraní,
    'SOS':'Sh.So.',
    'SYP': '£S',      # Syrian Pound
    'TOP': 'T$',      # Tongan Paʻanga
     'TWD': 'NT$',     # New Taiwan Dollar

}

In [None]:
len(currency_symbol_dict)

In [None]:
len(set(currency_symbol_dict.values()))

In [None]:
currency_name_dict = {
    'USD': 'United States Dollar',
    'INR': 'Indian Rupee',
    'EUR': 'Euro',
    'GBP': 'British Pound Sterling',
    'JPY': 'Japanese Yen',
    'CHF': 'Swiss Franc',
    'KPW': 'North Korean Won',
    'PKR': 'Pakistani Rupee',
    'CNY': 'Chinese Yuan',
    'AUD': 'Australian Dollar',
    'CAD': 'Canadian Dollar',
    'MXN': 'Mexican Peso',
    'RUB': 'Russian Ruble',
    'ZAR': 'South African Rand',
    'NGN': 'Nigerian Naira',
    'THB': 'Thai Baht',
    'VND': 'Vietnamese Dong',
    'IDR': 'Indonesian Rupiah',
    'KRW': 'South Korean Won',
    'PLN': 'Polish Złoty',
    'SEK': 'Swedish Krona',
    'TRY': 'Turkish Lira',
    'BRL': 'Brazilian Real',
    'AED': 'United Arab Emirates Dirham',
    'SAR': 'Saudi Riyal',
    'SCR': 'Seychellois Rupee',
    'YER': 'Yemeni Rial',
    'SLL': 'Sierra Leonean Leone',
    'MWK': 'Malawian Kwacha',
    'GHS': 'Ghanaian Cedi',
    'UGX': 'Ugandan Shilling',
    'TZS': 'Tanzanian Shilling',
    'XOF': 'West African CFA Franc',
    'DZD': 'Algerian Dinar',
    'EGP': 'Egyptian Pound',
    'PHP': 'Philippine Peso',
    'MYR': 'Malaysian Ringgit',
    'LKR': 'Sri Lankan Rupee',
    'NPR': 'Nepalese Rupee',
    'BDT': 'Bangladeshi Taka',
    'OMR': 'Omani Rial',
    'SGD': 'Singapore Dollar',

    'AFN': 'Afghan Afghani',
    'ALL': 'Albanian Lek',
    'AMD': 'Armenian Dram',
    'AOA': 'Angolan Kwanza',
    'ARS': 'Argentine Peso',
    'AZN': 'Azerbaijani Manat',
    'BAM': 'Bosnia and Herzegovina Convertible Mark',
    'BBD': 'Barbadian Dollar',
    'BGN': 'Bulgarian Lev',
    'BHD': 'Bahraini Dinar',
    'BIF': 'Burundian Franc',
    'BMD': 'Bermudian Dollar',
    'BND': 'Brunei Dollar',
    'BOB': 'Bolivian Boliviano',
    'BSD': 'Bahamian Dollar',
    'BTN': 'Bhutanese Ngultrum',
    'BWP': 'Botswana Pula',
    'BYN': 'Belarusian Ruble',
    'BZD': 'Belize Dollar',
    'CDF': 'Congolese Franc',
    'CLP': 'Chilean Peso',
    'COP': 'Colombian Peso',
    'CRC': 'Costa Rican Colón',
    'CUP': 'Cuban Peso',
    'CVE': 'Cape Verdean Escudo',
    'CZK': 'Czech Koruna',
    'DJF': 'Djiboutian Franc',
    'DKK': 'Danish Krone',
    'DOP': 'Dominican Peso',
    'ERN': 'Eritrean Nakfa',
    'ETB': 'Ethiopian Birr',
    'FJD': 'Fijian Dollar',
    'GEL': 'Georgian Lari',
    'GMD': 'Gambian Dalasi',
    'GNF': 'Guinean Franc',
    'GTQ': 'Guatemalan Quetzal',
    'GYD': 'Guyanese Dollar',
    'HKD': 'Hong Kong Dollar',
    'HNL': 'Honduran Lempira',
    'HTG': 'Haitian Gourde',
    'HUF': 'Hungarian Forint',
    'ILS': 'Israeli New Shekel',
    'IQD': 'Iraqi Dinar',
    'IRR': 'Iranian Rial',
    'ISK': 'Icelandic Króna',
    'JMD': 'Jamaican Dollar',
    'JOD': 'Jordanian Dinar',
    'KES': 'Kenyan Shilling',
    'KGS': 'Kyrgyzstani Som',
    'KHR': 'Cambodian Riel',
    'KMF': 'Comorian Franc',
    'KWD': 'Kuwaiti Dinar',
    'KYD': 'Cayman Islands Dollar',
    'KZT': 'Kazakhstani Tenge',
    'LAK': 'Lao Kip',
    'LBP': 'Lebanese Pound',
    'LRD': 'Liberian Dollar',
    'LSL': 'Lesotho Loti',
    'LYD': 'Libyan Dinar',
    'MAD': 'Moroccan Dirham',
    'MDL': 'Moldovan Leu',
    'MGA': 'Malagasy Ariary',
    'MKD': 'Macedonian Denar',
    'MMK': 'Myanmar Kyat',
    'MNT': 'Mongolian Tögrög',
    'MOP': 'Macanese Pataca',
    'MRU': 'Mauritanian Ouguiya',
    'MUR': 'Mauritian Rupee',
    'MVR': 'Maldivian Rufiyaa',
    'MZN': 'Mozambican Metical',
    'NAD': 'Namibian Dollar',
    'NIO': 'Nicaraguan Córdoba',
    'NOK': 'Norwegian Krone',
    'NZD': 'New Zealand Dollar',
    'PAB': 'Panamanian Balboa',
    'PEN': 'Peruvian Sol',
    'PGK': 'Papua New Guinean Kina',
    'PYG': 'Paraguayan Guaraní',
    'QAR': 'Qatari Riyal',
    'RON': 'Romanian Leu',
    'RSD': 'Serbian Dinar',
    'RWF': 'Rwandan Franc',
    'SBD': 'Solomon Islands Dollar',
    'SDG': 'Sudanese Pound',
    'SHP': 'Saint Helena Pound',
    'SOS': 'Somali Shilling',
    'SRD': 'Surinamese Dollar',
    'SSP': 'South Sudanese Pound',
    'STN': 'São Tomé and Príncipe Dobra',
    'SVC': 'Salvadoran Colón',
    'SYP': 'Syrian Pound',
    'SZL': 'Eswatini Lilangeni',
    'TJS': 'Tajikistani Somoni',
    'TMT': 'Turkmenistani Manat',
    'TND': 'Tunisian Dinar',
    'TOP': 'Tongan Paʻanga',
    'TTD': 'Trinidad and Tobago Dollar',
    'TWD': 'New Taiwan Dollar',
    'UAH': 'Ukrainian Hryvnia',
    'UYU': 'Uruguayan Peso',
    'UZS': 'Uzbekistani Som',
    'VES': 'Venezuelan Bolívar',
    'VUV': 'Vanuatu Vatu',
    'WST': 'Samoan Tālā',
    'XAF': 'Central African CFA Franc',
    'XCD': 'East Caribbean Dollar',
    'XPF': 'CFP Franc',
    'ZMW': 'Zambian Kwacha',
    'ZWG': 'Zimbabwean Gold'
}

In [None]:
len(currency_name_dict)

# Mean embedding exp

In [None]:

import pandas as pd
all_acronyms = sorted(list(set(currency_symbol_dict.keys()) | set(currency_name_dict.keys())))
data = []
for acronym in all_acronyms:
    currency_name = currency_name_dict.get(acronym, None)
    symbol = currency_symbol_dict.get(acronym, None)
    data.append([currency_name, symbol, acronym])

df_curr = pd.DataFrame(data, columns=['Currency', 'Symbol', 'Acronym'])

display(df_curr.head())

df_curr_sym = df_curr[df_curr['Symbol'].notna()].copy()
display(df_curr_sym)

In [None]:

import pandas as pd
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', None)
pd.set_option('display.max_colwidth', None)

# Finbert Pipeline

In [None]:

class FinBERTPipeline(Pipeline):
    def _sanitize_parameters(self, **kwargs):
        preprocess_kwargs = {}
        if "max_length" in kwargs:
            preprocess_kwargs["max_length"] = kwargs["max_length"]
        return {}, preprocess_kwargs, {}

    def preprocess(self, inputs, **kwargs):
        tokenizer = self.tokenizer
        return tokenizer(inputs, return_tensors=self.framework, padding=True, truncation=True, **kwargs)

    def _forward(self, model_inputs, **kwargs):
        with torch.no_grad():
            outputs = self.model(**model_inputs, output_hidden_states=True)
            cls_token_embeddings = outputs.hidden_states[-1][:, 0, :].cpu().numpy()
        return {"embeddings": cls_token_embeddings}

    def postprocess(self, model_outputs, **kwargs):
        return model_outputs["embeddings"]

PIPELINE_REGISTRY.register_pipeline(
    "finbert-embedding",
    pipeline_class=FinBERTPipeline,
    pt_model=AutoModelForSequenceClassification,
    default={"model": "ProsusAI/finbert", "revision": "main"}
)


In [None]:
model_name = "ProsusAI/finbert"
device = "cuda"

In [None]:
!pwd

In [None]:
import pandas as pd

In [None]:
model_names_to_test_finsts = ["FinLang/finance-embeddings-investopedia","intfloat/multilingual-e5-large", "intfloat/multilingual-e5-large-instruct","Alibaba-NLP/gte-large-en-v1.5","Alibaba-NLP/gte-multilingual-base","jinaai/jina-embeddings-v3",\
                              "jinaai/jina-embeddings-v2-base-en","BAAI/bge-m3","Snowflake/snowflake-arctic-embed-m-v1.5","sentence-transformers/paraphrase-multilingual-mpnet-base-v2","sentence-transformers/all-mpnet-base-v2",\
                               "Qwen/Qwen3-Embedding-0.6B","Qwen/Qwen3-Embedding-4B","Alibaba-NLP/gte-Qwen2-1.5B-instruct","google/embeddinggemma-300M"]

In [None]:
model_names_to_test_finsts =[ "FinLang/finance-embeddings-investopedia"]#,"intfloat/multilingual-e5-large",
# "ProsusAI/finbert",

model_names_to_test_finsts+=["Alibaba-NLP/gte-large-en-v1.5","Alibaba-NLP/gte-multilingual-base",\
                              "jinaai/jina-embeddings-v2-base-en","BAAI/bge-m3","Snowflake/snowflake-arctic-embed-m-v1.5","sentence-transformers/paraphrase-multilingual-mpnet-base-v2","sentence-transformers/all-mpnet-base-v2",\
                              "Qwen/Qwen3-Embedding-0.6B","google/embeddinggemma-300M", "intfloat/multilingual-e5-large-instruct"]

model_names_to_test_finsts+=["intfloat/multilingual-e5-large"]
model_names_to_test_finsts+=["sentence-transformers/LaBSE","ProsusAI/finbert"]

# model_names_to_test_finsts = ["Qwen/Qwen3-Embedding-4B"]#,"Alibaba-NLP/gte-Qwen2-1.5B-instruct"]


In [None]:
sorted(model_names_to_test_finsts)

In [None]:
len(model_names_to_test_finsts)

In [None]:
!which python

# MRR Calculation Function

In [None]:
import pandas as pd
all_acronyms = sorted(list(set(currency_symbol_dict.keys()) | set(currency_name_dict.keys())))
data = []
for acronym in all_acronyms:
    currency_name = currency_name_dict.get(acronym, None)
    symbol = currency_symbol_dict.get(acronym, None)
    data.append([currency_name, symbol, acronym])

df_curr = pd.DataFrame(data, columns=['Currency', 'Symbol', 'Acronym'])

display(df_curr.head())



df_curr_sym = df_curr[df_curr['Symbol'].notna()].copy()
display(df_curr_sym)

In [None]:
import torch
import numpy as np
from sentence_transformers import SentenceTransformer, util
from transformers.pipelines import Pipeline

def calculate_batched_mrr(
    queries: list[str],
    correct_targets: list[str],
    candidates: list[str],
    model_name: str,
    model: SentenceTransformer = None,
    finbert_pipe: Pipeline = None
) -> list[float]:

    if model_name == "ProsusAI/finbert":
        if finbert_pipe is None:
            raise ValueError("finbert_pipe must be provided for ProsusAI/finbert model_name")
        candidate_embeddings_raw = finbert_pipe(candidates, padding=True, truncation=True, max_length=128)
        candidate_embeddings = torch.from_numpy(np.stack(candidate_embeddings_raw)).float()
    else:
        if model is None:
            raise ValueError("SentenceTransformer model must be provided for non-FinBERT model_name")
        candidate_embeddings = model.encode(candidates, convert_to_tensor=True)

    if candidate_embeddings.dim() == 3:
        candidate_embeddings = candidate_embeddings.squeeze(1)

    all_mrr_scores = []


    print('queries ', queries)
    print('len queries ', len(queries))
    print('candidates ', candidates)
    print('len candidates ', len(candidates))
    print('correct_targets ', correct_targets)

    for idx, query in enumerate(queries):
        correct_target_string = correct_targets[idx]

        if model_name == "ProsusAI/finbert":
            query_embedding_raw = finbert_pipe(query, padding=True, truncation=True, max_length=128)
            query_embedding = torch.from_numpy(np.stack(query_embedding_raw)).float()
        else:
            query_embedding = model.encode(query, convert_to_tensor=True)

        if query_embedding.dim() == 1:
            query_embedding = query_embedding.unsqueeze(0)
        elif query_embedding.dim() == 3:
            query_embedding = query_embedding.squeeze(1)

        sims_for_current_query = util.cos_sim(query_embedding, candidate_embeddings).cpu().numpy().flatten()
        current_query_sims = sims_for_current_query.copy()
        current_query_string = query

        if current_query_string in candidates:
            try:
                query_idx_in_candidates = candidates.index(current_query_string)
                current_query_sims[query_idx_in_candidates] = float('-inf')
            except ValueError:
                pass

        try:
            correct_target_idx = candidates.index(correct_target_string)
        except ValueError:
            print(f"Warning: Correct target '{correct_target_string}' not found in candidates for query '{query}'. Assigning MRR of 0.0.")
            all_mrr_scores.append(0.0)
            continue

        ranked_indices = np.argsort(-current_query_sims)
        rank = np.where(ranked_indices == correct_target_idx)[0][0] + 1
        mrr = 1 / rank
        all_mrr_scores.append(round(mrr, 4))

    return all_mrr_scores

In [None]:
model_names_to_test_finsts

In [None]:
device = "cuda" if torch.cuda.is_available() else "cpu"
finbert_pipe = None
finbert_model = None
finbert_tokenizer = None

if "ProsusAI/finbert" in model_names_to_test_finsts:
    try:
        finbert_model = AutoModelForSequenceClassification.from_pretrained("ProsusAI/finbert").to(device)
        finbert_tokenizer = AutoTokenizer.from_pretrained("ProsusAI/finbert")
        finbert_pipe = pipeline("finbert-embedding", model=finbert_model, tokenizer=finbert_tokenizer, device=device)
        print("FinBERT model and pipeline loaded successfully.")
    except Exception as e:
        print(f"Could not load FinBERT on {device}: {e}")
        if device == "cuda":
            print("Attempting to load FinBERT on CPU.")
            device = "cpu"
            finbert_model = AutoModelForSequenceClassification.from_pretrained("ProsusAI/finbert").to(device)
            finbert_tokenizer = AutoTokenizer.from_pretrained("ProsusAI/finbert")
            finbert_pipe = pipeline("finbert-embedding", model=finbert_model, tokenizer=finbert_tokenizer, device=device)
            print("FinBERT model and pipeline loaded successfully on CPU.")
        else:
            print("FinBERT failed to load even on CPU. Skipping FinBERT tests.")
            if "ProsusAI/finbert" in model_names_to_test_finsts:
                model_names_to_test_finsts.remove("ProsusAI/finbert")


all_currency_names = df_curr['Currency'].tolist()
all_prefixed_acronyms = [f"Currency {x}" for x in df_curr["Acronym"]]
mixed_curr_acronym_candidates = list(all_currency_names + all_prefixed_acronyms)

all_curr_sym_names = df_curr_sym['Currency'].tolist()
all_prefixed_symbols = [f"Currency {str(x)}" for x in df_curr_sym["Symbol"]]
mixed_sym_curr_candidates = list(all_curr_sym_names + all_prefixed_symbols)

print("Universal candidate lists prepared.")

for model_name in model_names_to_test_finsts:
    print(f'\n--- Processing model: {model_name} ---')

    model_name_short = model_name.replace('/', '_')
    current_st_model = None
    current_finbert_pipe = None

    if model_name == "ProsusAI/finbert":
        if finbert_pipe is None:
            print(f"Skipping {model_name} as FinBERT pipeline could not be initialized.")
            continue
        current_finbert_pipe = finbert_pipe
        print('Using pre-initialized FinBERT pipeline.')
    else:
        try:
            current_st_model = SentenceTransformer(model_name, trust_remote_code=True).to(device)
            print(f'Loaded SentenceTransformer model {model_name}.')
        except Exception as e:
            print(f"Could not load {model_name} on {device}: {e}. Skipping this model.")
            continue
# Different setups
    print(f"  Calculating MRR for {model_name_short}: Acronym -> Currency Name (Clean candidates)")
    queries = [f"Currency {x}" for x in df_curr['Acronym'].tolist()]
    correct_targets = df_curr['Currency'].tolist()
    candidates = all_currency_names
    df_curr[f"{model_name_short}_Acronym_Currency_Clean_RR"] = calculate_batched_mrr(queries, correct_targets, candidates, model_name, model=current_st_model, finbert_pipe=current_finbert_pipe)
    print(f"    Mean MRR ({model_name_short}_Acronym_Currency_Clean_RR): {df_curr[f'{model_name_short}_Acronym_Currency_Clean_RR'].mean():.4f}")

    print(f"  Calculating MRR for {model_name_short}: Acronym -> Currency Name (Mixed candidates)")
    queries = [f"Currency {x}" for x in df_curr['Acronym'].tolist()]
    correct_targets = df_curr['Currency'].tolist()
    candidates = mixed_curr_acronym_candidates
    df_curr[f"{model_name_short}_Acronym_Currency_Mixed_RR"] = calculate_batched_mrr(queries, correct_targets, candidates, model_name, model=current_st_model, finbert_pipe=current_finbert_pipe)
    print(f"    Mean MRR ({model_name_short}_Acronym_Currency_Mixed_RR): {df_curr[f'{model_name_short}_Acronym_Currency_Mixed_RR'].mean():.4f}")

    print(f"  Calculating MRR for {model_name_short}: Currency Name -> Acronym (Clean candidates)")
    queries = df_curr['Currency'].tolist()
    correct_targets = [f"Currency {x}" for x in df_curr["Acronym"].values]
    candidates = all_prefixed_acronyms
    df_curr[f"{model_name_short}_Currency_Acronym_Clean_RR"] = calculate_batched_mrr(queries, correct_targets, candidates, model_name, model=current_st_model, finbert_pipe=current_finbert_pipe)
    print(f"    Mean MRR ({model_name_short}_Currency_Acronym_Clean_RR): {df_curr[f'{model_name_short}_Currency_Acronym_Clean_RR'].mean():.4f}")

    print(f"  Calculating MRR for {model_name_short}: Currency Name -> Acronym (Mixed candidates)")
    queries = df_curr['Currency'].tolist()
    correct_targets = [f"Currency {x}" for x in df_curr["Acronym"].values]
    candidates = mixed_curr_acronym_candidates
    df_curr[f"{model_name_short}_Currency_Acronym_Mixed_RR"] = calculate_batched_mrr(queries, correct_targets, candidates, model_name, model=current_st_model, finbert_pipe=current_finbert_pipe)
    print(f"    Mean MRR ({model_name_short}_Currency_Acronym_Mixed_RR): {df_curr[f'{model_name_short}_Currency_Acronym_Mixed_RR'].mean():.4f}")

    df_curr.to_csv(f"{directory_path}/{model_name_short}_curr_all_pools.csv", index=None)
    print(f"Saved {model_name_short}_curr_all_pools.csv")

    if not df_curr_sym.empty:
        df_curr_sym_valid_symbols = df_curr_sym[df_curr_sym['Symbol'].notna()].copy() #Filter NAN symbols
        print(f"  Calculating MRR for {model_name_short}: Symbol -> Currency Name (Clean candidates)")
        queries = [f"Currency {x}" for x in df_curr_sym_valid_symbols['Symbol'].apply(lambda x: str(x)).tolist()]
        correct_targets = df_curr_sym_valid_symbols['Currency'].tolist()
        candidates = all_curr_sym_names
        df_curr_sym.loc[df_curr_sym_valid_symbols.index, f"{model_name_short}_Symbol_Currency_Clean_RR"] = calculate_batched_mrr(queries, correct_targets, candidates, model_name, model=current_st_model, finbert_pipe=current_finbert_pipe)
        print(f"    Mean MRR ({model_name_short}_Symbol_Currency_Clean_RR): {df_curr_sym[f'{model_name_short}_Symbol_Currency_Clean_RR'].mean():.4f}")

        # Symbol (query) -> Currency Name (target) - Mixed candidates (currency names + prefixed symbols)
        print(f"  Calculating MRR for {model_name_short}: Symbol -> Currency Name (Mixed candidates)")
        queries = [f"Currency {x}" for x in df_curr_sym_valid_symbols['Symbol'].apply(lambda x: str(x)).tolist()]
        correct_targets = df_curr_sym_valid_symbols['Currency'].tolist()
        candidates = mixed_sym_curr_candidates
        df_curr_sym.loc[df_curr_sym_valid_symbols.index, f"{model_name_short}_Symbol_Currency_Mixed_RR"] = calculate_batched_mrr(queries, correct_targets, candidates, model_name, model=current_st_model, finbert_pipe=current_finbert_pipe)
        print(f"    Mean MRR ({model_name_short}_Symbol_Currency_Mixed_RR): {df_curr_sym[f'{model_name_short}_Symbol_Currency_Mixed_RR'].mean():.4f}")

        # Currency Name (query) -> Symbol (target) - Clean candidates (only prefixed symbols)
        print(f"  Calculating MRR for {model_name_short}: Currency Name -> Symbol (Clean candidates)")
        queries = df_curr_sym_valid_symbols['Currency'].tolist()
        correct_targets = [f"Currency {str(x)}" for x in df_curr_sym_valid_symbols["Symbol"].apply(lambda x: str(x)).values]
        candidates = all_prefixed_symbols
        df_curr_sym.loc[df_curr_sym_valid_symbols.index, f"{model_name_short}_Currency_Symbol_Clean_RR"] = calculate_batched_mrr(queries, correct_targets, candidates, model_name, model=current_st_model, finbert_pipe=current_finbert_pipe)
        print(f"    Mean MRR ({model_name_short}_Currency_Symbol_Clean_RR): {df_curr_sym[f'{model_name_short}_Currency_Symbol_Clean_RR'].mean():.4f}")

        # Currency Name (query) -> Symbol (target) - Mixed candidates (currency names + prefixed symbols)
        print(f"  Calculating MRR for {model_name_short}: Currency Name -> Symbol (Mixed candidates)")
        queries = df_curr_sym_valid_symbols['Currency'].tolist()
        correct_targets = [f"Currency {str(x)}" for x in df_curr_sym_valid_symbols["Symbol"].apply(lambda x: str(x)).values]
        candidates = mixed_sym_curr_candidates
        df_curr_sym.loc[df_curr_sym_valid_symbols.index, f"{model_name_short}_Currency_Symbol_Mixed_RR"] = calculate_batched_mrr(queries, correct_targets, candidates, model_name, model=current_st_model, finbert_pipe=current_finbert_pipe)
        print(f"    Mean MRR ({model_name_short}_Currency_Symbol_Mixed_RR): {df_curr_sym[f'{model_name_short}_Currency_Symbol_Mixed_RR'].mean():.4f}")

        df_curr_sym.to_csv(f"{directory_path}/{model_name_short}_curr_sym_all_pools.csv", index=None)
        print(f"Saved {model_name_short}_curr_sym_all_pools.csv")
    else:
        print("df_curr_sym is empty, skipping symbol-related calculations.")
    if current_st_model is not None:
        del current_st_model
        gc.collect()
        torch.cuda.empty_cache()
        torch.cuda.synchronize()
    print(f"--- {model_name} completed! ---\n")

if finbert_model is not None:
    del finbert_model
    if finbert_tokenizer is not None:
        del finbert_tokenizer
    gc.collect()
    torch.cuda.empty_cache()
    torch.cuda.synchronize()

# Merging dataframes

In [None]:
import glob
import pandas as pd

files = glob.glob(f"{directory_path}/*_curr_sym_all_pools.csv")
mrr_dataframes_sym = []

for file_path in files:
    temp_df_sym = pd.read_csv(file_path)
    model_name_full = file_path.split('/')[-1]
    model_name = model_name_full.replace('_curr_sym_all_pools.csv', '')
    mrr_cols_sym = []
    expected_suffixes = ['_Symbol_Currency_Clean_RR', '_Symbol_Currency_Mixed_RR',
                         '_Currency_Symbol_Clean_RR', '_Currency_Symbol_Mixed_RR']
    for suffix in expected_suffixes:
        col_name = f"{model_name}{suffix}"
        if col_name in temp_df_sym.columns:
            mrr_cols_sym.append(col_name)

    if mrr_cols_sym:
        df_model_mrr_sym = temp_df_sym[['Acronym', 'Symbol'] + mrr_cols_sym].copy()
        mrr_dataframes_sym.append(df_model_mrr_sym)
    else:
        print(f"Warning: No relevant Symbol-Currency MRR columns found for {model_name} in {file_path}. Skipping.")

mrr_consolidated_df_symbol_currency = df_curr_sym[['Acronym', 'Symbol']].copy()
for model_df_sym in mrr_dataframes_sym:
    mrr_consolidated_df_symbol_currency = pd.merge(mrr_consolidated_df_symbol_currency, model_df_sym, on=['Acronym', 'Symbol'], how='left')

print("Consolidated Symbol-Currency MRR DataFrame created:")
display(mrr_consolidated_df_symbol_currency.head())
mrr_consolidated_df_symbol_currency.to_csv(f"{directory_path}/mrr_consolidated_df_symbol_currency_all_pools.csv", index=None)

In [None]:
import glob
import pandas as pd

files = glob.glob(f"{directory_path}/*_curr_all_pools.csv")
mrr_dataframes_acronym = []

for file_path in files:
    temp_df_acronym = pd.read_csv(file_path)


    model_name_full = file_path.split('/')[-1]
    model_name = model_name_full.replace('_curr_all_pools.csv', '')

    mrr_cols_acronym = []
    expected_suffixes = ['_Acronym_Currency_Clean_RR', '_Acronym_Currency_Mixed_RR',
                         '_Currency_Acronym_Clean_RR', '_Currency_Acronym_Mixed_RR']
    for suffix in expected_suffixes:
        col_name = f"{model_name}{suffix}"
        if col_name in temp_df_acronym.columns:
            mrr_cols_acronym.append(col_name)

    if mrr_cols_acronym:
        df_model_mrr_acronym = temp_df_acronym[['Acronym'] + mrr_cols_acronym].copy()
        mrr_dataframes_acronym.append(df_model_mrr_acronym)
    else:
        print(f"Warning: No relevant Acronym-Currency MRR columns found for {model_name} in {file_path}. Skipping.")
mrr_consolidated_df_acronym_currency = df_curr[['Acronym']].copy()
for model_df_acronym in mrr_dataframes_acronym:
    mrr_consolidated_df_acronym_currency = pd.merge(mrr_consolidated_df_acronym_currency, model_df_acronym, on='Acronym', how='left')

print("Consolidated Acronym-Currency MRR DataFrame created:")
display(mrr_consolidated_df_acronym_currency.head())
mrr_consolidated_df_acronym_currency.to_csv(f"{directory_path}/mrr_consolidated_df_acronym_currency_all_pools.csv", index=None)

# Load and consolidate Data

In [None]:
import pandas as pd

acronym_df = None
symbol_df = None

try:
    acronym_df = pd.read_csv(f'{directory_path}/mrr_consolidated_df_acronym_currency_all_pools.csv')
    print("Loaded mrr_consolidated_df_acronym_currency_all_pools.csv successfully.")
    display(acronym_df.head())
except FileNotFoundError:
    print("Error: 'mrr_consolidated_df_acronym_currency_all_pools.csv' not found. Please ensure the file exists.")

try:
    symbol_df = pd.read_csv(f'{directory_path}/mrr_consolidated_df_symbol_currency_all_pools.csv')
    print("Loaded mrr_consolidated_df_symbol_currency_all_pools.csv successfully.")
    display(symbol_df.head())
except FileNotFoundError:
    print("Error: 'mrr_consolidated_df_symbol_currency_all_pools.csv' not found. Please ensure the file exists.")

In [None]:
model_names_to_test_finsts

In [None]:
model_short_names = [name.replace('/', '_') for name in model_names_to_test_finsts]


In [None]:
model_short_names

## Prepare Data for Acronym-Currency Scenario Plots


In [None]:
import glob
import pandas as pd
files = glob.glob(f"{directory_path}/*_curr_sym_all_pools.csv")
mrr_dataframes_sym = []

for file_path in files:
    temp_df_sym = pd.read_csv(file_path)
    model_name_full = file_path.split('/')[-1]
    model_name = model_name_full.replace('_curr_sym_all_pools.csv', '')
    mrr_cols_sym = []
    expected_suffixes = ['_Symbol_Currency_Clean_RR', '_Symbol_Currency_Mixed_RR',
                         '_Currency_Symbol_Clean_RR', '_Currency_Symbol_Mixed_RR']
    for suffix in expected_suffixes:
        col_name = f"{model_name}{suffix}"
        if col_name in temp_df_sym.columns:
            mrr_cols_sym.append(col_name)

    if mrr_cols_sym:
        df_model_mrr_sym = temp_df_sym[['Acronym', 'Symbol'] + mrr_cols_sym].copy()
        mrr_dataframes_sym.append(df_model_mrr_sym)
    else:
        print(f"Warning: No relevant Symbol-Currency MRR columns found for {model_name} in {file_path}. Skipping.")

mrr_consolidated_df_symbol_currency = df_curr_sym[['Acronym', 'Symbol']].copy()
for model_df_sym in mrr_dataframes_sym:
    mrr_consolidated_df_symbol_currency = pd.merge(mrr_consolidated_df_symbol_currency, model_df_sym, on=['Acronym', 'Symbol'], how='left')

print("Consolidated Symbol-Currency MRR DataFrame created:")
display(mrr_consolidated_df_symbol_currency.head())
mrr_consolidated_df_symbol_currency.to_csv(f"{directory_path}/mrr_consolidated_df_symbol_currency_all_pools.csv", index=None)

In [None]:
import glob
import pandas as pd
files = glob.glob(f"{directory_path}/*_curr_all_pools.csv")

mrr_dataframes_acronym = []

for file_path in files:
    temp_df_acronym = pd.read_csv(file_path)
    model_name_full = file_path.split('/')[-1]
    model_name = model_name_full.replace('_curr_all_pools.csv', '')
    mrr_cols_acronym = []
    expected_suffixes = ['_Acronym_Currency_Clean_RR', '_Acronym_Currency_Mixed_RR',
                         '_Currency_Acronym_Clean_RR', '_Currency_Acronym_Mixed_RR']
    for suffix in expected_suffixes:
        col_name = f"{model_name}{suffix}"
        if col_name in temp_df_acronym.columns:
            mrr_cols_acronym.append(col_name)

    if mrr_cols_acronym:
        df_model_mrr_acronym = temp_df_acronym[['Acronym'] + mrr_cols_acronym].copy()
        mrr_dataframes_acronym.append(df_model_mrr_acronym)
    else:
        print(f"Warning: No relevant Acronym-Currency MRR columns found for {model_name} in {file_path}. Skipping.")
mrr_consolidated_df_acronym_currency = df_curr[['Acronym']].copy()
for model_df_acronym in mrr_dataframes_acronym:
    mrr_consolidated_df_acronym_currency = pd.merge(mrr_consolidated_df_acronym_currency, model_df_acronym, on='Acronym', how='left')

print("Consolidated Acronym-Currency MRR DataFrame created:")
display(mrr_consolidated_df_acronym_currency.head())
mrr_consolidated_df_acronym_currency.to_csv(f"{directory_path}/mrr_consolidated_df_acronym_currency_all_pools.csv", index=None)

In [None]:
import pandas as pd

acronym_df = None
symbol_df = None

try:
    acronym_df = pd.read_csv(f'{directory_path}/mrr_consolidated_df_acronym_currency_all_pools.csv')
    print("Loaded mrr_consolidated_df_acronym_currency_all_pools.csv successfully.")
    display(acronym_df.head())
except FileNotFoundError:
    print("Error: 'mrr_consolidated_df_acronym_currency_all_pools.csv' not found. Please ensure the file exists.")

try:
    symbol_df = pd.read_csv(f'{directory_path}/mrr_consolidated_df_symbol_currency_all_pools.csv')
    print("Loaded mrr_consolidated_df_symbol_currency_all_pools.csv successfully.")
    display(symbol_df.head())
except FileNotFoundError:
    print("Error: 'mrr_consolidated_df_symbol_currency_all_pools.csv' not found. Please ensure the file exists.")

In [None]:
import pandas as pd
import numpy as np

ac_cn_plot_data = []

if acronym_df is None:
    print("Error: 'acronym_df' is not loaded. Cannot prepare plot data.")
else:
    for model_short_name in model_short_names:
        # Acronym -> Currency Name Scenario
        col_ac_clean = f"{model_short_name}_Acronym_Currency_Clean_RR"
        col_ac_mixed = f"{model_short_name}_Acronym_Currency_Mixed_RR"

        if col_ac_clean in acronym_df.columns and col_ac_mixed in acronym_df.columns:
            for mrr_value in acronym_df[col_ac_clean].dropna():
                ac_cn_plot_data.append({
                    'Model': model_short_name,
                    'MRR Type': 'MRR Clean',
                    'Scenario': 'Acronym -> Currency Name',
                    'MRR': mrr_value
                })
            for mrr_value in acronym_df[col_ac_mixed].dropna():
                ac_cn_plot_data.append({
                    'Model': model_short_name,
                    'MRR Type': 'MRR Mixed',
                    'Scenario': 'Acronym -> Currency Name',
                    'MRR': mrr_value
                })
        else:
            print(f"Warning: Missing columns for {model_short_name} Acronym -> Currency Name in acronym_df. Skipping this scenario for plotting.")

        # Currency Name -> Acronym Scenario
        col_ca_clean = f"{model_short_name}_Currency_Acronym_Clean_RR"
        col_ca_mixed = f"{model_short_name}_Currency_Acronym_Mixed_RR"

        if col_ca_clean in acronym_df.columns and col_ca_mixed in acronym_df.columns:
            for mrr_value in acronym_df[col_ca_clean].dropna():
                ac_cn_plot_data.append({
                    'Model': model_short_name,
                    'MRR Type': 'MRR Clean',
                    'Scenario': 'Currency Name -> Acronym',
                    'MRR': mrr_value
                })
            for mrr_value in acronym_df[col_ca_mixed].dropna():
                ac_cn_plot_data.append({
                    'Model': model_short_name,
                    'MRR Type': 'MRR Mixed',
                    'Scenario': 'Currency Name -> Acronym',
                    'MRR': mrr_value
                })
        else:
            print(f"Warning: Missing columns for {model_short_name} Currency Name -> Acronym in acronym_df. Skipping this scenario for plotting.")

    ac_cn_plot_df = pd.DataFrame(ac_cn_plot_data)
    ac_cn_plot_df['Model'] = ac_cn_plot_df['Model'].apply(lambda x: x.split('_', 1)[1] if '_' in x else x)
    print("\nLong-format DataFrame (ac_cn_plot_df) created:")
    display(ac_cn_plot_df.head())
    print(f"Total entries in ac_cn_plot_df: {len(ac_cn_plot_df)}")

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import os

plots_dir = os.path.join(directory_path, 'plots')
os.makedirs(plots_dir, exist_ok=True)
ac_to_cn_df = ac_cn_plot_df[ac_cn_plot_df['Scenario'] == 'Acronym -> Currency Name'].copy()
sns.set_style('whitegrid')
plt.figure(figsize=(16, 9))
sns.boxplot(
    data=ac_to_cn_df,
    x='Model',
    y='MRR',
    hue='MRR Type',
    palette={'MRR Clean': 'skyblue', 'MRR Mixed': 'lightcoral'},
    showmeans=True,
    meanprops={
        "marker": "D",
        "markerfacecolor": "white",
        "markeredgecolor": "black",
        "markersize": "10"
    },
    showfliers=True,
    width=0.6
)

plt.title('MRR : Acronym -> Currency Name ', fontsize=20)
plt.ylabel('MRR', fontsize=24)

plt.xticks(rotation=45, ha='right', fontsize=24)
plt.yticks(fontsize=22)

plt.legend(title='MRR Type', fontsize=24, title_fontsize=24)
plt.tight_layout()

output_filepath = os.path.join(plots_dir, 'acronym_to_currency_name_mrr_boxplot.pdf')

plt.savefig(output_filepath)

plt.show()

plots_dir = os.path.join(directory_path, 'plots')
os.makedirs(plots_dir, exist_ok=True)
cn_to_ac_df = ac_cn_plot_df[ac_cn_plot_df['Scenario'] == 'Currency Name -> Acronym'].copy()
sns.set_style('whitegrid')
plt.figure(figsize=(16, 9))

sns.boxplot(
    data=cn_to_ac_df,
    x='Model',
    y='MRR',
    hue='MRR Type',
    palette={'MRR Clean': 'skyblue', 'MRR Mixed': 'lightcoral'},
    showmeans=True,
    meanprops={
        "marker": "D",
        "markerfacecolor": "white",
        "markeredgecolor": "black",
        "markersize": "10"
    },
    showfliers=True,
    width=0.6
)

plt.title('MRR : Currency Name -> Acronym ', fontsize=20)


plt.ylabel('MRR', fontsize=26)
plt.xticks(rotation=45, ha='right', fontsize=26)
plt.yticks(fontsize=22)
plt.legend(title='MRR Type', fontsize=14, title_fontsize=16)
plt.tight_layout()

output_filepath = os.path.join(plots_dir, 'currency_name_to_acronym_mrr_boxplot.pdf')
plt.savefig(output_filepath)
plt.show()

print(f"Box plot saved to: {output_filepath}")

In [None]:
import numpy as np

acronym_scenario_data = []

if acronym_df is None:
    print("Error: acronym_df is not loaded. Cannot process.")
elif acronym_df.empty:
    print("Warning: acronym_df is empty. No data to process.")
else:
    for index, row in acronym_df.iterrows():
        acronym = row['Acronym']

        # Acronym --> Currency name
        ac_cn_cols = [col for col in acronym_df.columns if col.endswith('_Acronym_Currency_Clean_RR') or col.endswith('_Acronym_Currency_Mixed_RR')]
        ac_cn_mrr_values = row[ac_cn_cols].dropna().tolist()

        mean_ac_cn = np.mean(ac_cn_mrr_values) if ac_cn_mrr_values else np.nan
        std_ac_cn = np.std(ac_cn_mrr_values) if len(ac_cn_mrr_values) > 1 else (0.0 if ac_cn_mrr_values else np.nan)

        acronym_scenario_data.append({
            'Acronym': acronym,
            'Scenario': 'Acronym -> Currency Name',
            'Mean_MRR': mean_ac_cn,
            'Std_MRR': std_ac_cn
        })

        # Currency Name -> Acronym
        cn_ac_cols = [col for col in acronym_df.columns if col.endswith('_Currency_Acronym_Clean_RR') or col.endswith('_Currency_Acronym_Mixed_RR')]
        cn_ac_mrr_values = row[cn_ac_cols].dropna().tolist()

        mean_cn_ac = np.mean(cn_ac_mrr_values) if cn_ac_mrr_values else np.nan
        std_cn_ac = np.std(cn_ac_mrr_values) if len(cn_ac_mrr_values) > 1 else (0.0 if cn_ac_mrr_values else np.nan)

        acronym_scenario_data.append({
            'Acronym': acronym,
            'Scenario': 'Currency Name -> Acronym',
            'Mean_MRR': mean_cn_ac,
            'Std_MRR': std_cn_ac
        })

    acronym_mrr_summary_df = pd.DataFrame(acronym_scenario_data)
    print("Acronym MRR summary DataFrame created successfully.")
    display(acronym_mrr_summary_df.head())

In [None]:
import numpy as np

symbol_scenario_data = []

if symbol_df is None:
    print("Error: symbol_df is not loaded. Cannot process.")
elif symbol_df.empty:
    print("Warning: symbol_df is empty. No data to process.")
else:
    for index, row in symbol_df.iterrows():
        acronym = row['Acronym']
        symbol = row['Symbol']

        # Symbol --> Currency
        sc_cn_cols = [col for col in symbol_df.columns if col.endswith('_Symbol_Currency_Clean_RR') or col.endswith('_Symbol_Currency_Mixed_RR')]
        sc_cn_mrr_values = row[sc_cn_cols].dropna().tolist()

        mean_sc_cn = np.mean(sc_cn_mrr_values) if sc_cn_mrr_values else np.nan
        std_sc_cn = np.std(sc_cn_mrr_values) if len(sc_cn_mrr_values) > 1 else (0.0 if sc_cn_mrr_values else np.nan)

        symbol_scenario_data.append({
            'Acronym': acronym,
            'Symbol': symbol,
            'Scenario': 'Symbol -> Currency Name',
            'Mean_MRR': mean_sc_cn,
            'Std_MRR': std_sc_cn
        })

        # Currency --> Symbol
        cn_sc_cols = [col for col in symbol_df.columns if col.endswith('_Currency_Symbol_Clean_RR') or col.endswith('_Currency_Symbol_Mixed_RR')]
        cn_sc_mrr_values = row[cn_sc_cols].dropna().tolist()

        mean_cn_sc = np.mean(cn_sc_mrr_values) if cn_sc_mrr_values else np.nan
        std_cn_sc = np.std(cn_sc_mrr_values) if len(cn_sc_mrr_values) > 1 else (0.0 if cn_sc_mrr_values else np.nan)

        symbol_scenario_data.append({
            'Acronym': acronym,
            'Symbol': symbol,
            'Scenario': 'Currency Name -> Symbol',
            'Mean_MRR': mean_cn_sc,
            'Std_MRR': std_cn_sc
        })

    symbol_mrr_summary_df = pd.DataFrame(symbol_scenario_data)
    print("Symbol MRR summary DataFrame created successfully.")
    display(symbol_mrr_summary_df.head())


# Displaying Top and Bottom Currencies

In [None]:
import pandas as pd

def display_top_bottom_currencies(df, scenario_name, display_col='Acronym', n=5):
    print(f"\n--- Scenario: {scenario_name} ---")
    top_currencies = df[df['Scenario'] == scenario_name].sort_values(by='Mean_MRR', ascending=False).head(n)
    if not top_currencies.empty:
        print(f"Top {n} Currencies by Mean \u00b1 Std MRR:")
        for _, row in top_currencies.iterrows():
            print(f"{row[display_col]}: {row['Mean_MRR']:.4f} \u00b1 {row['Std_MRR']:.4f}")
    else:
        print(f"No top {n} currencies found for this scenario.")
    bottom_currencies = df[df['Scenario'] == scenario_name].sort_values(by='Mean_MRR', ascending=True).head(n)
    if not bottom_currencies.empty:
        print(f"Bottom {n} Currencies by Mean \u00b1 Std MRR:")
        for _, row in bottom_currencies.iterrows():
            print(f"{row[display_col]}: {row['Mean_MRR']:.4f} \u00b1 {row['Std_MRR']:.4f}")
    else:
        print(f"No bottom {n} currencies found for this scenario.")
if 'acronym_mrr_summary_df' in locals() and acronym_mrr_summary_df is not None:
    display_top_bottom_currencies(acronym_mrr_summary_df, 'Acronym -> Currency Name')
    display_top_bottom_currencies(acronym_mrr_summary_df, 'Currency Name -> Acronym')
else:
    print("Error: acronym_mrr_summary_df is not available.")

if 'symbol_mrr_summary_df' in locals() and symbol_mrr_summary_df is not None:
    display_top_bottom_currencies(symbol_mrr_summary_df, 'Symbol -> Currency Name', display_col='Symbol')
    display_top_bottom_currencies(symbol_mrr_summary_df, 'Currency Name -> Symbol', display_col='Symbol')
else:
    print("Error: symbol_mrr_summary_df is not available.")

# Formatting Table

In [None]:
import pandas as pd

summary_table_data = []


def format_mrr_std(mean_mrr, std_mrr):
    if pd.isna(mean_mrr) or pd.isna(std_mrr):
        return "N/A"
    return f"{mean_mrr:.4f} ± {std_mrr:.4f}"

scenarios_config = [
    {'name': 'Acronym -> Currency Name', 'df': acronym_mrr_summary_df, 'display_col': 'Acronym'},
    {'name': 'Currency Name -> Acronym', 'df': acronym_mrr_summary_df, 'display_col': 'Acronym'},
    {'name': 'Symbol -> Currency Name', 'df': symbol_mrr_summary_df, 'display_col': 'Symbol'},
    {'name': 'Currency Name -> Symbol', 'df': symbol_mrr_summary_df, 'display_col': 'Symbol'},
]

for config in scenarios_config:
    scenario_name = config['name']
    df_to_use = config['df']
    display_col = config['display_col']

    if df_to_use is None or df_to_use.empty:
        print(f"Warning: DataFrame for scenario '{scenario_name}' is not available or is empty. Skipping.")
        continue

    scenario_df_filtered = df_to_use[df_to_use['Scenario'] == scenario_name]
    top_currencies = scenario_df_filtered.sort_values(by='Mean_MRR', ascending=False).head(5)
    for _, row in top_currencies.iterrows():
        summary_table_data.append({
            'Category': f"{scenario_name} (Top 5)",
            'Currency Name': row[display_col],
            'Mean ± Std MRR': format_mrr_std(row['Mean_MRR'], row['Std_MRR'])
        })
    bottom_currencies = scenario_df_filtered.sort_values(by='Mean_MRR', ascending=True).head(5)
    for _, row in bottom_currencies.iterrows():
        summary_table_data.append({
            'Category': f"{scenario_name} (Bottom 5)",
            'Currency Name': row[display_col],
            'Mean ± Std MRR': format_mrr_std(row['Mean_MRR'], row['Std_MRR'])
        })

summary_table_df = pd.DataFrame(summary_table_data)

print("\n--- Formatted MRR Summary Table ---")
display(summary_table_df)


# Per Model MRR (top 5 and bottom 5 currencies)

In [None]:
import pandas as pd
import numpy as np

per_model_mrr_summary_data = []
model_short_names = [name.replace('/', '_') for name in model_names_to_test_finsts]
scenario_configurations = [
    {
        'name': 'Acronym -> Currency Name',
        'df': acronym_df,
        'identifier_col': 'Acronym',
        'mrr_col_prefix': '_Acronym_Currency_',
        'col_types': ['Clean', 'Mixed']
    },
    {
        'name': 'Currency Name -> Acronym',
        'df': acronym_df,
        'identifier_col': 'Acronym',
        'mrr_col_prefix': '_Currency_Acronym_',
        'col_types': ['Clean', 'Mixed']
    },
    {
        'name': 'Symbol -> Currency Name',
        'df': symbol_df,
        'identifier_col': 'Symbol',
        'mrr_col_prefix': '_Symbol_Currency_',
        'col_types': ['Clean', 'Mixed']
    },
    {
        'name': 'Currency Name -> Symbol',
        'df': symbol_df,
        'identifier_col': 'Symbol',
        'mrr_col_prefix': '_Currency_Symbol_',
        'col_types': ['Clean', 'Mixed']
    }
]
for model_short_name in model_short_names:
    print(f"\nProcessing model: {model_short_name}")
    for config in scenario_configurations:
        scenario_name = config['name']
        df_to_use = config['df']
        identifier_col = config['identifier_col']
        mrr_col_prefix = config['mrr_col_prefix']
        col_types = config['col_types']

        if df_to_use is None or df_to_use.empty:
            print(f"  Warning: DataFrame for scenario '{scenario_name}' is not available or is empty. Skipping.")
            continue

        print(f"  Processing scenario: {scenario_name}")
        mrr_columns_for_this_model_scenario = []
        for col_type in col_types:
            col_name = f"{model_short_name}{mrr_col_prefix}{col_type}_RR"
            if col_name in df_to_use.columns:
                mrr_columns_for_this_model_scenario.append(col_name)
            else:
                print(f"    Warning: Missing column '{col_name}'.")

        if not mrr_columns_for_this_model_scenario:
            print(f"    No MRR columns found for {model_short_name} in {scenario_name}. Skipping.")
            continue
        temp_df = df_to_use[[identifier_col] + mrr_columns_for_this_model_scenario].copy()
        current_scenario_summary = []
        for idx, row in temp_df.iterrows():
            identifier_value = row[identifier_col]
            mrr_values = []
            for col in mrr_columns_for_this_model_scenario:
                if pd.notna(row[col]):
                    mrr_values.append(row[col])

            mean_mrr = np.mean(mrr_values) if mrr_values else np.nan
            std_mrr = np.std(mrr_values) if len(mrr_values) > 1 else (0.0 if mrr_values else np.nan)

            current_scenario_summary.append({
                identifier_col: identifier_value,
                'Mean_MRR': mean_mrr,
                'Std_MRR': std_mrr
            })
        summary_df_per_model_scenario = pd.DataFrame(current_scenario_summary)
        summary_df_per_model_scenario.dropna(subset=['Mean_MRR'], inplace=True)

        if summary_df_per_model_scenario.empty:
            print(f"    No valid MRR data for {model_short_name} in {scenario_name}. Skipping top/bottom 5.")
            continue
        top_currencies = summary_df_per_model_scenario.sort_values(by='Mean_MRR', ascending=False).head(5)
        for _, currency_row in top_currencies.iterrows():
            per_model_mrr_summary_data.append({
                'Model': model_short_name,
                'Scenario': scenario_name,
                'Category': 'Top 5',
                'Currency Identifier': currency_row[identifier_col],
                'Mean_MRR': currency_row['Mean_MRR'],
                'Std_MRR': currency_row['Std_MRR']
            })
        bottom_currencies = summary_df_per_model_scenario.sort_values(by='Mean_MRR', ascending=True).head(5)
        for _, currency_row in bottom_currencies.iterrows():
            per_model_mrr_summary_data.append({
                'Model': model_short_name,
                'Scenario': scenario_name,
                'Category': 'Bottom 5',
                'Currency Identifier': currency_row[identifier_col],
                'Mean_MRR': currency_row['Mean_MRR'],
                'Std_MRR': currency_row['Std_MRR']
            })
per_model_mrr_summary_df = pd.DataFrame(per_model_mrr_summary_data)
print("\nPer-model MRR summary data collected:")
display(per_model_mrr_summary_df.head(10))
print(f"Total entries in per_model_mrr_summary_df: {len(per_model_mrr_summary_df)}")

In [None]:
per_model_mrr_summary_df.head(5)