In [None]:
import pandas as pd
import glob
import os
from collections import defaultdict
import json
import re
def open_json(path):
    with open(path, 'r') as f:
        return json.load(f)

In [None]:
files = [x for x in glob.glob('experiments/*provence*0*_mkqa_*') if re.search(r'(0\d+_)', x)]
print(len(files))

In [None]:
compression = defaultdict(dict)
recall3gram = defaultdict(dict)
for f in files:
    if 'tmp' in f:
        continue
    model_name = f.split('/')[-1].split('_')[2]
    lang = f.split('/')[-1].split('_')[-2]
    metric_file = open_json(os.path.join(f, 'eval_dev_metrics.json'))
    if 'noprovence' in model_name:
        compression[lang][model_name] = 0
    elif ('provencezeroshot' in model_name):
        compression[lang][model_name] = 100
    else:
        compression[lang][model_name] = metric_file['compression_ratio_mean']
    recall3gram[lang][model_name] = metric_file['Recall_char3gram']

In [None]:
df = pd.DataFrame(compression).reset_index(names='model')
df2 = pd.DataFrame(recall3gram).reset_index(names='model')
df = pd.melt(df, id_vars=['model'], value_name='compression_ratio', var_name='lang')
df2 = pd.melt(df2, id_vars=['model'],  value_name='recall3gram', var_name='lang')
final_df = pd.merge(df, df2, on=['model', 'lang'], how='inner')

In [None]:
final_df['threshold'] = final_df['model'].apply(lambda x: '0.' + re.search(r'0(\d+)$', x).group(1))
final_df['model'] = final_df['model'].apply(lambda x: re.sub(r'0\d+$', '', x))
final_df['model'] = final_df['model'].apply(lambda x: re.sub(r'xprovence', '', x))
final_df['model'] = final_df['model'].apply(lambda x: re.sub(r'best', 'cross-lingual', x))
final_df['model'] = final_df['model'].apply(lambda x: re.sub(r'noprovence', 'no-pruning', x))
final_df['model'] = final_df['model'].apply(lambda x: re.sub(r'provencezeroshot', 'zeroshot', x))

In [None]:
print(final_df['model'].unique())
print(final_df['threshold'].unique())
print(final_df['lang'].unique())

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from matplotlib.lines import Line2D

df = final_df

model_colors = {
    'no-pruning': '#000000',             # strong black
    'msmarcocomp': '#117733',                # strong brown
    'dslr': "#7B7D7F",                   # strong pink
    'finalcomp': '#D8A547',              # strong gold
    'msmarcotranslated2Mjoint': "#AA4499",  # strong cyan
    'msmarcotranslated2Mcomp': "#C92F4B"     # strong teal
}

model_markers = {
    'no-pruning': 'P',
    'msmarcocomp': 'v',
    'dslr': 'H',
    'finalcomp': '^',
    'msmarcotranslated2Mjoint': '*',
    'msmarcotranslated2Mcomp': 's',
}

model_to_name = {
    'no-pruning': 'No Compression',
    'msmarcocomp': 'Cross-Lingual',
    'finalcomp': 'Reannotated\nMultilingual Data',
    'dslr': 'DSLR',
    'msmarcotranslated2Mjoint': 'XProvence (w/ reranking)',
    'msmarcotranslated2Mcomp': 'Data Translation'
}

# Filter to the same models as in original plot
models_to_plot = ['no-pruning', 'finalcomp',  'dslr', 'msmarcocomp', 'msmarcotranslated2Mjoint', 'msmarcotranslated2Mcomp']

aya_model_langs = ["en", "fr", "de", "es", "it", "pt", "ja", "ko", "zh", "ar", "el", "fa", "pl", "id", "cs", "he", "hi", "nl", "ro", "ru", "tr", "uk", "vi"]
seen_languages = ['ar', 'en', 'es', 'fr', 'ko', 'ru', 'zh']
unseen_languages = ['de', 'he', 'it', 'nl', 'pl', 'pt', 'tr', 'vi']

df = df[(df['lang'].isin(aya_model_langs))]
df = df[(df['lang'].isin(seen_languages))]

num_langs = df['lang'].nunique()

# Calculate average scores and compression rates across all languages
# Group by model and threshold, then take the mean across languages
avg_data = df.dropna().groupby(['model', 'threshold']).agg({
    'recall3gram': 'mean',
    'compression_ratio': 'mean',
    'lang': list
}).reset_index()
avg_data['num_langs'] = avg_data['lang'].apply(len)

avg_data = avg_data[avg_data['num_langs'] == num_langs]

# Filter to only include the models we want to plot
avg_data = avg_data[avg_data['model'].isin(models_to_plot)]
avg_data = avg_data[((avg_data['compression_ratio'] > 15) & (avg_data['compression_ratio'] < 95)) | (avg_data['compression_ratio'] < 1)]

# Create the averaged plot
plt.figure(figsize=(6.5, 5))

# Plot each model
for model in models_to_plot:
    model_data = avg_data[avg_data['model'] == model].sort_values('threshold')
    
    # Connect all points for this model with a line
    plt.plot(
        model_data['recall3gram'], 
        model_data['compression_ratio'],
        color=model_colors[model], 
        linestyle='--',
        linewidth=6, 
        alpha=0.7,
        label=model_to_name.get(model, model)
    )
    
    # Plot individual points with different markers for thresholds
    if model == 'msmarcotranslated2Mjoint':
        for threshold in model_data['threshold'].unique():
            threshold_data = model_data[model_data['threshold'] == threshold]
            
            plt.scatter(
                threshold_data['recall3gram'], 
                threshold_data['compression_ratio'],
                c=model_colors[model], 
                marker=model_markers[model],
                s=600, 
                alpha=0.9,
                edgecolors='black',
                linewidth=1.5,
                zorder=5  # Ensure points are on top of lines
            )
    
    else:
        for threshold in model_data['threshold'].unique():
            threshold_data = model_data[model_data['threshold'] == threshold]
            
            plt.scatter(
                threshold_data['recall3gram'], 
                threshold_data['compression_ratio'],
                c=model_colors[model], 
                marker=model_markers[model],
                s=250, 
                alpha=0.9,
                edgecolors='black',
                linewidth=1.5,
                zorder=5  # Ensure points are on top of lines
            )

# Customize the plot
plt.xlabel('3-gram Recall', fontsize=24, fontweight='bold')
plt.ylabel('Compression Rate (%)', fontsize=24, fontweight='bold')
plt.title('(a) MKQA - Seen - $L_{cntx} = L_q$', fontsize=25, fontweight='bold')
plt.grid(True, alpha=0.3)
plt.xticks(fontsize=16)
plt.yticks(fontsize=16)


# Set appropriate limits with padding
all_recall = avg_data['recall3gram']
all_compression = avg_data['compression_ratio']

recall_padding = (all_recall.max() - all_recall.min()) * 0.05
compression_padding = (all_compression.max() - all_compression.min()) * 0.05

plt.xlim(all_recall.max() - 0.03 - recall_padding, all_recall.max() + recall_padding)
plt.ylim(all_compression.min() - compression_padding, all_compression.max() + compression_padding)

# Create legend elements with only markers (no lines)
legend_elements = [
    Line2D([0], [0], color='w', marker=model_markers[model], markerfacecolor=model_colors[model],
           markeredgecolor='black', markersize=13, linewidth=0,
           label=model_to_name[model])
    for model in models_to_plot
]

ax = plt.gca()
xticks = ax.get_xticks()
ax.set_xticks(xticks[::2])

plt.tight_layout()
plt.savefig('mkqaseen.svg', dpi=400, bbox_inches='tight')
plt.show()

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from matplotlib.lines import Line2D
import math

# Assuming df is your final_df from the original code
df = final_df

# --- Configuration (Same as original) ---
model_colors = {
    'no-pruning': '#000000',             
    'msmarcocomp': '#117733',                
    'dslr': "#7B7D7F",                   
    'finalcomp': '#D8A547',              
    'msmarcotranslated2Mjoint': "#AA4499",  
    'msmarcotranslated2Mcomp': "#C92F4B"     
}

model_markers = {
    'translatedcomp': 'o',
    'msmarcotransfinalcomp': 's',
    'msmarcotranslated2Mcomp': 's',
    'finetune': '<',
    'msmarcojoint': '*',
    'no-pruning': 'P',
    'zeroshot': 'X',
    'msmarcocomp': 'v',
    'finetunelong': '>',
    'msmarcotranslated2Mjoint': '*',
    'dslr': 'H',
    'finalcomp': '^'
}

model_to_name = {
    'no-pruning': 'No Compression',
    'msmarcocomp': 'Cross-Lingual',
    'finalcomp': 'Reannotated\nMultilingual Data',
    'dslr': 'DSLR',
    'msmarcotranslated2Mjoint': 'XProvence (w/ reranking)',
    'msmarcotranslated2Mcomp': 'Data Translation'
}

models_to_plot = ['no-pruning', 'msmarcotranslated2Mjoint', 'msmarcocomp', 'msmarcotranslated2Mcomp', 'finalcomp',  'dslr']

# --- Data Preparation ---

aya_model_langs = ["en", "fr", "de", "es", "it", "pt", "ja", "ko", "zh", "ar", "el", "fa", "pl", "id", "cs", "he", "hi", "nl", "ro", "ru", "tr", "uk", "vi"]
seen_languages = ['ar', 'en', 'es', 'fr', 'ko', 'ru', 'zh']
unseen_languages = ['de', 'he', 'it', 'nl', 'pl', 'pt', 'tr', 'vi']
df = df[(df['lang'].isin(aya_model_langs))]
df = df[(df['lang'].isin(seen_languages))]


# 1. Filter by models
df = df[df['model'].isin(models_to_plot)]

# 2. Get Languages Dynamically
unique_langs = sorted(df['lang'].unique())

# 3. Filter for valid compression ratios (for plotting and limits)
#    We do this globally first to calculate the correct global limits
# df_clean = df[((df['compression_ratio'] > 15) & (df['compression_ratio'] < 95)) | (df['compression_ratio'] < 0.001)]
df_clean = df

# --- Plotting ---

# Grid setup
num_langs = len(unique_langs)
cols = 4
rows = math.ceil(num_langs / cols)

fig, axes = plt.subplots(rows, cols, figsize=(25, 5 * rows))
plt.suptitle('MKQA Seen Languages', fontsize=20, fontweight='bold')
axes = axes.flatten()

for i, lang in enumerate(unique_langs):
    ax = axes[i]
    
    # Filter for specific language
    lang_data = df_clean[df_clean['lang'] == lang]
    
    for model in models_to_plot:
        model_data = lang_data[lang_data['model'] == model].sort_values('threshold')
        
        if model_data.empty:
            continue
        
        # Plot Lines
        # Scaled linewidth slightly down from 6 to 3.5 for subplots, but kept style
        ax.plot(
            model_data['recall3gram'], 
            model_data['compression_ratio'],
            color=model_colors[model], 
            linestyle='--',
            linewidth=3.5, 
            alpha=0.7
        )
        
        # Plot Markers
        # Scaled size down from 600/250 to 200/100 for subplots
        marker_size = 200 if model == 'msmarcotranslated2Mjoint' else 100
        
        for threshold in model_data['threshold'].unique():
            threshold_data = model_data[model_data['threshold'] == threshold]
            
            ax.scatter(
                threshold_data['recall3gram'], 
                threshold_data['compression_ratio'],
                c=model_colors[model], 
                marker=model_markers[model],
                s=marker_size, 
                alpha=0.9,
                edgecolors='black',
                linewidth=1.2,
                zorder=5
            )

    # Subplot Specific Styling
    ax.set_title(lang, fontsize=18, fontweight='bold')
    ax.grid(True, alpha=0.3)

    all_recall = lang_data['recall3gram']
    all_compression = lang_data['compression_ratio']

    recall_padding = (all_recall.max() - all_recall.min()) * 0.05
    compression_padding = (all_compression.max() - all_compression.min()) * 0.05

    xlims = all_recall.max() - 0.03 - recall_padding, all_recall.max() + recall_padding
    ylims = all_compression.min() - compression_padding, all_compression.max() + compression_padding
    
    # Apply the Global Limits
    ax.set_xlim(xlims)
    ax.set_ylim(ylims)
    
    # Tick styling
    ax.tick_params(axis='both', which='major', labelsize=12)
    
    # Axis labels (can be removed from inner plots if desired, kept here for clarity)
    if i >= (rows - 1) * cols: # Bottom row only
        ax.set_xlabel('3-gram Recall', fontsize=14, fontweight='bold')
    if i % cols == 0: # Left column only
        ax.set_ylabel('Compression Rate (%)', fontsize=14, fontweight='bold')

# Hide empty subplots
for j in range(i + 1, len(axes)):
    axes[j].axis('off')

# --- Legend ---
legend_elements = [
    Line2D([0], [0], color='w', marker=model_markers[model], markerfacecolor=model_colors[model],
           markeredgecolor='black', markersize=23 if model == 'msmarcotranslated2Mjoint' else 15, linewidth=0,
           label=model_to_name[model])
    for model in models_to_plot
]

fig.legend(handles=legend_elements, loc='lower center', 
           bbox_to_anchor=(0.5, -0.05), ncol=len(models_to_plot), 
           prop={'weight': 'bold', 'size': 16}, framealpha=0.9)

plt.tight_layout(rect=[0, 0.05, 1, 1]) # Make room for legend
plt.savefig('mkqa_seen_subplots.svg', dpi=400, bbox_inches='tight')
plt.show()

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from matplotlib.lines import Line2D

# Assuming df is your final_df from the original code
df = final_df

model_colors = {
    'no-pruning': '#000000',             # strong black
    'msmarcocomp': '#117733',                # strong brown
    'dslr': "#7B7D7F",                   # strong pink
    'finalcomp': '#D8A547',              # strong gold
    'msmarcotranslated2Mjoint': "#AA4499",  # strong cyan
    'msmarcotranslated2Mcomp': "#C92F4B"     # strong teal
}

model_markers = {
    'no-pruning': 'P',
    'msmarcocomp': 'v',
    'dslr': 'H',
    'finalcomp': '^',
    'msmarcotranslated2Mjoint': '*',
    'msmarcotranslated2Mcomp': 's',
}

model_to_name = {
    'no-pruning': 'No Compression',
    'msmarcocomp': 'Cross-Lingual',
    'finalcomp': 'Reannotated\nMultilingual Data',
    'dslr': 'DSLR',
    'msmarcotranslated2Mjoint': 'XProvence (w/ reranking)',
    'msmarcotranslated2Mcomp': 'Data Translation'
}

# Filter to the same models as in original plot
models_to_plot = ['no-pruning', 'finalcomp',  'dslr', 'msmarcocomp', 'msmarcotranslated2Mjoint', 'msmarcotranslated2Mcomp']

aya_model_langs = ["en", "fr", "de", "es", "it", "pt", "ja", "ko", "zh", "ar", "el", "fa", "pl", "id", "cs", "he", "hi", "nl", "ro", "ru", "tr", "uk", "vi"]
seen_languages = ['ar', 'en', 'es', 'fr', 'ko', 'ru', 'zh']
unseen_languages = ['de', 'he', 'it', 'nl', 'pl', 'pt', 'tr', 'vi']

df = df[(df['lang'].isin(aya_model_langs))]
df = df[(df['lang'].isin(unseen_languages))]

num_langs = df['lang'].nunique()

# Calculate average scores and compression rates across all languages
# Group by model and threshold, then take the mean across languages
avg_data = df.dropna().groupby(['model', 'threshold']).agg({
    'recall3gram': 'mean',
    'compression_ratio': 'mean',
    'lang': list
}).reset_index()
avg_data['num_langs'] = avg_data['lang'].apply(len)

avg_data = avg_data[avg_data['num_langs'] == num_langs]

# Filter to only include the models we want to plot
avg_data = avg_data[avg_data['model'].isin(models_to_plot)]
avg_data = avg_data[((avg_data['compression_ratio'] > 20) & (avg_data['compression_ratio'] < 95)) | (avg_data['compression_ratio'] < 1)]

# Create the averaged plot
plt.figure(figsize=(6.5, 5))

# Plot each model
for model in models_to_plot:
    model_data = avg_data[avg_data['model'] == model].sort_values('threshold')
    
    # Connect all points for this model with a line
    plt.plot(
        model_data['recall3gram'], 
        model_data['compression_ratio'],
        color=model_colors[model], 
        linestyle='--',
        linewidth=6, 
        alpha=0.7,
        label=model_to_name.get(model, model)
    )
    
    # Plot individual points with different markers for thresholds
    if model == 'msmarcotranslated2Mjoint':
        for threshold in model_data['threshold'].unique():
            threshold_data = model_data[model_data['threshold'] == threshold]
            
            plt.scatter(
                threshold_data['recall3gram'], 
                threshold_data['compression_ratio'],
                c=model_colors[model], 
                marker=model_markers[model],
                s=600, 
                alpha=0.9,
                edgecolors='black',
                linewidth=1.5,
                zorder=5  # Ensure points are on top of lines
            )
    
    else:
        for threshold in model_data['threshold'].unique():
            threshold_data = model_data[model_data['threshold'] == threshold]
            
            plt.scatter(
                threshold_data['recall3gram'], 
                threshold_data['compression_ratio'],
                c=model_colors[model], 
                marker=model_markers[model],
                s=250, 
                alpha=0.9,
                edgecolors='black',
                linewidth=1.5,
                zorder=5  # Ensure points are on top of lines
            )

# Customize the plot
plt.xlabel('3-gram Recall', fontsize=24, fontweight='bold')
plt.ylabel('Compression Rate (%)', fontsize=24, fontweight='bold')
plt.title('(b) MKQA - Unseen - $L_{cntx} = L_q$', fontsize=25, fontweight='bold')
plt.grid(True, alpha=0.3)
plt.xticks(fontsize=16)
plt.yticks(fontsize=16)


# Set appropriate limits with padding
all_recall = avg_data['recall3gram']
all_compression = avg_data['compression_ratio']

recall_padding = (all_recall.max() - all_recall.min()) * 0.05
compression_padding = (all_compression.max() - all_compression.min()) * 0.05

plt.xlim(all_recall.max() - 0.03 - recall_padding, all_recall.max() + recall_padding)
plt.ylim(all_compression.min() - compression_padding, all_compression.max() + compression_padding)

# Create legend elements with only markers (no lines)
legend_elements = [
    Line2D([0], [0], color='w', marker=model_markers[model], markerfacecolor=model_colors[model],
           markeredgecolor='black', markersize=13, linewidth=0,
           label=model_to_name[model])
    for model in models_to_plot
]

ax = plt.gca()
xticks = ax.get_xticks()
ax.set_xticks(xticks[::2])

plt.tight_layout()
plt.savefig('mkqaunseen.svg', dpi=400, bbox_inches='tight')
plt.show()

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from matplotlib.lines import Line2D
import math

# Assuming df is your final_df from the original code
df = final_df

# --- Configuration (Same as original) ---
model_colors = {
    'no-pruning': '#000000',             
    'msmarcocomp': '#117733',                
    'dslr': "#7B7D7F",                   
    'finalcomp': '#D8A547',              
    'msmarcotranslated2Mjoint': "#AA4499",  
    'msmarcotranslated2Mcomp': "#C92F4B"     
}

model_markers = {
    'translatedcomp': 'o',
    'msmarcotransfinalcomp': 's',
    'msmarcotranslated2Mcomp': 's',
    'finetune': '<',
    'msmarcojoint': '*',
    'no-pruning': 'P',
    'zeroshot': 'X',
    'msmarcocomp': 'v',
    'finetunelong': '>',
    'msmarcotranslated2Mjoint': '*',
    'dslr': 'H',
    'finalcomp': '^'
}

model_to_name = {
    'no-pruning': 'No Compression',
    'msmarcocomp': 'Cross-Lingual',
    'finalcomp': 'Reannotated\nMultilingual Data',
    'dslr': 'DSLR',
    'msmarcotranslated2Mjoint': 'XProvence (w/ reranking)',
    'msmarcotranslated2Mcomp': 'Data Translation'
}

models_to_plot = ['no-pruning', 'msmarcotranslated2Mjoint', 'msmarcocomp', 'msmarcotranslated2Mcomp', 'finalcomp',  'dslr']

# --- Data Preparation ---

aya_model_langs = ["en", "fr", "de", "es", "it", "pt", "ja", "ko", "zh", "ar", "el", "fa", "pl", "id", "cs", "he", "hi", "nl", "ro", "ru", "tr", "uk", "vi"]
seen_languages = ['ar', 'en', 'es', 'fr', 'ko', 'ru', 'zh']
unseen_languages = ['de', 'he', 'it', 'nl', 'pl', 'pt', 'tr', 'vi']
df = df[(df['lang'].isin(aya_model_langs))]
df = df[(df['lang'].isin(unseen_languages))]


# 1. Filter by models
df = df[df['model'].isin(models_to_plot)]

# 2. Get Languages Dynamically
unique_langs = sorted(df['lang'].unique())

# 3. Filter for valid compression ratios (for plotting and limits)
#    We do this globally first to calculate the correct global limits
df_clean = df

# --- Plotting ---

# Grid setup
num_langs = len(unique_langs)
cols = 4
rows = math.ceil(num_langs / cols)

fig, axes = plt.subplots(rows, cols, figsize=(25, 5 * rows))
plt.suptitle('MKQA Unseen Languages', fontsize=20, fontweight='bold')
axes = axes.flatten()

for i, lang in enumerate(unique_langs):
    ax = axes[i]
    
    # Filter for specific language
    lang_data = df_clean[df_clean['lang'] == lang]
    
    for model in models_to_plot:
        model_data = lang_data[lang_data['model'] == model].sort_values('threshold')
        
        if model_data.empty:
            continue
        
        # Plot Lines
        # Scaled linewidth slightly down from 6 to 3.5 for subplots, but kept style
        ax.plot(
            model_data['recall3gram'], 
            model_data['compression_ratio'],
            color=model_colors[model], 
            linestyle='--',
            linewidth=3.5, 
            alpha=0.7
        )
        
        # Plot Markers
        # Scaled size down from 600/250 to 200/100 for subplots
        marker_size = 200 if model == 'msmarcotranslated2Mjoint' else 100
        
        for threshold in model_data['threshold'].unique():
            threshold_data = model_data[model_data['threshold'] == threshold]
            
            ax.scatter(
                threshold_data['recall3gram'], 
                threshold_data['compression_ratio'],
                c=model_colors[model], 
                marker=model_markers[model],
                s=marker_size, 
                alpha=0.9,
                edgecolors='black',
                linewidth=1.2,
                zorder=5
            )

    # Subplot Specific Styling
    ax.set_title(lang, fontsize=18, fontweight='bold')
    ax.grid(True, alpha=0.3)

    all_recall = lang_data['recall3gram']
    all_compression = lang_data['compression_ratio']

    recall_padding = (all_recall.max() - all_recall.min()) * 0.05
    compression_padding = (all_compression.max() - all_compression.min()) * 0.05

    xlims = all_recall.max() - 0.03 - recall_padding, all_recall.max() + recall_padding
    ylims = all_compression.min() - compression_padding, all_compression.max() + compression_padding
    
    # Apply the Global Limits
    ax.set_xlim(xlims)
    ax.set_ylim(ylims)
    
    # Tick styling
    ax.tick_params(axis='both', which='major', labelsize=12)
    
    # Axis labels (can be removed from inner plots if desired, kept here for clarity)
    if i >= (rows - 1) * cols: # Bottom row only
        ax.set_xlabel('3-gram Recall', fontsize=14, fontweight='bold')
    if i % cols == 0: # Left column only
        ax.set_ylabel('Compression Rate (%)', fontsize=14, fontweight='bold')

# Hide empty subplots
for j in range(i + 1, len(axes)):
    axes[j].axis('off')

# --- Legend ---
legend_elements = [
    Line2D([0], [0], color='w', marker=model_markers[model], markerfacecolor=model_colors[model],
           markeredgecolor='black', markersize=23 if model == 'msmarcotranslated2Mjoint' else 15, linewidth=0,
           label=model_to_name[model])
    for model in models_to_plot
]

fig.legend(handles=legend_elements, loc='lower center', 
           bbox_to_anchor=(0.5, -0.05), ncol=len(models_to_plot), 
           prop={'weight': 'bold', 'size': 16}, framealpha=0.9)

plt.tight_layout(rect=[0, 0.05, 1, 1]) # Make room for legend
plt.savefig('mkqa_unseen_subplots.svg', dpi=400, bbox_inches='tight')
plt.show()