# Training Experiments Analysis

This notebook combines and visualizes results from the training experiments.

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path

sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (14, 8)

## Load and Combine All Results

In [None]:
results_dir = Path('../scripts/training_experiments/results')
csv_files = list(results_dir.glob('*_metrics.csv'))

# Load all CSVs
dfs = []
for csv_file in csv_files:
    df = pd.read_csv(csv_file)
    # Extract experiment name from filename
    experiment_name = csv_file.stem.replace('_metrics', '')
    df['experiment'] = experiment_name
    dfs.append(df)

# Combine all dataframes
combined_df = pd.concat(dfs, ignore_index=True)

print(f"Loaded {len(csv_files)} experiment files")
print(f"Total rows: {len(combined_df)}")
print(f"\nExperiments: {combined_df['experiment'].unique().tolist()}")

combined_df.head()

## Summary Statistics

In [None]:
# Get best epoch for each experiment based on validation loss
best_epochs = combined_df.loc[combined_df.groupby('experiment')['val_loss'].idxmin()]

# Select key metrics
metrics_cols = ['experiment', 'epoch', 'val_loss', 'precision@1', 'MRR', 'NDCG@10',
                'Recall@1', 'Recall@5', 'Recall@10']
summary = best_epochs[metrics_cols].sort_values('val_loss')

print("Best Performance (by validation loss):")
summary

## Validation Loss Over Epochs

In [None]:
fig, axes = plt.subplots(1, 2, figsize=(16, 6))

# Embedder experiments
embedder_df = combined_df[combined_df['model_type'] == 'embedder']
for exp in embedder_df['experiment'].unique():
    data = embedder_df[embedder_df['experiment'] == exp]
    axes[0].plot(data['epoch'], data['val_loss'], marker='o', label=exp)
axes[0].set_title('Embedder: Validation Loss', fontsize=14, fontweight='bold')
axes[0].set_xlabel('Epoch')
axes[0].set_ylabel('Validation Loss')
axes[0].legend()
axes[0].grid(True, alpha=0.3)

# Ranker experiments
ranker_df = combined_df[combined_df['model_type'] == 'ranker']
for exp in ranker_df['experiment'].unique():
    data = ranker_df[ranker_df['experiment'] == exp]
    axes[1].plot(data['epoch'], data['val_loss'], marker='o', label=exp)
axes[1].set_title('Ranker: Validation Loss', fontsize=14, fontweight='bold')
axes[1].set_xlabel('Epoch')
axes[1].set_ylabel('Validation Loss')
axes[1].legend()
axes[1].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

## Recall Metrics Comparison

In [None]:
fig, axes = plt.subplots(2, 2, figsize=(16, 12))

recall_metrics = ['Recall@1', 'Recall@5', 'Recall@10', 'Recall@20']

for idx, metric in enumerate(recall_metrics):
    ax = axes[idx // 2, idx % 2]

    # Embedder
    for exp in embedder_df['experiment'].unique():
        data = embedder_df[embedder_df['experiment'] == exp]
        ax.plot(data['epoch'], data[metric], marker='o', linestyle='-', label=f'{exp}')

    # Ranker
    for exp in ranker_df['experiment'].unique():
        data = ranker_df[ranker_df['experiment'] == exp]
        ax.plot(data['epoch'], data[metric], marker='s', linestyle='--', label=f'{exp}')

    ax.set_title(metric, fontsize=12, fontweight='bold')
    ax.set_xlabel('Epoch')
    ax.set_ylabel('Score')
    ax.legend(fontsize=8)
    ax.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

## MRR and NDCG@10 Comparison

In [None]:
fig, axes = plt.subplots(1, 2, figsize=(16, 6))

# MRR
for exp in embedder_df['experiment'].unique():
    data = embedder_df[embedder_df['experiment'] == exp]
    axes[0].plot(data['epoch'], data['MRR'], marker='o', linestyle='-', label=exp)
for exp in ranker_df['experiment'].unique():
    data = ranker_df[ranker_df['experiment'] == exp]
    axes[0].plot(data['epoch'], data['MRR'], marker='s', linestyle='--', label=exp)
axes[0].set_title('Mean Reciprocal Rank (MRR)', fontsize=14, fontweight='bold')
axes[0].set_xlabel('Epoch')
axes[0].set_ylabel('MRR')
axes[0].legend()
axes[0].grid(True, alpha=0.3)

# NDCG@10
for exp in embedder_df['experiment'].unique():
    data = embedder_df[embedder_df['experiment'] == exp]
    axes[1].plot(data['epoch'], data['NDCG@10'], marker='o', linestyle='-', label=exp)
for exp in ranker_df['experiment'].unique():
    data = ranker_df[ranker_df['experiment'] == exp]
    axes[1].plot(data['epoch'], data['NDCG@10'], marker='s', linestyle='--', label=exp)
axes[1].set_title('Normalized Discounted Cumulative Gain @10', fontsize=14, fontweight='bold')
axes[1].set_xlabel('Epoch')
axes[1].set_ylabel('NDCG@10')
axes[1].legend()
axes[1].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

## Best Performance Comparison (Bar Chart)

In [None]:
# Create comparison of best epochs
fig, axes = plt.subplots(2, 2, figsize=(16, 12))

metrics_to_compare = ['MRR', 'NDCG@10', 'Recall@5', 'Recall@10']

for idx, metric in enumerate(metrics_to_compare):
    ax = axes[idx // 2, idx % 2]

    data = best_epochs.sort_values(metric, ascending=False)

    # Color by model type
    colors = ['#1f77b4' if 'embedder' in exp else '#ff7f0e' for exp in data['experiment']]

    ax.barh(data['experiment'], data[metric], color=colors)
    ax.set_xlabel(metric, fontweight='bold')
    ax.set_title(f'Best {metric} by Experiment', fontsize=12, fontweight='bold')
    ax.grid(True, alpha=0.3, axis='x')

# Add legend
from matplotlib.patches import Patch
legend_elements = [Patch(facecolor='#1f77b4', label='Embedder'),
                   Patch(facecolor='#ff7f0e', label='Ranker')]
fig.legend(handles=legend_elements, loc='upper right', bbox_to_anchor=(0.98, 0.98))

plt.tight_layout()
plt.show()

## Training Configuration Analysis

In [None]:
# Analyze the effect of freezing encoder vs query
config_summary = best_epochs[['experiment', 'model_type', 'freeze_encoder', 'freeze_query',
                               'MRR', 'NDCG@10', 'Recall@10', 'val_loss']].copy()

# Create configuration label
def config_label(row):
    if row['freeze_encoder'] and row['freeze_query']:
        return 'baseline (both frozen)'
    elif not row['freeze_encoder'] and row['freeze_query']:
        return 'encoder_only'
    elif row['freeze_encoder'] and not row['freeze_query']:
        return 'query_only'
    else:
        return 'both'

config_summary['config'] = config_summary.apply(config_label, axis=1)

print("\nPerformance by Configuration:")
config_summary[['model_type', 'config', 'MRR', 'NDCG@10', 'Recall@10', 'val_loss']].sort_values(['model_type', 'MRR'], ascending=[True, False])

## Save Combined Results

In [None]:
# Save combined results
output_path = results_dir / 'combined_results.csv'
combined_df.to_csv(output_path, index=False)
print(f"Combined results saved to: {output_path}")

# Save best epochs summary
summary_path = results_dir / 'best_performance_summary.csv'
best_epochs.to_csv(summary_path, index=False)
print(f"Best performance summary saved to: {summary_path}")