# S2F Performance Analysis

Use this notebook to explore the prediction outputs produced by different S2F runs and compare their behaviour. Each run writes a `prediction.df` file inside `<installation_directory>/output/<alias>/`. Update the configuration cells below with the aliases you want to analyse.

## Imports and configuration

The snippet below reads `s2f.conf` to locate the shared output directory. Adjust `CONFIG_PATH` if you are running the notebook from a different location.

In [None]:
from pathlib import Path
import configparser

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 10)
pd.set_option('display.width', 120)

sns.set_theme(style='whitegrid')

CONFIG_PATH = Path('s2f.conf')
config = configparser.ConfigParser()
if not CONFIG_PATH.exists():
    raise FileNotFoundError(f'Configuration file not found: {CONFIG_PATH.resolve()}. Update CONFIG_PATH to match your setup.')
config.read(CONFIG_PATH)

BASE_OUTPUT = Path(config['directories']['installation_directory']).expanduser() / 'output'
if not BASE_OUTPUT.exists():
    raise FileNotFoundError(f'Output directory not found: {BASE_OUTPUT}. Make sure the S2F runs have been executed.')

BASE_OUTPUT

## Discover available runs

This cell lists the aliases that currently have diffusion outputs. If the list is long, slice or filter it as needed.

In [None]:
AVAILABLE_RUNS = sorted(p.name for p in BASE_OUTPUT.iterdir() if p.is_dir())
print(f"{len(AVAILABLE_RUNS)} run(s) found under {BASE_OUTPUT}")
pd.DataFrame({'alias': AVAILABLE_RUNS})

## Select runs to compare

Edit `RUNS_TO_COMPARE` to focus on the runs you are interested in. By default the cell keeps only aliases that actually exist in `AVAILABLE_RUNS`.

In [None]:
RUNS_TO_COMPARE = [
    'test_223283',
    'test_223283_new',
    'test_223283_new_2',
]

RUNS_TO_COMPARE = [alias for alias in RUNS_TO_COMPARE if alias in AVAILABLE_RUNS]
if not RUNS_TO_COMPARE:
    raise ValueError('Update RUNS_TO_COMPARE with at least one valid alias.')

RUNS_TO_COMPARE

## Load prediction tables

Helper functions to read the prediction scores and the index files written by each run.

In [None]:
def load_predictions(alias: str, base_path: Path = BASE_OUTPUT) -> pd.DataFrame:
    """Load the diffusion output for a single run as a tidy DataFrame."""
    path = base_path / alias / 'prediction.df'
    if not path.exists():
        raise FileNotFoundError(f'Prediction file not found for {alias}: {path}')
    df = pd.read_csv(path, sep='	', header=None, names=['protein_id', 'term_id', 'score'])
    df['alias'] = alias
    return df


def load_terms(alias: str, base_path: Path = BASE_OUTPUT) -> pd.DataFrame:
    """Grab the GO term lookup table if you need term names or namespaces."""
    path = base_path / alias / 'terms.df'
    if not path.exists():
        raise FileNotFoundError(f'GO term index not found for {alias}: {path}')
    return pd.read_pickle(path)


def load_proteins(alias: str, base_path: Path = BASE_OUTPUT) -> pd.DataFrame:
    """Fetch the protein index for convenience (e.g. to map back to FASTA identifiers)."""
    path = base_path / alias / 'proteins.df'
    if not path.exists():
        raise FileNotFoundError(f'Protein index not found for {alias}: {path}')
    return pd.read_pickle(path)


In [None]:
predictions_df = pd.concat(
    [load_predictions(alias) for alias in RUNS_TO_COMPARE],
    ignore_index=True
)

print(f"Loaded {predictions_df.shape[0]:,} scored annotations spanning {predictions_df['alias'].nunique()} run(s).")
predictions_df.head()

## Summary statistics per run

In [None]:
summary = (
    predictions_df
    .groupby('alias')
    .agg(
        proteins=('protein_id', 'nunique'),
        go_terms=('term_id', 'nunique'),
        annotations=('term_id', 'size'),
        min_score=('score', 'min'),
        median_score=('score', 'median'),
        mean_score=('score', 'mean'),
        max_score=('score', 'max')
    )
    .sort_index()
)
summary

## Score distribution (log scale)

Scores tend to be very small, so plotting their log<sub>10</sub> values highlights differences between runs.

In [None]:
predictions_df['score_log10'] = np.log10(predictions_df['score'].clip(lower=1e-15))

plt.figure(figsize=(9, 4))
sns.histplot(
    data=predictions_df,
    x='score_log10',
    hue='alias',
    bins=60,
    element='step',
    stat='density',
    common_norm=False,
    alpha=0.35
)
plt.xlabel('log10(score)')
plt.ylabel('density')
plt.title('Score distribution per run (log scale)')
plt.tight_layout()
plt.show()

## Focus on top-N predictions per protein

Restrict to the highest-scoring annotations per protein to study agreement between runs.

In [None]:
TOP_N = 5  # change to inspect more or fewer annotations per protein

_top_sorted = predictions_df.sort_values(
    ['alias', 'protein_id', 'score'],
    ascending=[True, True, False]
)

top_predictions = (
    _top_sorted
    .groupby(['alias', 'protein_id'], as_index=False)
    .head(TOP_N)
    .reset_index(drop=True)
)

top_predictions.head()

## Pairwise comparison (first two runs)

The cell below contrasts the first two aliases in `RUNS_TO_COMPARE`. Edit `BASELINE` and `VARIANT` if you prefer a different pair.

In [None]:
if len(RUNS_TO_COMPARE) >= 2:
    BASELINE, VARIANT = RUNS_TO_COMPARE[0], RUNS_TO_COMPARE[1]

    baseline_top = (
        top_predictions[top_predictions['alias'] == BASELINE]
        [['protein_id', 'term_id', 'score']]
        .set_index(['protein_id', 'term_id'])
    )
    variant_top = (
        top_predictions[top_predictions['alias'] == VARIANT]
        [['protein_id', 'term_id', 'score']]
        .set_index(['protein_id', 'term_id'])
    )

    pairwise = baseline_top.join(
        variant_top,
        how='outer',
        lsuffix=f'_{BASELINE}',
        rsuffix=f'_{VARIANT}'
    )
    pairwise[f'score_{BASELINE}'] = pairwise.get(f'score_{BASELINE}', 0).fillna(0)
    pairwise[f'score_{VARIANT}'] = pairwise.get(f'score_{VARIANT}', 0).fillna(0)
    pairwise['score_delta'] = pairwise[f'score_{VARIANT}'] - pairwise[f'score_{BASELINE}']
    pairwise = pairwise.sort_values('score_delta', ascending=False)

    print(f'Comparing top-{TOP_N} predictions between {BASELINE} and {VARIANT}.')
    pairwise.head(20)
else:
    print('Add at least two aliases to RUNS_TO_COMPARE to compute pairwise deltas.')

## Largest improvements and regressions

Filter the `pairwise` table to highlight the strongest positive or negative score shifts.

In [None]:
if len(RUNS_TO_COMPARE) >= 2 and 'pairwise' in globals():
    gains = pairwise[pairwise['score_delta'] > 0].head(20)
    losses = pairwise[pairwise['score_delta'] < 0].tail(20)

    print('Top gains:')
    display(gains)

    print('Top losses:')
    display(losses)
else:
    print('Pairwise deltas are not available â€” add at least two aliases and run the previous cell first.')

## Per-protein overlap statistics

Compute how much the top-N annotations overlap per protein between the first two runs.

In [None]:
if len(RUNS_TO_COMPARE) >= 2:
    BASELINE, VARIANT = RUNS_TO_COMPARE[0], RUNS_TO_COMPARE[1]
    overlaps = []

    for protein_id, subset in top_predictions.groupby('protein_id'):
        alias_groups = {alias: group for alias, group in subset.groupby('alias')}
        if BASELINE not in alias_groups or VARIANT not in alias_groups:
            continue
        baseline_terms = set(alias_groups[BASELINE]['term_id'])
        variant_terms = set(alias_groups[VARIANT]['term_id'])
        union = baseline_terms | variant_terms
        if not union:
            continue
        shared = baseline_terms & variant_terms
        overlaps.append({
            'protein_id': protein_id,
            'shared': len(shared),
            'union': len(union),
            'jaccard': len(shared) / len(union)
        })

    overlap_df = pd.DataFrame(overlaps)
    print(f'Computed overlaps for {len(overlap_df)} proteins present in both runs.')
    overlap_df.sort_values('jaccard', ascending=False).head(20)
else:
    print('Add at least two aliases to RUNS_TO_COMPARE to evaluate overlap.')

## Optional: attach GO term metadata

If you need GO term names or namespaces, use `load_terms` and merge on `term_id`. The snippet below shows how to expand the pairwise comparison with GO labels.

In [None]:
# Example: enrich pairwise table with GO term metadata (adjust `ALIAS_FOR_METADATA` as needed)
if RUNS_TO_COMPARE:
    ALIAS_FOR_METADATA = RUNS_TO_COMPARE[0]
    try:
        terms_lookup = load_terms(ALIAS_FOR_METADATA)
        terms_lookup = terms_lookup.reset_index().rename(columns={'term id': 'term_id'})
    except FileNotFoundError as exc:
        print(exc)
    else:
        if 'pairwise' in globals():
            decorated = pairwise.reset_index().merge(terms_lookup, on='term_id', how='left')
            decorated.head()
        else:
            print('Run the pairwise comparison cell first to create the `pairwise` table.')
else:
    print('RUNS_TO_COMPARE is empty; nothing to decorate.')