# The biological results and final figures.
What goes here:
Constraint vs. Orthogonal Metrics: Correlation with pLI, LOEUF, ncRVIS (proving independence).
Candidate Filtering: Extracting the "Uncoupled" list (Score < -0.3 & VeQTL > 0.5).
Enrichment Analysis: The Fold Enrichment table (0.00x at -0.3) and AUPRC/AUROC curves.
LOEUF Validation: The violin/bar charts showing 100% essentiality in your candidates.
"Money Plot": The final scatter plot highlighting the "Uncoupled Stars" (FGFBP3, ELL3).

In [None]:
import sys
from pathlib import Path

import matplotlib.pyplot as plt
import numpy as np
import polars as pl
import seaborn as sns
from scipy import stats
import sys
from pathlib import Path
import polars as pl
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy import stats

from rich.table import Table
from rich.console import Console
from rich.panel import Panel

sns.set_style('whitegrid')
plt.rcParams['figure.dpi'] = 150

def _resolve_project_root() -> Path:
    here = Path.cwd().resolve()
    for candidate in (here, *here.parents):
        if (candidate / 'config.py').exists():
            return candidate
    raise FileNotFoundError('config.py not found in cwd or parents')

PROJECT_ROOT = _resolve_project_root()
if str(PROJECT_ROOT) not in sys.path:
    sys.path.append(str(PROJECT_ROOT))

print(f'project root: {PROJECT_ROOT}')

from config import (
    GENE_PATHS,
    VARIANT_PATHS,
    SOURCE_PALETTE,
    load_variant_pairs_matched,
)
from utils.plot_utils import autosave

print('\navailable datasets:')
for name in ['clingen', 'clingen_null', 'background', 'background_null']:
    print(f'  {name}: {VARIANT_PATHS[name].name}')

In [None]:
CLINGEN_VAR = VARIANT_PATHS['clingen']
BG_VAR = VARIANT_PATHS['background']
BG_NULL_VAR = VARIANT_PATHS['background_null']
CLINGEN_NULL_VAR = VARIANT_PATHS['clingen_null']

CLINGEN_GENE = GENE_PATHS['clingen']
BG_GENE = GENE_PATHS['background']
BG_NULL_GENE = GENE_PATHS['background_null']
CLINGEN_NULL_GENE = GENE_PATHS['clingen_null']

PATHS = {
    'background': BG_VAR,
    'background_null': BG_NULL_VAR,
    'clingen': CLINGEN_VAR,
    'clingen_null': CLINGEN_NULL_VAR,
}

df = pl.read_parquet(BG_VAR)
print(df.columns)

['variant_id', 'scored_interval', 'gene_id', 'gene_name', 'gene_type', 'gene_strand', 'junction_Start', 'junction_End', 'output_type', 'variant_scorer', 'track_name', 'track_strand', 'Assay title', 'ontology_curie', 'biosample_name', 'biosample_type', 'gtex_tissue', 'raw_score', 'quantile_score', 'scored_interval_str', 'is_anchor', 'seq_len', 'scorer_friendly', 'gene_norm', 'CHROM', 'POS', 'REF', 'ALT', 'gene_tag', 'method_friendly', 'variant_id_canonical', 'CHROM_af', 'POS_af', 'REF_af', 'ALT_af', 'AF', 'perm_AF']


In [None]:
real_bg = pl.read_parquet(VARIANT_PATHS['background'])
null_bg = pl.read_parquet(VARIANT_PATHS['background_null'])
real_cg = pl.read_parquet(VARIANT_PATHS['clingen'])
null_cg = pl.read_parquet(VARIANT_PATHS['clingen_null'])

variant_tables = {
    'background': real_bg,
    'background_null': null_bg,
    'clingen': real_cg,
    'clingen_null': null_cg,
}

print("Loading and aligning gene metadata...")
gene_tables = {
    'background': pl.read_parquet(GENE_PATHS['background']),
    'background_null': pl.read_parquet(GENE_PATHS['background_null']),
    'clingen': pl.read_parquet(GENE_PATHS['clingen']),
    'clingen_null': pl.read_parquet(GENE_PATHS['clingen_null']),
}

for name in gene_tables:
    valid_genes = variant_tables[name]['gene_id'].unique().to_list()

    gene_tables[name] = gene_tables[name].filter(pl.col('gene_id').is_in(valid_genes))

    print(f"  {name}: aligned to {gene_tables[name].height} genes")

Loading and aligning gene metadata...
  background: aligned to 349 genes
  background_null: aligned to 349 genes
  clingen: aligned to 316 genes
  clingen_null: aligned to 316 genes


In [None]:

import polars as pl
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# [Load Configuration if needed]
# %run 00_Config.ipynb

print("Loading Gene-Level Summary Data...")

# 1. Load the Variance Data (Simulated vs Observed)
# Adjust path to where your aggregated gene scores are saved
df_obs = pl.scan_parquet(f"{PROJECT_ROOT}/experiments_data/gene_level_scores_obs.parquet")
df_null = pl.scan_parquet(f"{PROJECT_ROOT}/experiments_data/gene_level_scores_null.parquet")

# 2. Load Metadata (LOEUF, pLI, veQTL)
# This usually comes from your master gene table
meta_cols = ['gene_id', 'gene_symbol', 'loeuf_score', 'pLI', 'vg_eqtl']
df_meta = pl.scan_parquet(GENE_METADATA_PATH).select(meta_cols)

# 3. Merge and Align Columns for your Visualization Code
# We rename columns to match what your snippet expects:
# vg_obs -> vg_predicted
# vg_null -> vg_predicted_perm
bg_df = (
    df_obs.join(df_null, on="gene_id")
    .join(df_meta, on="gene_id")
    .rename({
        "vg_obs": "vg_predicted", 
        "vg_null": "vg_predicted_perm"
    })
    .collect()
    .to_pandas()
)

# 4. Prepare the 'plot_data' dictionary your code expects
plot_data = {
    'background': bg_df
}

print(f"Loaded {len(bg_df):,} genes into 'plot_data'. Ready for visualization.")