# Build Activity Library for Motif Context Swap
Sample sequences from test (75%) and train (25%) sets into high/mid/low activity bins.

In [7]:
import pandas as pd
import numpy as np
from pathlib import Path

In [8]:
# Config
MODEL_NAME = "fastdrop"  # change this to run for different models

SEQS_PER_BIN = 1000
TEST_FRAC = 0.75  # fraction of each bin from test
TRAIN_FRAC = 0.25
TEST_PER_BIN = int(SEQS_PER_BIN * TEST_FRAC)   # 750
TRAIN_PER_BIN = int(SEQS_PER_BIN * TRAIN_FRAC) # 250

BIN_DEFS = {
    "high": (1.0, np.inf),
    "mid": (-0.5, 0.5),
    "low": (-np.inf, -1.0),
}

RESULTS_DIR = Path("/grid/wsbs/home_norepl/pmantill/LentiMPRA_mcs/LentiMoCon/lenti_AGFT/training/results")
SEED = 42
print(f"Model: {MODEL_NAME}")
print(f"Target: {SEQS_PER_BIN} seqs/bin ({TEST_PER_BIN} test + {TRAIN_PER_BIN} train)")

Model: fastdrop
Target: 1000 seqs/bin (750 test + 250 train)


In [9]:
# Load K562 data
data_path = "/grid/wsbs/home_norepl/pmantill/LentiMPRA_mcs/LentiMoCon/test_run_lenti_data/K562.tsv"
df = pd.read_csv(data_path, sep="\t")
df = df[df["rev"] == 0].copy().reset_index(drop=True)

test_df = df[df["fold"] == 10].copy()
train_df = df[df["fold"].isin(range(2, 10))].copy()

# Load predictions (test fold only) from the selected model
pred_path = RESULTS_DIR / MODEL_NAME / "test_predictions.npz"
if pred_path.exists():
    preds = np.load(pred_path)
    pred_targets = preds["targets"]
    pred_predictions = preds["predictions"]
    assert len(pred_targets) == len(test_df), f"Mismatch: {len(pred_targets)} preds vs {len(test_df)} test seqs"
    test_df["pred_activity"] = pred_predictions
    print(f"Predictions loaded from {MODEL_NAME}: {len(pred_predictions)} samples")
else:
    print(f"WARNING: No predictions found at {pred_path}")
    print(f"  Run eval_checkpoint.py --name {MODEL_NAME} to generate them")
    test_df["pred_activity"] = np.nan

print(f"Test: {len(test_df)}, Train: {len(train_df)}")

Predictions loaded from fastdrop: 19670 samples
Test: 19670, Train: 157328


In [10]:
# Check availability and sample (flexible: if test can't fill quota, take more from train)
rng = np.random.RandomState(SEED)
sampled_parts = []
sampling_metadata = {}

for bin_name, (lo, hi) in BIN_DEFS.items():
    test_in_bin = test_df[(test_df["mean_value"] > lo) & (test_df["mean_value"] < hi)]
    train_in_bin = train_df[(train_df["mean_value"] > lo) & (train_df["mean_value"] < hi)]

    avail_test = len(test_in_bin)
    avail_train = len(train_in_bin)
    total_avail = avail_test + avail_train

    if total_avail < SEQS_PER_BIN:
        raise ValueError(f"ERROR: '{bin_name}' bin has only {total_avail} total seqs, need {SEQS_PER_BIN}")

    # Take up to TEST_PER_BIN from test, fill remainder from train
    n_test = min(TEST_PER_BIN, avail_test)
    n_train = SEQS_PER_BIN - n_test
    if n_train > avail_train:
        raise ValueError(f"ERROR: '{bin_name}' bin can't fill {SEQS_PER_BIN} seqs (test={avail_test}, train={avail_train})")

    adjusted = n_test != TEST_PER_BIN
    sampling_metadata[bin_name] = {
        "available_test": avail_test, "available_train": avail_train,
        "sampled_test": n_test, "sampled_train": n_train,
        "target_test": TEST_PER_BIN, "target_train": TRAIN_PER_BIN,
        "adjusted": adjusted,
    }

    if adjusted:
        print(f"  {bin_name}: ADJUSTED — test {n_test}/{TEST_PER_BIN}, train {n_train}/{TRAIN_PER_BIN} (only {avail_test} test available)")
    else:
        print(f"  {bin_name}: OK — test {n_test}, train {n_train}")

    test_sample = test_in_bin.sample(n=n_test, random_state=rng)
    train_sample = train_in_bin.sample(n=n_train, random_state=rng)

    test_sample = test_sample.assign(split="test", activity_bin=bin_name)
    train_sample = train_sample.assign(split="train", activity_bin=bin_name)

    sampled_parts.append(test_sample)
    sampled_parts.append(train_sample)

library = pd.concat(sampled_parts, ignore_index=True)
print(f"\nLibrary built: {len(library)} total sequences")

  high: OK — test 750, train 250
  mid: OK — test 750, train 250
  low: ADJUSTED — test 258/750, train 742/250 (only 258 test available)

Library built: 3000 total sequences


In [11]:
# Format final library
library = library.rename(columns={"mean_value": "actual_activity", "seq": "sequence"})
library["original_index"] = library.index  # index in the sampled df

# For train seqs, pred_activity is not available
if "pred_activity" not in library.columns:
    library["pred_activity"] = np.nan

library = library[["sequence", "split", "original_index", "activity_bin",
                    "actual_activity", "pred_activity", "fold", "seq_id"]]

print("Library head:")
display(library.head(10))

print(f"\nShape: {library.shape}")
print(f"\nBy bin and split:")
print(library.groupby(["activity_bin", "split"]).size().unstack(fill_value=0))

print(f"\nActivity stats by bin:")
print(library.groupby("activity_bin")["actual_activity"].describe().round(3))

# --- Redundancy checks ---
complement = str.maketrans("ACGT", "TGCA")

def revcomp(seq):
    return seq.translate(complement)[::-1]

seqs = library["sequence"].values

# Check exact duplicates
n_unique = len(set(seqs))
n_dup = len(seqs) - n_unique
print(f"\nRedundancy check:")
print(f"  Exact duplicates: {n_dup}")

# Check reverse complement overlap
seq_set = set(seqs)
rc_hits = sum(1 for s in seqs if revcomp(s) in seq_set and revcomp(s) != s)
# Each pair counted twice, so divide by 2
rc_pairs = rc_hits // 2
print(f"  Reverse complement pairs: {rc_pairs}")

if n_dup == 0 and rc_pairs == 0:
    print("  All clear — no redundancy found.")
else:
    if n_dup > 0:
        print(f"  WARNING: {n_dup} duplicate sequences found!")
    if rc_pairs > 0:
        print(f"  WARNING: {rc_pairs} reverse complement pairs found!")

Library head:


Unnamed: 0,sequence,split,original_index,activity_bin,actual_activity,pred_activity,fold,seq_id
0,AGGACCGGATCAACTGCGAATGGCAGCGAGTAGAGCAGCACGGTAC...,test,0,high,1.107,1.0625,10,ENSG00000111885_Reversed:
1,AGGACCGGATCAACTGTTGGGAAGGCAGTACCCCAGGGAGAGGTAA...,test,1,high,1.558,1.4375,10,peak76368_Reversed:
2,AGGACCGGATCAACTCACCAAGGGCTGGGATAGCAAATATTGCATC...,test,2,high,1.095,1.578125,10,peak51020
3,AGGACCGGATCAACTCGCCGGAAGCCAGCCTGCCCCGCCCGGAAGC...,test,3,high,1.07,1.09375,10,ENSG00000073169_Reversed:
4,AGGACCGGATCAACTTCACTGAGGGAAATAGATCTTGAGTGTGGCC...,test,4,high,1.275,0.988281,10,peak35869
5,AGGACCGGATCAACTGTTTCCCAAGGTCCGCGCGCCGCCTGCAAGG...,test,5,high,1.395,1.09375,10,ENSG00000139343
6,AGGACCGGATCAACTCCGCTTCAGACCTTTCTCTCTTCTTCAGTGT...,test,6,high,1.149,0.515625,10,ENSG00000205502_Reversed:
7,AGGACCGGATCAACTGATTTTATTTCCCCAAGTCCTCATAGGGCCC...,test,7,high,1.011,1.070312,10,peak21186_Reversed:
8,AGGACCGGATCAACTGTCCCTTCCTCCCTTCTAATGGAACGCGGTT...,test,8,high,1.243,1.328125,10,peak47523_Reversed:
9,AGGACCGGATCAACTAGGGCTCAGGAGTTCCGCCTCTCTTCCCCAG...,test,9,high,1.227,1.34375,10,ENSG00000186448



Shape: (3000, 8)

By bin and split:
split         test  train
activity_bin             
high           750    250
low            258    742
mid            750    250

Activity stats by bin:
               count   mean    std    min    25%    50%    75%    max
activity_bin                                                         
high          1000.0  1.245  0.224  1.001  1.081  1.188  1.340  2.838
low           1000.0 -1.170  0.141 -2.030 -1.240 -1.135 -1.061 -1.001
mid           1000.0 -0.119  0.269 -0.499 -0.342 -0.162  0.072  0.499

Redundancy check:
  Exact duplicates: 0
  Reverse complement pairs: 0
  All clear — no redundancy found.


In [None]:
# Save library + metadata
import json

out_dir = Path("/grid/wsbs/home_norepl/pmantill/LentiMPRA_mcs/LentiMoCon/lenti_AGFT/motif_context_swap/library_prep/activity_libraries") / MODEL_NAME
out_dir.mkdir(parents=True, exist_ok=True)

out_path = out_dir / f"k562_activity_library_{MODEL_NAME}.csv"
library.to_csv(out_path, index=False)

meta_path = out_dir / f"k562_activity_library_{MODEL_NAME}_metadata.json"
meta = {
    "model": MODEL_NAME,
    "seed": SEED,
    "seqs_per_bin": SEQS_PER_BIN,
    "target_test_frac": TEST_FRAC,
    "target_train_frac": TRAIN_FRAC,
    "bin_definitions": {k: list(v) for k, v in BIN_DEFS.items()},
    "sampling": sampling_metadata,
    "total_sequences": len(library),
}
with open(meta_path, "w") as f:
    json.dump(meta, f, indent=2, default=str)

print(f"Saved library: {out_path}")
print(f"Saved metadata: {meta_path}")
print(f"Size: {len(library)} sequences")