# KRAS, FAS, MST1R splicing, and CFTR variant set generation

One notebook that:
1. **KRAS event neighborhood** – generates all double-SNV pairs in a window around the KRAS event (25227343–25227344), saves `kras_neighborhood_doubles.csv`.
2. **FAS double variants** – parses the FAS exon 3 epistasis Excel table, converts local IDs to full mut IDs, saves `fas_subset.csv`.
3. **MST1R splicing mutations** – a separate set of *splicing* double variants (nothing to do with Cai et al. eQTLs). Source/generation code: **not found in this repo**; `splicing_story.ipynb` reads `mst1r_subset.csv`. Placeholder below until the real pipeline is added (e.g. from splicing_data or another project).
4. **CFTR mRNA folding pairs** – curates specific CFTR double variants in the neighborhood related to mRNA folding; saves `cftr_folding_pairs.csv`.

In [None]:
import os
import pandas as pd
import numpy as np
from itertools import combinations, product

# Paths (set for your environment)
GENE_STRAND_PATH = "/tamir2/nicolaslynn/projects/dlm_wrappers/genebeddings/assets/benchmarks/gene_strands.csv"
LIFTOVER_CHAIN_PATH = "/tamir2/nicolaslynn/data/UCSC/raw_data/hg19ToHg38.over.chain.gz"
# EPISTASIS_EQTLS_PATH = "epistasis_eqtls.csv"  # only for eQTL-based sets (not MST1R splicing)
FAS_EXCEL_PATH = "/tamir2/nicolaslynn/projects/epistasis/notebooks/FAS_data/41467_2016_BFncomms11558_MOESM968_ESM.xlsx"

OUT_KRAS = "kras_neighborhood_doubles.csv"
OUT_FAS = "fas_subset.csv"
OUT_MST1R = "mst1r_subset.csv"  # MST1R splicing double variants (source: add your pipeline)
OUT_CFTR_FOLDING = "cftr_folding_pairs.csv"

BASES = "ACGT"

## Shared helpers

In [None]:
GENE_STRAND = pd.read_csv(GENE_STRAND_PATH).set_index("gene_name").Strand.to_dict()

def make_mut_id(row, pos_col="pos", ref_col="ref", var_col="alt", chrom_col="chrom", gene_col="gene"):
    rev = GENE_STRAND.get(row[gene_col], False)
    s = "N" if rev else "P"
    chrom = str(row[chrom_col]).strip("chr")
    return f"{row[gene_col]}:{chrom}:{int(row[pos_col])}:{row[ref_col]}:{row[var_col]}:{s}"

## 1. KRAS event neighborhood – generate all double variants

In [None]:
from seqmat import SeqMat

# Window around KRAS double-hit event (chr12)
KRAS_CENTER_LO, KRAS_CENTER_HI = 25227343, 25227344
KRAS_WINDOW = 150
seqmat = SeqMat.from_fasta("hg38", "chr12", KRAS_CENTER_LO - KRAS_WINDOW, KRAS_CENTER_HI + KRAS_WINDOW)
seq, indices = seqmat.seq, seqmat.index

def get_all_epistasis_ids(seq, indices, gene, chrom, zyg="N", max_distance=20, delta_dist=1):
    L = len(seq)
    all_ids = []
    for i, j in combinations(range(L), 2):
        pos1, pos2 = int(indices[i]), int(indices[j])
        ref1, ref2 = seq[i], seq[j]
        if ref1 not in BASES or ref2 not in BASES:
            continue
        alts1 = [b for b in BASES if b != ref1]
        alts2 = [b for b in BASES if b != ref2]
        for alt1, alt2 in product(alts1, alts2):
            if pos2 - pos1 > delta_dist:
                continue
            id1 = f"{gene}:{chrom}:{pos1}:{ref1}:{alt1}:{zyg}"
            id2 = f"{gene}:{chrom}:{pos2}:{ref2}:{alt2}:{zyg}"
            all_ids.append(f"{id1}|{id2}")
    return all_ids

ids1 = get_all_epistasis_ids(seq, indices, gene="KRAS", chrom="12", zyg="N")
ids2 = get_all_epistasis_ids(seq, indices, gene="KRAS", chrom="12", zyg="N", max_distance=10, delta_dist=5)
all_kras_ids = ids1 + ids2
kras_df = pd.DataFrame({"epistasis_id": all_kras_ids})
kras_df.to_csv(OUT_KRAS, index=False)
print(f"KRAS: {len(kras_df)} pairs -> {OUT_KRAS}")




## 2. FAS double variants – parse Excel and convert to mut IDs

In [None]:
from seqmat.gene import Gene
from tqdm import tqdm
tqdm.pandas()

FAS_EXON_START, FAS_EXON_END = 89_010_753, 89_010_815
FAS_CHROM = 10
fas_rev = GENE_STRAND.get("FAS", False)
strand = "N" if fas_rev else "P"

FAS = Gene.from_file("FAS").transcript().generate_pre_mrna()

def convert_local_exon_to_mut(local_id):
    pos, new_allele = local_id.split("-")
    pos = FAS_EXON_START + int(pos) - 1
    old_allele = FAS.pre_mrna[pos][0].decode("utf-8")
    return f"FAS:{FAS_CHROM}:{pos}:{old_allele}:{new_allele}:{strand}"

fas_df = pd.read_excel(FAS_EXCEL_PATH).rename(columns={
    "IDA": "mut1", "IDB": "mut2", "Ascore": "score1", "Bscore": "score2",
    "A+B": "expected_experimental_score", "ABscore": "experimental_score",
    "EmpiricalEpistasis": "empirical_epistasis", "EmpiricalEpistasisRawPvalue": "empirical_pval",
})[["mut1", "mut2", "score1", "score2", "experimental_score", "expected_experimental_score",
   "empirical_epistasis", "empirical_pval", "CategoryA", "CategoryB"]].sort_values(["mut1", "mut2"])

fas_df["mut1"] = fas_df.mut1.progress_apply(convert_local_exon_to_mut)
fas_df["mut2"] = fas_df.mut2.progress_apply(convert_local_exon_to_mut)
fas_df["epistasis_id"] = fas_df.mut1 + "|" + fas_df.mut2
fas_df.to_csv(OUT_FAS, index=False)
print(f"FAS: {len(fas_df)} pairs -> {OUT_FAS}")




## 3. MST1R splicing mutations

**MST1R is a separate set of splicing (double) variants – not from Cai et al. eQTLs.**

Code that *generates* this set was **not found** in this repo. Searched: `get_data.ipynb` (lists MST1R variants but no producer), `splicing_story.ipynb` (only reads `mst1r_subset.csv`). Splicing inputs in repo: `splicing_data/cao_et_al.csv`, `jung_et_al1/2.csv`. Add your pipeline below and write to `OUT_MST1R`.

In [None]:
# PLACEHOLDER: Add your MST1R splicing pipeline here (e.g. filter splicing_data by gene MST1R, form pairs).
if os.path.isfile(OUT_MST1R):
    print(pd.read_csv(OUT_MST1R)[["epistasis_id"]].head(2))
else:
    print("MST1R splicing set not generated in this notebook (add pipeline above).")

## 4. CFTR mRNA folding pairs

Curated double variants in the CFTR neighborhood related to mRNA folding (to check in downstream analyses).

In [None]:
# CFTR mRNA folding–related pairs (neighborhood to check)
cftr_folding_epistasis_ids = [
    "CFTR:7:117509093:G:A:P|CFTR:7:117595001:T:G:P",
    "CFTR:7:117594991:G:T:P|CFTR:7:117595001:T:G:P",
    "CFTR:7:117592008:A:G:P|CFTR:7:117595001:T:G:P",
]
cftr_folding_df = pd.DataFrame({"epistasis_id": cftr_folding_epistasis_ids})
cftr_folding_df.to_csv(OUT_CFTR_FOLDING, index=False)
print(f"CFTR folding: {len(cftr_folding_df)} pairs -> {OUT_CFTR_FOLDING}")




## Summary

In [None]:
print(pd.read_csv(OUT_KRAS).head(2))
print()
print(pd.read_csv(OUT_FAS)[["epistasis_id"]].head(2))
print()
if os.path.isfile(OUT_MST1R):
    print(pd.read_csv(OUT_MST1R)[["epistasis_id"]].head(2))
else:
    print("MST1R subset not present (splicing pipeline to be added).")
print()
print("CFTR folding:", pd.read_csv(OUT_CFTR_FOLDING).epistasis_id.tolist())


