# 01 GWAS-TWAS preprocessing

**Origin:** `0_1_preprocessing_gwas_twas.ipynb`  
**This annotated version was generated on:** 2025-10-13 06:41

**What this notebook does (high level):**  
- Preprocess GWAS/TWAS summary statistics: harmonization, allele alignment, liftover checks, and basic QC.

**How to use:**  
1. Review the markdown notes before each code cell.  
2. Adjust input/output paths as needed for your environment.  
3. Run cell-by-cell to reproduce artifacts for downstream steps.

---


**Step 1:** Filesystem setup and path management.

In [2]:
import gzip
import pandas as pd
import math
import numpy as np

import os, re, gzip
from pathlib import Path
import pandas as pd



**Step 2:** Load tabular data (summary stats / annotations).

In [None]:

# Outcome 1: OSTEONECROSIS
osteo1 = pd.read_csv("/mnt/f/10_osteo_MR/datasets/outcome/summary_stats_release_finngen_R12_M13_OSTEONECROSIS.gz", sep="\t")
osteo1_out = osteo1.rename(columns={
    'rsids': 'SNP',
    'beta': 'beta',
    'sebeta': 'se',
    'ref': 'other_allele',
    'alt': 'effect_allele',
    'pval': 'pval'
})[['SNP', 'beta', 'se', 'effect_allele', 'other_allele', 'pval']]
# osteo1_out.to_csv("/mnt/f/10_osteo_MR/MR_ready/outcome_osteo_whole_snp.tsv", sep='\t', index=False)


# Outcome 2: OSTEO_DRUGS
osteo2 = pd.read_csv("/mnt/f/10_osteo_MR/datasets/outcome/summary_stats_release_finngen_R12_OSTEON_DRUGS.gz", sep="\t")
osteo2_out = osteo2.rename(columns={
    'rsids': 'SNP',
    'beta': 'beta',
    'sebeta': 'se',
    'ref': 'other_allele',
    'alt': 'effect_allele',
    'pval': 'pval'
})[['SNP', 'beta', 'se', 'effect_allele', 'other_allele', 'pval']]
# osteo2_out.to_csv("/mnt/f/10_osteo_MR/MR_ready/outcome_osteo_drugs.tsv", sep='\t', index=False)


outcome_rsid = set( pd.read_csv( "/mnt/f/10_osteo_MR/MR_ready/outcome_osteo_whole_snp.tsv", sep='\t', usecols=['SNP'], dtype=str )['SNP'].to_list() )


**Step 3:** Load tabular data (summary stats / annotations).

In [7]:

# %% Write GWAS subset WITH dhs_id (RSID join only; NA-safe; Lymphoid precedence)

import os, re, gzip
from pathlib import Path
import pandas as pd

# --- Inputs ---
gwas_path = "/mnt/f/10_osteo_MR/datasets/outcome/summary_stats_release_finngen_R12_M13_OSTEONECROSIS.gz"
# Merged RSID↔DHS table (columns: CHROM, POS, rsid, component, lym_id, mye_id)
merged_map_path = "/mnt/f/0.datasets/ens_vcf_dhs/rsid_in_DHS_merged_lym_mye.tsv.gz"

# --- Outputs ---
out_dir  = "/mnt/f/10_osteo_MR/datasets/dhs_context"
Path(out_dir).mkdir(parents=True, exist_ok=True)
gwas_out = os.path.join(out_dir, "summary_stats_release_finngen_R12_M13_OSTEONECROSIS.within_wb_DHS.tsv.gz")

# 1) Build RSID→DHS-ID map (prefer lym_id, else mye_id)
m = pd.read_csv(
    merged_map_path, sep="\t", compression="infer",
    usecols=["rsid","lym_id","mye_id"],
    dtype={"rsid":"string","lym_id":"string","mye_id":"string"}
).dropna(subset=["rsid"])
m["dhs_id"] = m["lym_id"].where(m["lym_id"].notna() & (m["lym_id"] != ""), m["mye_id"])
rsid2dhs = dict(zip(m["rsid"].astype(str), m["dhs_id"].astype("string").fillna("").astype(str)))

# Also keep the RSID set for quick membership filtering
rsids_in_dhs = set(rsid2dhs.keys())
print(f"[info] RSIDs-in-DHS: {len(rsids_in_dhs):,}  | with DHS id: {sum(bool(v) for v in rsid2dhs.values()):,}")

# 2) Helpers
splitter = re.compile(r"[;,\s]+")

def any_in_dhs(rs_field: str) -> bool:
    if rs_field is None or pd.isna(rs_field): return False
    s = str(rs_field)
    if not s or s == ".": return False
    for tok in splitter.split(s):
        if tok and tok in rsids_in_dhs:
            return True
    return False

def first_dhs_id(rs_field: str) -> str:
    """Return the FIRST DHS id found among the rsids in this row (lym precedence already baked into map)."""
    if rs_field is None or pd.isna(rs_field): return ""
    s = str(rs_field)
    if not s or s == ".": return ""
    for tok in splitter.split(s):
        if tok and tok in rsid2dhs:
            did = rsid2dhs.get(tok, "")
            if did and did != "nan":
                return did
    return ""

# 3) Stream GWAS, keep only rows with RSID in DHS, and append dhs_id
written = total = 0
first = True
chunksize = 1_000_000

with gzip.open(gwas_out, "wt") as gzout:
    for chunk in pd.read_csv(gwas_path, sep="\t", compression="infer", dtype="string", chunksize=chunksize):
        total += len(chunk)
        if "rsids" not in chunk.columns:
            raise ValueError("Input GWAS file is missing the 'rsids' column.")
        rs = chunk["rsids"].astype("string").fillna("")
        mask = rs.map(any_in_dhs)
        sub  = chunk.loc[mask].copy()
        if not sub.empty:
            # add dhs_id column (first matching RSID’s DHS ID)
            sub["dhs_id"] = rs.loc[sub.index].map(first_dhs_id).astype(str)
            sub.to_csv(gzout, sep="\t", index=False, header=first)
            written += len(sub)
            first = False

print(f"[done] Wrote {written:,} GWAS rows with dhs_id → {gwas_out}")
print(f"[note] Total GWAS rows scanned: {total:,}")

# 4) Build MR-ready outcome file (carry dhs_id through)
osteo1 = pd.read_csv(gwas_out, sep="\t", compression="infer", dtype="string")

# Rename and select columns for MR tool; keep dhs_id
# Finngen headers => MR: SNP, beta, se, effect_allele, other_allele, pval
osteo1_out = osteo1.rename(columns={
    "rsids": "SNP",
    "beta": "beta",
    "sebeta": "se",
    "alt": "effect_allele",
    "ref": "other_allele",
    "pval": "pval",
})[["SNP","beta","se","effect_allele","other_allele","pval","dhs_id"]]

mr_out = "/mnt/f/10_osteo_MR/MR_ready/outcome_osteo_within_wb_DHS.tsv"
osteo1_out.to_csv(mr_out, sep="\t", index=False)
print(f"[done] MR-ready outcome with dhs_id → {mr_out}")




[info] RSIDs-in-DHS: 32,364,424  | with DHS id: 32,364,424
[done] Wrote 597,125 GWAS rows with dhs_id → /mnt/f/10_osteo_MR/datasets/dhs_context/summary_stats_release_finngen_R12_M13_OSTEONECROSIS.within_wb_DHS.tsv.gz
[note] Total GWAS rows scanned: 21,326,767
[done] MR-ready outcome with dhs_id → /mnt/f/10_osteo_MR/MR_ready/outcome_osteo_within_wb_DHS.tsv


**Step 4:** Join/merge datasets to align keys across resources.

In [1]:

# eqtlgen 
# 
# Step 2: Prepare input/output files
infile = "/mnt/e/0.datasets/eqtlGen/2019-12-11-cis-eQTLsFDR0.05-ProbeLevel-CohortInfoRemoved-BonferroniAdded.txt.gz"
outfile = "/mnt/f/10_osteo_MR/datasets/expo/2019-12-11-cis-eQTLsFDR0.05-ProbeLevel-CohortInfoRemoved-BonferroniAdded_eqtls_beta_se.tsv"
N = 31864  # max sample size as per instruction
alleleFreq = 0.5  # Set to 0.5 if unknown (this is the most conservative assumption)

with gzip.open(infile, 'rt') as fin, open(outfile, 'w') as fout:
    header = fin.readline().strip().split('\t')
    # Write new header
    fout.write('\t'.join(header + ["beta", "se"]) + '\n')
    
    for line in fin:
        fields = line.strip().split('\t')
        if len(fields) < 14:
            continue
        try:
            z = float(fields[6])
        except Exception:
            continue
        # Compute beta and se
        denom = math.sqrt(2 * alleleFreq * (1 - alleleFreq) * (N + z ** 2))
        beta = z / denom
        se = 1 / denom
        # Write row with computed values
        fout.write('\t'.join(fields + [f"{beta:.6g}", f"{se:.6g}"]) + '\n')

print(f"Done. Output written to: {outfile}")


Done. Output written to: /mnt/f/10_osteo_MR/datasets/expo/2019-12-11-cis-eQTLsFDR0.05-ProbeLevel-CohortInfoRemoved-BonferroniAdded_eqtls_beta_se.tsv


**Step 5:** Load tabular data (summary stats / annotations).

In [19]:
# %% One-SNP-per-DHS-per-gene from eQTLGen (using DHS RSIDs from outcome file)
import os, re
from pathlib import Path
import numpy as np
import pandas as pd

# ---- Inputs ----
eqtlgen_file = "/mnt/f/10_osteo_MR/datasets/expo/2019-12-11-cis-eQTLsFDR0.05-ProbeLevel-CohortInfoRemoved-BonferroniAdded_eqtls_beta_se.tsv"
outcome_with_dhs = "/mnt/f/10_osteo_MR/MR_ready/outcome_osteo_within_wb_DHS.tsv"   # has columns SNP (possibly 'rs1;rs2;...') and dhs_id

# ---- Output ----
final_out = "/mnt/f/10_osteo_MR/MR_ready/exposure_eqtlgen_dhs_index.tsv"
Path(os.path.dirname(final_out)).mkdir(parents=True, exist_ok=True)

# ---- 1) Build RSID -> dhs_id map from outcome file (split multi-rsid) ----
splitter = re.compile(r"[;,\s]+")
snp2dhs = {}

out_df = pd.read_csv(outcome_with_dhs, sep="\t", dtype="string", usecols=["SNP","dhs_id"])
for s, did in zip(out_df["SNP"], out_df["dhs_id"]):
    if pd.isna(s):
        continue
    did_str = "" if pd.isna(did) else str(did)
    for tok in splitter.split(str(s)):
        if tok and tok != ".":
            # keep the first mapping we see; outcome file already applies lymphoid precedence
            if tok not in snp2dhs:
                snp2dhs[tok] = did_str

dhs_rsids = set(snp2dhs.keys())
print(f"[info] RSIDs in DHS (from outcome): {len(dhs_rsids):,}")

# ---- 2) Stream eQTLGen and keep only SNPs in DHS set; reduce to best per (GeneSymbol, dhs_id) ----
# We'll keep a compact 'best' table and update it chunk-by-chunk
best_df = pd.DataFrame(columns=[
    "SNP","beta","se","AssessedAllele","OtherAllele","GeneSymbol","Pvalue","Zscore","dhs_id"
])

def reduce_to_best(df: pd.DataFrame) -> pd.DataFrame:
    """Return one row per (GeneSymbol, dhs_id):
       lowest Pvalue, then highest |Zscore|, then first."""
    if df.empty:
        return df
    tmp = df.copy()
    # numeric keys
    tmp["pval_num"] = pd.to_numeric(tmp["Pvalue"], errors="coerce")
    tmp["abs_z"]    = pd.to_numeric(tmp["Zscore"], errors="coerce").abs()
    tmp["abs_z"]    = tmp["abs_z"].fillna(-np.inf)
    # sort and keep first per group
    tmp = tmp.sort_values(["GeneSymbol","dhs_id","pval_num","abs_z"],
                          ascending=[True, True, True, False])
    return tmp.groupby(["GeneSymbol","dhs_id"], as_index=False).head(1)

chunksize = 1_000_000
required_cols = ["Pvalue","SNP","AssessedAllele","OtherAllele","Zscore","GeneSymbol","beta","se"]

for i, chunk in enumerate(pd.read_csv(eqtlgen_file, sep="\t", dtype="string",
                                      usecols=required_cols, chunksize=chunksize)):
    # filter to SNPs present in the DHS RSID set
    mask = chunk["SNP"].isin(dhs_rsids)
    sub = chunk.loc[mask].copy()
    if sub.empty:
        print(f"[chunk {i}] matches: 0 (skipped)")
        continue

    # attach dhs_id via RSID map
    sub["dhs_id"] = sub["SNP"].map(snp2dhs).astype("string")

    # local reduction to one row per (GeneSymbol, dhs_id)
    local_best = reduce_to_best(sub)

    # merge with global best and reduce again
    if best_df.empty:
        best_df = local_best
    else:
        combined = pd.concat([best_df, local_best], ignore_index=True)
        best_df = reduce_to_best(combined)

    print(f"[chunk {i}] matches: {len(sub):,} | running groups: {best_df.shape[0]:,}")

print(f"[done] total groups (GeneSymbol, dhs_id): {best_df.shape[0]:,}")

# ---- 3) Build MR-ready exposure file (keep dhs_id for traceability) ----
eqtlgen_exp = best_df.rename(columns={
    "AssessedAllele": "effect_allele",
    "OtherAllele": "other_allele",
    "GeneSymbol": "gene",
    "Pvalue": "pval"
})[["SNP","beta","se","effect_allele","other_allele","gene","pval","dhs_id"]]

eqtlgen_exp.to_csv(final_out, sep="\t", index=False)
print(f"[saved] {final_out}  (rows: {len(eqtlgen_exp):,})")


[info] RSIDs in DHS (from outcome): 606,769
[chunk 0] matches: 48,057 | running groups: 38,706
[chunk 1] matches: 45,845 | running groups: 75,333
[chunk 2] matches: 45,104 | running groups: 111,135
[chunk 3] matches: 43,743 | running groups: 145,704
[chunk 4] matches: 43,089 | running groups: 179,435
[chunk 5] matches: 43,546 | running groups: 213,411
[chunk 6] matches: 42,600 | running groups: 246,624
[chunk 7] matches: 42,973 | running groups: 280,179
[chunk 8] matches: 42,647 | running groups: 313,258
[chunk 9] matches: 42,662 | running groups: 346,324
[chunk 10] matches: 21,795 | running groups: 363,123
[done] total groups (GeneSymbol, dhs_id): 363,123
[saved] /mnt/f/10_osteo_MR/MR_ready/exposure_eqtlgen_dhs_index.tsv  (rows: 363,123)


**Step 6:** Load tabular data (summary stats / annotations).

In [28]:


# %% GTEx WB → MR exposure (within DHS; one SNP per gene×DHS by p-value, tie by |Z|)
import os, re
from pathlib import Path
import pandas as pd
import numpy as np

# ---------- Inputs ----------
eqtl_path     = "/mnt/f/0.datasets/gtex/gtex_v10_qtl_data/GTEx_Analysis_v10_eQTL_updated/Whole_Blood.v10.eQTLs.signif_pairs.parquet"
lookup_path   = "/mnt/f/0.datasets/gtex/gtex_v10_qtl_data/GTEx_Analysis_2021-02-11_v10_WholeGenomeSeq_953Indiv.lookup_table.txt.gz"
gtf_path      = "/mnt/f/0.datasets/gtex/gtex_v10_qtl_data/gencode.v39.GRCh38.genes.gtf"
outcome_with_dhs = "/mnt/f/10_osteo_MR/MR_ready/outcome_osteo_within_wb_DHS.tsv"  # columns: SNP (may be 'rs1;rs2;...'), dhs_id

# ---------- Output ----------
mr_out = "/mnt/f/10_osteo_MR/MR_ready/exposure_gtex_whole_blood_eqtl_dhs_index.tsv"
Path(os.path.dirname(mr_out)).mkdir(parents=True, exist_ok=True)

# ---------- 0) RSIDs within DHS (and dhs_id) from outcome file ----------
splitter = re.compile(r"[;,\s]+")
snp2dhs = {}
out_df = pd.read_csv(outcome_with_dhs, sep="\t", dtype="string", usecols=["SNP","dhs_id"])
for s, did in zip(out_df["SNP"], out_df["dhs_id"]):
    if pd.isna(s):
        continue
    did_str = "" if pd.isna(did) else str(did)
    for tok in splitter.split(str(s)):
        if tok and tok != "." and tok not in snp2dhs:
            snp2dhs[tok] = did_str
keep_rsid = set(snp2dhs.keys())
print(f"[info] DHS RSIDs from outcome: {len(keep_rsid):,}")

# ---------- 1) Load GTEx eQTL pairs (only needed columns) ----------
eqtl = pd.read_parquet(eqtl_path, columns=["variant_id","gene_id","pval_nominal","slope","slope_se"])
print(f"[info] GTEx WB rows (significant pairs): {len(eqtl):,}")

# Parse alleles and position from variant_id: chr1_1035804_G_A_b38
parts = eqtl["variant_id"].str.split("_", expand=True)
eqtl["chr"]  = parts[0].str.replace("^chr","",regex=True)
eqtl["pos"]  = pd.to_numeric(parts[1], errors="coerce")
eqtl["ref"]  = parts[2]
eqtl["alt"]  = parts[3]

# ---------- 2) Map variant_id → rsid ----------
lookup = pd.read_csv(lookup_path, sep="\t", dtype="string",
                     usecols=["variant_id","rs_id_dbSNP155_GRCh38p13"])
var2rs = dict(zip(lookup["variant_id"], lookup["rs_id_dbSNP155_GRCh38p13"]))
eqtl["rsid"] = eqtl["variant_id"].map(var2rs).astype("string")

# Filter to SNPs that are in DHS (by RSID)
eqtl = eqtl[eqtl["rsid"].isin(keep_rsid)].copy()
print(f"[info] Rows after DHS RSID filter: {len(eqtl):,}")

# Attach dhs_id
eqtl["dhs_id"] = eqtl["rsid"].map(snp2dhs).astype("string")

# ---------- 3) Map gene_id → gene_name from GTF ----------
def load_gene_map(gtf):
    gmap = {}
    with open(gtf, "r") as f:
        for line in f:
            if not line or line[0] == "#": continue
            fields = line.rstrip("\n").split("\t")
            if len(fields) < 9 or fields[2] != "gene": continue
            info = fields[8]
            gid = gname = None
            for item in info.split(";"):
                s = item.strip()
                if s.startswith("gene_id"):
                    gid = s.split('"')[1]
                elif s.startswith("gene_name"):
                    gname = s.split('"')[1]
            if gid and gname:
                gmap[gid] = gname
    return gmap

gene_map = load_gene_map(gtf_path)
eqtl["gene"] = eqtl["gene_id"].map(gene_map).astype("string")

# ---------- 4) Compute Z and reduce to one SNP per (gene, dhs_id) ----------
# Effect allele = ALT; other allele = REF (GTEx slope is for ALT vs REF)
eqtl["effect_allele"] = eqtl["alt"]
eqtl["other_allele"]  = eqtl["ref"]

# Numeric conversions
eqtl["pval_nominal"] = pd.to_numeric(eqtl["pval_nominal"], errors="coerce")
eqtl["slope"]        = pd.to_numeric(eqtl["slope"], errors="coerce")
eqtl["slope_se"]     = pd.to_numeric(eqtl["slope_se"], errors="coerce")

# Z score (use sign of slope; robust to se=0)
eqtl["Z"] = eqtl["slope"] / eqtl["slope_se"]
eqtl.loc[~np.isfinite(eqtl["Z"]), "Z"] = np.nan  # handle inf/NaN gracefully

# Keep rows with a valid dhs_id and gene
eqtl = eqtl.dropna(subset=["dhs_id","gene"])

# Sort for tie-breaking: lowest p, then largest |Z|, then stable order
eqtl = eqtl.sort_values(["gene","dhs_id","pval_nominal","Z"],
                        ascending=[True, True, True, False])

# One SNP per (gene, dhs_id)
best = eqtl.groupby(["gene","dhs_id"], as_index=False).head(1).copy()
print(f"[info] Final (gene × DHS) picks: {len(best):,}")

# ---------- 5) Build MR-ready table ----------
out = best.rename(columns={
    "rsid": "SNP",
    "slope": "beta",
    "slope_se": "se",
    "pval_nominal": "pval",
})[["SNP","effect_allele","other_allele","beta","se","pval","gene","gene_id","dhs_id"]]

out.to_csv(mr_out, sep="\t", index=False)
print(f"[done] MR exposure written → {mr_out}")





[info] DHS RSIDs from outcome: 606,769
[info] GTEx WB rows (significant pairs): 2,985,690
[info] Rows after DHS RSID filter: 158,211
[info] Final (gene × DHS) picks: 110,483
[done] MR exposure written → /mnt/f/10_osteo_MR/MR_ready/exposure_gtex_whole_blood_eqtl_dhs_index.tsv
