# 00 DHS-dbSNP filtering

**Origin:** `0_0_processing_dbsnp_within_DHS.ipynb`  
**This annotated version was generated on:** 2025-10-13 06:41

**What this notebook does (high level):**  
- Filter dbSNP variants by DNase I hypersensitive sites (DHS) and annotate regulatory context. Prepares regulatory SNP universe for instrument selection.

**How to use:**  
1. Review the markdown notes before each code cell.  
2. Adjust input/output paths as needed for your environment.  
3. Run cell-by-cell to reproduce artifacts for downstream steps.

---


**Step 1:** Join/merge datasets to align keys across resources.

In [None]:



import os
import requests
from concurrent.futures import ThreadPoolExecutor, as_completed

# Settings
output_dir = "/mnt/f/0.data/ens_vcf/"
base_url = "https://ftp.ensembl.org/pub/release-114/variation/vcf/homo_sapiens"
chromosomes = [str(i) for i in range(1, 23)]  # chr1 ~ chr22
extensions = [".vcf.gz", ".vcf.gz.csi"]       # download both VCF and CSI

os.makedirs(output_dir, exist_ok=True)

def download_file(url, dest):
    try:
        with requests.get(url, stream=True, timeout=30) as r:
            r.raise_for_status()
            with open(dest, "wb") as f:
                for chunk in r.iter_content(chunk_size=8192):
                    if chunk:
                        f.write(chunk)
        print(f"Downloaded: {dest}")
        return dest
    except Exception as e:
        print(f"Failed: {url} ({e})")
        return None

# Prepare list of all files to download
files_to_download = []
for chrom in  chromosomes:
    for ext in extensions:
        fname = f"homo_sapiens-chr{chrom}{ext}"
        url = f"{base_url}/{fname}"
        dest = os.path.join(output_dir, fname)
        files_to_download.append((url, dest))

# Download with 5 threads
max_workers = 5
with ThreadPoolExecutor(max_workers=max_workers) as executor:
    future_to_file = {executor.submit(download_file, url, dest): (url, dest) for url, dest in files_to_download}
    for future in as_completed(future_to_file):
        url, dest = future_to_file[future]
        result = future.result()



**Step 2:** Load tabular data (summary stats / annotations).

In [1]:
import pandas as pd
import pybedtools
import gzip
import os
from concurrent.futures import ProcessPoolExecutor

vcf_dir = "/mnt/f/0.datasets/ens_vcf/"
dhs_path = "/mnt/f/0.datasets/dhs/DHS_Index_and_Vocabulary_hg38_WM20190703.txt.gz"
out_dir = "/mnt/f/0.datasets/ens_vcf_dhs/"
os.makedirs(out_dir, exist_ok=True)

def vcf_chunk_reader(vcf_path, chunk_size=100000):
    buffer = []
    with gzip.open(vcf_path, 'rt') as f:
        for line in f:
            if line.startswith('#'):
                continue
            buffer.append(line.strip().split('\t'))
            if len(buffer) == chunk_size:
                df = pd.DataFrame(buffer, columns=['chr', 'pos', 'rsid', 'ref', 'alt', 'qual', 'filter', 'info'])
                yield df
                buffer = []
        if buffer:
            df = pd.DataFrame(buffer, columns=['chr', 'pos', 'rsid', 'ref', 'alt', 'qual', 'filter', 'info'])
            yield df

def snps_to_bed(df):
    # 'chrom', 'start', 'end', 'rsid', 'POS'
    df['pos'] = df['pos'].astype(int)
    return df.assign(
        start=df['pos'] - 1,
        end=df['pos'],
        POS=df['pos'],
    ).rename(columns={'chr':'chrom'})[['chrom','start','end','rsid','POS']]

def dhs_to_bed(dhs_df, chrom, component):
    # 'chrom', 'start', 'end', 'identifier'
    sub = dhs_df[(dhs_df['Chromosome'] == chrom) & (dhs_df['component'] == component)]
    return sub[['Chromosome', 'Start', 'End', 'identifier']].rename(
        columns={'Chromosome':'chrom','Start':'start','End':'end'}
    )

def process_chrom(chrom):
    chr_str = str(chrom)
    vcf_path = os.path.join(vcf_dir, f"homo_sapiens-chr{chrom}.vcf.gz")
    out_path = os.path.join(out_dir, f"rsid_in_DHS_chr{chrom}.tsv")
    out_path_lym = os.path.join(out_dir, f"rsid_in_DHS_chr{chrom}_lym.tsv")
    out_path_mye = os.path.join(out_dir, f"rsid_in_DHS_chr{chrom}_mye.tsv")
    
    # DHS BEDs for the chromosome
    lym_bed = pybedtools.BedTool.from_dataframe(dhs_to_bed(dhs, chr_str, 'Lymphoid'))
    mye_bed = pybedtools.BedTool.from_dataframe(dhs_to_bed(dhs, chr_str, 'Myeloid / erythroid'))
    
    results_lym, results_mye = [], []
    
    for chunk in vcf_chunk_reader(vcf_path, chunk_size=100000):
        if chunk.empty:
            continue
        snps_bed = pybedtools.BedTool.from_dataframe(snps_to_bed(chunk))
        # Intersect with DHS, -wa -wb: report all columns from A (SNP) and B (DHS)
        lym_overlap = snps_bed.intersect(lym_bed, wa=True, wb=True)
        mye_overlap = snps_bed.intersect(mye_bed, wa=True, wb=True)
        
        if len(lym_overlap) > 0:
            # Columns: snp bed: chrom, start, end, rsid, POS; dhs bed: chrom, start, end, identifier
            df_lym = lym_overlap.to_dataframe(names=[
                'chr', 'start', 'end', 'rsid', 'POS', 'dhs_chr', 'dhs_start', 'dhs_end', 'identifier'
            ])
            df_lym['component'] = 'Lymphoid'
            results_lym.append(df_lym[['rsid', 'POS', 'component', 'identifier']])
        if len(mye_overlap) > 0:
            df_mye = mye_overlap.to_dataframe(names=[
                'chr', 'start', 'end', 'rsid', 'POS', 'dhs_chr', 'dhs_start', 'dhs_end', 'identifier'
            ])
            df_mye['component'] = 'Mye_ery'
            results_mye.append(df_mye[['rsid', 'POS', 'component', 'identifier']])
    
    df_lym = pd.concat(results_lym, ignore_index=True) if results_lym else pd.DataFrame(columns=['rsid', 'POS', 'component', 'identifier'])
    df_mye = pd.concat(results_mye, ignore_index=True) if results_mye else pd.DataFrame(columns=['rsid', 'POS', 'component', 'identifier'])
    df_lym.to_csv(out_path_lym, sep='\t', index=False)
    df_mye.to_csv(out_path_mye, sep='\t', index=False)


# --- Load DHS only once for all workers (top level, so it's pickled for each subprocess)
dhs = pd.read_csv(dhs_path, sep='\t')
dhs['identifier'] = dhs['identifier'].astype(str)
dhs['Chromosome'] = dhs['seqname'].str.replace('chr', '')
dhs['Start'] = dhs['start']
dhs['End'] = dhs['end']

# --- Run with 6 workers
if __name__ == "__main__":
    with ProcessPoolExecutor(max_workers=6) as executor:
        list(executor.map(process_chrom, range(1, 23)))


  dhs = pd.read_csv(dhs_path, sep='\t')


**Step 3:** Load tabular data (summary stats / annotations).

In [3]:

out_dir = "/mnt/f/0.datasets/ens_vcf_dhs/"

def process_chrom(chrom):
    chr_str = str(chrom)
    out_path = os.path.join(out_dir, f"rsid_in_DHS_chr{chrom}.tsv")
    out_path_lym = os.path.join(out_dir, f"rsid_in_DHS_chr{chrom}_lym.tsv")
    out_path_mye = os.path.join(out_dir, f"rsid_in_DHS_chr{chrom}_mye.tsv")

    df_lym = pd.read_csv(out_path_lym, sep='\t' ) 
    df_mye = pd.read_csv(out_path_mye, sep='\t' )
    df_all = pd.concat([df_lym, df_mye], ignore_index=True)
    # Merge: for rsid+POS+identifier found in both, assign 'Lym_Mye_ery'
    if not df_all.empty:
        # group by rsid+POS+identifier, assign merged component
        df_all = df_all.groupby(['rsid','POS' ])['component'].agg(
                lambda x: 'Lym_Mye_ery' if set(x) == {'Lymphoid','Mye_ery'} else list(x)[0]
            ).reset_index()
        df_all.to_csv(out_path, sep='\t', index=False)
        print(f"{out_path} written: {len(df_all)} SNPs.")
    else:
        print(f"{out_path}: No SNPs found.")

# --- Run with 6 workers
if __name__ == "__main__":
    with ProcessPoolExecutor(max_workers=6) as executor:
        list(executor.map(process_chrom, range(1, 23)))


/mnt/f/0.data/ens_vcf_dhs/rsid_in_DHS_chr4.tsv written: 1619196 SNPs.
/mnt/f/0.data/ens_vcf_dhs/rsid_in_DHS_chr5.tsv written: 1753382 SNPs.
/mnt/f/0.data/ens_vcf_dhs/rsid_in_DHS_chr6.tsv written: 1959169 SNPs.
/mnt/f/0.data/ens_vcf_dhs/rsid_in_DHS_chr3.tsv written: 2218744 SNPs.
/mnt/f/0.data/ens_vcf_dhs/rsid_in_DHS_chr2.tsv written: 2770994 SNPs.
/mnt/f/0.data/ens_vcf_dhs/rsid_in_DHS_chr1.tsv written: 2974381 SNPs.
/mnt/f/0.data/ens_vcf_dhs/rsid_in_DHS_chr8.tsv written: 1592383 SNPs.
/mnt/f/0.data/ens_vcf_dhs/rsid_in_DHS_chr9.tsv written: 1410433 SNPs.
/mnt/f/0.data/ens_vcf_dhs/rsid_in_DHS_chr7.tsv written: 1830229 SNPs.
/mnt/f/0.data/ens_vcf_dhs/rsid_in_DHS_chr10.tsv written: 1466773 SNPs.
/mnt/f/0.data/ens_vcf_dhs/rsid_in_DHS_chr13.tsv written: 945242 SNPs.
/mnt/f/0.data/ens_vcf_dhs/rsid_in_DHS_chr11.tsv written: 1532595 SNPs.
/mnt/f/0.data/ens_vcf_dhs/rsid_in_DHS_chr15.tsv written: 1041082 SNPs.
/mnt/f/0.data/ens_vcf_dhs/rsid_in_DHS_chr14.tsv written: 1108494 SNPs.
/mnt/f/0.data/en

**Step 4:** Load tabular data (summary stats / annotations).

In [6]:

# %% Merge rsid_in_DHS_chr{n}_{lym|mye}.tsv into one table
import os, glob
from pathlib import Path
import pandas as pd
import numpy as np

in_dir  = "/mnt/f/0.datasets/ens_vcf_dhs/chroms/"
out_dir = "/mnt/f/0.datasets/ens_vcf_dhs/"
Path(out_dir).mkdir(parents=True, exist_ok=True)

def load_side(path, id_colname):
    """Load one of (lym/mye) files with robust parsing; keep first occurrence per rsid."""
    if not Path(path).exists():
        return pd.DataFrame(columns=["rsid","POS",id_colname])
    df = pd.read_csv(
        path, sep=r"\s+|,", engine="python",
        usecols=["rsid","POS","identifier"],
        dtype={"rsid":"string","POS":"Int64","identifier":"string"}
    ).dropna(subset=["rsid"])
    df = df.drop_duplicates(subset=["rsid"], keep="first")
    df = df.rename(columns={"identifier": id_colname})
    return df[["rsid","POS",id_colname]]

merged_chunks = []
warnings_pos_mismatch = 0

for chrom in range(1, 23):
    p_lym = os.path.join(in_dir, f"rsid_in_DHS_chr{chrom}_lym.tsv")
    p_mye = os.path.join(in_dir, f"rsid_in_DHS_chr{chrom}_mye.tsv")

    df_lym = load_side(p_lym, "lym_id")
    df_mye = load_side(p_mye, "mye_id")

    if df_lym.empty and df_mye.empty:
        continue

    # Outer-join on rsid
    m = pd.merge(df_lym, df_mye, on="rsid", how="outer", suffixes=("_lym","_mye"))

    print( m.head() ) 
    # Choose POS: prefer lymphoid POS when both present, otherwise use the available one
    # Track mismatches (optional)
    both_pos = m["POS_lym"].notna() & m["POS_mye"].notna()
    if both_pos.any():
        warnings_pos_mismatch += int((m.loc[both_pos, "POS_lym"] != m.loc[both_pos, "POS_mye"]).sum())

    m["POS"] = m["POS_lym"].combine_first(m["POS_mye"]).astype("Int64")
    m = m.drop(columns=["POS_lym","POS_mye"])

    # Component label
    has_lym = m["lym_id"].notna()
    has_mye = m["mye_id"].notna()
    m["component"] = np.where(has_lym & has_mye, "lym_mye",
                         np.where(has_lym, "lym",
                           np.where(has_mye, "mye", "")))

    # Add CHROM
    m.insert(0, "CHROM", chrom)

    # Reorder columns
    m = m[["CHROM","POS","rsid","component","lym_id","mye_id"]]

    merged_chunks.append(m)

# Concatenate all chromosomes
if merged_chunks:
    merged = pd.concat(merged_chunks, ignore_index=True)
else:
    merged = pd.DataFrame(columns=["CHROM","POS","rsid","component","lym_id","mye_id"])

# (Optional) report POS mismatches when rsid appeared in both files
if warnings_pos_mismatch:
    print(f"[WARN] {warnings_pos_mismatch} rsids had different POS between lym and mye; "
          "kept lymphoid POS when available.")

# Write output
out_path = os.path.join(out_dir, "rsid_in_DHS_merged_lym_mye.tsv")
merged.to_csv(out_path, sep="\t", index=False)
print(f"[done] Wrote {len(merged):,} rows → {out_path}")





           rsid    POS_lym     lym_id    POS_mye     mye_id
0  rs1000001430       <NA>       <NA>   50979270  1.2842488
1  rs1000001499  189726932   1.785795       <NA>       <NA>
2  rs1000001701       <NA>       <NA>  230427541  1.9328719
3  rs1000001936       <NA>       <NA>   13769354   1.149772
4  rs1000002474  203289853  1.8347732       <NA>       <NA>
           rsid    POS_lym     lym_id    POS_mye     mye_id
0  rs1000000078   28683172  2.2065748       <NA>       <NA>
1  rs1000000696       <NA>       <NA>  191479699   2.811475
2  rs1000000724    7694393   2.128576       <NA>       <NA>
3  rs1000001129       <NA>       <NA>   31012765  2.2152529
4  rs1000001171  157411878   2.684923       <NA>       <NA>
           rsid   POS_lym     lym_id   POS_mye     mye_id
0  rs1000000393  15384792   3.169858      <NA>       <NA>
1  rs1000000694      <NA>       <NA>  71385964  3.4240621
2  rs1000001006      <NA>       <NA>  14646662   3.166586
3  rs1000001162  16511162  3.1749706      <NA>  

**Step 6:** Run a processing or analysis step.

**Step 7:** Run a processing or analysis step.

**Step 8:** Run a processing or analysis step.

In [8]:
498/( 2043+ 473264)


0.0010477438792191151

**Step 9:** Run a processing or analysis step.