# 41 DrugMap2 target mapping and enrichment

**Origin:** `4_1_drug_target_drugMap2_target.ipynb`  
**This annotated version was generated on:** 2025-10-13 06:41

**What this notebook does (high level):**  
- Map DrugMap 2.0 targets to causal genes/modules; run enrichment and prepare inputs for DRS/ci-PTRS-based drug ranking.

**How to use:**  
1. Review the markdown notes before each code cell.  
2. Adjust input/output paths as needed for your environment.  
3. Run cell-by-cell to reproduce artifacts for downstream steps.

---


**Step 1:** Load tabular data (summary stats / annotations).

In [1]:
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Drugmap 2.0: Drug–target intersection with MR causal genes
Outputs:
  1) For each MOA class (Agonist/Antagonist/Inhibitor/Modulator): histogram of per-drug average beta over its causal targets
  2) Grouped bar chart: Highest_status counts for drugs targeting risk (β>0) vs protective (β<0) genes

Inputs:
  MR betas:
    /mnt/f/10_osteo_MR/results_mr_ptrs/PTRS/bulk_crossmodal_meta_beta.tsv
      columns: gene, meta_beta_common, ...
  Drugmap (under /mnt/f/0.datasets/dgidb/):
    1.General Information of Drug.tsv
    5.General Information of Drug Therapeutic Target (DTT).tsv
    10.Drug to DTT Mapping Information.tsv
"""

import os
import math
import re
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from collections import defaultdict

# ----------------------------
# Paths
# ----------------------------
DIR_DM = "/mnt/f/0.datasets/dgidb/"
PATH_DM_DRUGS = os.path.join(DIR_DM, "1.General Information of Drug.tsv")
PATH_DM_TARGETS = os.path.join(DIR_DM, "5.General Information of Drug Therapeutic Target (DTT).tsv")
PATH_DM_MAP = os.path.join(DIR_DM, "10.Drug to DTT Mapping Information.tsv")

PATH_MR = "/mnt/f/10_osteo_MR/results_mr_ptrs/PTRS/bulk_crossmodal_meta_beta.tsv"

OUTDIR = "/mnt/f/10_osteo_MR/result_drug_target"
os.makedirs(OUTDIR, exist_ok=True)

# ----------------------------
# Config
# ----------------------------
MOA_KEEP = {"Agonist", "Antagonist", "Inhibitor", "Modulator" } # , "Activator" }
STATUS_KEEP = [
    "Investigative", "Patented", "Approved", "Preclinical", "Clinical Trial",
    "Phase 1", "Phase 1/2", "Phase 2", "Phase 2/3", "Phase 3", "Phase 4"
]
STATUS_OTHER = "Other"

# ----------------------------
# Helpers
# ----------------------------
def clean_symbol(x):
    if pd.isna(x):
        return None
    s = str(x).strip()
    if s == "" or s.lower() in {"nan","null","none","."}:
        return None
    return s.upper()

def coerce_str(x):
    if pd.isna(x):
        return ""
    return str(x).strip()

def coerce_float(x):
    try:
        return float(x)
    except Exception:
        return np.nan

def bucket_status(s):
    s = coerce_str(s)
    # exact match first
    if s in STATUS_KEEP:
        return s
    # quick normalization hints
    low = s.lower()
    if low.startswith("phase 1/2"): return "Phase 1/2"
    if low.startswith("phase 2/3"): return "Phase 2/3"
    if low.startswith("phase 1"): return "Phase 1"
    if low.startswith("phase 2"): return "Phase 2"
    if low.startswith("phase 3"): return "Phase 3"
    if low.startswith("phase 4"): return "Phase 4"
    if "approved" in low: return "Approved"
    if "investigative" in low: return "Investigative"
    if "preclinical" in low: return "Preclinical"
    if "patented" in low: return "Patented"
    if "clinical trial" in low: return "Clinical Trial"
    return STATUS_OTHER

# ----------------------------
# Load data
# ----------------------------
# MR betas
mr = pd.read_csv(PATH_MR, sep="\t")
mr = mr[["gene", "meta_beta_common"]].dropna()
mr["gene_clean"] = mr["gene"].map(clean_symbol)
mr = mr[~mr["gene_clean"].isna()].copy()
# average β if duplicates
betas = mr.groupby("gene_clean")["meta_beta_common"].mean().to_dict()
causal_genes = set(betas.keys())

# Drugmap: drugs
dm_drugs = pd.read_csv(PATH_DM_DRUGS, sep="\t", dtype=str, low_memory=False)
if "DrugID" not in dm_drugs.columns:
    raise ValueError("Drug file must have 'DrugID' column.")
# Normalize
dm_drugs["DrugID"] = dm_drugs["DrugID"].astype(str).str.strip()
if "Drug_Name" not in dm_drugs.columns:
    # some releases use 'Drug_Name' or 'Drug_Name '—be forgiving
    cand = [c for c in dm_drugs.columns if c.strip().lower() == "drug_name"]
    dm_drugs["Drug_Name"] = dm_drugs[cand[0]] if cand else dm_drugs["DrugID"]
dm_drugs["Drug_Name"] = dm_drugs["Drug_Name"].astype(str).str.strip()

if "Highest_status" not in dm_drugs.columns:
    # be forgiving—search a close match
    cand = [c for c in dm_drugs.columns if c.strip().lower() == "highest_status"]
    if cand:
        dm_drugs["Highest_status"] = dm_drugs[cand[0]]
    else:
        dm_drugs["Highest_status"] = ""

dm_drugs["Status_bucket"] = dm_drugs["Highest_status"].apply(bucket_status)

# Drugmap: targets (TargetID -> Gene)
dm_targets = pd.read_csv(PATH_DM_TARGETS, sep="\t", dtype=str, low_memory=False)
need_t = {"TargetID", "Gene_Name"}
if not need_t.issubset(dm_targets.columns):
    raise ValueError("Target file must contain TargetID and Gene_Name.")
dm_targets["TargetID"] = dm_targets["TargetID"].astype(str).str.strip()
dm_targets["Gene_clean"] = dm_targets["Gene_Name"].map(clean_symbol)

# Drugmap: mapping DrugID–TargetID–MOA
dm_map = pd.read_csv(PATH_DM_MAP, sep="\t", dtype=str, low_memory=False)
need_m = {"DrugID", "TargetID", "MOA"}
if not need_m.issubset(dm_map.columns):
    raise ValueError("Mapping file must contain DrugID, TargetID, MOA.")
dm_map["DrugID"] = dm_map["DrugID"].astype(str).str.strip()
dm_map["TargetID"] = dm_map["TargetID"].astype(str).str.strip()
dm_map["MOA"] = dm_map["MOA"].astype(str).str.strip()

# Keep only desired MOA classes
dm_map = dm_map[dm_map["MOA"].isin(MOA_KEEP)].copy()

# ----------------------------
# Build Drug–Gene (causal) table with MOA
# ----------------------------
# Join TargetID->Gene
map_ = dm_map.merge(dm_targets[["TargetID", "Gene_clean"]], on="TargetID", how="left")
map_ = map_[~map_["Gene_clean"].isna()].copy()

# Keep only causal target genes (present in MR)
map_ = map_[map_["Gene_clean"].isin(causal_genes)].copy()

# Attach Drug_Name and Status_bucket
drug_meta = dm_drugs[["DrugID", "Drug_Name", "Status_bucket"]].drop_duplicates()
map_ = map_.merge(drug_meta, on="DrugID", how="left")

# Deduplicate basic rows
map_ = map_[["DrugID","Drug_Name","MOA","Gene_clean","Status_bucket"]].drop_duplicates()

# ----------------------------
# 1) Per-drug average beta by MOA, histograms
# ----------------------------
# Average β per (DrugID, MOA) over its causal targets
map_["beta"] = map_["Gene_clean"].map(betas)

drug_moa_beta = (
    map_.groupby(["DrugID","Drug_Name","MOA"], as_index=False)
        .agg(n_causal_targets=("Gene_clean","nunique"),
             avg_beta=("beta","mean"))
)

# Save the table
out_csv = os.path.join(OUTDIR, "drugmap_drug_avg_beta_by_moa.csv")
drug_moa_beta.to_csv(out_csv, index=False)
print(f"[OK] Wrote {out_csv} (rows={len(drug_moa_beta)})")

# Histograms per MOA
def hist_avg_beta(df, moa, outpng):
    sub = df[df["MOA"] == moa].copy()
    if sub.empty:
        print(f"[WARN] No entries for MOA={moa}")
        return
    plt.figure(figsize=(7,5))
    plt.hist(sub["avg_beta"].dropna().values, bins=40, color='k')
    plt.xlabel("Per-drug average β (over causal targets)")
    plt.ylabel("Count of drugs")
    plt.title(f"Histogram of per-drug avg β — {moa}")
    plt.axvline(0, color="black", linewidth=0.8, linestyle="--")
    plt.tight_layout()
    plt.savefig(outpng, dpi=220)
    plt.close()
    print(f"[OK] Wrote {outpng}")

for moa in ["Agonist","Antagonist","Inhibitor","Modulator"]:
    hist_avg_beta(
        drug_moa_beta, moa,
        #os.path.join(OUTDIR, f"hist_avg_beta_{moa.lower()}.png")
        os.path.join(OUTDIR, f"hist_avg_beta_{moa.lower()}.pdf")
    )

# ----------------------------
# 2) Status counts for drugs targeting risk vs protective genes
# ----------------------------
# A drug is "risk-targeting" if it targets at least one gene with β>0
# A drug is "protective-targeting" if it targets at least one gene with β<0
gene_beta_sign = {g: ("risk" if b>0 else ("protective" if b<0 else "zero")) for g,b in betas.items()}

map_["sign"] = map_["Gene_clean"].map(lambda g: gene_beta_sign.get(g, "zero"))

# Aggregate per DrugID (does the drug hit any risk/protective gene?)
drug_signs = (
    map_.groupby(["DrugID","Drug_Name","Status_bucket"])["sign"]
        .apply(lambda x: set(x))
        .reset_index()
)

drug_signs["risk_flag"] = drug_signs["sign"].apply(lambda s: ("risk" in s))
drug_signs["protective_flag"] = drug_signs["sign"].apply(lambda s: ("protective" in s))

# Count by status for each group (a drug can belong to both)
status_levels = STATUS_KEEP + [STATUS_OTHER]
def count_by_status(flag_col):
    s = (drug_signs[drug_signs[flag_col]]
         .groupby("Status_bucket")["DrugID"].nunique()
         .reindex(status_levels, fill_value=0)
         .rename("count"))
    s = s.reset_index().rename(columns={"Status_bucket":"status"})
    return s

risk_counts = count_by_status("risk_flag")
risk_counts["group"] = "Risk-targeting (β>0)"

prot_counts = count_by_status("protective_flag")
prot_counts["group"] = "Protective-targeting (β<0)"

status_counts = pd.concat([risk_counts, prot_counts], ignore_index=True)

# Save status counts
out_counts = os.path.join(OUTDIR, "drugmap_drugs_targeting_causal_status_counts.csv")
status_counts.to_csv(out_counts, index=False)
print(f"[OK] Wrote {out_counts}")

# Plot grouped bar chart
def plot_grouped_status_counts(df, outpng):
    # pivot: rows=status, cols=group
    piv = df.pivot(index="status", columns="group", values="count").fillna(0).loc[status_levels]
    x = np.arange(len(piv.index))
    groups = list(piv.columns)
    width = 0.4 # 0.38 if len(groups)==2 else 0.8/len(groups)

    plt.figure(figsize=(max(6, 0.4*len(x)), 5))
    for i, g in enumerate(groups):
        plt.bar(x + (i-0.5)*width, piv[g].values, width=width, label=g)
    plt.xticks(x, piv.index, rotation=45, ha="right")
    plt.ylabel("Number of drugs")
    plt.title("Drug Highest_status counts for drugs targeting causal genes")
    plt.legend()
    plt.tight_layout()
    plt.savefig(outpng, dpi=220)
    plt.close()
    print(f"[OK] Wrote {outpng}")

plot_grouped_status_counts(
    status_counts,
    os.path.join(OUTDIR, "bar_status_counts_risk_vs_protective.pdf")
)

print("[DONE] Drugmap intersection & stats complete.")


[OK] Wrote /mnt/f/10_osteo_MR/result_drug_target/drugmap_drug_avg_beta_by_moa.csv (rows=2767)
[OK] Wrote /mnt/f/10_osteo_MR/result_drug_target/hist_avg_beta_agonist.pdf
[OK] Wrote /mnt/f/10_osteo_MR/result_drug_target/hist_avg_beta_antagonist.pdf
[OK] Wrote /mnt/f/10_osteo_MR/result_drug_target/hist_avg_beta_inhibitor.pdf
[OK] Wrote /mnt/f/10_osteo_MR/result_drug_target/hist_avg_beta_modulator.pdf
[OK] Wrote /mnt/f/10_osteo_MR/result_drug_target/drugmap_drugs_targeting_causal_status_counts.csv
[OK] Wrote /mnt/f/10_osteo_MR/result_drug_target/bar_status_counts_risk_vs_protective.pdf
[DONE] Drugmap intersection & stats complete.


**Step 2:** Load tabular data (summary stats / annotations).

In [61]:
p = '/mnt/f/0.datasets/dgidb/10.Drug to DTT Mapping Information.tsv'
df = pd.read_csv( p, sep = '\t' )

**Step 3:** Run a processing or analysis step.

In [63]:
df[ 'MOA' ].value_counts()


MOA
Inhibitor                            29735
Modulator                             4749
Antagonist                            3555
Agonist                               2683
.                                     2254
Activator                              389
Binder                                 374
CAR-T-Cell-Therapy                     322
Modulator (allosteric modulator)       260
Blocker                                237
Ligand                                 208
Blocker (channel blocker)              205
CAR-T-Cell-Therapy(Dual specific)       76
Replacement                             62
Stimulator                              57
Opener                                  39
Inhibitor (gating inhibitor)            29
Inducer                                 20
Degrader                                19
Immunomodulator                         18
Enhancer                                17
Breaker                                 14
Reactivator                             10
Suppres

**Step 4:** Load tabular data (summary stats / annotations).

In [2]:
import os
import math
import re
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from collections import defaultdict

# load expression profile

def load_probe2gene(probe2gene_path):
    probe2gene = {}
    with gzip.open(probe2gene_path, 'rt') as f:
        inside_table = False
        for line in f:
            if line.startswith('!platform_table_begin'):
                inside_table = True
                header = next(f).strip().split('\t')
                idx_probe = header.index('ID')
                idx_symbol = header.index('Gene Symbol')
                continue
            if inside_table:
                if line.startswith('!platform_table_end'):
                    break
                cols = line.strip().split('\t')
                probe = cols[idx_probe]
                symbol = cols[idx_symbol]
                if symbol and symbol != '---':
                    probe2gene[probe] = symbol.split(' /// ')[0]
    return probe2gene

def load_and_process_expression(expr_path, probe2gene_path):
    probe2gene = load_probe2gene(probe2gene_path)
    expr = pd.read_csv(expr_path, sep='\t', comment='!', index_col=0)
    expr.index = expr.index.astype(str)
    expr = expr[expr.index.isin(probe2gene)]
    expr['gene'] = expr.index.map(probe2gene)
    gene_expr = expr.groupby('gene').mean()
    gene_expr_norm = gene_expr.subtract(gene_expr.mean(axis=1), axis=0)
    return gene_expr_norm

def extract_group_labels(matrix_file):
    with gzip.open(matrix_file, 'rt') as f:
        for line in f:
            if line.startswith('!Sample_title'):
                sample_titles = line.strip().split('\t')[1:]
                break
    group_labels = []
    for title in sample_titles:
        if re.search(r'control group', title, re.IGNORECASE):
            group_labels.append('Control')
        elif re.search(r'disease group', title, re.IGNORECASE):
            group_labels.append('Disease')
        else:
            group_labels.append('Unknown')
    return group_labels

# 1. Load expression

gene_expr_norm = load_and_process_expression(
    '/mnt/f/10_osteo_MR/datasets/gse123568/GSE123568_series_matrix.txt.gz',
    '/mnt/f/10_osteo_MR/datasets/gse123568/GSE123568_family.soft.gz'
)

# 2. Get group labels
group_labels = extract_group_labels('/mnt/f/10_osteo_MR/datasets/gse123568/GSE123568_series_matrix.txt.gz')



**Step 5:** Join/merge datasets to align keys across resources.

In [3]:
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
DOT-class expression vs beta, with drug names per gene and combined scatter per class.

Inputs already prepared in your session:
  - gene_expr_norm: DataFrame [genes x samples], normalized expression
  - group_labels: list[str] for columns of gene_expr_norm ('Control'/'Disease'/...)
  - betas: dict[str -> float], MR beta per gene (from your MR table)
  - map_: DataFrame with columns ["DrugID","Drug_Name","MOA","Gene_clean"] (Drugmap mapping filtered to MOA in {Agonist, Antagonist, Inhibitor, Modulator} and to causal genes)

Outputs in /mnt/f/10_osteo_MR/result_drug_target:
  - dotclass_stats_<moa>.csv  (now with 'drugs' column)
  - scatter_expr_vs_beta_<moa>.png  (one figure per class showing Control and Disease)
"""

import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

OUTDIR = "/mnt/f/10_osteo_MR/result_drug_target"
os.makedirs(OUTDIR, exist_ok=True)

DOT_CLASSES = ["Agonist", "Antagonist", "Inhibitor", "Modulator"]

# --- sanity for labels vs matrix
cols = gene_expr_norm.columns.tolist()
if len(cols) != len(group_labels):
    raise ValueError("Length mismatch: gene_expr_norm columns vs group_labels")

mask_control = np.array([g == "Control" for g in group_labels], dtype=bool)
mask_disease = np.array([g == "Disease" for g in group_labels], dtype=bool)

def uniq_join(values):
    """Return semicolon-joined unique non-empty strings."""
    vals = sorted({str(v).strip() for v in values if str(v).strip() and str(v).strip().lower() != 'nan'})
    return ";".join(vals)

all_stats = []

for moa in DOT_CLASSES:
    submap = map_[map_["MOA"] == moa].copy()
    if submap.empty:
        print(f"[WARN] No targets for MOA={moa}")
        continue

    # build per-gene drug list for this class
    gene2drugs = (
        submap.groupby("Gene_clean")["Drug_Name"]
        .apply(uniq_join)
        .to_dict()
    )

    target_genes = sorted(set(submap["Gene_clean"]))
    records = []
    for g in target_genes:
        # require beta and expression
        if (g not in betas) or (g not in gene_expr_norm.index):
            continue
        row = gene_expr_norm.loc[g].values
        expr_control = float(np.nanmean(row[mask_control])) if mask_control.any() else np.nan
        expr_disease = float(np.nanmean(row[mask_disease])) if mask_disease.any() else np.nan
        records.append({
            "MOA": moa,
            "gene": g,
            "beta": float(betas[g]),
            "expr_control": expr_control,
            "expr_disease": expr_disease,
            "drugs": gene2drugs.get(g, "")
        })

    if not records:
        print(f"[WARN] No usable genes for MOA={moa}")
        continue

    df = pd.DataFrame(records)
    all_stats.append(df)

    # save CSV with drugs column
    out_csv = os.path.join(OUTDIR, f"dotclass_stats_{moa.lower()}.csv")
    df.to_csv(out_csv, index=False)
    print(f"[OK] Wrote {out_csv} ({len(df)} genes)")

    # combined scatter: plot Control and Disease in one figure
    plt.figure(figsize=(6.5, 6.5))
    # Control
    plt.scatter(df["expr_control"], df["beta"], alpha=0.65, label="Control", marker="o")
    # Disease
    plt.scatter(df["expr_disease"], df["beta"], alpha=0.65, label="Disease", marker="s")
    # axes refs
    plt.axhline(0, color="black", lw=0.8, ls="--")
    plt.axvline(0, color="grey", lw=0.5, ls=":")
    plt.xlabel("Average expression (Control / Disease)")
    plt.ylabel("Beta")
    plt.title(f"{moa} targets — Expression vs Beta")
    plt.legend()
    plt.tight_layout()
    fig_path = os.path.join(OUTDIR, f"scatter_expr_vs_beta_{moa.lower()}.pdf")
    plt.savefig(fig_path, dpi=220)
    plt.close()
    print(f"[OK] Wrote {fig_path}")

# merged table of all classes (optional convenience)
if all_stats:
    merged = pd.concat(all_stats, ignore_index=True)
    merged_out = os.path.join(OUTDIR, "dotclass_stats_all.csv")
    merged.to_csv(merged_out, index=False)
    print(f"[OK] Wrote {merged_out} (rows={len(merged)})")


[OK] Wrote /mnt/f/10_osteo_MR/result_drug_target/dotclass_stats_agonist.csv (19 genes)
[OK] Wrote /mnt/f/10_osteo_MR/result_drug_target/scatter_expr_vs_beta_agonist.pdf
[OK] Wrote /mnt/f/10_osteo_MR/result_drug_target/dotclass_stats_antagonist.csv (32 genes)
[OK] Wrote /mnt/f/10_osteo_MR/result_drug_target/scatter_expr_vs_beta_antagonist.pdf
[OK] Wrote /mnt/f/10_osteo_MR/result_drug_target/dotclass_stats_inhibitor.csv (134 genes)
[OK] Wrote /mnt/f/10_osteo_MR/result_drug_target/scatter_expr_vs_beta_inhibitor.pdf
[OK] Wrote /mnt/f/10_osteo_MR/result_drug_target/dotclass_stats_modulator.csv (76 genes)
[OK] Wrote /mnt/f/10_osteo_MR/result_drug_target/scatter_expr_vs_beta_modulator.pdf
[OK] Wrote /mnt/f/10_osteo_MR/result_drug_target/dotclass_stats_all.csv (rows=261)


**Step 6:** Join/merge datasets to align keys across resources.

In [7]:

import os, numpy as np, pandas as pd
import matplotlib.pyplot as plt

OUTDIR = "/mnt/f/10_osteo_MR/result_drug_target"
os.makedirs(OUTDIR, exist_ok=True)

# ----- config
target_gene = "C5AR1"   # Avacopan's target

# ----- safety checks
if target_gene not in gene_expr_norm.index:
    raise ValueError(f"{target_gene} not found in gene_expr_norm.index")

if len(gene_expr_norm.columns) != len(group_labels):
    raise ValueError("Length mismatch: gene_expr_norm columns vs group_labels")

# ----- gather data
expr = gene_expr_norm.loc[target_gene].values
cols = gene_expr_norm.columns.to_list()
labels = np.array(group_labels)

mask_ctrl = labels == "Control"
mask_dis  = labels == "Disease"

x_ctrl = expr[mask_ctrl]
x_dis  = expr[mask_dis]

# save a tidy CSV for audit
df_long = pd.DataFrame({
    "sample_id": np.array(cols),
    "group": labels,
    "gene": target_gene,
    "expr": expr
})
csv_path = os.path.join(OUTDIR, f"swarm_{target_gene}_expression_per_sample.csv")
df_long.to_csv(csv_path, index=False)
print(f"[OK] wrote {csv_path} (n_control={mask_ctrl.sum()}, n_disease={mask_dis.sum()})")

# ----- make a horizontal “swarm” with jitter in y
rng = np.random.default_rng(42)
jitter_scale = 0.10

y_ctrl = np.full(mask_ctrl.sum(), 0.0) + rng.uniform(-jitter_scale, jitter_scale, mask_ctrl.sum())
y_dis  = np.full(mask_dis.sum(),  1.0) + rng.uniform(-jitter_scale, jitter_scale, mask_dis.sum())

plt.figure(figsize=(8, 4.5))
plt.scatter(x_ctrl, y_ctrl, alpha=0.75, marker="o", label=f"Control (n={mask_ctrl.sum()})")
plt.scatter(x_dis,  y_dis,  alpha=0.75, marker="s", label=f"Disease (n={mask_dis.sum()})")

# central tendency: mean lines per group (optional but helpful)
if mask_ctrl.sum() > 0:
    plt.axvline(float(np.nanmean(x_ctrl)), ymin=0.02, ymax=0.48, linewidth=1.3, linestyle="--")
if mask_dis.sum() > 0:
    plt.axvline(float(np.nanmean(x_dis)),  ymin=0.52, ymax=0.98, linewidth=1.3, linestyle="--")

# aesthetics
plt.yticks([0, 1], ["Control", "Disease"])
plt.xlabel(f"{target_gene} normalized expression")
plt.ylabel("Group")
plt.title(f"{target_gene} expression — Avacopan target (Antagonist class)")
plt.grid(axis="x", linestyle=":", linewidth=0.6, alpha=0.6)
plt.tight_layout()

fig_path = os.path.join(OUTDIR, f"swarm_{target_gene}_expression.pdf")
plt.savefig(fig_path, dpi=220)
plt.close()
print(f"[OK] wrote {fig_path}")

# quick console stats
def statline(arr, name):
    return f"{name}: mean={np.nanmean(arr):.3f} median={np.nanmedian(arr):.3f} sd={np.nanstd(arr, ddof=1):.3f} n={np.sum(~np.isnan(arr))}"

print(statline(x_ctrl, "Control"))
print(statline(x_dis,  "Disease"))


import os, numpy as np, pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

OUTDIR = "/mnt/f/10_osteo_MR/result_drug_target"
os.makedirs(OUTDIR, exist_ok=True)

gene = "C5AR1"
# Build the tidy frame (if you don’t already have df_long)
expr = gene_expr_norm.loc[gene]
df_long = pd.DataFrame({
    "sample_id": expr.index,
    "expr": expr.values,
    "group": group_labels  # same order as columns in gene_expr_norm
})

plt.figure(figsize=(4, 3))
sns.swarmplot(data=df_long, x="expr", y="group", size=5, hue='group')

#plt.figure(figsize=(2, 3.5))
#sns.swarmplot(data=df_long, y="expr", x="group", size=5, hue='group')

# optional: add group means as vertical lines
#for grp, color in [("Control", "black"), ("Disease", "gray")]:
#    m = df_long.loc[df_long["group"] == grp, "expr"].mean()
#    plt.axvline(m, linestyle="--", linewidth=1.2, color=color)
plt.axvline( 0, linestyle = '--', linewidth=1, color='grey' )

plt.xlabel(f"{gene} normalized expression")
plt.ylabel("Group")
plt.title(f"Avacopan target—{gene} expression")
plt.tight_layout()
plt.savefig(os.path.join(OUTDIR, f"swarm_{gene}_seaborn.pdf"), dpi=220)
plt.close()



[OK] wrote /mnt/f/10_osteo_MR/result_drug_target/swarm_C5AR1_expression_per_sample.csv (n_control=10, n_disease=30)
[OK] wrote /mnt/f/10_osteo_MR/result_drug_target/swarm_C5AR1_expression.pdf
Control: mean=-0.663 median=-0.699 sd=0.757 n=10
Disease: mean=0.221 median=0.207 sd=0.431 n=30
