
# Create Admixed Genotype Data (Python)

This notebook simulates admixed diploid genotypes using a Balding–Nichols model for source populations and a simple local-ancestry draw per haplotype.
It mirrors the R workflow but is vectorized in NumPy. Outputs include a genotype matrix (CSV/NPZ), summary statistics, and PCA plots.


In [3]:

# Setup
import sys, os
sys.path.append("/mnt/data")

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# Local modules (provided/uploaded)
from chasm.admix_sim import simulate_admixed_genotypes, sample_ancestry_proportions

# Optional: your helper libraries (will import if available)
try:
    import chasm.allele_sharing_lib as asl
    HAVE_ASL = True
except Exception as e:
    HAVE_ASL = False
    print("allele_sharing_lib.py not found or import failed:", e)


In [7]:

# Parameters (edit as needed)
N = 500          # individuals
L = 10000        # SNPs
K = 3            # sources
FST = 0.07       # Balding–Nichols drift
SEED = 123

# Ancestry proportion centers for individuals; if None, uniform Dirichlet
centers = np.array([0.6, 0.3, 0.1])  # length K
conc = 80.0                           # higher => tighter around centers

# Output paths
OUT_DIR = "simulation_data/"
OUT_BASE = "admixed_genotypes_python"
CSV_PATH = os.path.join(OUT_DIR, f"{OUT_BASE}.csv")
NPZ_PATH = os.path.join(OUT_DIR, f"{OUT_BASE}.npz")
PC_FIG12 = os.path.join(OUT_DIR, f"{OUT_BASE}_PC1_PC2.png")
PC_FIG13 = os.path.join(OUT_DIR, f"{OUT_BASE}_PC1_PC3.png")
W_PATH = os.path.join(OUT_DIR, f"{OUT_BASE}_W.csv")

print("Will write to:", OUT_DIR)


Will write to: ../..


In [8]:

# Sample ancestry proportions and simulate genotypes
W = sample_ancestry_proportions(N, K, centers=centers, conc=conc, seed=SEED)
G = simulate_admixed_genotypes(N, L, K=K, Fst=FST, W=W, seed=SEED)

# Save genotypes and ancestry
dfG = pd.DataFrame(G, columns=[f"SNP{j+1}" for j in range(G.shape[1])])
dfG.insert(0, "IID", [f"ind{i+1}" for i in range(G.shape[0])])
dfG.to_csv(CSV_PATH, index=False)

np.savez_compressed(NPZ_PATH, G=G, W=W)
pd.DataFrame(W, columns=[f"pop{k+1}" for k in range(K)]).to_csv(W_PATH, index=False)

print("Genotypes shape:", G.shape)
print("Saved:", CSV_PATH)
print("Saved:", NPZ_PATH)
print("Saved:", W_PATH)


Genotypes shape: (500, 10000)
Saved: ../../admixed_genotypes_python.csv
Saved: ../../admixed_genotypes_python.npz
Saved: ../../admixed_genotypes_python_W.csv


In [None]:

# Summary statistics
# Mean allele frequency across SNPs (estimated from genotypes)
maf_est = (G.mean(axis=0) / 2.0)  # per SNP
mean_maf = float(maf_est.mean())

# Per-individual heterozygosity
het = (G * (2 - G)) / 2.0  # 0->0, 1->0.5, 2->0
mean_het = float(het.mean())

print({"N": N, "L": L, "K": K, "FST": FST, "mean_maf": mean_maf, "mean_het": mean_het})


In [None]:

# PCA (no external dependencies)
# Center genotypes by 2 * estimated freq
p_hat = G.mean(axis=0) / 2.0
X = G - (2.0 * p_hat)  # N x L

# Thin if very large to speed up SVD (optional)
# Here we use all SNPs; for huge L consider LD-pruning or random subset.
U, S, Vt = np.linalg.svd(X, full_matrices=False)
PCs = U[:, :3] * S[:3]  # top 3 PCs

# Plot PC1 vs PC2
plt.figure(figsize=(6,5))
plt.scatter(PCs[:,0], PCs[:,1], s=10, alpha=0.6)
plt.xlabel("PC1")
plt.ylabel("PC2")
plt.title("PCA of Simulated Admixed Genotypes")
plt.tight_layout()
plt.savefig(PC_FIG12, dpi=150)
plt.show()

# Plot PC1 vs PC3
plt.figure(figsize=(6,5))
plt.scatter(PCs[:,0], PCs[:,2], s=10, alpha=0.6)
plt.xlabel("PC1")
plt.ylabel("PC3")
plt.title("PCA of Simulated Admixed Genotypes")
plt.tight_layout()
plt.savefig(PC_FIG13, dpi=150)
plt.show()

print("Saved PCA figures:", PC_FIG12, PC_FIG13)



## Optional: Allele sharing (if `allele_sharing_lib.py` is available)

This block converts diploid genotypes to presence/absence and, if your helper
functions are importable, computes a simple allele-sharing summary.


In [None]:

if HAVE_ASL:
    # Convert diploid to presence/absence (any derived allele present)
    X01 = (G > 0).astype(np.uint8)  # N x L
    
    # Example: pairwise sharing = proportion of SNPs where both have derived allele present
    # (This is a simple baseline; adapt to your library's interfaces as needed.)
    import itertools
    def pairwise_sharing(x):
        n = x.shape[0]
        ps = np.zeros((n, n), dtype=float)
        denom = x.shape[1]
        for i, j in itertools.combinations(range(n), 2):
            ps[i,j] = (x[i] & x[j]).sum() / denom
            ps[j,i] = ps[i,j]
        np.fill_diagonal(ps, 1.0)
        return ps

    # For speed, compute on a subset if N is large
    idx = np.arange(min(100, N))
    PS = pairwise_sharing(X01[idx])

    # Quick visualize histogram of pairwise sharing
    vals = PS[np.triu_indices_from(PS, 1)]
    import matplotlib.pyplot as plt
    plt.figure(figsize=(6,4))
    plt.hist(vals, bins=30)
    plt.xlabel("Pairwise derived-allele sharing")
    plt.ylabel("Count")
    plt.title("Allele Sharing (subset)")
    plt.tight_layout()
    AS_FIG = os.path.join(OUT_DIR, f"{OUT_BASE}_allele_sharing_hist.png")
    plt.savefig(AS_FIG, dpi=150)
    plt.show()
    print("Saved:", AS_FIG)
else:
    print("Skipping allele-sharing: allele_sharing_lib.py not available.")
