# Step 1: Setup + Understanding the Goal

The idea:
We're computing genetic ancestry PCs (principal components) from your variants. This gives you:

- Continuous ancestry coordinates (PC1, PC2, ..., PC10) instead of discrete labels (EUR/AFR/EAS/SAS)
- PC1 typically separates African vs non-African ancestry
- PC2 typically separates East Asian vs South Asian vs European

Why this matters for debiasing:

- Your ML model might learn "AFR subjects have different variant patterns" (confounding)
- By projecting out the PC space, you remove ancestry-correlated features
- Model then learns causal disease variants independent of ancestry

In [None]:
# Cell 1: Imports
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import subprocess
from pathlib import Path

# Set up paths
data_dir = Path("./data/plink/tapestry/genome_wide")
fig_dir = Path("./figures")
fig_dir.mkdir(exist_ok=True)

# Step 2 - Load Plink2 PCAs

We will use Plink2 to compute the PCs from your genotype data, per the `./scripts/create_lds.sh` script.

After runing the script, we can load in the PC file.

In [None]:
# Cell 2: Load PCA results from Plink2
# Plink2 outputs:
#   .eigenvec: FID, IID, PC1, PC2, ..., PC20
#   .eigenval: variance explained per PC

# Load PC coordinates
pca_file = data_dir / "ancestry_pca.eigenvec"
pcs = pd.read_csv(pca_file, sep='\t')
print(f"Loaded PCs: {pcs.shape}")
print(pcs.head())

# Load variance explained
eigenval_file = data_dir / "ancestry_pca.eigenval"
eigenvals = pd.read_csv(eigenval_file, sep='\t', header=None, names=['variance'])
print(f"\nVariance explained by top 10 PCs:")
print(eigenvals.head(10))

# Step 3: Calculate Variance Explained %

In [None]:
# Cell 3: Calculate variance explained percentages
# Total variance = sum of all eigenvalues
total_var = eigenvals['variance'].sum()

# Percent variance per PC
eigenvals['pct_variance'] = 100 * eigenvals['variance'] / total_var

# Cumulative variance
eigenvals['cumulative_pct'] = eigenvals['pct_variance'].cumsum()

print("Variance explained by top 10 PCs:")
print(eigenvals.head(10)[['variance', 'pct_variance', 'cumulative_pct']])
print(f"\nTop 10 PCs capture: {eigenvals.iloc[9]['cumulative_pct']:.1f}% of variance")

# Step 4: Scree Plot (Variance Explained)

In [None]:
# Cell 4: Scree plot - variance explained per PC
n_pcs_to_plot = 20
fig, axes = plt.subplots(1, 2, figsize=(12, 4))

# Left: Variance per PC (bar plot)
axes[0].bar(range(1, n_pcs_to_plot + 1), eigenvals.head(n_pcs_to_plot)['pct_variance'], color='steelblue')
axes[0].set_xlabel('Principal Component')
axes[0].set_ylabel('Variance Explained (%)')
axes[0].set_title('Variance Explained per PC')
axes[0].set_xticks(range(1, n_pcs_to_plot + 1))
# Right: Cumulative variance (line plot)
axes[1].plot(range(1, n_pcs_to_plot + 1), eigenvals.head(n_pcs_to_plot)['cumulative_pct'], 
             marker='o', linewidth=2, color='darkred')
axes[1].axhline(y=80, linestyle='--', color='gray', label='80% threshold')
axes[1].set_xlabel('Principal Component')
axes[1].set_ylabel('Cumulative Variance (%)')
axes[1].set_title('Cumulative Variance Explained')
axes[1].set_xticks(range(1, n_pcs_to_plot + 1))
axes[1].legend()

plt.tight_layout()
plt.savefig(fig_dir / 'ancestry_pca_scree.png', dpi=300, bbox_inches='tight')
plt.show()

# Step 5: PC1 vs PC2 Scatter Plot

In [None]:
# Cell 5: Visualize PC1 vs PC2 (ancestry clusters)
fig, ax = plt.subplots(figsize=(10, 8))

# Scatter plot: each point = one subject
ax.scatter(pcs['PC1'], pcs['PC2'], alpha=0.3, s=10, color='steelblue')

ax.set_xlabel('PC1 (29% variance)', fontsize=12)
ax.set_ylabel('PC2 (19% variance)', fontsize=12)
ax.set_title('Genetic Ancestry PCA: PC1 vs PC2\n(Mayo Tapestry Cohort)', fontsize=14)
ax.grid(alpha=0.3)

plt.tight_layout()
plt.savefig(fig_dir / 'ancestry_pca_pc1_pc2.png', dpi=300, bbox_inches='tight')
plt.show()

# Print summary stats
print(f"PC1 range: [{pcs['PC1'].min():.4f}, {pcs['PC1'].max():.4f}]")
print(f"PC2 range: [{pcs['PC2'].min():.4f}, {pcs['PC2'].max():.4f}]")

The variation on PC2 is a batch effect of the 3 sequencing assays used in the study.  The horizontal stratification (bands along PC2) is classic technical variation from different sequencing assays.

# Step 6: Load Metedata & Merge

In [None]:
# Cell 7: Load metadata and explore columns
# Try both possible filenames
metadata_file = data_dir / "metadata.psam"  # tab-delimited
# If that doesn't exist, try: metadata_file = data_dir / "metadata.csv"

metadata = pd.read_csv(metadata_file, sep='\t')
print(f"Metadata shape: {metadata.shape}")

# Look for batch-related columns
batch_cols = [col for col in metadata.columns if any(term in col.lower() 
              for term in ['batch', 'assay', 'version', 'platform', 'kit', 'capture'])]
print(f"\nPotential batch columns: {batch_cols}")

In [None]:
# Cell 7: Investigate VCF_ASSAY_VERSION as batch variable
# Check unique assay versions
print("Unique assay versions:")
print(metadata['VCF_ASSAY_VERSION'].value_counts())
print(f"\nTotal: {metadata['VCF_ASSAY_VERSION'].nunique()} versions")
print(f"Missing values: {metadata['VCF_ASSAY_VERSION'].isna().sum()}")

# Merge PCs with metadata
# Match on IID column (check exact column names)
pcs_batch = pcs.merge(metadata[['#IID', 'VCF_ASSAY_VERSION']], 
                      on='#IID', how='left')

print(f"\nMerged data shape: {pcs_batch.shape}")
print(f"Subjects with assay info: {pcs_batch['VCF_ASSAY_VERSION'].notna().sum()}")

# Step 7: Investigate Batch Effect

In [None]:
# Cell 7 continued: Visualize batch effect on PC1 vs PC2
fig, ax = plt.subplots(figsize=(12, 8))

# Color by assay version
assay_versions = pcs_batch['VCF_ASSAY_VERSION'].dropna().unique()
colors = sns.color_palette('Set1', n_colors=len(assay_versions))

for i, assay in enumerate(sorted(assay_versions)):
    subset = pcs_batch[pcs_batch['VCF_ASSAY_VERSION'] == assay]
    ax.scatter(subset['PC1'], subset['PC3'], 
               label=f'Assay {assay}', 
               alpha=0.5, s=10, 
               color=colors[i])

ax.set_xlabel('PC1 (29% variance)', fontsize=12)
ax.set_ylabel('PC2 (19% variance)', fontsize=12)
ax.set_title('PC1 vs PC2 Colored by Assay Version\n(Batch Effect Investigation)', fontsize=14)
ax.legend(loc='best', markerscale=2)
ax.grid(alpha=0.3)

plt.tight_layout()
plt.savefig(fig_dir / 'ancestry_pca_batch_effect.png', dpi=300, bbox_inches='tight')
plt.show()

In [None]:
col = 'GNOMAD_RACE'
pcs_batch = pcs.merge(metadata[['#IID', col]], 
                      on='#IID', how='left')

# Cell 7 continued: Visualize batch effect on PC1 vs PC2
fig, ax = plt.subplots(figsize=(12, 8))

# Color by assay version
assay_versions = pcs_batch[col].dropna().unique()
colors = sns.color_palette('Set1', n_colors=len(assay_versions))

for i, assay in enumerate(sorted(assay_versions)):
    subset = pcs_batch[pcs_batch[col] == assay]
    ax.scatter(subset['PC4'], subset['PC3'], 
               label=f'Ancestry: {assay}', 
               alpha=0.5, s=10, 
               color=colors[i])

ax.set_xlabel('PC1 (29% variance)', fontsize=12)
ax.set_ylabel('PC3 (13% variance)', fontsize=12)
ax.set_title('PC1 vs PC3 Colored by Assay Version\n(Batch Effect Investigation)', fontsize=14)
ax.legend(loc='best', markerscale=2)
ax.grid(alpha=0.3)

plt.tight_layout()
plt.savefig(fig_dir / 'ancestry_pca_batch_effect.png', dpi=300, bbox_inches='tight')
plt.show()

In [None]:
metadata.columns

In [None]:
pcs_batch = pcs.merge(metadata[['#IID', 'GNOMAD_RACE']], 
                      on='#IID', how='left')