# 41. Python version of heritability analysis

Note here we set `df_row` to 0 because we're just testing code on first row

In [1]:
benchmark = True

In [2]:
import os
import numpy as np
import pandas as pd
import time
from pgenlib import PgenReader
import limix.her

# -----------------------------
# Parameters (Adjust as needed)
# -----------------------------
#window_sizes = [10000, 100000, 1000000]  # Window sizes in base pairs
window_sizes = [10000]
chunk_start = 1                           # Start index for CpG sites (1-based)
chunk_end = 50                            # End index for CpG sites (1-based)
benchmark = True                         # Whether to measure timing

# -----------------------------
# Paths (Adjust these paths according to your data)
# -----------------------------

df_csv_path = "/dcs04/lieber/statsgen/mnagle/mwas/CpGWAS/scripts/09.5-OUT_matched_SNP_meth_cov_chunked_JHPCE.csv"
output_dir = "./41-OUT_heritability_a1"

Test on single window...

In [3]:
window_size = window_sizes[0]

In [4]:
if benchmark:
    start_time_total = time.time()

In [5]:
# Create output directory if it doesn't exist
if not os.path.exists(output_dir):
    os.makedirs(output_dir)
os.chdir(output_dir)

# Read the data frame containing paths to SNP and methylation data
df = pd.read_csv(df_csv_path)

row = df.iloc[0]
gwas_dir = os.path.dirname(row['SNP_data'])
methylation_file = row['modified_methylation_data']
chromosome = row['Chr']


methylation_file = methylation_file.replace("/dcs04/lieber/statsgen/shizhong/michael/mwas/pheno/", "/dcs04/lieber/statsgen/mnagle/mwas/pheno/")

methylation_file = methylation_file.replace("rda", "csv")
methylation_file = methylation_file.replace("rds", "csv")

In [6]:
chromosome = row['Chr']

print(f"Processing Chromosome: {chromosome}")
print(f"Genotype Directory: {gwas_dir}")
print(f"Methylation File: {methylation_file}")

# -----------------------------
# Load Methylation Data
# -----------------------------
try:
    # Methylation data has 'sample_id' as the first column and CpG positions as other columns
    methylation_df = pd.read_csv(methylation_file)
    print(f"Methylation data loaded from '{methylation_file}'.")
except Exception as e:
    print(f"Error reading methylation file '{methylation_file}': {e}")
    exit(1)

# Ensure 'sample_id' is treated as a string
if 'sample_id' not in methylation_df.columns:
    print(f"'sample_id' column not found in methylation data. Exiting.")
    exit(1)

methylation_df['sample_id'] = methylation_df['sample_id'].astype(str)
print("'sample_id' column confirmed and converted to string.")

# Extract CpG columns (all columns except 'sample_id')
cpg_columns = methylation_df.columns.drop('sample_id')

# Extract numeric CpG positions from column names (e.g., 'pos_1069461' -> 1069461)
try:
    cpg_positions = [int(col.split('_')[1]) for col in cpg_columns]
    print("CpG positions extracted from column names.")
except IndexError as e:
    print(f"Error parsing CpG positions in column names: {e}")
    exit(1)
except ValueError as e:
    print(f"Non-integer CpG position found in column names: {e}")
    exit(1)

# Create a mapping from column names to positions
cpg_col_to_pos = dict(zip(cpg_columns, cpg_positions))

# Select the CpG positions for the specified chunk
selected_cpg_cols = cpg_columns[chunk_start - 1:chunk_end]
selected_cpg_positions = [cpg_col_to_pos[col] for col in selected_cpg_cols]

print(f"Selected CpG Columns: {selected_cpg_cols.tolist()}")
print(f"Selected CpG Positions: {selected_cpg_positions}")

# -----------------------------
# Select a Single CpG Site and Window Size for Testing
# -----------------------------
# For initial testing, we'll process the first CpG site and the first window size
cpg_col = selected_cpg_cols[0]
cpg_pos = selected_cpg_positions[0]
w = window_size

print(f"\nSelected CpG Site: {cpg_col} at position {cpg_pos}")
print(f"Selected Window Size: {w} bp")

# -----------------------------
# Extract Methylation Data for the Selected CpG Site
# -----------------------------
pheno_df = methylation_df[['sample_id', cpg_col]].dropna()
y = pheno_df[cpg_col].values
sample_ids = pheno_df['sample_id'].values
n_samples = len(sample_ids)

print(f"Number of samples with non-missing methylation data: {n_samples}")

if n_samples == 0:
    print("No samples with non-missing methylation data. Exiting.")
    exit(1)

# -----------------------------
# Define Genomic Window
# -----------------------------
p1 = max(cpg_pos - w, 0)
p2 = cpg_pos + w

print(f"Genomic window: {p1} - {p2} bp")

# -----------------------------
# Load Genotype Data for the Specified Chromosome
# -----------------------------
pgen_prefix = os.path.join(gwas_dir, f"libd_chr{chromosome}")
pgen_file = f"{pgen_prefix}.pgen"
pvar_file = f"{pgen_prefix}.pvar"
psam_file = f"{pgen_prefix}.psam"

# Check if all necessary PLINK 2 files exist
if not all(os.path.exists(f) for f in [pgen_file, pvar_file, psam_file]):
    print("One or more PLINK 2 files are missing. Exiting.")
    exit(1)

print("All necessary PLINK 2 files found.")

# -----------------------------
# Read Sample IDs from .psam File
# -----------------------------
try:
    psam_df = pd.read_csv(psam_file, sep='\t')
    if '#IID' not in psam_df.columns:
        print(f"'#IID' column not found in .psam file '{psam_file}'. Exiting.")
        exit(1)
    geno_sample_ids = psam_df['#IID'].astype(str).values
    print("Genotype sample IDs loaded from .psam file.")
except Exception as e:
    print(f"Error reading .psam file '{psam_file}': {e}")
    exit(1)

# Create a mapping from sample ID to index in genotype data
sample_id_to_index = {sid: idx for idx, sid in enumerate(geno_sample_ids)}

# Get genotype indices for samples present in methylation data
geno_indices = [sample_id_to_index[sid] for sid in sample_ids if sid in sample_id_to_index]

if not geno_indices:
    print("No matching samples between genotype and methylation data. Exiting.")
    exit(1)

print(f"Number of matching samples: {len(geno_indices)}")

# -----------------------------
# Read SNP Positions from .pvar File
# -----------------------------
try:
    pvar_df = pd.read_csv(pvar_file, sep='\t', comment='#',
                          names=['CHROM', 'POS', 'ID', 'REF', 'ALT', 'QUAL', 'FILTER', 'INFO', 'FORMAT'])
    print("SNP positions loaded from .pvar file.")
except Exception as e:
    print(f"Error reading .pvar file '{pvar_file}': {e}")
    exit(1)

# Subset SNPs within the genomic window
snps_in_window = pvar_df[(pvar_df['POS'] >= p1) & (pvar_df['POS'] <= p2)]

if snps_in_window.empty:
    print("No SNPs found within the genomic window. Exiting.")
    exit(1)

print(f"Number of SNPs within the window: {len(snps_in_window)}")

Processing Chromosome: 1
Genotype Directory: /dcs04/lieber/statsgen/shizhong/michael/mwas/gwas
Methylation File: /dcs04/lieber/statsgen/mnagle/mwas/pheno/caud/out/chr1_AA_8982-28981.csv
Methylation data loaded from '/dcs04/lieber/statsgen/mnagle/mwas/pheno/caud/out/chr1_AA_8982-28981.csv'.
'sample_id' column confirmed and converted to string.
CpG positions extracted from column names.
Selected CpG Columns: ['pos_1069461', 'pos_1069467', 'pos_1069470', 'pos_1069477', 'pos_1069484', 'pos_1069498', 'pos_1069506', 'pos_1069516', 'pos_1069530', 'pos_1069533', 'pos_1069539', 'pos_1069544', 'pos_1069569', 'pos_1069573', 'pos_1069591', 'pos_1069599', 'pos_1069601', 'pos_1069603', 'pos_1069613', 'pos_1069626', 'pos_1069629', 'pos_1069635', 'pos_1069637', 'pos_1069645', 'pos_1069651', 'pos_1069653', 'pos_1069669', 'pos_1069682', 'pos_1069691', 'pos_1069693', 'pos_1069697', 'pos_1069699', 'pos_1069707', 'pos_1069715', 'pos_1069720', 'pos_1069778', 'pos_1069790', 'pos_1069800', 'pos_1069810', 'pos

In [8]:
# -----------------------------
# Read Genotype Data Using PgenReader
# -----------------------------
# Initialize PgenReader with sample_subset
pgr = PgenReader(pgen_file.encode('utf-8'), sample_subset=np.array(sorted(geno_indices), dtype=np.uint32))

# Allocate buffer: rows=variants (SNPs), cols=samples
geno_buffer = np.empty((len(variant_indices := snps_in_window.shape[0]), n_samples), dtype=np.int32)

# Read each variant and populate the buffer
for var_idx, variant_idx in enumerate(snps_in_window.index.values):
    # Read genotype for the current variant
    # allele_idx=1 corresponds to the alternate allele count
    pgr.read(variant_idx, geno_buffer[var_idx, :], allele_idx=1)

print("Genotype data successfully read and stored in buffer.")

# -----------------------------
# Benchmarking: Genotype Reading Time
# -----------------------------
if benchmark:
    geno_time = time.time() - start_time_total  # Ensure start_time_total is defined before
    print(f"Genotype reading time: {geno_time:.2f} seconds")

TypeError: object of type 'int' has no len()

In [None]:
# -----------------------------
# Check for Missing Data and Impute
# -----------------------------
if np.any(geno_buffer == -9):
    print("Missing genotype data detected. Imputing missing values with mean genotype.")
    # Replace missing genotypes (-9) with the mean genotype for each SNP
    for var in range(geno_buffer.shape[0]):
        missing = geno_buffer[var, :] == -9
        if np.any(missing):
            non_missing = geno_buffer[var, :] != -9
            if np.any(non_missing):
                mean_geno = np.mean(geno_buffer[var, non_missing])
                geno_buffer[var, missing] = mean_geno
                print(f"  Imputed missing values for SNP {var + 1} with mean genotype {mean_geno:.2f}.")
            else:
                # If all genotypes are missing, impute with 0
                geno_buffer[var, missing] = 0
                print(f"  All genotypes missing for SNP {var + 1}. Imputed with 0.")

    # Check for NaNs after imputation
    if np.isnan(geno_buffer).any():
        nan_indices = np.argwhere(np.isnan(geno_buffer))
        print(f"NaNs found at positions: {nan_indices}")
        exit(1)  # Stop execution to address the issue

# -----------------------------
# Check number of SNPs
# -----------------------------
if len(snps_in_window) < 2:
    print("Only one SNP in window; skipping heritability estimation.")
    exit(0)

# -----------------------------
# Standardize Genotypes (Row-wise)
# -----------------------------
print("Standardizing genotype data.")
M = geno_buffer.astype(float).T  # Transpose to samples × SNPs

# Compute mean and std per SNP (columns)
mu = np.mean(M, axis=0, keepdims=True)
sigma = np.std(M, axis=0, ddof=1, keepdims=True)

# Handle zero standard deviation
sigma[sigma == 0] = 1

# Standardize
S = (M - mu) / sigma
print("Genotype data standardized.")

# -----------------------------
# Compute Kinship Matrix using GEMMA Method
# -----------------------------
print("Computing kinship matrix.")
K = np.dot(S, S.T) / S.shape[1]
print("Kinship matrix computed.")

if benchmark:
    kinship_time = time.time() - start_time  # Ensure start_time is defined before
    print(f"Kinship computation time: {kinship_time:.2f} seconds")

# -----------------------------
# Normalize Kinship Matrix
# -----------------------------
try:
    K_normalized = K / np.mean(np.diagonal(K))
    print("Kinship matrix normalized.")
except ZeroDivisionError:
    print("Mean of the diagonal of the kinship matrix is zero. Cannot normalize.")
    exit(1)
except Exception as e:
    print(f"Kinship normalization failed: {e}. Exiting.")
    exit(1)

# -----------------------------
# Estimate Heritability Using Limix
# -----------------------------
try:
    # Define the likelihood based on phenotype type
    likelihood_type = "normal"

    print(f"Estimating heritability with likelihood: '{likelihood_type}'")
    h2 = limix.her.estimate(
        y=y,
        lik=likelihood_type,
        K=K_normalized,
        M=None,
        verbose=True
    )
    print(f"Estimated heritability (h2): {h2:.4f}")

    if benchmark:
        herit_time = time.time() - start_time
        total_time = time.time() - start_time_total
        print(f"Heritability estimation time: {herit_time:.2f} seconds")
        print(f"Total processing time: {total_time:.2f} seconds")

except Exception as e:
    print(f"Heritability estimation failed: {e}")
    exit(1)

# -----------------------------
# Collect and Save Results
# -----------------------------
print("Collecting results.")
result_entry = {
    'V_G': h2 * (np.mean(np.diagonal(K)) * (1 - h2)),
    'V_e': (1 - h2) * (np.mean(np.diagonal(K)) * (1 - h2)),
    'h2': h2,
    'n': n_samples,
    'site': f"chr{chromosome}_{cpg_pos}",
    'window_bp': w
}
results = [result_entry]

# Collect Timing Data (if benchmarking)
if benchmark:
    timing_measurements = {
        f"chr{chromosome}_pos{cpg_pos}_window{w}": {
            'geno_time_sec': geno_time,
            'kinship_time_sec': kinship_time,
            'herit_time_sec': herit_time,
            'total_time_sec': total_time
        }
    }

# Save Results to CSV
results_df = pd.DataFrame(results)
results_df.to_csv("heritability_results.csv", index=False)
print("Heritability results saved to 'heritability_results.csv'.")

# Save Timing Measurements to CSV (if benchmarking)
if benchmark:
    timing_df = pd.DataFrame.from_dict(timing_measurements, orient='index')
    timing_df.reset_index(inplace=True)
    timing_df.rename(columns={'index': 'ID'}, inplace=True)
    timing_df.to_csv("timing_measurements.csv", index=False)
    print("Timing measurements saved to 'timing_measurements.csv'.")

Error reading genotype data: object of type 'int' has no len()


NameError: name 'geno_buffer' is not defined