# 41. Python version of heritability analysis

Note here we set `df_row` to 0 because we're just testing code on first row

In [1]:
benchmark = True

In [None]:
import os
import numpy as np
import pandas as pd
import time
from pgenlib import PgenReader
import limix.her

# -----------------------------
# Parameters (Adjust as needed)
# -----------------------------
#window_sizes = [10000, 100000, 1000000]  # Window sizes in base pairs
window_sizes = [10000]
chunk_start = 1                           # Start index for CpG sites (1-based)
chunk_end = 50                            # End index for CpG sites (1-based)
benchmark = True                         # Whether to measure timing

# -----------------------------
# Paths (Adjust these paths according to your data)
# -----------------------------

df_csv_path = "/dcs04/lieber/statsgen/mnagle/mwas/CpGWAS/scripts/09.5-OUT_matched_SNP_meth_cov_chunked_JHPCE.csv"
output_dir = "./41-OUT_heritability_a1"

if benchmark:
    start_time_total = time.time()

# Create output directory if it doesn't exist
if not os.path.exists(output_dir):
    os.makedirs(output_dir)
os.chdir(output_dir)

# Read the data frame containing paths to SNP and methylation data
df = pd.read_csv(df_csv_path)

# Processing the first row as per df_row in R script
df_row = 0  # Adjust as needed

# Extract paths from the data frame
gwas_dir = os.path.dirname(df.loc[df_row, 'SNP_data'])
methylation_file = df.loc[df_row, 'modified_methylation_data']

methylation_file = methylation_file.replace("/dcs04/lieber/statsgen/shizhong/michael/mwas/pheno/", "/dcs04/lieber/statsgen/mnagle/mwas/pheno/")

methylation_file = methylation_file.replace("rda", "csv")
methylation_file = methylation_file.replace("rds", "csv")

In [5]:
chromosome = df.loc[df_row, 'Chr']

In [6]:
print(f"Processing Chromosome: {chromosome}")
print(f"Genotype Directory: {gwas_dir}")
print(f"Methylation File: {methylation_file}")

Processing Chromosome: 1
Genotype Directory: /dcs04/lieber/statsgen/shizhong/michael/mwas/gwas
Methylation File: /dcs04/lieber/statsgen/mnagle/mwas/pheno/caud/out/chr1_AA_8982-28981.csv


In [7]:
# -----------------------------
# Load Methylation Data
# -----------------------------
try:
    # Methylation data has 'sample_id' as the first column and CpG positions as other columns
    methylation_df = pd.read_csv(methylation_file)
    print(f"Methylation data loaded from '{methylation_file}'.")
except Exception as e:
    print(f"Error reading methylation file '{methylation_file}': {e}")
    exit(1)

# Ensure 'sample_id' is treated as a string
if 'sample_id' not in methylation_df.columns:
    print(f"'sample_id' column not found in methylation data. Exiting.")
    exit(1)

methylation_df['sample_id'] = methylation_df['sample_id'].astype(str)
print("'sample_id' column confirmed and converted to string.")

# Extract CpG columns (all columns except 'sample_id')
cpg_columns = methylation_df.columns.drop('sample_id')

# Extract numeric CpG positions from column names (e.g., 'pos_1069461' -> 1069461)
try:
    cpg_positions = [int(col.split('_')[1]) for col in cpg_columns]
    print("CpG positions extracted from column names.")
except IndexError as e:
    print(f"Error parsing CpG positions in column names: {e}")
    exit(1)
except ValueError as e:
    print(f"Non-integer CpG position found in column names: {e}")
    exit(1)

# Create a mapping from column names to positions
cpg_col_to_pos = dict(zip(cpg_columns, cpg_positions))

# Select the CpG positions for the specified chunk
# Note: Python uses 0-based indexing
selected_cpg_cols = cpg_columns[chunk_start - 1:chunk_end]
selected_cpg_positions = [cpg_col_to_pos[col] for col in selected_cpg_cols]

print(f"Selected CpG Columns: {selected_cpg_cols.tolist()}")
print(f"Selected CpG Positions: {selected_cpg_positions}")

Methylation data loaded from '/dcs04/lieber/statsgen/mnagle/mwas/pheno/caud/out/chr1_AA_8982-28981.csv'.
'sample_id' column confirmed and converted to string.
CpG positions extracted from column names.
Selected CpG Columns: ['pos_1069461', 'pos_1069467', 'pos_1069470', 'pos_1069477', 'pos_1069484', 'pos_1069498', 'pos_1069506', 'pos_1069516', 'pos_1069530', 'pos_1069533', 'pos_1069539', 'pos_1069544', 'pos_1069569', 'pos_1069573', 'pos_1069591', 'pos_1069599', 'pos_1069601', 'pos_1069603', 'pos_1069613', 'pos_1069626', 'pos_1069629', 'pos_1069635', 'pos_1069637', 'pos_1069645', 'pos_1069651', 'pos_1069653', 'pos_1069669', 'pos_1069682', 'pos_1069691', 'pos_1069693', 'pos_1069697', 'pos_1069699', 'pos_1069707', 'pos_1069715', 'pos_1069720', 'pos_1069778', 'pos_1069790', 'pos_1069800', 'pos_1069810', 'pos_1069819', 'pos_1069829', 'pos_1069831', 'pos_1069835', 'pos_1069846', 'pos_1069855', 'pos_1069859', 'pos_1069874', 'pos_1069882', 'pos_1069890', 'pos_1069906']
Selected CpG Positions: [

In [10]:
i = 49

In [12]:
# -----------------------------
# Select a Single CpG Site and Window Size for Testing
# -----------------------------
# For initial testing, we'll process the first CpG site and the first window size
cpg_col = selected_cpg_cols[i]
cpg_pos = selected_cpg_positions[i]
w = window_sizes[0]

print(f"\nSelected CpG Site: {cpg_col} at position {cpg_pos}")
print(f"Selected Window Size: {w} bp")


Selected CpG Site: pos_1069906 at position 1069906
Selected Window Size: 10000 bp


In [13]:
# -----------------------------
# Extract Methylation Data for the Selected CpG Site
# -----------------------------
pheno_df = methylation_df[['sample_id', cpg_col]].dropna()
y = pheno_df[cpg_col].values
sample_ids = pheno_df['sample_id'].values
n_samples = len(sample_ids)

print(f"Number of samples with non-missing methylation data: {n_samples}")

if n_samples == 0:
    print("No samples with non-missing methylation data. Exiting.")
    exit(1)

# -----------------------------
# Define Genomic Window
# -----------------------------
p1 = max(cpg_pos - w, 0)
p2 = cpg_pos + w

print(f"Genomic window: {p1} - {p2} bp")

# -----------------------------
# Load Genotype Data for the Specified Chromosome
# -----------------------------
pgen_prefix = os.path.join(gwas_dir, f"libd_chr{chromosome}")
pgen_file = f"{pgen_prefix}.pgen"
pvar_file = f"{pgen_prefix}.pvar"
psam_file = f"{pgen_prefix}.psam"

# Check if all necessary PLINK 2 files exist
if not all(os.path.exists(f) for f in [pgen_file, pvar_file, psam_file]):
    print("One or more PLINK 2 files are missing. Exiting.")
    exit(1)

print("All necessary PLINK 2 files found.")

Number of samples with non-missing methylation data: 164
Genomic window: 1059906 - 1079906 bp
All necessary PLINK 2 files found.


In [14]:
# -----------------------------
# Read Sample IDs from .psam File
# -----------------------------
try:
    psam_df = pd.read_csv(psam_file, sep='\t')
    if '#IID' not in psam_df.columns:
        print(f"'#IID' column not found in .psam file '{psam_file}'. Exiting.")
        exit(1)
    geno_sample_ids = psam_df['#IID'].astype(str).values
    print("Genotype sample IDs loaded from .psam file.")
except Exception as e:
    print(f"Error reading .psam file '{psam_file}': {e}")
    exit(1)

# Create a mapping from sample ID to index in genotype data
sample_id_to_index = {sid: idx for idx, sid in enumerate(geno_sample_ids)}

# Get genotype indices for samples present in methylation data
geno_indices = [sample_id_to_index[sid] for sid in sample_ids if sid in sample_id_to_index]

Genotype sample IDs loaded from .psam file.


In [None]:
if not geno_indices:
    print("No matching samples between genotype and methylation data. Exiting.")
    exit(1)

print(f"Number of matching samples: {len(geno_indices)}")

# -----------------------------
# Read SNP Positions from .pvar File
# -----------------------------
try:
    pvar_df = pd.read_csv(pvar_file, sep='\t', comment='#',
                          names=['CHROM', 'POS', 'ID', 'REF', 'ALT', 'QUAL', 'FILTER', 'INFO', 'FORMAT'])
    print("SNP positions loaded from .pvar file.")
except Exception as e:
    print(f"Error reading .pvar file '{pvar_file}': {e}")
    exit(1)

# Subset SNPs within the genomic window
snps_in_window = pvar_df[(pvar_df['POS'] >= p1) & (pvar_df['POS'] <= p2)]

if snps_in_window.empty:
    print("No SNPs found within the genomic window. Exiting.")
    exit(1)

print(f"Number of SNPs within the window: {len(snps_in_window)}")

# Get variant indices (0-based)
variant_indices = snps_in_window.index.values

if benchmark:
    start_time_total = time.time()
# Initialize PgenReader with sample_subset
# PgenReader expects the filename as bytes
geno_indices = sorted(geno_indices)



# Initialize PgenReader with sample_subset
pgr = PgenReader(pgen_file.encode('utf-8'), sample_subset=np.array(geno_indices, dtype=np.uint32))

# Allocate buffer: rows=variants, cols=samples
geno_buffer = np.empty((len(variant_indices), n_samples), dtype=np.int32)

# Read each variant and populate the buffer
for var_idx, variant_idx in enumerate(variant_indices):
    # Read genotype for the current variant
    # allele_idx=1 corresponds to the alternate allele count
    pgr.read(variant_idx, geno_buffer[var_idx, :], allele_idx=1)

print("Genotype data successfully read and stored in buffer.")

# -----------------------------
# Benchmarking: Genotype Reading Time
# -----------------------------
if benchmark:
    geno_time = time.time() - start_time_total
    print(f"Genotype reading time: {geno_time:.2f} seconds")

# -----------------------------
# Check number of SNPs
# -----------------------------
if len(snps_in_window) < 2:
    print("Only one SNP in window; skipping heritability estimation.")
    exit(0)


In [18]:
geno_buffer_backup = geno_buffer # we need to debug and figure out why geno_buffer is still all integer values even after imputing NA

In [19]:
# -----------------------------
# Standardize Genotypes
# -----------------------------
print("Standardizing genotype data.")
M = geno_buffer.astype(float)

Standardizing genotype data.


In [21]:
# -----------------------------
# Check for Missing Data and Impute
# -----------------------------
if np.any(geno_buffer == -9):
    print("Missing genotype data detected. Imputing missing values with mean genotype.")
    # Replace missing genotypes (-9) with the mean genotype for each SNP in M
    for var in range(M.shape[0]):
        missing = M[var, :] == -9
        if np.any(missing):
            non_missing = M[var, :] != -9
            if np.any(non_missing):
                mean_geno = np.mean(M[var, non_missing])
                M[var, missing] = mean_geno
                print(f"  Imputed missing values for SNP {var + 1} with mean genotype {mean_geno:.2f}.")
            else:
                # If all genotypes are missing, impute with 0
                M[var, missing] = 0
                print(f"  All genotypes missing for SNP {var + 1}. Imputed with 0.")
    
    # Check for NaNs after imputation
    if np.isnan(M).any():
        nan_indices = np.argwhere(np.isnan(M))
        print(f"NaNs found at positions: {nan_indices}")
        exit(1)  # Stop execution to address the issue

# -----------------------------
# Standardize Genotypes
# -----------------------------
print("Standardizing genotype data.")
mu = np.mean(M, axis=0)
sigma = np.std(M, axis=0, ddof=1)
sigma[sigma == 0] = 1  # Avoid division by zero
S = (M - mu) / sigma
S = np.nan_to_num(S)
print("Genotype data standardized.")

Missing genotype data detected. Imputing missing values with mean genotype.
  Imputed missing values for SNP 1 with mean genotype 0.69.
  Imputed missing values for SNP 2 with mean genotype 0.96.
  Imputed missing values for SNP 3 with mean genotype 0.99.
  Imputed missing values for SNP 4 with mean genotype 1.07.
Standardizing genotype data.
Genotype data standardized.


In [23]:
mu = np.mean(M, axis = 0)
sigma = np.std(M, axis=0, ddof=1)

In [26]:
# -----------------------------
# Compute Kinship Matrix using GEMMA Method
# -----------------------------
if benchmark:
    start_time = time.time()
print("Computing kinship matrix.")
K = np.dot(S, S.T) / S.shape[1]
print("Kinship matrix computed.")

if benchmark:
    kinship_time = time.time() - start_time
    print(f"Kinship computation time: {kinship_time:.2f} seconds")

# -----------------------------
# Normalize Kinship Matrix
# -----------------------------
try:
    K_normalized = K / np.mean(np.diagonal(K))
    print("Kinship matrix normalized.")
except ZeroDivisionError:
    print("Mean of the diagonal of the kinship matrix is zero. Cannot normalize.")
    exit(1)
except Exception as e:
    print(f"Kinship normalization failed: {e}. Exiting.")
    exit(1)

Computing kinship matrix.
Kinship matrix computed.
Kinship computation time: 0.00 seconds
Kinship matrix normalized.


In [27]:
K

array([[ 0.67821676, -0.1702268 , -0.20778734, -0.30020262],
       [-0.1702268 ,  0.10448223,  0.07153829, -0.00579371],
       [-0.20778734,  0.07153829,  0.11828906,  0.01795999],
       [-0.30020262, -0.00579371,  0.01795999,  0.28803634]])

In [None]:
# -----------------------------
# Estimate Heritability Using Limix
# -----------------------------
try:
    # Define the likelihood based on phenotype type
    # Options: "normal", "bernoulli", "probit", "binomial", "poisson"
    # Adjust 'likelihood_type' as needed based on your phenotype
    likelihood_type = "normal"

    print(f"Estimating heritability with likelihood: '{likelihood_type}'")
    h2 = limix.her.estimate(
        y=y,
        lik=likelihood_type,
        K=K_normalized,
        M=None,
        verbose=True
    )
    print(f"Estimated heritability (h2): {h2:.4f}")

    if benchmark:
        herit_time = time.time() - start_time
        total_time = time.time() - start_time_total
        print(f"Heritability estimation time: {herit_time:.2f} seconds")
        print(f"Total processing time: {total_time:.2f} seconds")

except Exception as e:
    print(f"Heritability estimation failed: {e}")
    exit(1)

# -----------------------------
# Collect and Save Results
# -----------------------------
print("Collecting results.")
result_entry = {
    'V_G': h2 * (np.mean(np.diagonal(K)) * (1 - h2)),
    'V_e': (1 - h2) * (np.mean(np.diagonal(K)) * (1 - h2)),
    'h2': h2,
    'n': n_samples,
    'site': f"chr{chromosome}_{cpg_pos}",
    'window_bp': w
}
results = [result_entry]

# Collect Timing Data (if benchmarking)
if benchmark:
    timing_measurements = {
        f"chr{chromosome}_pos{cpg_pos}_window{w}": {
            'geno_time_sec': geno_time,
            'kinship_time_sec': kinship_time,
            'herit_time_sec': herit_time,
            'total_time_sec': total_time
        }
    }

# Save Results to CSV
results_df = pd.DataFrame(results)
results_df.to_csv("heritability_results.csv", index=False)
print("Heritability results saved to 'heritability_results.csv'.")

# Save Timing Measurements to CSV (if benchmarking)
if benchmark:
    timing_df = pd.DataFrame.from_dict(timing_measurements, orient='index')
    timing_df.reset_index(inplace=True)
    timing_df.rename(columns={'index': 'ID'}, inplace=True)
    timing_df.to_csv("timing_measurements.csv", index=False)
    print("Timing measurements saved to 'timing_measurements.csv'.")

In [None]:
# Create output directory if it doesn't exist
os.makedirs(output_dir, exist_ok=True)
os.chdir(output_dir)

# -----------------------------
# Read the Metadata DataFrame
# -----------------------------
df = pd.read_csv(df_csv_path)

# Initialize Results Storage
results = []
timing_measurements = {}

# -----------------------------
# Function to Normalize Kinship Matrix
# -----------------------------
def normalize_kinship(K):
    """Normalize the kinship matrix as required by Limix."""
    mean_diag = np.mean(np.diagonal(K))
    if mean_diag == 0:
        raise ValueError("Mean of the diagonal of the kinship matrix is zero.")
    return K / mean_diag


In [None]:
# Create output directory if it doesn't exist
if not os.path.exists(outdir):
    os.makedirs(outdir)
os.chdir(outdir)

# Read the data frame containing paths to SNP and methylation data
df = pd.read_csv(df_csv)

# Processing the first row as per df_row in R script
df_row = 0  # Adjust as needed

# Extract paths from the data frame
gwas_dir = os.path.dirname(df.loc[df_row, 'SNP_data'])
methylation_file = df.loc[df_row, 'modified_methylation_data']

methylation_file = methylation_file.replace("/dcs04/lieber/statsgen/shizhong/michael/mwas/pheno/", "/dcs04/lieber/statsgen/mnagle/mwas/pheno/")

methylation_file = methylation_file.replace("rda", "csv")
methylation_file = methylation_file.replace("rds", "csv")

In [None]:
methylation_file

## Re-run once finihsed reprocessing csv formatted files

In [None]:
chunk_start = chunk1
chunk_end = chunk2

In [None]:
chromosome = df.loc[df_row, 'Chr']
window_sizes = wind

In [None]:
for idx, (cpg_col, cpg_pos) in enumerate(zip(selected_cpg_cols, selected_cpg_positions), start=chunk_start):
    print(f"  Processing CpG site {idx} at position {cpg_pos}")

    # Extract methylation levels for the current CpG site
    pheno_df = methylation_df[['sample_id', cpg_col]].dropna()
    y = pheno_df[cpg_col].values
    sample_ids = pheno_df['sample_id'].values
    n_samples = len(sample_ids)

    if n_samples == 0:
        print("    No samples with non-missing methylation data. Skipping this CpG site.")
        continue

    # -----------------------------
    # Loop Over Window Sizes
    # -----------------------------
    for w in window_sizes:
        print(f"    Window size: {w} bp")

        if benchmark:
            start_time_total = time.time()

        # Define genomic window
        p1 = max(cpg_pos - w, 0)
        p2 = cpg_pos + w

        # -----------------------------
        # Load Genotype Data for the Specified Chromosome
        # -----------------------------
        pgen_prefix = os.path.join(gwas_dir, f"libd_chr{chromosome}")
        pgen_file = f"{pgen_prefix}.pgen"
        pvar_file = f"{pgen_prefix}.pvar"
        psam_file = f"{pgen_prefix}.psam"

        # Check if all necessary PLINK 2 files exist
        if not all(os.path.exists(f) for f in [pgen_file, pvar_file, psam_file]):
            print("      One or more PLINK 2 files are missing. Skipping this window.")
            continue

        # -----------------------------
        # Read Sample IDs from .psam File
        # -----------------------------
        try:
            psam_df = pd.read_csv(psam_file, sep='\t')
            if '#IID' not in psam_df.columns:
                print(f"      '#IID' column not found in .psam file '{psam_file}'. Skipping this window.")
                continue
            geno_sample_ids = psam_df['#IID'].astype(str).values
        except Exception as e:
            print(f"      Error reading .psam file '{psam_file}': {e}")
            continue

        # Create a mapping from sample ID to index in genotype data
        sample_id_to_index = {sid: idx for idx, sid in enumerate(geno_sample_ids)}

        # Get genotype indices for samples present in methylation data
        geno_indices = [sample_id_to_index[sid] for sid in sample_ids if sid in sample_id_to_index]

        if not geno_indices:
            print("      No matching samples between genotype and methylation data. Skipping this window.")
            continue

        # -----------------------------
        # Read SNP Positions from .pvar File
        # -----------------------------
        try:
            pvar_df = pd.read_csv(pvar_file, sep='\t', comment='#',
                                  names=['CHROM', 'POS', 'ID', 'REF', 'ALT', 'QUAL', 'FILTER', 'INFO', 'FORMAT'])
        except Exception as e:
            print(f"      Error reading .pvar file '{pvar_file}': {e}")
            continue

        # Subset SNPs within the genomic window
        snps_in_window = pvar_df[(pvar_df['POS'] >= p1) & (pvar_df['POS'] <= p2)]

        if snps_in_window.empty:
            print("      No SNPs found within this window. Skipping.")
            continue

        # Get variant indices (0-based)
        variant_indices = snps_in_window.index.values

        # -----------------------------
        # Read Genotype Data Using PgenReader
        # -----------------------------
        try:
            # Initialize PgenReader with sample_subset
            # PgenReader expects the filename as bytes
            pgr = PgenReader(pgen_file.encode('utf-8'), sample_subset=np.array(geno_indices, dtype=np.uint32))

            # Allocate buffer: rows=variants, cols=samples
            geno_buffer = np.empty((len(variant_indices), n_samples), dtype=np.int32)

            # Read each variant and populate the buffer
            for var_idx, variant_idx in enumerate(variant_indices):
                # Read genotype for the current variant
                # allele_idx=1 corresponds to the alternate allele count
                pgr.read(variant_idx, geno_buffer[var_idx, :], allele_idx=1)

            # -----------------------------
            # Benchmarking: Genotype Reading Time
            # -----------------------------
            if benchmark:
                geno_time = time.time() - start_time_total
                start_time = time.time()

        except Exception as e:
            print(f"      Error reading genotype data: {e}")
            continue

        # -----------------------------
        # Check for Missing Data and Impute
        # -----------------------------
        if np.any(geno_buffer == -9):
            print("      Missing genotype data detected. Imputing missing values with mean genotype.")
            # Replace missing genotypes (-9) with the mean genotype for each SNP
            for var in range(geno_buffer.shape[0]):
                missing = geno_buffer[var, :] == -9
                if np.any(missing):
                    non_missing = geno_buffer[var, :] != -9
                    if np.any(non_missing):
                        mean_geno = np.mean(geno_buffer[var, non_missing])
                        geno_buffer[var, missing] = mean_geno
                    else:
                        # If all genotypes are missing, impute with 0
                        geno_buffer[var, missing] = 0

        # -----------------------------
        # Standardize Genotypes
        # -----------------------------
        M = geno_buffer.astype(float)
        mu = np.mean(M, axis=0)
        sigma = np.std(M, axis=0, ddof=1)
        sigma[sigma == 0] = 1  # Avoid division by zero
        S = (M - mu) / sigma
        S = np.nan_to_num(S)

        # Compute kinship matrix using GEMMA method
        K = np.dot(S, S.T) / S.shape[1]

        if benchmark:
            kinship_time = time.time() - start_time
            start_time = time.time()

        # -----------------------------
        # Normalize Kinship Matrix
        # -----------------------------
        try:
            K_normalized = normalize_kinship(K)
        except ValueError as e:
            print(f"      Kinship normalization failed: {e}. Skipping this window.")
            continue

        # -----------------------------
        # Estimate Heritability Using Limix
        # -----------------------------
        try:
            # Define the likelihood based on phenotype type
            # Adjust 'likelihood_type' as needed based on your phenotype
            # Options: "normal", "bernoulli", "probit", "binomial", "poisson"
            # For this example, we'll assume a "normal" phenotype
            likelihood_type = "normal"

            # Estimate heritability
            h2 = limix.her.estimate(
                y=y,
                lik=likelihood_type,
                K=K_normalized,
                M=None,
                verbose=False
            )

            if benchmark:
                herit_time = time.time() - start_time
                total_time = time.time() - start_time_total

        except Exception as e:
            print(f"      Heritability estimation failed: {e}")
            continue

        # -----------------------------
        # Collect Results
        # -----------------------------
        result_entry = {
            'V_G': h2 * (np.mean(np.diagonal(K)) * (1 - h2)),
            'V_e': (1 - h2) * (np.mean(np.diagonal(K)) * (1 - h2)),
            'h2': h2,
            'n': n_samples,
            'site': f"chr{chromosome}_{cpg_pos}",
            'window_bp': w
        }
        results.append(result_entry)

        # -----------------------------
        # Collect Timing Data (if benchmarking)
        # -----------------------------
        if benchmark:
            timing_entry = {
                'geno_time_sec': geno_time,
                'kinship_time_sec': kinship_time,
                'herit_time_sec': herit_time,
                'total_time_sec': total_time
            }
            timing_measurements[f"chr{chromosome}_pos{cpg_pos}_window{w}"] = timing_entry

        print(f"      Completed CpG site {idx}, window {w} bp")

# -----------------------------
# Save Results to CSV
# -----------------------------
if results:
    results_df = pd.DataFrame(results)
    results_df.to_csv("heritability_results.csv", index=False)
    print("\nHeritability results saved to 'heritability_results.csv'.")
else:
    print("\nNo heritability results to save.")

if benchmark and timing_measurements:
    timing_df = pd.DataFrame.from_dict(timing_measurements, orient='index')
    timing_df.reset_index(inplace=True)
    timing_df.rename(columns={'index': 'ID'}, inplace=True)
    timing_df.to_csv("timing_measurements.csv", index=False)
    print("Timing measurements saved to 'timing_measurements.csv'.")
elif benchmark:
    print("No timing measurements to save.")

In [None]:
# Initialize PgenReader with sample_subset
pgr = PgenReader(pgen_file, sample_subset=np.array(geno_indices, dtype=np.uint32))

In [None]:
pgen_file

In [None]:
# Read the range of variants
# Assuming variants are sorted by position; hence, read them sequentially
# Allocate buffer: rows=variants, cols=samples
geno_buffer = np.empty((len(variant_indices), n_samples), dtype=np.int32)

# Read each variant and populate the buffer
for var_idx, geno_idx in enumerate(variant_indices):
    # Read genotype for the current variant
    # Variants are 0-based; ensure correct indexing
    pgr.read(var_idx, geno_buffer[var_idx, :], allele_idx=1)

# -----------------------------
# Benchmarking: Genotype Reading Time
# -----------------------------
if benchmark:
    geno_time = time.time() - start_time_total
    start_time = time.time()

In [None]:
# Last version from o1-preview, gave error:
# ---------------------------------------------------------------------------
# TypeError                                 Traceback (most recent call last)
# Cell In[5], line 83
#      80     start_time = time.time()
#      82 # Initialize PgenReader
# ---> 83 pgr = PgenReader(pgen_file, sample_subset=geno_indices, variant_subset=snp_indices)
#      85 # Read genotype data
#      86 M = np.empty((len(geno_indices), len(snp_indices)), dtype=np.float32)

# File src/pgenlib/pgenlib.pyx:400, in pgenlib.PgenReader.__cinit__()

# TypeError: __cinit__() got an unexpected keyword argument 'variant_subset'

# chr_number = df.loc[df_row, 'Chr']  # Assuming 'Chr' column has the chromosome number

# # Load methylation data
# # Methylation data has CpG positions as columns and samples as rows
# methylation_df = pd.read_csv(methylation_file)

# # 'sample_id' is the first column
# # Extract CpG columns (all columns except 'sample_id')
# CpG_columns = methylation_df.columns.drop('sample_id')
# CpG_positions = [int(col.split('_')[1]) for col in CpG_columns]  # Extract numeric positions

# # Create a mapping from column names to positions
# CpG_col_to_pos = dict(zip(CpG_columns, CpG_positions))

# # Select the CpG positions for the specified chunk
# CpG_columns_chunk = CpG_columns[chunk1 - 1:chunk2]
# CpG_positions_chunk = [CpG_col_to_pos[col] for col in CpG_columns_chunk]

# # Initialize results storage
# results = []
# time_measurements = {}

# # Loop over CpG sites
# for idx, (CpG_col, CpG_position) in enumerate(zip(CpG_columns_chunk, CpG_positions_chunk), start=chunk1):
#     print(f"Processing CpG site {idx} at position {CpG_position} on chromosome {chr_number}")
    
#     # Extract methylation levels for the current CpG site
#     pheno_df = methylation_df[['sample_id', CpG_col]].dropna()
#     y = pheno_df[CpG_col].values
#     sample_ids = pheno_df['sample_id'].astype(str).values
#     n_samples = len(sample_ids)
    
#     # Loop over window sizes
#     for w in wind:
#         print(f"Window size: {w} bp")
#         if benchmark:
#             start_time_total = time.time()
        
#         # Define genomic window
#         p1 = max(CpG_position - w, 0)
#         p2 = CpG_position + w

#         # Load genotype data for the specified chromosome
#         pgen_prefix = os.path.join(gwas_dir, f"libd_chr{chr_number}")
#         pgen_file = f"{pgen_prefix}.pgen"
#         pvar_file = f"{pgen_prefix}.pvar"
#         psam_file = f"{pgen_prefix}.psam"

#         # Check if files exist
#         if not all(os.path.exists(f) for f in [pgen_file, pvar_file, psam_file]):
#             print("One or more PLINK 2 files are missing. Skipping.")
#             continue

#         # Read sample IDs from .psam file
#         psam_df = pd.read_csv(psam_file, sep='\t')
#         geno_sample_ids = psam_df['#IID'].astype(str).values

#         # Map sample IDs to indices
#         sample_id_to_index = {sid: idx for idx, sid in enumerate(geno_sample_ids)}
#         geno_indices = [sample_id_to_index.get(sid) for sid in sample_ids if sid in sample_id_to_index]

#         if not geno_indices:
#             print("No matching samples between genotype and methylation data. Skipping.")
#             continue

#         # Read SNP positions from .pvar file
#         pvar_df = pd.read_csv(pvar_file, sep='\t', comment='#',
#                               names=['CHROM', 'POS', 'ID', 'REF', 'ALT', 'QUAL', 'FILTER', 'INFO', 'FORMAT'])
#         # Subset SNPs within the window
#         snps_in_window = pvar_df[(pvar_df['POS'] >= p1) & (pvar_df['POS'] <= p2)]

#         if snps_in_window.empty:
#             print("No SNPs in this window. Skipping.")
#             continue

#         # Get indices of SNPs in the window
#         snp_indices = snps_in_window.index.values

#         if benchmark:
#             start_time = time.time()

#         # Initialize PgenReader
#         pgr = PgenReader(pgen_file, sample_subset=geno_indices, variant_subset=snp_indices)

#         # Read genotype data
#         M = np.empty((len(geno_indices), len(snp_indices)), dtype=np.float32)
#         pgr.read(M)

#         if benchmark:
#             geno_time = time.time() - start_time
#             start_time = time.time()

#         # Check if sample sizes match
#         if M.shape[0] != y.shape[0]:
#             print("Mismatch in sample sizes after subsetting. Skipping.")
#             continue

#         # Standardize genotypes
#         n_samples, n_snps = M.shape
#         mu = np.mean(M, axis=0)
#         std = np.std(M, axis=0, ddof=1)
#         std[std == 0] = 1  # Avoid division by zero
#         S = (M - mu) / std
#         S = np.nan_to_num(S)
#         K = np.dot(S, S.T) / n_snps  # Kinship matrix

#         if benchmark:
#             kinship_time = time.time() - start_time
#             start_time = time.time()

#         # Estimate heritability using limix
#         try:
#             # Construct the linear mixed model
#             y_centered = y - np.mean(y)
#             covar = np.ones((n_samples, 1))  # Intercept

#             # Use limix LinearMixedModel
#             from limix.qtl import scan

#             # Reshape y and covariates
#             y_centered = y_centered.reshape(-1, 1)
#             covar = covar.reshape(-1, 1)

#             # Perform GWAS scan (single-variant association test) to estimate variance components
#             # Here, we can use the variance components from the null model
#             result = scan(y=y_centered, M=None, K=K, X=covar, verbose=False)

#             # Extract variance components
#             sigma_g2 = result.variance_components['V(K)'][0]
#             sigma_e2 = result.variance_components['V(I)'][0]
#             h2 = sigma_g2 / (sigma_g2 + sigma_e2)

#             if benchmark:
#                 herit_time = time.time() - start_time
#                 total_time = time.time() - start_time_total
#         except Exception as e:
#             print(f"Heritability estimation failed: {e}")
#             continue

#         # Collect results
#         temp2 = {
#             'V_G': sigma_g2,
#             'V_e': sigma_e2,
#             'V_G_Vp': h2,
#             'n': n_samples,
#             'site': f"chr{chr_number}_{CpG_position}",
#             'wind': w
#         }
#         results.append(temp2)

#         # Collect timing data if benchmarking
#         if benchmark:
#             key = f"chr{chr_number}_pos{CpG_position}_wind{w}"
#             time_measurements[key] = {
#                 'geno_time': geno_time,
#                 'kinship_time': kinship_time,
#                 'herit_time': herit_time,
#                 'total_time': total_time
#             }

#         print(f"Completed CpG site {idx}, window {w}")

#     print("\n")

# # Convert results to a DataFrame and save
# results_df = pd.DataFrame(results)
# results_df.to_csv("heritability_results.csv", index=False)

# # Save timing measurements if benchmarking
# if benchmark:
#     timing_df = pd.DataFrame.from_dict(time_measurements, orient='index')
#     timing_df.reset_index(inplace=True)
#     timing_df.rename(columns={'index': 'ID'}, inplace=True)
#     timing_df.to_csv("timing_measurements.csv", index=False)

In [None]:
?PgenReader

## Correct structure of methylation df!!!

In [None]:
timing_df