# Import libraries

In [1]:
import os
import subprocess
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.optimizers import Adam
from sklearn.decomposition import PCA
import seaborn as sns
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D  # 3D plotting toolkit
from sklearn.model_selection import train_test_split
import statsmodels.api as sm
import scipy.stats as stats
import importlib.util
import sys
import gc
import re

2025-08-20 05:47:31.067966: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-08-20 05:47:31.068485: I external/local_xla/xla/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2025-08-20 05:47:31.070676: I external/local_xla/xla/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2025-08-20 05:47:31.075776: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1755661651.083953    1653 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1755661651.08

# Load data

In [2]:
from scripts.parse_vars import parse_variables
path_vars = f"../../geno_simulation.txt"
R_directory = f"../../rstudio_geno_simulation"
R_file = f"create_geno.R"

variables = parse_variables(path_vars)
if 'G' not in globals():
    G = int(variables['G'])
if 'L' not in globals():
    L = int(variables['L'])
if 'c' not in globals():
    c = int(variables['c'])
if 'k' not in globals():
    k = int(variables['k'])
if 'M' not in globals():
    M = float(variables['M'])
if 'HWE' not in globals():
    HWE = int(variables['HWE'])

if 'tools' not in globals():
    tools = ['PCA', 'abyss_counted', 'abyss', 'no_corr']


if 'scenarios' not in globals():
    scenarios = ['snp_effect',
                 'linear_continuous',
                 'non_linear_continuous',
                 'discrete_global',
                 'discrete_localized',
                 'mix_linear_continuous',
                 'mix_non_linear_continuous',
                 'mix_discrete_global',
                 'mix_discrete_localized']

if 'very_rare_threshold_L' not in globals():
    very_rare_threshold_L = float(variables['very_rare_threshold_L'])
if 'very_rare_threshold_H' not in globals():
    very_rare_threshold_H = float(variables['very_rare_threshold_H'])
if 'rare_threshold_L' not in globals():
    rare_threshold_L = float(variables['rare_threshold_L'])
if 'rare_threshold_H' not in globals():
    rare_threshold_H = float(variables['rare_threshold_H'])
if 'common_threshold_L' not in globals():
    common_threshold_L = float(variables['common_threshold_L'])
if 'common_threshold_H' not in globals():
    common_threshold_H = float(variables['common_threshold_H'])
if 'F' not in globals():
    F = float(variables['F'])

path_geno = f"simulation_data/G{G}_L{L}_c{c}_k{k}_M{M}_F{F}/genotype"
geno = pd.read_pickle((f"{path_geno}/complete_inbred.pkl"))

In [47]:
path_geno = f"simulation_data/G{G}_L{L}_c{c}_k{k}_M{M}_F{F}/genotype"
geno = pd.read_pickle((f"{path_geno}/complete_inbred.pkl"))

In [4]:
nrs_pcs = list(set([int(f.split("_")[1]) for f in os.listdir(f"simulation_data//G{G}_L{L}_c{c}_k{k}_M{M}_F{F}/PCs")]))

In [5]:
nrs_pcs

[1, 35, 100, 5, 40, 15, 50]

In [6]:
os.makedirs(f"simulation_data/G{G}_L{L}_c{c}_k{k}_M{M}_F{F}/maf_reconstruct/",exist_ok = True)

In [7]:
geno

Unnamed: 0,C_1_MAF_0.500,C_2_MAF_0.500,C_3_MAF_0.500,C_4_MAF_0.500,C_5_MAF_0.500,C_6_MAF_0.500,C_7_MAF_0.500,C_8_MAF_0.499,C_9_MAF_0.499,C_10_MAF_0.499,...,VR_39991_MAF_0.042,VR_39992_MAF_0.042,VR_39993_MAF_0.041,VR_39994_MAF_0.041,VR_39995_MAF_0.041,VR_39996_MAF_0.041,VR_39997_MAF_0.040,VR_39998_MAF_0.040,VR_39999_MAF_0.039,VR_40000_MAF_0.038
0,-1,1,1,-1,0,-1,-1,-1,-1,-1,...,1,1,1,1,1,0,1,1,1,1
1,-1,1,1,-1,1,-1,-1,-1,-1,-1,...,1,1,1,1,1,1,1,1,1,1
2,0,1,1,-1,0,-1,-1,-1,-1,-1,...,1,1,0,1,1,1,1,1,1,1
3,-1,1,1,-1,1,-1,-1,-1,0,-1,...,1,1,1,1,0,0,1,1,1,1
4,-1,1,1,-1,1,-1,-1,-1,-1,-1,...,1,1,1,1,1,1,1,1,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1995,1,-1,-1,1,-1,1,1,1,1,1,...,1,1,1,1,0,1,0,1,1,0
1996,1,-1,-1,1,-1,1,1,1,1,1,...,1,0,1,1,1,0,1,1,1,1
1997,1,-1,-1,0,-1,1,1,1,1,0,...,1,1,1,1,1,1,1,1,1,1
1998,1,-1,-1,1,-1,1,1,1,1,1,...,1,1,0,1,1,1,1,0,1,1


In [8]:
def column_chunks(df: pd.DataFrame,
                  max_cols: int = 4000,
                  seed: int = 42,
                  return_numpy: bool = False,
                  dtype=np.int8):
    """
    Yields randomized column chunks of size <= max_cols.
    Casting to int8 keeps genotype encodings (-1,0,1) compact.
    """
    cols = df.columns.to_numpy()
    rng = np.random.default_rng(seed)
    perm = rng.permutation(cols)  # randomized column order
    
    for start in range(0, len(perm), max_cols):
        block_cols = perm[start:start + max_cols]
        chunk_df = df.loc[:, block_cols]
        if dtype is not None:
            # downcast without copying if already int8-compatible
            chunk_df = chunk_df.astype(dtype, copy=False)
        yield chunk_df.to_numpy() if return_numpy else chunk_df

# --- Example usage ---
# geno is your (2000 x 40000) DataFrame

# 1) Get a list of 10 DataFrame chunks (each ~4000 cols)
chunks_df = list(column_chunks(geno, max_cols=4000, seed=123, return_numpy=False))


In [9]:


# --------- small helpers ---------

def make_projector(pcs_df: pd.DataFrame,
                   add_intercept: bool = True,
                   ridge_alpha: float | None = 1e-6,
                   dtype=np.float32):
    """Precompute A_aug and M = (A' A + αI)^(-1) A' once per PC set."""
    A = pcs_df.astype(dtype, copy=False).values
    n, k = A.shape
    if add_intercept:
        A_aug = np.concatenate([A, np.ones((n, 1), dtype=dtype)], axis=1)
    else:
        A_aug = A

    AtA = A_aug.T @ A_aug
    if ridge_alpha is not None and ridge_alpha > 0:
        AtA = AtA + ridge_alpha * np.eye(AtA.shape[0], dtype=AtA.dtype)

    # pinv is robust for tiny k
    M = np.linalg.pinv(AtA) @ A_aug.T        # (k+q, n)
    return A_aug, M

def apply_projector(A_aug, M, target_df: pd.DataFrame,
                    block_cols: int = 20000,
                    dtype=np.float32) -> pd.DataFrame:
    """Stream target_df through (A_aug, M) in column blocks."""
    cols = target_df.columns
    out_blocks = []
    for start in range(0, len(cols), block_cols):
        end = start + block_cols
        blk_cols = cols[start:end]
        B_blk = target_df[blk_cols].astype(dtype, copy=False).values   # (n, b)
        T = M @ B_blk                                                  # (k+q, b)
        Bhat_blk = A_aug @ T                                           # (n, b)
        out_blocks.append(pd.DataFrame(Bhat_blk, index=target_df.index, columns=blk_cols))
    return pd.concat(out_blocks, axis=1)

# --------- your loop, swapped to OLS ---------

for i, geno in enumerate(chunks_df):
    # build targets once per chunk
    minor = (geno == -1).astype(np.float32)
    het   = (geno ==  0).astype(np.float32)
    major = (geno ==  1).astype(np.float32)
    print(i)

    for nr_pcs in nrs_pcs:
        pcs_path = f"simulation_data/G{G}_L{L}_c{c}_k{k}_M{M}_F{F}/PCs/geno_{nr_pcs}_PCs.pkl"
        geno_pcs = pd.read_pickle(pcs_path)

        # make sure rows align (important if indexing differs)
        if not geno_pcs.index.equals(geno.index):
            geno_pcs = geno_pcs.loc[geno.index]

        # Precompute projector once and reuse for all targets
        A_aug, M_1 = make_projector(geno_pcs, add_intercept=True, ridge_alpha=1e-6, dtype=np.float32)

        # Reconstruct each target in blocks (no massive matrices in memory)
        geno_reconstruct  = apply_projector(A_aug, M_1, geno,  block_cols=20000)
        min_reconstruct   = apply_projector(A_aug, M_1, minor, block_cols=20000)
        het_reconstruct   = apply_projector(A_aug, M_1, het,   block_cols=20000)
        major_reconstruct = apply_projector(A_aug, M_1, major, block_cols=20000)

        # Save
        outdir = f"simulation_data/G{G}_L{L}_c{c}_k{k}_M{M}_F{F}/maf_reconstruct"
        os.makedirs(outdir, exist_ok=True)
        geno_reconstruct.to_pickle (f"{outdir}/chunk_{i}_geno_reconstruct_{nr_pcs}_PCs.pkl")
        min_reconstruct.to_pickle  (f"{outdir}/chunk_{i}_minor_reconstruct_{nr_pcs}_PCs.pkl")
        het_reconstruct.to_pickle  (f"{outdir}/chunk_{i}_het_reconstruct_{nr_pcs}_PCs.pkl")
        major_reconstruct.to_pickle(f"{outdir}/chunk_{i}_major_reconstruct_{nr_pcs}_PCs.pkl")

        # Light cleanup
        del A_aug, M_1, geno_reconstruct, min_reconstruct, het_reconstruct, major_reconstruct
        gc.collect()


0
1
2
3
4
5
6
7
8
9


In [11]:
min_reconstruct = pd.read_pickle(f"{outdir}/chunk_{i}_minor_reconstruct_{nr_pcs}_PCs.pkl")
het_reconstruct = pd.read_pickle(f"{outdir}/chunk_{i}_het_reconstruct_{nr_pcs}_PCs.pkl")
major_reconstruct = pd.read_pickle(f"{outdir}/chunk_{i}_major_reconstruct_{nr_pcs}_PCs.pkl")

In [51]:
path_geno = f"simulation_data/G{G}_L{L}_c{c}_k{k}_M{M}_F{F}/genotype"
geno = pd.read_pickle((f"{path_geno}/complete_inbred.pkl"))
pattern1 = re.compile(r"_reconstruct_(\d+)_PCs")
pattern2 = re.compile(r"chunk_(\d+)_")
for nr_pcs in nrs_pcs:
    list_of_chunks = [f for f in os.listdir(outdir) if (m := pattern1.search(f)) and m.group(1) == str(nr_pcs)]
    chunk_indexes = sorted({int(pattern2.search(f).group(1)) for f in list_of_chunks})
    minors = []
    hets = []
    majors = []
    genos_rec = []
    for i in chunk_indexes:
        minor = pd.read_pickle(f"{outdir}/chunk_{i}_minor_reconstruct_{nr_pcs}_PCs.pkl")
        het = pd.read_pickle(f"{outdir}/chunk_{i}_het_reconstruct_{nr_pcs}_PCs.pkl")
        major = pd.read_pickle(f"{outdir}/chunk_{i}_major_reconstruct_{nr_pcs}_PCs.pkl")
        geno_rec = pd.read_pickle(f"{outdir}/chunk_{i}_geno_reconstruct_{nr_pcs}_PCs.pkl")
        minors.append(minor)
        hets.append(het)
        majors.append(major)
        genos_rec.append(geno_rec)
    
    majors_df = pd.concat(majors, axis=1)
    majors_df = majors_df.reindex(columns=geno.columns)
    majors_df.to_pickle(f"{outdir}/major_reconstruct_{nr_pcs}_PCs.pkl")

    minors_df = pd.concat(minors, axis=1)
    minors_df = minors_df.reindex(columns=geno.columns)
    minors_df.to_pickle(f"{outdir}/minor_reconstruct_{nr_pcs}_PCs.pkl")

    hets_df = pd.concat(hets, axis=1)
    hets_df = hets_df.reindex(columns=geno.columns)
    hets_df.to_pickle(f"{outdir}/het_reconstruct_{nr_pcs}_PCs.pkl")

    genos_df = pd.concat(genos_rec, axis=1)
    genos_df = genos_df.reindex(columns=geno.columns)
    genos_df.to_pickle(f"{outdir}/geno_reconstruct_{nr_pcs}_PCs.pkl")

    for i in chunk_indexes:
        os.remove(f"{outdir}/chunk_{i}_minor_reconstruct_{nr_pcs}_PCs.pkl")
        os.remove(f"{outdir}/chunk_{i}_het_reconstruct_{nr_pcs}_PCs.pkl")
        os.remove(f"{outdir}/chunk_{i}_major_reconstruct_{nr_pcs}_PCs.pkl")
        os.remove(f"{outdir}/chunk_{i}_geno_reconstruct_{nr_pcs}_PCs.pkl")
    