# Import libraries

In [9]:
import os
import subprocess
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.optimizers import Adam
from sklearn.decomposition import PCA
import seaborn as sns
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D  # 3D plotting toolkit
from sklearn.model_selection import train_test_split
import statsmodels.api as sm
import scipy.stats as stats
import importlib.util
import sys

# Load data

In [10]:
from scripts.parse_vars import parse_variables
path_vars = f"../../geno_simulation.txt"
R_directory = f"../../rstudio_geno_simulation"
R_file = f"create_geno.R"

variables = parse_variables(path_vars)
if 'G' not in globals():
    G = int(variables['G'])
if 'L' not in globals():
    L = int(variables['L'])
if 'c' not in globals():
    c = int(variables['c'])
if 'k' not in globals():
    k = int(variables['k'])
if 'M' not in globals():
    M = float(variables['M'])
if 'HWE' not in globals():
    HWE = int(variables['HWE'])

if 'tools' not in globals():
    tools = ['PCA', 'abyss_counted', 'abyss', 'no_corr']


if 'scenarios' not in globals():
    scenarios = ['snp_effect',
                 'linear_continuous',
                 'non_linear_continuous',
                 'discrete_global',
                 'discrete_localized',
                 'mix_linear_continuous',
                 'mix_non_linear_continuous',
                 'mix_discrete_global',
                 'mix_discrete_localized']

if 'very_rare_threshold_L' not in globals():
    very_rare_threshold_L = float(variables['very_rare_threshold_L'])
if 'very_rare_threshold_H' not in globals():
    very_rare_threshold_H = float(variables['very_rare_threshold_H'])
if 'rare_threshold_L' not in globals():
    rare_threshold_L = float(variables['rare_threshold_L'])
if 'rare_threshold_H' not in globals():
    rare_threshold_H = float(variables['rare_threshold_H'])
if 'common_threshold_L' not in globals():
    common_threshold_L = float(variables['common_threshold_L'])
if 'common_threshold_H' not in globals():
    common_threshold_H = float(variables['common_threshold_H'])
if 'F' not in globals():
    F = float(variables['F'])

path_geno = f"simulation_data/G{G}_L{L}_c{c}_k{k}_M{M}_F{F}/genotype"
geno = pd.read_pickle((f"{path_geno}/complete_inbred.pkl"))

In [11]:
nr_pcs = 15

In [12]:
path_geno = f"simulation_data/G{G}_L{L}_c{c}_k{k}_M{M}_F{F}/genotype"
geno = pd.read_pickle((f"{path_geno}/complete_inbred.pkl"))

In [13]:
geno_pcs = pd.read_pickle(f"simulation_data/G{G}_L{L}_c{c}_k{k}_M{M}_F{F}/PCs/geno_{nr_pcs}_PCs.pkl")

In [14]:
# Create the three new DataFrames
minor = (geno == -1).astype(int)
het = (geno == 0).astype(int)
major = (geno == 1).astype(int)

In [15]:
def train_and_reconstruct(df1, df2, learning_rate, epochs):
    # Safety: ensure float32 for TF
    X = df1.astype("float32").values
    y = df2.astype("float32").values
    X_tr, X_val, y_tr, y_val = train_test_split(X, y, test_size=0.1, random_state=42)
    n_out = y.shape[1]
    inputs = tf.keras.Input(shape=(X.shape[1],), dtype=tf.float32)
    outputs = tf.keras.layers.Dense(n_out, activation="linear", use_bias=True)(inputs)
    model = tf.keras.Model(inputs, outputs)
    model.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate=learning_rate),
    loss="mse",
    metrics=["mae"]
    )
    
    # (Optional) Early stopping to avoid overfitting/futile training
    cb = tf.keras.callbacks.EarlyStopping(
        monitor="val_loss", patience=10, restore_best_weights=True
    )
    
    history = model.fit(
        X_tr, y_tr,
        validation_data=(X_val, y_val),
        epochs=epochs,           # will stop early if val_loss stops improving
        batch_size=64,
        callbacks=[cb],
        verbose=0
    )
    
    # Reconstruction: pass df1 through the trained model to get df3
    y_hat = model.predict(X, verbose=0)
    
    # Make it a DataFrame aligned to df2
    df3 = pd.DataFrame(y_hat, index=df2.index, columns=df2.columns)
    return df3

In [16]:
test = train_and_reconstruct(geno_pcs, geno, learning_rate=1e-3, epochs=500)

E0000 00:00:1755167156.735387    1701 cuda_executor.cc:1228] INTERNAL: CUDA Runtime error: Failed call to cudaGetRuntimeVersion: Error loading CUDA libraries. GPU will not be used.: Error loading CUDA libraries. GPU will not be used.
W0000 00:00:1755167156.735984    1701 gpu_device.cc:2341] Cannot dlopen some GPU libraries. Please make sure the missing libraries mentioned above are installed properly if you would like to use GPU. Follow the guide at https://www.tensorflow.org/install/gpu for how to download and setup the required libraries for your platform.
Skipping registering GPU devices...


In [18]:
nrs_pcs = list(set([int(f.split("_")[1]) for f in os.listdir(f"simulation_data//G{G}_L{L}_c{c}_k{k}_M{M}_F{F}/PCs")]))

In [19]:
nrs_pcs

[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]

In [20]:
os.makedirs(f"simulation_data/G{G}_L{L}_c{c}_k{k}_M{M}_F{F}/maf_reconstruct/",exist_ok = True)

In [21]:
for nr_pcs in nrs_pcs:
    print(nr_pcs)
    geno_pcs = pd.read_pickle(f"simulation_data/G{G}_L{L}_c{c}_k{k}_M{M}_F{F}/PCs/geno_{nr_pcs}_PCs.pkl")
    geno_reconstruct = train_and_reconstruct(geno_pcs, geno, learning_rate=1e-3, epochs=500)
    min_reconstruct = train_and_reconstruct(minor, geno, learning_rate=1e-3, epochs=500)
    het_reconstruct = train_and_reconstruct(het, geno, learning_rate=1e-3, epochs=500)
    major_reconstruct = train_and_reconstruct(major, geno, learning_rate=1e-3, epochs=500)
    geno_reconstruct.to_pickle(f"simulation_data/G{G}_L{L}_c{c}_k{k}_M{M}_F{F}/maf_reconstruct/geno_reconstruct_{nr_pcs}_PCs.pkl")
    min_reconstruct.to_pickle(f"simulation_data/G{G}_L{L}_c{c}_k{k}_M{M}_F{F}/maf_reconstruct/minor_reconstruct_{nr_pcs}_PCs.pkl")
    het_reconstruct.to_pickle(f"simulation_data/G{G}_L{L}_c{c}_k{k}_M{M}_F{F}/maf_reconstruct/het_reconstruct_{nr_pcs}_PCs.pkl")
    major_reconstruct.to_pickle(f"simulation_data/G{G}_L{L}_c{c}_k{k}_M{M}_F{F}/maf_reconstruct/major_reconstruct_{nr_pcs}_PCs.pkl")

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
