# Import libraries

In [1]:
import os
import subprocess
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
import seaborn as sns
import matplotlib.pyplot as plt
import statsmodels.api as sm
import scipy.stats as stats
import importlib.util
import sys

# Load data

In [3]:
from scripts.parse_vars import parse_variables
path_vars = f"../../geno_simulation.txt"
R_directory = f"../../rstudio_geno_simulation"
R_file = f"create_geno.R"

variables = parse_variables(path_vars)
if 'G' not in globals():
    G = int(variables['G'])
if 'L' not in globals():
    L = int(variables['L'])
if 'c' not in globals():
    c = int(variables['c'])
if 'k' not in globals():
    k = int(variables['k'])
if 'M' not in globals():
    M = float(variables['M'])
if 'HWE' not in globals():
    HWE = int(variables['HWE'])

if 'tools' not in globals():
    tools = ['PCA', 'abyss_counted', 'abyss', 'no_corr']


if 'scenarios' not in globals():
    scenarios = ['snp_effect',
                 'linear_continuous',
                 'non_linear_continuous',
                 'discrete_global',
                 'discrete_localized',
                 'mix_linear_continuous',
                 'mix_non_linear_continuous',
                 'mix_discrete_global',
                 'mix_discrete_localized']

if 'very_rare_threshold_L' not in globals():
    very_rare_threshold_L = float(variables['very_rare_threshold_L'])
if 'very_rare_threshold_H' not in globals():
    very_rare_threshold_H = float(variables['very_rare_threshold_H'])
if 'rare_threshold_L' not in globals():
    rare_threshold_L = float(variables['rare_threshold_L'])
if 'rare_threshold_H' not in globals():
    rare_threshold_H = float(variables['rare_threshold_H'])
if 'common_threshold_L' not in globals():
    common_threshold_L = float(variables['common_threshold_L'])
if 'common_threshold_H' not in globals():
    common_threshold_H = float(variables['common_threshold_H'])
if 'F' not in globals():
    F = float(variables['F'])

path_geno = f"simulation_data/G{G}_L{L}_c{c}_k{k}_M{M}_F{F}/genotype"
geno = pd.read_pickle((f"{path_geno}/complete_inbred.pkl"))

In [4]:
geno

Unnamed: 0,C_1_MAF_0.500,C_2_MAF_0.500,C_3_MAF_0.500,C_4_MAF_0.500,C_5_MAF_0.500,C_6_MAF_0.500,C_7_MAF_0.499,C_8_MAF_0.499,C_9_MAF_0.499,C_10_MAF_0.499,...,VR_3991_MAF_0.045,VR_3992_MAF_0.044,VR_3993_MAF_0.044,VR_3994_MAF_0.044,VR_3995_MAF_0.044,VR_3996_MAF_0.043,VR_3997_MAF_0.043,VR_3998_MAF_0.042,VR_3999_MAF_0.042,VR_4000_MAF_0.042
0,1,1,-1,1,-1,-1,1,1,-1,-1,...,1,1,1,1,1,1,0,1,1,1
1,1,1,-1,1,0,0,1,1,-1,-1,...,1,1,1,1,1,1,1,1,0,1
2,1,1,-1,1,-1,-1,1,1,-1,0,...,1,1,1,1,1,1,1,1,1,1
3,1,1,-1,1,-1,-1,1,1,0,-1,...,1,1,1,1,1,1,1,1,1,1
4,1,1,0,1,-1,-1,1,1,-1,-1,...,1,1,1,1,0,1,1,1,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1995,-1,1,1,0,1,-1,-1,1,0,0,...,1,1,1,1,0,1,1,1,1,1
1996,-1,1,1,-1,1,-1,-1,1,1,-1,...,1,1,1,0,1,1,1,1,1,1
1997,-1,1,1,-1,1,1,-1,1,1,-1,...,1,1,1,0,1,1,1,1,1,1
1998,-1,0,1,-1,1,-1,-1,1,1,-1,...,1,1,0,1,0,1,1,1,1,0


In [6]:
nr_pcs = 5

In [7]:
# Create the three new DataFrames
minor = (geno == -1).astype(int)
het = (geno == 0).astype(int)
major = (geno == 1).astype(int)

In [None]:
os.makedirs(f"simulation_data/G{G}_L{L}_c{c}_k{k}_M{M}_F{F}/PCs/", exist_ok = True)
os.makedirs(f"simulation_data/G{G}_L{L}_c{c}_k{k}_M{M}_F{F}/maf_reconstruct/", exist_ok = True)
scaler = StandardScaler()


In [25]:
def create_pcs_and_reconstruct(df, nr_pcs, name):
    col_names = [f"PC{i+1}" for i in range(nr_pcs)]
    # Standardize each SNP (column): zero mean, unit variance
    X_std = scaler.fit_transform(df.values)
    pca = PCA(n_components=nr_pcs)
    pcs = pca.fit_transform(X_std)
    # Create DataFrame
    df = pd.DataFrame(pcs, columns=col_names)
    df.to_pickle(f"simulation_data/G{G}_L{L}_c{c}_k{k}_M{M}_F{F}/PCs/{name}_{nr_pcs}_PCs.pkl")
    # Reconstruct standardized data from first nr_pcs PCs
    X_std_recon = pca.inverse_transform(pcs)

    # Undo standardization to get back to the original scale
    geno_recon = scaler.inverse_transform(X_std_recon)

    # Convert to DataFrame with same SNP names
    geno_recon_df = pd.DataFrame(geno_recon, columns=geno.columns, index=geno.index)
    geno_recon_df.to_pickle(f"simulation_data/G{G}_L{L}_c{c}_k{k}_M{M}_F{F}/maf_reconstruct/{name}_reconstruct_{nr_pcs}_PCs.pkl")

In [27]:
create_pcs_and_reconstruct(geno, nr_pcs, "geno")
create_pcs_and_reconstruct(minor, nr_pcs, "minor")
create_pcs_and_reconstruct(het, nr_pcs, "het")
create_pcs_and_reconstruct(major, nr_pcs, "major")