# Import libraries

In [1]:
import os
import subprocess
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
import seaborn as sns
import matplotlib.pyplot as plt
import statsmodels.api as sm
import scipy.stats as stats
import importlib.util
import sys
from pathlib import Path
import re
import joblib
from pathlib import Path

# Load data

In [2]:
from scripts.parse_vars import parse_variables, load_config

In [3]:
PATH_VARS = Path("../../geno_simulation.txt")
R_DIRECTORY = Path("../../rstudio_geno_simulation")

# Random seed for reproducibility of the flip step
RNG_SEED = 42
rng = np.random.default_rng(RNG_SEED)
cfg = load_config(PATH_VARS)
G = int(cfg["G"]); L = int(cfg["L"]); c = int(cfg["c"]); k = int(cfg["k"]); M = float(cfg["M"]);


In [4]:
# Build prefix pattern for filtering
prefix = f"G{G}_L{L}_c{c}_k{k}_M{M}"

# Filter only relevant directories
filtered = [d for d in os.listdir("simulation_data") if d.startswith(prefix)]
# Extract F values and mixed cases
F_values = []
mixed_cases = []

for d in filtered:
    # Check if F is present
    match_F = re.search(r"F([0-9.]+)", d)
    if match_F:
        F_values.append(float(match_F.group(1)))

    # Check if mixed is present
    match_mixed = re.search(r"(mixed_[^/]+)", d)
    if match_mixed:
        mixed_cases.append(match_mixed.group(1))

In [5]:
def compute_and_save_pcs(
    X: np.ndarray,
    outdir: Path,
    basename: str,
    pc_counts=(1,5,15,35,40,50,100),
    n_max=None,
    randomized=True,
    float_dtype=np.float32,
    seed=42
):
    outdir.mkdir(parents=True, exist_ok=True)
    n_max = n_max or max(pc_counts)

    # 1) Standardize once
    scaler = StandardScaler(with_mean=True, with_std=True)
    X_std = scaler.fit_transform(X.astype(float_dtype, copy=False))

    # 2) Fit PCA once up to the largest k
    pca = PCA(
        n_components=n_max,
        svd_solver="randomized" if randomized else "full",
        random_state=seed
    )
    scores = pca.fit_transform(X_std)        # shape: (n_samples, n_max)

    # 3) Persist model bits for reuse/repro
    joblib.dump({"scaler": scaler, "pca": pca}, outdir / f"{basename}_pca_model.joblib")
    pd.DataFrame({
        "pc": np.arange(1, n_max+1),
        "explained_variance_ratio": pca.explained_variance_ratio_
    }).to_pickle(outdir / f"{basename}_explained_variance.pkl")

    # 4) Write all requested subsets without refitting
    for k in sorted(set(pc_counts)):
        df = pd.DataFrame(scores[:, :k], columns=[f"PC{i}" for i in range(1, k+1)])
        df.to_pickle(outdir / f"{basename}_{k}_PCs.pkl")

    return scores, pca, scaler


In [8]:
pc_range = [1, 5, 15, 35, 40, 50, 60, 100, 110, 130, 150, 170, 200]
n_max = max(pc_range)


In [9]:
for mixed_case in mixed_cases:
    base_path = f"simulation_data/G{G}_L{L}_c{c}_k{k}_M{M}_{mixed_case}"
    path_geno = f"{base_path}/genotype"
    geno = pd.read_pickle((f"{path_geno}/complete.pkl"))

    base = Path(f"{base_path}/PCs")
    compute_and_save_pcs(geno,  base, "geno",  pc_range, n_max)