# CpG set exploration

In [6]:
import joblib
import pickle
import pandas as pd
import numpy as np
import os

In [7]:
os.chdir(os.path.expanduser("~/PhD_Workspace/PredictRecurrence/"))


In [8]:
def load_training_data(train_ids, beta_path, clinical_path):
    # load clin data
    clinical_data = pd.read_csv(clinical_path)
    clinical_data = clinical_data.set_index("Sample")
    clinical_data = clinical_data.loc[train_ids]

    # load beta values
    beta_matrix = pd.read_csv(beta_path,index_col=0).T

    # align dataframes
    beta_matrix = beta_matrix.loc[train_ids]

    print("Loaded training data.")

    return beta_matrix, clinical_data


def beta2m(beta, beta_threshold=1e-3):
    beta = np.clip(beta, beta_threshold, 1 - beta_threshold)
    return np.log2(beta / (1 - beta))


In [9]:
fold_dict_path = "./output/CoxNet/ERpHER2n/Methylation/Unadjusted/cvfold_ids.pkl"
anno_path = "./data/raw/EPIC_probeAnnoObj.csv"
infile_train_ids = "./data/train/train_subcohorts/ERpHER2n_train_ids.csv"
infile_betavalues = "./data/train/train_methylation_unadjusted.csv"
infile_clinical = "./data/train/train_clinical.csv"

In [None]:
# Load and preprocess data
train_ids = pd.read_csv(infile_train_ids, header=None).iloc[:, 0].tolist()
beta_matrix, clinical_data = load_training_data(train_ids, infile_betavalues, infile_clinical)

# convert to M-values
mvals = beta2m(beta_matrix, beta_threshold=0.001)

In [None]:

# -----------------
# Load fold dict (joblib or pickle)
# -----------------
def load_fold_dict(path):
    try:
        return joblib.load(path)
    except Exception:
        with open(path, "rb") as f:
            return pickle.load(f)

fold_dict = load_fold_dict(fold_dict_path)
print("Loaded fold_dict with keys:", list(fold_dict.keys()))


In [None]:

# -----------------
# Load annotation file
# -----------------
anno = pd.read_csv(anno_path)

# If first column contains CpG IDs, use it as index
first_col = anno.columns[0]
if anno[first_col].astype(str).str.startswith("cg").sum() > 0:
    anno = anno.set_index(first_col)
    print(f"Annotation index set to column '{first_col}'.")

print("Annotation shape:", anno.shape)

# Done â€” fold_dict and anno are now loaded and ready.
