In [None]:
import os
import numpy as np
import pandas as pd
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from sklearn.preprocessing import StandardScaler, RobustScaler, normalize, LabelEncoder
from sklearn.model_selection import StratifiedKFold, cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
from sklearn.svm import SVC
from collections import Counter, defaultdict

import matplotlib.pyplot as plt

In [None]:
METADATA_PATH = '../../raw_data/cristiano_cfdnas/meta_data.csv'
CRISTIANO_PAPER = 'Genome-wide cell-free DNA fragmentation in patients with cancer'

def parse_metadata(file_path, paper):
    metadata_df = pd.read_csv(file_path)
    metadata_df = metadata_df[metadata_df.publication == paper]
    return dict(zip(metadata_df.sample_file_id, metadata_df.sample_disease))

metadata = parse_metadata(METADATA_PATH, CRISTIANO_PAPER)

In [None]:
def plot_pca(matrix, binary_labels, dhs_sites, stat_name):
    labels_set = set(binary_labels)
    for dhs in set(dhs_sites):
        mask = np.array(dhs_sites) == dhs
        filtered_matrix = matrix[mask]
        
        if stat_name == 'pfe':
            df = pd.DataFrame({
                'value': filtered_matrix.flatten(),
                'category': np.array(dhs_sites)[mask],
            })

            categories = sorted(df['category'].unique())
            data_to_plot = [df[df['category'] == cat]['value'].values for cat in categories]

            plt.figure(figsize=(8, 6))
            plt.violinplot(data_to_plot)
            plt.title('Distribution of PFE by DHS')
            plt.xlabel('DHS')
            plt.ylabel('PFE value')
            plt.xticks(ticks=np.arange(1, len(categories) + 1), labels=categories)
            plt.tight_layout()
            out_path_violin = os.path.join(PCA_PLOT_DIR, f"{stat_name}_{dhs}_violon.png")
            plt.savefig(out_path_violin, dpi=200)
            plt.close()
        else:
            plt.figure(figsize=(8, 6))
            pca = PCA(n_components=10)
            X_pca = pca.fit_transform(filtered_matrix)
            expl_var = pca.explained_variance_ratio_

            for group in labels_set:
                mask = np.array(binary_labels) == group
                x_positions, y_positions = X_pca[mask, 0], X_pca[mask, 1]
                plt.scatter(
                    x_positions,
                    y_positions,
                    label=group,
                    alpha=0.4
                )

            plt.xlabel(f"PC1 ({expl_var[0]*100:.1f}% var)")
            plt.ylabel(f"PC2 ({expl_var[1]*100:.1f}% var)")
            plt.title(f"PCA of {stat_name.upper()}")
            plt.legend()
            plt.grid(True, linestyle="--", alpha=0.4)
            plt.tight_layout()
            out_path = os.path.join(PCA_PLOT_DIR, f"{stat_name}_{dhs}_pca.png")
            plt.savefig(out_path, dpi=200)
            plt.close()

            pc1_weights = pca.components_[0]
            num_weights = len(pc1_weights)
            feature_indices = np.arange(num_weights)

            plt.figure(figsize=(8, 6))
            plt.plot(feature_indices, pc1_weights)
            plt.axvline(x=num_weights//2, color='red', linestyle='--', linewidth=2, label='DHS site window')
            plt.xlabel("Feature Index")
            plt.ylabel("Weight in PC1")
            plt.title(f"PC1 Feature Weights for {stat_name.upper()}")
            plt.grid(True, linestyle="--", alpha=0.4)
            plt.tight_layout()

            out_path_weights = os.path.join(PCA_PLOT_DIR, f"{stat_name}_{dhs}_pc1_weights.png")
            plt.savefig(out_path_weights, dpi=200)
            plt.close()

        
DATA_DIR = "../../data/test_small/"
PCA_PLOT_DIR = "../../data/pca_test_small/"
DHS_FOLDER = '../../raw_data/dhs_one'

os.makedirs(PCA_PLOT_DIR, exist_ok=True)

DHS_FILES = [f.split('.')[0] for f in os.listdir(DHS_FOLDER)]


# hardcoded stats
STATS = {
    "ocf":   ("{sid}__{dhs}_sorted_ocf.npy", None),
    "lwps":  ("{sid}__{dhs}_sorted_lwps.npy", None),
    "ifs":   ("{sid}__{dhs}_sorted_ifs.npz", "ifs_scores"),
    "pfe":   ("{sid}__{dhs}_sorted_pfe.npz", "pfe_scores"),
    "fdi":   ("{sid}__{dhs}_sorted_fdi.npz", "overlapping_fdi_scores"),
}


def load_vectors(stat_name, metadata_cache):
    vectors, binary_labels, dhs_sites = [], [], []
    
    for sid, group_name in metadata_cache.items():
        for dhs_name in DHS_FILES:
            pattern, key = STATS[stat_name]
            fname = pattern.format(sid=sid, dhs=dhs_name)
            path = os.path.join(DATA_DIR, fname)

            try:
                # load npy or npz
                if path.endswith(".npy"):
                    vec = np.load(path)
                elif path.endswith(".npz"):
                    data = np.load(path)
                    vec = data[key]
                else:
                    continue
            except FileNotFoundError:
                continue
            vectors.append(vec.flatten())
            binary_labels.append('Healthy' if group_name == 'Healthy' else 'Cancerous')
            dhs_sites.append(dhs_name)
    
    if not vectors:
        return None, None, None
    matrix = np.vstack(vectors)

    scaler = StandardScaler()
    scaler.fit(matrix)
    matrix = scaler.transform(matrix)
    return matrix, binary_labels, dhs_sites


all_matrices = defaultdict()
all_binary_labels = defaultdict()
all_dhs_sites = defaultdict()
for stat in STATS.keys():
    print(f"\nProcessing: {stat}")
    matrix, binary_labels, dhs_sites = load_vectors(stat, metadata)
    if matrix is not None:
        plot_pca(matrix, binary_labels, dhs_sites, stat)
        all_matrices[stat] = matrix
        all_binary_labels[stat] = binary_labels
        all_dhs_sites[stat] = dhs_sites
    else:
        print(f"Skipping {dhs} | {stat} — no data found.")
        
# combine by test statistics
# combined_matrix = np.hstack(all_matrices)

X = combined_matrix
y_strings = np.array(all_labels)

le = LabelEncoder()
y = le.fit_transform(y_strings)
print(f"Label mapping: {dict(zip(le.classes_, le.transform(le.classes_)))}")

n_pca_components = 10
pipeline = Pipeline([
    ('pca', PCA(n_components=n_pca_components)),
    ('classifier', SVC(probability=True, random_state=42)),
])

n_splits = 10
cv = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)
scores = cross_val_score(pipeline, X, y, cv=cv, scoring='accuracy', n_jobs=-1)
print(f"Mean Accuracy: {np.mean(scores):.3f} +/- {np.std(scores):.3f}")

In [None]:
df = pd.DataFrame({
    'value': all_matrices['pfe'].flatten(),
    'category': all_dhs_sites['pfe'],
    'binary_labels': all_binary_labels['pfe'],
})

categories = sorted(df['category'].unique())
data_to_plot = [df[df['category'] == cat]['value'].values for cat in categories]

plt.figure(figsize=(8, 6))
plt.violinplot(data_to_plot)
plt.title('Distribution of Values by Category')
plt.xlabel("all_dhs_sites['pfe'] Category")
plt.ylabel("all_matrices['pfe'] Value")
plt.xticks(ticks=np.arange(1, len(categories) + 1), labels=categories)
plt.tight_layout()
plt.show()
plt.close()

In [None]:
np.array(all_dhs_sites['ocf']) == 'Healthy'

In [None]:
# combine by test statistics
combined_matrix = np.hstack(all_matrices)

X = combined_matrix
y_strings = np.array(all_binary_labels)

le = LabelEncoder()
y = le.fit_transform(y_strings)
print(f"Label mapping: {dict(zip(le.classes_, le.transform(le.classes_)))}")

n_pca_components = 10
pipeline = Pipeline([
    ('pca', PCA(n_components=n_pca_components)),
#     ('classifier', LDA()),
    ('classifier', SVC(probability=True, random_state=42)),
])

n_splits = 10
cv = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)
scores = cross_val_score(pipeline, X, y, cv=cv, scoring='accuracy', n_jobs=-1)
print(f"Mean Accuracy: {np.mean(scores):.3f} +/- {np.std(scores):.3f}")

In [None]:
label_counter = Counter(all_labels)
plt.bar(label_counter.keys(), label_counter.values())
plt.xticks(rotation=90)
plt.xlabel("Multi-label classes")
plt.ylabel("Number of samples")
plt.title("Distribution of multi-label classes")

In [None]:
binary_label_counter = Counter(all_binary_labels)
plt.bar(binary_label_counter.keys(), binary_label_counter.values())
plt.xlabel("Binary classes")
plt.ylabel("Number of samples")
plt.title("Distribution of binary classes")

In [None]:
combined_matrix.shape, len(all_labels), len(all_binary_labels)

In [None]:
import os
import yaml
import numpy as np
import matplotlib.pyplot as plt
from collections import defaultdict
from sklearn.decomposition import PCA
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.svm import SVC
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score

In [None]:
# fdi_meta_path = '../../data/cristiano_cfdnas_dhs_small/feature_matrix_fdi_meta.npz'
# fdi_path = '../../data/cristiano_cfdnas_dhs_small/feature_matrix_fdi.npy'

feature_dir = '../../data/cristiano_cfdnas_dhs_small/'


def load_stat(stat: str, feature_dir: str):
    matrix_path = os.path.join(feature_dir, f"feature_matrix_{stat}.npy")
    meta_path = os.path.join(feature_dir, f"feature_matrix_{stat}_meta.npz")
    if not (os.path.exists(matrix_path) and os.path.exists(meta_path)):
        return None
    X = np.load(matrix_path)
    meta = np.load(meta_path)
    dhs_sites = meta['dhs_sites']

    labels = meta['binary_labels'] if 'binary_labels' in meta else None
    sample_ids = meta['sample_ids'] if 'sample_ids' in meta else None
    return X, dhs_sites, labels, sample_ids


def build_pc1_feature_matrix(X, dhs_sites, sample_ids):
    unique_samples = np.unique(sample_ids)
    unique_dhs = np.unique(dhs_sites)

    # map sample to index for final assembly
    sample_index = {s: i for i, s in enumerate(unique_samples)}
    # for each DHS, collect vectors for all samples -> matrix (n_samples x n_features) then PCA-> PC1 per sample
    pc1_columns = []
    valid_dhs = []
    for dhs in unique_dhs:
        mask = dhs_sites == dhs
        dhs_samples = sample_ids[mask]
        dhs_vectors = X[mask]
        if len(np.unique(dhs_samples)) != len(unique_samples):
            # skip DHS if incomplete (could also impute)
            continue
        # order rows by global sample ordering
        order = np.argsort([sample_index[s] for s in dhs_samples])
        dhs_matrix = dhs_vectors[order]
        # standardize per feature
        dhs_matrix = StandardScaler().fit_transform(dhs_matrix)
        pca = PCA(n_components=1)
        pc1 = pca.fit_transform(dhs_matrix).ravel()  # length n_samples
        pc1_columns.append(pc1)
        valid_dhs.append(dhs)
    if not pc1_columns:
        return None, None, None
    pc1_matrix = np.vstack(pc1_columns).T  # shape (n_samples, n_valid_dhs)
    return pc1_matrix, np.array(valid_dhs), unique_samples


def evaluate_stat_pc1(stat: str, feature_dir: str, cv_splits: int):
    loaded = load_stat(stat, feature_dir)
    if loaded is None:
        return None
    X, dhs_sites, labels, sample_ids = loaded
    if labels is None or sample_ids is None:
        return None

    pc1_matrix, valid_dhs, unique_samples = build_pc1_feature_matrix(X, dhs_sites, sample_ids)
    if pc1_matrix is None:
        return None
    
    print(pc1_matrix.shape, X.shape)

    # build per-sample labels
    sample_label_map = {}
    for sid, lab in zip(sample_ids, labels):
        if sid not in sample_label_map:
            sample_label_map[sid] = lab
    y_strings = [sample_label_map[s] for s in unique_samples]

    le = LabelEncoder()
    y = le.fit_transform(labels)
    
    n_pca_components = 1
    pipeline = Pipeline([
        ('pca', PCA(n_components=n_pca_components)),
        ('classifier', SVC(probability=True, random_state=42)),
    ])

    n_splits = 10
    cv = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)
    scores = cross_val_score(pipeline, X, y, cv=cv, scoring='roc_auc', n_jobs=-1)
    print(f"Mean ROC-AUC: {np.mean(scores):.3f} ± {np.std(scores):.3f}")


    # classification on pc1_matrix
#     clf = SVC(kernel='linear', probability=True, random_state=42)
#     cv = StratifiedKFold(n_splits=cv_splits, shuffle=True, random_state=42)

#     auc_scores = []
#     for train_idx, test_idx in cv.split(pc1_matrix, y):
#         clf.fit(pc1_matrix[train_idx], y[train_idx])
#         probs = clf.predict_proba(pc1_matrix[test_idx])
#         print(probs[:, 1].shape, y[test_idx].shape)
#         if probs.shape[1] == 2:
#             auc = roc_auc_score(y[test_idx], probs[:, 1])
#         else:
#             auc = roc_auc_score(y[test_idx], probs, multi_class='ovr')
#         auc_scores.append(auc)

#     return {
#         'stat': stat,
#         'auc_mean': float(np.mean(auc_scores)),
#         'auc_std': float(np.std(auc_scores)),
#         'n_dhs_used': int(len(valid_dhs)),
#     }


r = evaluate_stat_pc1('fdi', feature_dir, 10)
r
# if r is not None:
#     results.append(r)

In [None]:
# fdi_meta_path = '../../data/cristiano_cfdnas_dhs_small/feature_matrix_fdi_meta.npz'
# fdi_path = '../../data/cristiano_cfdnas_dhs_small/feature_matrix_fdi.npy'

feature_dir = '../../data/cristiano_cfdnas_dhs_small/'


def load_stat(stat: str, feature_dir: str):
    matrix_path = os.path.join(feature_dir, f"feature_matrix_{stat}.npy")
    meta_path = os.path.join(feature_dir, f"feature_matrix_{stat}_meta.npz")
    if not (os.path.exists(matrix_path) and os.path.exists(meta_path)):
        return None
    X = np.load(matrix_path)
    meta = np.load(meta_path)

    labels = meta['binary_labels'] if 'binary_labels' in meta else None
    return X, labels


def evaluate_stat_pc1(stat: str, feature_dir: str, cv_splits: int):
    loaded = load_stat(stat, feature_dir)
    if loaded is None:
        return None
    X, labels = loaded
    if labels is None:
        return None

    le = LabelEncoder()
    y = le.fit_transform(labels)
    
    n_pca_components = 1
    pipeline = Pipeline([
        ('pca', PCA(n_components=n_pca_components)),
        ('classifier', SVC(probability=True, random_state=42)),
    ])

    n_splits = 10
    cv = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)
    scores = cross_val_score(pipeline, X, y, cv=cv, scoring='roc_auc', n_jobs=-1)

    return {
        'stat': stat,
        'auc_mean': float(np.mean(scores)),
        'auc_std': float(np.std(scores)),
    }


stats = ['fdi', 'pfe', 'ocf', 'lwps', 'ifs']
cv_splits = 10

results = []
for stat in stats:
    r = evaluate_stat_pc1(stat, feature_dir, cv_splits)
    if r is not None:
        results.append(r)

if results:
    stats_order = [r['stat'] for r in results]
    auc_means = [r['auc_mean'] for r in results]
    auc_stds = [r['auc_std'] for r in results]

    plt.figure(figsize=(8, 6))
    bars = plt.bar(stats_order, auc_means, yerr=auc_stds, capsize=5)
    plt.ylabel('Mean ROC AUC (PC1 across DHS)')
    plt.xlabel('Test statistic')
    for bar, val in zip(bars, auc_means):
        plt.text(
            bar.get_x() + bar.get_width()/2 - 0.05,
            bar.get_height(), 
            f"{val:.2f}", 
            ha='right', 
            va='bottom', 
            fontsize=9
        )
    plt.title('Binary classification result (PC1 features + SVM)')
    plt.tight_layout()
    plt.show()
    plt.close()