In [None]:
import os
import sys
os.environ['TF_ENABLE_ONEDNN_OPTS'] = '0'

In [None]:
import inspect
import shutil
import json
import cupy as cp
#import cudf
#import dask_cudf
#from dask_cuda import LocalCUDACluster
#import dask
#from dask.distributed import Client, wait
import pyarrow.feather as fth
#import dask.dataframe as dd
#import torch
import pickle
import pandas as pd
import pyaging as pya
from pygam import LinearGAM, LogisticGAM, s
import numpy as np
import scipy.sparse as sp
from scipy import stats, sparse
import seaborn as sns
import matplotlib.pyplot as plt
from numba import jit, prange
import matplotlib.gridspec as gridspec
from mepylome import Manifest
import statsmodels.api as sm
from statsmodels.formula.api import ols
from statsmodels.gam.api import GLMGam, BSplines
from statsmodels.stats.anova import anova_lm
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.experimental import enable_iterative_imputer  # Enable experimental features first
from sklearn.impute import KNNImputer, IterativeImputer
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from patsy import dmatrix
#import tensorflow as tf
#tf.get_logger().setLevel('ERROR')
#from tensorflow.keras.layers import Input, Dense
#from tensorflow.keras.models import Model
from scipy.stats import pearsonr

In [None]:
# !pip freeze > requirements.txt

In [None]:
# Define paths for Feather files
subset_path_feather = '/u/scratch/c/cobeaman/mymetharray_subset_2458_r_654_c_08152024_192025.feather'
final_path_feather = '/u/scratch/c/cobeaman/mymetharray_final_2458_r_731791_c_08152024_192025.feather'

# Load the data using the single-threaded approach
methylation_data_subset = fth.read_table(subset_path_feather)
methylation_data_final = fth.read_table(final_path_feather)

# Convert to pandas DataFrame and set 'SampleID' as index
methylation_data_subset_pd = methylation_data_subset.to_pandas().set_index('SampleID')
methylation_data_final_pd = methylation_data_final.to_pandas().set_index('SampleID')

In [None]:
# Check data types
# print(methylation_data_subset['Female'].dtype)

# Ensure 'Female' column is binary (0 or 1)
# methylation_data_final['Female'] = (methylation_data_final['Female'] == 1).astype(int)
# methylation_data_subset['Female'] = (methylation_data_subset['Female'] == 1).astype(int)

# Handle any missing data (if necessary)
methylation_data_final_pd.dropna(inplace=True)
methylation_data_subset_pd.dropna(inplace=True)

# Check for missing values
print(methylation_data_final_pd.isnull().sum())
print(methylation_data_subset_pd.isnull().sum())

# Aggregate any duplicated probes
# methylation_data_subset_pd = pya.pp.epicv2_probe_aggregation(methylation_data_subset_pd)
# methylation_data_final_pd = pya.pp.epicv2_probe_aggregation(methylation_data_final_pd)

In [None]:
@jit(nopython=True, parallel=True)
def parallel_to_numeric(arr):
    result = np.empty(arr.shape, dtype=np.float64)
    for i in prange(arr.shape[0]):
        for j in prange(arr.shape[1]):
            try:
                result[i, j] = float(arr[i, j])
            except ValueError:
                result[i, j] = np.nan
    return result

def prepare_data_for_adata(df, metadata_cols, chunk_size=1000):
    # Separate metadata and data
    metadata = df[metadata_cols]
    data = df.drop(columns=metadata_cols)
    
    # First, try to convert all columns to numeric
    data = data.apply(pd.to_numeric, errors='coerce')
    
    # Process data in chunks
    chunks = []
    for i in range(0, len(data), chunk_size):
        chunk = data.iloc[i:i+chunk_size]
        
        # Convert to GPU array
        chunk_gpu = cp.array(chunk.values, dtype=cp.float64)
        
        # Transfer back to CPU
        chunk_np = cp.asnumpy(chunk_gpu)
        
        chunks.append(pd.DataFrame(chunk_np, columns=chunk.columns, index=chunk.index))
    
    # Combine processed chunks
    data_processed = pd.concat(chunks)
    
    # Combine metadata and converted data
    df_prepared = pd.concat([metadata, data_processed], axis=1)
    
    return df_prepared

# Prepare the data
methylation_data_final_pd_prepared = prepare_data_for_adata(methylation_data_final_pd, ['Female', 'Age', 'Diagnosis'])
methylation_data_subset_pd_prepared = prepare_data_for_adata(methylation_data_subset_pd, ['Female', 'Age', 'Diagnosis'])

In [None]:
for df in [methylation_data_final_pd_prepared, methylation_data_subset_pd_prepared]:
    df['female'] = df['Female']
    df['age'] = df['Age']

In [None]:
import pandas as pd

# Function to get probe data from manifests
def get_probes_from_manifest(manifest_name):
    manifest = Manifest(manifest_name)
    return set(manifest.data_frame['IlmnID'])

# Extract probe IDs from each array as sets for faster lookups
epicv2_probes_set = get_probes_from_manifest("epicv2")
epic_probes_set = get_probes_from_manifest("epic")
i450_probes_set = get_probes_from_manifest("450k")

# Load GrimAge2 feature sets and extract unique probes as a set
df_grimage_subcomponents = pd.read_csv('grimage2_subcomponents.csv', index_col=0)
grimage2_probes_set = set([probe for probe in df_grimage_subcomponents['var'].unique() if probe.startswith('cg')])

# Filter probe names that start with 'cg' from the methylation data
cg_probes_set = set([col for col in methylation_data_final_pd_prepared.columns if col.startswith('cg')])

# Check for missing probes in each array using set difference for efficiency
missing_probes_epicv2 = grimage2_probes_set - epicv2_probes_set
missing_probes_epic = grimage2_probes_set - epic_probes_set
missing_probes_i450 = grimage2_probes_set - i450_probes_set

missing_probes_epicv2_BP = cg_probes_set - epicv2_probes_set
missing_probes_epic_BP = cg_probes_set - epic_probes_set
missing_probes_i450_BP = cg_probes_set - i450_probes_set

# Organize the summary data
summary_data = {
    'Array': ['EPICv2', 'EPIC', '450k', 'GrimAge2'],
    'Total Probes': [
        len(epicv2_probes_set),
        len(epic_probes_set),
        len(i450_probes_set),
        len(grimage2_probes_set)
    ],
    'First Few Probes': [
        ', '.join(list(epicv2_probes_set)[:5]),
        ', '.join(list(epic_probes_set)[:5]),
        ', '.join(list(i450_probes_set)[:5]),
        ', '.join(list(grimage2_probes_set)[:5])
    ],
    'Missing Probes in GrimAge2': [
        len(missing_probes_epicv2),
        len(missing_probes_epic),
        len(missing_probes_i450),
        None
    ],
    'First Few Missing in GrimAge2': [
        ', '.join(list(missing_probes_epicv2)[:5]),
        ', '.join(list(missing_probes_epic)[:5]),
        ', '.join(list(missing_probes_i450)[:5]),
        None
    ],
    'Missing Probes in BPDNAm': [
        len(missing_probes_epicv2_BP),
        len(missing_probes_epic_BP),
        len(missing_probes_i450_BP),
        None
    ],
    'First Few Missing in BPDNAm': [
        ', '.join(list(missing_probes_epicv2_BP)[:5]),
        ', '.join(list(missing_probes_epic_BP)[:5]),
        ', '.join(list(missing_probes_i450_BP)[:5]),
        None
    ],
    'Number of Missing GrimAge2 Probes in Methylation Data': [
        None,
        None,
        None,
        len(grimage2_probes_set - cg_probes_set)
    ],
    'First Few Missing GrimAge2 Probes in Methylation Data': [
        None,
        None,
        None,
        ', '.join(list(grimage2_probes_set - cg_probes_set)[:5])
    ]
}

# Convert to DataFrame for exporting
summary_df = pd.DataFrame(summary_data)

# Save to CSV for sharing
summary_df.to_csv('probe_summary_detailed.csv', index=False)

# Save to Excel for sharing
summary_df.to_excel('probe_summary_detailed.xlsx', index=False)

# Optional: Display the summary
print(summary_df)


In [None]:
import pandas as pd

# Function to get probe data from manifests
def get_probes_from_manifest(manifest_name):
    manifest = Manifest(manifest_name)
    return set(manifest.data_frame['IlmnID'])

# Extract probe IDs from EPICv2 as a set for faster lookups
epicv2_probes_set = get_probes_from_manifest("epicv2")

# Load GrimAge2 feature sets and extract unique probes as a set
df_grimage_subcomponents = pd.read_csv('grimage2_subcomponents.csv', index_col=0)
grimage2_probes_set = set([probe for probe in df_grimage_subcomponents['var'].unique() if probe.startswith('cg')])

# Filter probe names that start with 'cg' from the methylation data
cg_probes_set = set([col for col in methylation_data_final_pd_prepared.columns if col.startswith('cg')])

# Calculate specific missing probes
missing_grimage2_in_epicv2 = grimage2_probes_set - epicv2_probes_set
missing_grimage2_in_methylation = grimage2_probes_set - cg_probes_set

# Probes missing in methylation but not in EPICv2
missing_in_methylation_only = missing_grimage2_in_methylation - missing_grimage2_in_epicv2

# Create a detailed summary DataFrame with the necessary missing probes
missing_specific_summary_df = pd.DataFrame({
    'Description': [
        'GrimAge2 probes missing in EPICv2',
        'GrimAge2 probes missing in methylation data',
        'GrimAge2 probes missing in only methylation but not EPICv2'
    ],
    'Number of Missing Probes': [
        len(missing_grimage2_in_epicv2),
        len(missing_grimage2_in_methylation),
        len(missing_in_methylation_only)
    ],
    'Missing Probes': [
        ', '.join(list(missing_grimage2_in_epicv2)),
        ', '.join(list(missing_grimage2_in_methylation)),
        ', '.join(list(missing_in_methylation_only))
    ]
})

# Sort the DataFrame by 'Number of Missing Probes' in descending order
missing_specific_summary_df = missing_specific_summary_df.sort_values(by='Number of Missing Probes', ascending=False)

# Save to CSV for sharing
missing_specific_summary_df.to_csv('detailed_missing_probes_summary.csv', index=False)

# Save to Excel for sharing
missing_specific_summary_df.to_excel('detailed_missing_probes_summary.xlsx', index=False)

# Optional: Display the summary
print(missing_specific_summary_df)


In [None]:
# # 1. Data Preparation and Exploratory Analysis
# def prepare_data(df, missing_features):
#     # Separate methylation data from metadata
#     methylation_data = df[[col for col in df.columns if col.startswith('cg')]]
#     metadata = df[['female', 'age', 'Diagnosis']]
    
#     # Add missing columns
#     for col in missing_features:
#         if col not in methylation_data.columns and col.startswith('cg'):
#             methylation_data[col] = np.nan
    
#     return methylation_data, metadata

# def exploratory_analysis(df):
#     missing_percentages = df.isnull().mean() * 100
#     print(f"Average percentage of missing values: {missing_percentages.mean():.2f}%")
#     plt.figure(figsize=(10, 5))
#     sns.histplot(missing_percentages, bins=50)
#     plt.title("Distribution of Missing Values Percentage")
#     plt.xlabel("Percentage of Missing Values")
#     plt.show()

# # 2. Initial Imputation with KNN
# def knn_impute(df, n_neighbors=5):
#     imputer = KNNImputer(n_neighbors=n_neighbors)
#     imputed_data = imputer.fit_transform(df)
#     return pd.DataFrame(imputed_data, columns=df.columns, index=df.index)

# # 3. Advanced Imputation with MICE and Random Forest
# def mice_rf_impute(df, max_iter=10, n_estimators=100, random_state=0):
#     imputer = IterativeImputer(
#         estimator=RandomForestRegressor(n_estimators=n_estimators, random_state=random_state),
#         max_iter=max_iter,
#         random_state=random_state
#     )
#     imputed_data = imputer.fit_transform(df)
#     return pd.DataFrame(imputed_data, columns=df.columns, index=df.index)

# # 4. Deep Learning Imputation
# def create_autoencoder(input_dim, encoding_dim):
#     input_layer = Input(shape=(input_dim,))
#     encoded = Dense(encoding_dim, activation='relu')(input_layer)
#     decoded = Dense(input_dim, activation='sigmoid')(encoded)
#     autoencoder = Model(input_layer, decoded)
#     autoencoder.compile(optimizer='adam', loss='mse')
#     return autoencoder

# def deep_learning_impute(df, epochs=100, batch_size=32):
#     # Normalize data to [0, 1] range (suitable for beta values)
#     normalized_data = df.values
    
#     # Create a mask for missing values
#     missing_mask = np.isnan(normalized_data)
    
#     # Replace NaNs with mean for initial input
#     col_mean = np.nanmean(normalized_data, axis=0)
#     normalized_data[missing_mask] = np.take(col_mean, missing_mask.nonzero()[1])
    
#     # Create and train the autoencoder
#     input_dim = df.shape[1]
#     encoding_dim = min(input_dim // 2, 256)  # Cap encoding dim to prevent overfitting
#     autoencoder = create_autoencoder(input_dim, encoding_dim)
    
#     autoencoder.fit(normalized_data, normalized_data, 
#                     epochs=epochs, 
#                     batch_size=batch_size, 
#                     shuffle=True,
#                     validation_split=0.2,
#                     verbose=0)
    
#     # Use the model to impute missing values
#     imputed_data = autoencoder.predict(normalized_data)
    
#     # Replace only the missing values in the original data
#     normalized_data[missing_mask] = imputed_data[missing_mask]
    
#     return pd.DataFrame(normalized_data, columns=df.columns, index=df.index)

# # 5. Ensemble Method
# def ensemble_impute(df, methods=['knn', 'mice_rf', 'deep_learning']):
#     imputed_dfs = []
    
#     if 'knn' in methods:
#         imputed_dfs.append(knn_impute(df))
#     if 'mice_rf' in methods:
#         imputed_dfs.append(mice_rf_impute(df))
#     if 'deep_learning' in methods:
#         imputed_dfs.append(deep_learning_impute(df))
    
#     # Average the results from different methods
#     ensemble_imputed = pd.concat(imputed_dfs).groupby(level=0).mean()
#     return ensemble_imputed

# # 6. Validation and Sensitivity Analysis
# def validate_imputation(original_df, imputed_df, n_samples=1000):
#     # Select a subset of non-missing values to compare
#     non_missing_mask = ~original_df.isnull().any(axis=1)
#     sample_indices = np.random.choice(non_missing_mask.index[non_missing_mask], size=n_samples, replace=False)
    
#     original_sample = original_df.loc[sample_indices]
#     imputed_sample = imputed_df.loc[sample_indices]
    
#     mse = mean_squared_error(original_sample.values.flatten(), imputed_sample.values.flatten())
#     correlation, _ = pearsonr(original_sample.values.flatten(), imputed_sample.values.flatten())
    for df in [methylation_data_final_pd_prepared, methylation_data_subset_pd_prepared]:
    df['female'] = df['Female']
    df['age'] = df['Age']
#     print(f"Mean Squared Error: {mse:.4f}")
#     print(f"Pearson Correlation: {correlation:.4f}")
    
#     plt.figure(figsize=(10, 5))
#     plt.scatter(original_sample.values.flatten(), imputed_sample.values.flatten(), alpha=0.1)
#     plt.xlabel("Original Values")
#     plt.ylabel("Imputed Values")
#     plt.title("Original vs Imputed Values")
#     plt.show()

In [None]:
# # Main workflow
# def imputation_workflow(df, missing_features):
#     methylation_data, metadata = prepare_data(df, missing_features)
    
#     print("Exploratory Analysis:")
#     exploratory_analysis(methylation_data)
    
#     print("\nPerforming KNN Imputation...")
#     knn_imputed = knn_impute(methylation_data)
    
#     print("\nPerforming MICE with Random Forest Imputation...")
#     mice_rf_imputed = mice_rf_impute(methylation_data)
    
#     print("\nPerforming Deep Learning Imputation...")
#     dl_imputed = deep_learning_impute(methylation_data)
    
#     print("\nPerforming Ensemble Imputation...")
#     ensemble_imputed = ensemble_impute(methylation_data)
    
#     print("\nValidation and Sensitivity Analysis:")
#     validate_imputation(methylation_data, ensemble_imputed)
    
#     # Combine imputed data with metadata
#     final_imputed_data = pd.concat([ensemble_imputed, metadata], axis=1)
    
#     return final_imputed_data

In [None]:
# imputed_methylation_data = imputation_workflow(methylation_data_final_pd_prepared, grimage2_missing_features)