In [5]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
import matplotlib.pyplot as plt
import seaborn as sns

# Load the file at the beginning
input_path = "G:/Pipeline/version2/combined_filtered_whole_dataset_v1.csv"
df = pd.read_csv(input_path)

def load_and_preprocess_data(df, imputation_strategy='mean', fill_value=None):
    patient_ids = df['patient_id']

    # Select only numerical columns
    df_numerical = df.select_dtypes(exclude=['object'])

    # Handle missing values
    if imputation_strategy == 'constant' and fill_value is not None:
        imputer = SimpleImputer(strategy=imputation_strategy, fill_value=fill_value)
    elif imputation_strategy in ['mean', 'median', 'most_frequent']:
        imputer = SimpleImputer(strategy=imputation_strategy)
    elif imputation_strategy == 'knn':
        imputer = KNNImputer(n_neighbors=5, weights='uniform')
    elif imputation_strategy == 'iterative':
        imputer = IterativeImputer(max_iter=10, random_state=42)
    else:
        raise ValueError("Unsupported imputation strategy provided.")

    df_imputed = imputer.fit_transform(df_numerical)

    # Standardize the features
    scaler = StandardScaler()
    scaled_features = scaler.fit_transform(df_imputed)
    
    return scaled_features, patient_ids

def perform_pca(data, variance_threshold=0.95):
    # Perform PCA on the entire dataset to cover the specified variance threshold
    pca = PCA(n_components=variance_threshold)
    principal_components = pca.fit_transform(data)
    
    return principal_components, pca

def main(df, variance_threshold=0.95, imputation_strategy='mean', fill_value=None):
    np.random.seed(42)  # For reproducibility
    scaled_features, patient_ids = load_and_preprocess_data(df, imputation_strategy, fill_value)

    # Predefined list of patients
    predefined_patients = [
        "me_407_0_0", "me_409_0_0", "me_410_0_0", "me_414_0_0", "me_420_0_0",
        "me_424_0_0", "me_434_0_0", "me_437_0_0", "me_438_0_0", "me_440_0_0",
        "me_442_0_0", "me_443_0_0", "me_444_0_0", "me_445_0_0", "me_446_0_0",
        "me_448_0_0", "me_449_0_0", "me_451_0_0", "me_456_0_0"
    ]

    # Perform PCA on the entire dataset
    principal_components, pca = perform_pca(scaled_features, variance_threshold)

    # Create a DataFrame for the principal components
    num_components = principal_components.shape[1]
    principal_df = pd.DataFrame(data=principal_components, columns=[f'PC{i+1}' for i in range(num_components)])
    principal_df['patient_id'] = patient_ids.values

    # Filter the DataFrame for the selected patients
    principal_df_selected = principal_df[principal_df['patient_id'].isin(predefined_patients)]

    # Save the principal components of the selected patients to a CSV file
    principal_df_selected.to_csv('G:/Pipeline/version2/selected_patients_principal_components.csv', index=False)

if __name__ == "__main__":
    main(df)
