In [1]:
import os
import pickle 
import time
import traceback 
import warnings
from collections import defaultdict
import random 

import pandas as pd
import numpy as np
import networkx as nx 
import matplotlib.pyplot as plt
from sklearn.preprocessing import normalize
from scipy import stats
from joblib import Parallel, delayed
from tqdm import tqdm

ModuleNotFoundError: No module named 'networkx'

In [None]:
# Suppress specific warnings
# warnings.filterwarnings("ignore", category=UserWarning)
# warnings.filterwarnings("ignore", category=RuntimeWarning)


def process_data(otu_file_path, metadata_file_path, output_dir,
                 sample_id='Sample', condition_id='Study.Group'):
    """
    Reads OTU table and metadata, aligns them, cleans and normalizes
    the OTU table to relative abundances. Saves processed files.
   
    
    """
    print(f" Reading OTU table at: {otu_file_path}")
    print(f" Reading metadata at: {metadata_file_path}")

    processed_data_dir = os.path.join(output_dir, "01_Processed_Data")
    os.makedirs(processed_data_dir, exist_ok=True)

    sep_otu = '\t' if otu_file_path.lower().endswith('.tsv') else ','
    sep_meta = '\t' if metadata_file_path.lower().endswith('.tsv') else ','

 
    otu_table = pd.read_csv(otu_file_path, index_col=0, sep=sep_otu)
    print(f"  Initial OTU table shape: {otu_table.shape}")
    metadata = pd.read_csv(metadata_file_path, sep=sep_meta)
    print(f"  Initial Metadata shape: {metadata.shape}")

   
    #Metadata validation
    if sample_id not in metadata.columns:
        raise ValueError(f"Sample ID column '{sample_id}' not found in metadata. Available columns: {metadata.columns.tolist()}")
    if condition_id not in metadata.columns:
        raise ValueError(f"Condition ID column '{condition_id}' not found in metadata. Available columns: {metadata.columns.tolist()}")
    
    print(f"  Using Sample ID column: '{sample_id}'")
    print(f"  Using Condition ID column: '{condition_id}'")

    metadata[sample_id] = metadata[sample_id].astype(str)

    if not metadata[sample_id].is_unique:
        num_duplicates = metadata[sample_id].duplicated().sum()
        print(f"  Warning: Sample ID column '{sample_id}' contains {num_duplicates} duplicate values. Keeping first occurrence of each.")
        metadata = metadata.drop_duplicates(subset=[sample_id], keep='first')
        print(f"  Metadata shape after dropping duplicates: {metadata.shape}")
    metadata.set_index(sample_id, inplace=True)
    otu_table.index = otu_table.index.astype(str)

    common_samples = otu_table.index.intersection(metadata.index)
    print(f"  Found {len(common_samples)} common samples between OTU table and metadata.")
    if len(common_samples) == 0:
        raise ValueError("No common samples found. Check Sample ID matching and formatting")
    otu_table = otu_table.loc[common_samples]
    metadata = metadata.loc[common_samples]
    
    print(f"  Cleaned feature table shape: {otu_table.shape}")
    print(f"  Cleaned metadata shape: {metadata.shape}")

    otu_table = otu_table.astype(float)

    initial_otus = otu_table.shape[1]
    otu_table = otu_table.loc[:, (otu_table > 0).any(axis=0)] 
    otus_removed = initial_otus - otu_table.shape[1]
    if otus_removed > 0:
        print(f"  Removed {otus_removed} OTUs that were all zero across samples.")

    sample_sums = otu_table.sum(axis=1)
    valid_samples_mask = sample_sums > 1e-9 
    if (~valid_samples_mask).any():
        num_zero_sum = (~valid_samples_mask).sum()
        print(f"  Warning: Removing {num_zero_sum} samples with zero total abundance before normalization.")
        otu_table = otu_table.loc[valid_samples_mask]
        metadata = metadata.loc[valid_samples_mask]
        sample_sums = sample_sums.loc[valid_samples_mask]

    
    # Normalize to relative abundance
    otu_table = otu_table.div(sample_sums, axis=0)
    otu_table = otu_table.fillna(0) #This technically shouldn't do anything due to check above
    final_otu_table = otu_table
    print(f"  Normalized OTU table to relative abundances. Final shape: {final_otu_table.shape}")

    # Grouping 
    otu_table_grouped = None
    if condition_id in metadata.columns and not final_otu_table.empty:
       
         if not metadata.empty:
            
             metadata[condition_id] = metadata[condition_id].astype(str)
             otu_table_grouped = final_otu_table.groupby(metadata[condition_id]).mean()
             print(f"  Grouped table shape (for info): {otu_table_grouped.shape}") 

    # Save Processed Data
    ft_path = os.path.join(processed_data_dir, "feature_table_normalized.csv")
    meta_path = os.path.join(processed_data_dir, "metadata_aligned.csv")
    final_otu_table.to_csv(ft_path)
    metadata.to_csv(meta_path)
    print(f"  Saved normalized feature table to: {ft_path}")
    print(f"  Saved aligned metadata to: {meta_path}")

    print(" Data processing function finished.")

    return final_otu_table, metadata

In [None]:
def compute_sample_weights(sample_rel_otu):
    """Calculates pairwise weights within a single sample based on relative abundance."""
    sample_rel_otu = np.asarray(sample_rel_otu)
    n_species = len(sample_rel_otu)

    binary_sample = (sample_rel_otu > 1e-9).astype(int)
    sample_binary_matrix = np.outer(binary_sample, binary_sample) # Co-occurrence matrix for particular sample
    sample_matrix = np.tile(sample_rel_otu, (n_species, 1)) # Repeat sample row into a matrix

    original_array = sample_rel_otu 
    non_zero_mask = original_array > 1e-9 # Mask for species present in the sample

    # Calculate 1/abundance only for present species
    inverted_non_zero_elements = np.zeros_like(original_array, dtype=float)
    inverted_non_zero_elements = np.divide(1.0, original_array, where=non_zero_mask, out=inverted_non_zero_elements)
    inv_diag = np.diag(inverted_non_zero_elements) # Diagonal matrix of 1/abundance_i

    # Calculate ratios: R_ij = abundance_j / abundance_i
    ratios = np.matmul(inv_diag, sample_matrix)

    # Calculate weights: W_ij = 2 / R_ij = 2 * abundance_i / abundance_j
    weights = np.zeros_like(ratios)
    non_diagonal_mask = ~np.eye(n_species, dtype=bool) # Exclude self-interactions
    
    # Calculate weights only where ratios are valid (non-zero) and not on the diagonal
    valid_ratios_mask = (np.abs(ratios) > 1e-9) & non_diagonal_mask
    weights = np.divide(2.0, ratios, where=valid_ratios_mask, out=weights)

    # Symmetrization using upper triangle: W'_ij = W_ij, W'_ji = W_ij for i < j
    weights_new = np.triu(weights, k=1) # Take the upper triangle (excluding diagonal)
    weights_new = weights_new + weights_new.T # Make it symmetric by adding its transpose
    weights_new[~np.isfinite(weights_new)] = 0 # Handle potential infinities/NaNs from division by ~zero

    # sample_binary_matrix indicates co-occurrence (1 if both i and j are present, 0 otherwise)
    return weights_new, sample_binary_matrix

def compute_all_weights(otu_data_subset):
    
    """
    Computes an average weight matrix across a set of samples (e.g., bootstrap replicate).
    Input is assumed to be relative abundances for the samples in the subset.
    """
    
    otu_data_subset = np.asarray(otu_data_subset)
    relative_raw = otu_data_subset

    num_samples, num_species = relative_raw.shape
    
    combined_weights = np.zeros((num_species, num_species))
    
    # This cooc_matrix sums the sample co-occurrence matrices (1 if both present in sample)
    total_cooc_matrix = np.zeros((num_species, num_species))

    valid_samples_count = 0
    for i in range(num_samples):
        sample = relative_raw[i, :]

        w, cooc = compute_sample_weights(sample)
    
        if w.shape == combined_weights.shape and cooc.shape == total_cooc_matrix.shape:
            combined_weights += w # Sum of individual sample weights W_ij
            total_cooc_matrix += cooc # Sum of co-occurrence indicators (counts how many samples have both i and j)
            valid_samples_count += 1
      
    # Average weights: Divide sum of weights by number of times species co-occurred across the samples in the subset
    with np.errstate(divide='ignore', invalid='ignore'):
        # average_weight_matrix = combined_weights / total_cooc_matrix
        average_weight_matrix = np.divide(combined_weights, total_cooc_matrix,
                                          where=total_cooc_matrix!=0,
                                          out=np.zeros_like(combined_weights))
    average_weight_matrix[~np.isfinite(average_weight_matrix)] = 0 

    # total_cooc_matrix represents the count of samples where both species co-occurred in this subset
    return average_weight_matrix, total_cooc_matrix



In [None]:
def create_bootstrap_population(observed_data_df, condition_group, output_dir_base,
                                  n_boots): 
    """
    Generates bootstrap weight matrices for a given condition's data.

    """
    print(f" Bootstrapping {n_boots} replicates for condition: {condition_group}")
    # Define output path for intermediate bootstrap results
    condition_results_dir = os.path.join(output_dir_base, "Intermediates", condition_group)
    matrices_dir = os.path.join(condition_results_dir, "matrices")
    os.makedirs(matrices_dir, exist_ok=True)
    print(f"   Bootstrap intermediate matrices will be saved in: {matrices_dir}")

    # Input is expected to be relative abundances already (the subset for the condition)
    raw_data = observed_data_df.to_numpy()
    n_samples, num_species = raw_data.shape
    if n_samples == 0 or num_species < 2: # Need at least 2 species for interactions
        print(f"   ERROR: Not enough data for bootstrapping ({n_samples} samples, {num_species} species).")
        return None
    
    # Generate bootstrap datasets (indices get resampled data)
    # Each bootstrap dataset contains n_samples drawn with replacement from the original n_samples
    bootstrap_indices_list = [np.random.choice(n_samples, size=n_samples, replace=True) for _ in range(n_boots)]
    bstrap_otus_datasets = [raw_data[indices] for indices in bootstrap_indices_list]

    # Function to process a single bootstrap replicate dataset
    def process_bootstrap_sample(b, otu_sample_replicate):
        # Calculate the average weight matrix across the samples within this bootstrap replicate
        w, _ = compute_all_weights(otu_sample_replicate)

        if w.shape == (num_species, num_species):
            matrix_path = os.path.join(matrices_dir, f"bstrap_weight_matrix_{b}.csv")
            np.savetxt(matrix_path, w, delimiter=",")
            return w
        else: 
             print(f"   Warning: Shape mismatch for bootstrap replicate {b}. Expected ({num_species},{num_species}), got {w.shape}")
             return np.zeros((num_species, num_species))

    # Run the bootstrap replicate processing in parallel
    results_matrices = Parallel(n_jobs=-1)(delayed(process_bootstrap_sample)(b, otu_rep)
                                             for b, otu_rep in enumerate(tqdm(bstrap_otus_datasets, desc=f"  Bootstrapping {condition_group}", leave=False, ncols=100)))

    # Filter out potential None or incorrectly shaped arrays before stacking
    valid_results_matrices = [m for m in results_matrices if isinstance(m, np.ndarray) and m.shape == (num_species, num_species)]

    if not valid_results_matrices:
        print(f"   ERROR: No valid bootstrap matrices were generated for {condition_group}.")
        if not os.listdir(matrices_dir): os.rmdir(matrices_dir)
        if not os.listdir(condition_results_dir): os.rmdir(condition_results_dir)
        return None
    elif len(valid_results_matrices) < n_boots * 0.5: # Warning if less than half succeeded
        print(f"   WARNING: Only {len(valid_results_matrices)}/{n_boots} bootstrap replicates generated valid matrices for {condition_group}.")


    print("   Calculating mean and std deviation across bootstrap matrices...")
    stacked_matrices = np.stack(valid_results_matrices, axis=0)
    
    # Calculate the mean and std dev for each edge weight across all bootstrap replicates
    bstrap_means = np.mean(stacked_matrices, axis=0)
    bstrap_stds = np.std(stacked_matrices, axis=0)

    # Save the overall mean and std dev matrices derived from the bootstrap population
    np.savetxt(os.path.join(condition_results_dir, f"means.csv"), bstrap_means, delimiter=",")
    np.savetxt(os.path.join(condition_results_dir, f"stds.csv"), bstrap_stds, delimiter=",")
    print(f"   Saved mean and std matrices to: {condition_results_dir}")
    return condition_results_dir

def filtering_pvals_for_each_sample(df_cond, condition_group, bstrap_intermed_dir,
                                      output_dir_base, pval_thresh,
                                      delete_bootstrap_matrices=True):
    """
    Filters each individual sample graph based on comparison to the bootstrap distributions derived from the *condition's samples.
    Saves filtered weights matrix and graphml graph per sample.
    Saves condition-level summary and filtered OTU table for the condition.

    """
    print(f"  Filtering individual samples for condition: {condition_group} (p-value < {pval_thresh} removes edge)")
    matrices_dir = os.path.join(bstrap_intermed_dir, "matrices") 

    condition_summary_dir = os.path.join(output_dir_base, "Condition_Summaries")
    sample_results_base_dir = os.path.join(output_dir_base, "Individual_Samples_by_Condition", condition_group)
    os.makedirs(condition_summary_dir, exist_ok=True)
    os.makedirs(sample_results_base_dir, exist_ok=True) 

    bs_data_3d = None
    num_replicates_loaded = 0
    expected_shape = None



    bootstrap_files = [f for f in os.listdir(matrices_dir) if f.startswith('bstrap_weight_matrix_') and f.endswith('.csv')]
    if not bootstrap_files:
        print(f"   ERROR: No bootstrap matrix files found in {matrices_dir}")
        return None, None


    bootstrap_matrices = []
    print(f"   Loading bootstrap matrices from {matrices_dir}...")
    for f in tqdm(bootstrap_files, desc="    Loading matrices", leave=False):
        file_path = os.path.join(matrices_dir, f)
       
        m = np.loadtxt(file_path, delimiter=",", dtype=float)
        if m.ndim == 0: 
            m = np.array([[m]])

        if expected_shape is None and m.ndim == 2:
            expected_shape = m.shape
            if expected_shape[0] < 2 or expected_shape[1] < 2:
                 print(f"   Warning: First loaded matrix {f} has shape {expected_shape}. This might be incorrect. Resetting expected shape.")
                 expected_shape = None 
                 continue
            print(f"    Expected matrix shape: {expected_shape}")

        if expected_shape is not None:
            if m.ndim == 2 and m.shape == expected_shape:
                bootstrap_matrices.append(m)
            elif m.ndim == 2:
                print(f"   Warning: Shape mismatch loading {f}. Expected {expected_shape}, got {m.shape}. Skipping.")


    if not bootstrap_matrices:
         print(f"   ERROR: Could not load ANY valid bootstrap matrices with consistent shape from {matrices_dir}. Check intermediate files and bootstrapping logs.")
         if os.path.exists(matrices_dir) and not os.listdir(matrices_dir): os.rmdir(matrices_dir)
         if os.path.exists(bstrap_intermed_dir) and not os.listdir(bstrap_intermed_dir): os.rmdir(bstrap_intermed_dir)
         return None, None


    bs_data_3d = np.stack(bootstrap_matrices, axis=0)
    num_replicates_loaded = bs_data_3d.shape[0]
    print(f"   Loaded {num_replicates_loaded} bootstrap matrices into memory.")

    # Optional Deletion 
    if delete_bootstrap_matrices:
        print(f"     Attempting to delete individual bootstrap matrix files from {matrices_dir}...")
        deleted_count = 0

        for f in bootstrap_files: 
            file_path = os.path.join(matrices_dir, f)
           
            if os.path.exists(file_path):
                os.remove(file_path)
                deleted_count += 1

        if os.path.exists(matrices_dir) and not os.listdir(matrices_dir):
            os.rmdir(matrices_dir)

    data_cond = df_cond.to_numpy()
    num_samples, num_species = data_cond.shape


    if bs_data_3d is None or bs_data_3d.shape[1:] != (num_species, num_species):
        print(f"   ERROR: Dimension mismatch or missing bootstrap data after stacking. Bootstrap shape {bs_data_3d.shape if bs_data_3d is not None else 'None'}, "
              f"condition data has {num_species} species. Cannot proceed.")
        return None, None

    species_names = df_cond.columns 

    bs_weight_distributions = defaultdict(list)
    print("   Pre-calculating bootstrap distributions for edges...")

    for i in range(num_species):
        for j in range(i + 1, num_species):
         
            weights_for_edge = bs_data_3d[:, i, j]
            
            finite_weights = weights_for_edge[np.isfinite(weights_for_edge)]
            if len(finite_weights) > 0:
                bs_weight_distributions[i, j] = finite_weights.tolist()

    # Free up memory? Not sure if that's how this works 
    del bootstrap_matrices
    del bs_data_3d
    print("   Bootstrap distributions extracted, memory released (probably).")


    print(f"   Processing {num_samples} samples for filtering...")
    filtering_summary_info = {}
    filtered_otu_table_data = df_cond.copy()
    samples_processed_count = 0

    sample_iterator = tqdm(range(num_samples), desc=f"     Filtering {condition_group}")

    for counter in sample_iterator:
        sample_name = df_cond.index[counter]
        sample_data_rel = data_cond[counter, :] 

        sample_output_dir = os.path.join(sample_results_base_dir, sample_name)
        os.makedirs(sample_output_dir, exist_ok=True)

        sample_weights_unfiltered, _ = compute_sample_weights(sample_data_rel)
       
        filtered_sample_weights = sample_weights_unfiltered.copy() # Start with unfiltered weights
        total_edges_in_sample = 0 # Count edges initially present in the sample's graph
        edges_removed_count = 0 # Count edges removed by p-value filtering

        # Iterate through upper triangle of the weight matrix
        for i in range(num_species):
            for j in range(i + 1, num_species):
                sample_w_ij = sample_weights_unfiltered[i, j] # The weight in this specific sample

                # Only consider edges that exist (non-zero, finite) in this sample
                if abs(sample_w_ij) > 1e-9 and np.isfinite(sample_w_ij):
                    total_edges_in_sample += 1
                    retain_edge = True # Assume we keep the edge initially

                    # Get the bootstrap distribution for this edge (i, j)
                    if (i, j) in bs_weight_distributions:
                        bs_dist = bs_weight_distributions[i, j]
                  
                        # Need at least 2 points for variance, and non-zero variance
                        if len(bs_dist) > 1 and np.std(bs_dist) > 1e-9:
                            
                            with np.errstate(invalid='ignore'): 
                                t_stat, p_val = stats.ttest_1samp(a=bs_dist, popmean=sample_w_ij,
                                                                    alternative='two-sided', nan_policy='omit')

                            # Decision: Remove edge if p-value is significant (low) and finite
                            if p_val < pval_thresh and np.isfinite(p_val):
                                retain_edge = False # Mark edge for removal

                    # Apply filtering decision
                    if not retain_edge:
                        filtered_sample_weights[i, j] = filtered_sample_weights[j, i] = 0 # Remove edge
                        edges_removed_count += 1


        # 3. Save the filtered weights matrix for this sample
        filtered_matrix_path = os.path.join(sample_output_dir, f"weights_filtered.csv")
        np.savetxt(filtered_matrix_path, filtered_sample_weights, delimiter=",")


        G_filtered = nx.Graph()
        present_species_indices = np.where(sample_data_rel > 1e-9)[0]
        nodes_added_to_graph = set()
        for idx in present_species_indices:
             if idx < len(species_names): 
                  node_name = species_names[idx]
                  G_filtered.add_node(node_name, relab=sample_data_rel[idx])
                  nodes_added_to_graph.add(node_name)

        edges_added_count = 0
        for i in range(num_species):
            for j in range(i + 1, num_species):
                if abs(filtered_sample_weights[i, j]) > 1e-9 and np.isfinite(filtered_sample_weights[i, j]):
                     if i < len(species_names) and j < len(species_names):
                          node_i = species_names[i]; node_j = species_names[j]
                          if node_i in nodes_added_to_graph and node_j in nodes_added_to_graph:
                               G_filtered.add_edge(node_i, node_j, weight=filtered_sample_weights[i, j])
                               edges_added_count += 1

        # Save filtered graphml
        filtered_graph_path = os.path.join(sample_output_dir, f"graph_filtered.graphml") 

        # Optional: Remove isolated nodes before saving?
        # isolated = list(nx.isolates(G_filtered))
        # G_filtered.remove_nodes_from(isolated)
        
        nx.write_graphml(G_filtered, filtered_graph_path) 

        remaining_nodes = list(G_filtered.nodes())
        cols_to_zero_out = df_cond.columns[~df_cond.columns.isin(remaining_nodes)]
        if not cols_to_zero_out.empty:
            filtered_otu_table_data.loc[sample_name, cols_to_zero_out] = 0


        edges_kept_count = G_filtered.number_of_edges()
        edges_filtered_out_actual = total_edges_in_sample - edges_kept_count

        filtering_summary_info[sample_name] = {
            "nodes_in": len(nodes_added_to_graph), # Nodes initially present with abundance > 0
            "edges_unfiltered": total_edges_in_sample, # Edges calculated from original sample abundances
            "edges_removed_by_pval": edges_filtered_out_actual, # Edges removed by p-value check
            "prop_removed": (edges_filtered_out_actual / total_edges_in_sample if total_edges_in_sample > 0 else 0),
            "nodes_out": G_filtered.number_of_nodes(), # Nodes remaining in the final graph
            "edges_out": edges_kept_count # Edges remaining in the final graph
        }
        samples_processed_count += 1



    print(f"   Saving condition-level summaries for {condition_group}...")
    summary_df = pd.DataFrame.from_dict(filtering_summary_info, orient="index")
    summary_csv_path = os.path.join(condition_summary_dir, f"filtering_summary_{condition_group}.csv")

    summary_df.to_csv(summary_csv_path)
    print(f"     Saved filtering summary: {summary_csv_path}")

    filtered_otu_table_path = os.path.join(condition_summary_dir, f"feature_table_filtered_{condition_group}.csv")
    filtered_otu_table_data.to_csv(filtered_otu_table_path)
    print(f"     Filtered OTU table saved: {filtered_otu_table_path}")

    print(f"  Finished filtering for condition {condition_group}. Samples processed: {samples_processed_count}/{num_samples}")
    return filtered_otu_table_data, summary_df




In [None]:
def run_bootstrap_filtering_per_condition(feature_table, metadata, output_dir,
                                            condition_id_col, num_bootstraps,
                                            p_value_threshold, min_samples_bootstrap):
    """
    Orchestrates bootstrapping and p-value filtering for each condition.
    """
    print(f"\n--- Running Bootstrap P-Value Filtering Workflow ---")
    
    # Create base directory for this run's parameters
    run_params_str = f"P_{p_value_threshold}_N_{num_bootstraps}"
    bootstrap_base_dir = os.path.join(output_dir, f"02_Bootstrap_Filtering_({run_params_str})")
    
    os.makedirs(bootstrap_base_dir, exist_ok=True) # Base for this specific run
    print(f" Bootstrap filtering outputs will be saved under: '{bootstrap_base_dir}'")

    all_conditions = metadata[condition_id_col].unique()
   

    print(f" Processing {len(all_conditions)} conditions based on: '{condition_id_col}'")
    print(f" Parameters: Bootstraps={num_bootstraps}, P-Value Threshold={p_value_threshold}, Min Samples={min_samples_bootstrap}")

    all_filtered_otus = {}
    all_summaries = {}
    conditions_processed_count = 0
    conditions_skipped_count = 0

  
    for condition_group in all_conditions:
        condition_sample_ids = metadata[metadata[condition_id_col] == condition_group].index

        valid_condition_sample_ids = condition_sample_ids.intersection(feature_table.index)

        if len(valid_condition_sample_ids) < min_samples_bootstrap:
            print(f" Skipping condition '{condition_group}': Only {len(valid_condition_sample_ids)} valid samples found (min required: {min_samples_bootstrap}).")
            conditions_skipped_count += 1
            continue

        df_cond = feature_table.loc[valid_condition_sample_ids]
  
        print("  Starting bootstrap population generation...")
        bstrap_intermed_dir = create_bootstrap_population(
            observed_data_df=df_cond, 
            condition_group=condition_group, 
            output_dir_base=bootstrap_base_dir, 
            n_boots=num_bootstraps
        )

        if bstrap_intermed_dir is None:
             print(f"  ERROR: Bootstrapping failed for condition '{condition_group}'. Skipping filtering step.")
             conditions_skipped_count += 1
             continue
                
        print("  Bootstrap population generation finished.")

        print("  Starting p-value filtering for samples...")
        filtered_otu_table, summary_df = filtering_pvals_for_each_sample(
            df_cond=df_cond, 
            condition_group=condition_group, 
            bstrap_intermed_dir=bstrap_intermed_dir, # Path to intermediates (matrices/, means.csv, stds.csv)
            output_dir_base=bootstrap_base_dir, # Pass the run-specific base dir
            pval_thresh=p_value_threshold,
            delete_bootstrap_matrices=True # Default set to True to clean up space
        )
        print("  P-value filtering for samples finished.")

        if filtered_otu_table is not None:
             all_filtered_otus[condition_group] = filtered_otu_table 
        if summary_df is not None:
             all_summaries[condition_group] = summary_df 
        conditions_processed_count += 1

    # --- Workflow Finish ---
    print("\n" + "="*50)
    print("Bootstrap P-Value Filtering Workflow Finished.")
    print(f" Conditions processed: {conditions_processed_count}")
    print(f" Conditions skipped (due to sample size or errors): {conditions_skipped_count}")
    print("="*50)
    return all_filtered_otus, all_summaries



In [None]:
from tqdm import tqdm

if __name__ == "__main__":

    print("--- Configuration Settings ---")

    otu_file_path = '../data/rvc/OTU_table_full.csv'
    metadata_file_path = '../data/rvc/metadata.tsv'
    sample_id_col = 'Sample-ID'
    condition_id_col = 'Group ID' 

    # Base output directory for all results
    output_dir = "outputs"


    run_feature_processing = True 
    run_bootstrap_pval_filtering = True 

   #Choose params
    num_bootstraps_pval = 10
    pval_threshold = 0.05
    min_samples_pval = 5 # Minimum samples required per condition for bootstrapping

    random_seed = 42 # This should mean its the same random numbers every time

    # Print Configuration
    print(f"OTU Table Path: {otu_file_path}")
    print(f"Metadata Path: {metadata_file_path}")
    print(f"Sample ID Column: '{sample_id_col}'")
    print(f"Condition ID Column: '{condition_id_col}'")
    print(f"Base Output Directory: '{output_dir}'")
    print(f"\nWorkflow Selection:")
    print(f"  Run Feature Processing & Alignment: {run_feature_processing}")
    print(f"  Run Bootstrap P-Value Filtering (p<{pval_threshold}, N={num_bootstraps_pval}, min_samples={min_samples_pval}): {run_bootstrap_pval_filtering}")
    print("-" * 30)

    # Workflwo
    print("\n--- Starting Data Processing and Analysis Workflow ---")
    print(f"Script started at: {time.strftime('%Y-%m-%d %H:%M:%S')}")
    overall_start_time = time.time()

    feature_table = None
    metadata = None

    # Create base output directory if it doesn't exist
    os.makedirs(output_dir, exist_ok=True)
    print(f"\nOutput directory confirmed: '{os.path.abspath(output_dir)}'")


    if run_feature_processing:
        print("\n[Step 1] Processing input data...")
        feature_table, metadata = process_data(
            otu_file_path, metadata_file_path, output_dir,
            sample_id=sample_id_col,
            condition_id=condition_id_col
        )
   
        if feature_table is None or metadata is None or feature_table.empty or metadata.empty:
            raise ValueError("Data processing failed to return valid feature table or metadata.")
        print("[Step 1] Data processing finished.")
        print("-"*50)
    else:
        print("\n[Step 1] Processing input data SKIPPED")
        print("   Attempting to load previously processed data...")
        ft_path = os.path.join(output_dir, "01_Processed_Data", "feature_table_normalized.csv")
        meta_path = os.path.join(output_dir, "01_Processed_Data", "metadata_aligned.csv")
        feature_table = pd.read_csv(ft_path, index_col=0)
        metadata = pd.read_csv(meta_path, index_col=0)
        # Ensure indices are strings after loading
        feature_table.index = feature_table.index.astype(str)
        metadata.index = metadata.index.astype(str)
        print(f"   Loaded feature table {feature_table.shape} and metadata {metadata.shape}")
        print("-"*50)

    # 2. Run Bootstrap P-Value Filtering 
    if run_bootstrap_pval_filtering:
        print(f"\n[Step 2] Running Bootstrap P-Value Filtering (p < {pval_threshold}, N = {num_bootstraps_pval})...")
       
        filtered_otu_tables_by_cond, filter_summaries_by_cond = run_bootstrap_filtering_per_condition(
            feature_table=feature_table,
            metadata=metadata,
            output_dir=output_dir,
            condition_id_col=condition_id_col,
            num_bootstraps=num_bootstraps_pval,
            p_value_threshold=pval_threshold,
            min_samples_bootstrap=min_samples_pval
        )
        print("[Step 2] Bootstrap P-Value Filtering finished.")
        print("-"*50)
    else:
        print("\n[Step 2] Bootstrap P-Value Filtering Workflow SKIPPED")
        print("-"*50)


    overall_end_time = time.time()
    print("\n" + "="*50)
    print("--- Workflow Finished ---")
    print(f"Total execution time: {overall_end_time - overall_start_time:.2f} seconds")
    print(f"Output saved in base directory: {os.path.abspath(output_dir)}")
    print("Check subdirectories for results from enabled analyses:")
    if run_feature_processing: print(" - 01_Processed_Data/")
    if run_bootstrap_pval_filtering:
        run_params_str = f"P_{pval_threshold}_N_{num_bootstraps_pval}"
        print(f" - 02_Bootstrap_Filtering_({run_params_str})/")
    print("="*50)

    print(f"\nScript finished at: {time.strftime('%Y-%m-%d %H:%M:%S')}")

--- Configuration Settings ---
OTU Table Path: ../data/rvc/OTU_table_full.csv
Metadata Path: ../data/rvc/metadata.tsv
Sample ID Column: 'Sample-ID'
Condition ID Column: 'Group ID'
Base Output Directory: 'outputs'

Workflow Selection:
  Run Feature Processing & Alignment: True
  Run Bootstrap P-Value Filtering (p<0.05, N=10, min_samples=5): True
------------------------------

--- Starting Data Processing and Analysis Workflow ---
Script started at: 2025-04-25 13:36:10

Output directory confirmed: '/Users/nandini.gadhia/Documents/projects/ot_omics/notebooks/outputs'

[Step 1] Processing input data...
 Reading OTU table at: ../data/rvc/OTU_table_full.csv
 Reading metadata at: ../data/rvc/metadata.tsv
  Initial OTU table shape: (88, 740)
  Initial Metadata shape: (88, 8)
  Using Sample ID column: 'Sample-ID'
  Using Condition ID column: 'Group ID'
  Found 88 common samples between OTU table and metadata.
  Cleaned feature table shape: (88, 740)
  Cleaned metadata shape: (88, 7)
  Normaliz

                                                                                                    

   Calculating mean and std deviation across bootstrap matrices...
   Saved mean and std matrices to: outputs/02_Bootstrap_Filtering_(P_0.05_N_10)/Intermediates/UV-C
  Bootstrap population generation finished.
  Starting p-value filtering for samples...
  Filtering individual samples for condition: UV-C (p-value < 0.05 removes edge)
   Loading bootstrap matrices from outputs/02_Bootstrap_Filtering_(P_0.05_N_10)/Intermediates/UV-C/matrices...


    Loading matrices:  60%|██████    | 6/10 [00:00<00:00, 27.04it/s]

    Expected matrix shape: (740, 740)


                                                                    

   Loaded 10 bootstrap matrices into memory.
     Attempting to delete individual bootstrap matrix files from outputs/02_Bootstrap_Filtering_(P_0.05_N_10)/Intermediates/UV-C/matrices...
   Pre-calculating bootstrap distributions for edges...
   Bootstrap distributions extracted, memory released (probably).
   Processing 20 samples for filtering...


                                                                                                    

   Saving condition-level summaries for UV-C...
     Saved filtering summary: outputs/02_Bootstrap_Filtering_(P_0.05_N_10)/Condition_Summaries/filtering_summary_UV-C.csv
     Filtered OTU table saved: outputs/02_Bootstrap_Filtering_(P_0.05_N_10)/Condition_Summaries/feature_table_filtered_UV-C.csv
  Finished filtering for condition UV-C. Samples processed: 20/20
  P-value filtering for samples finished.
  Starting bootstrap population generation...
 Bootstrapping 10 replicates for condition: UV-UC
   Bootstrap intermediate matrices will be saved in: outputs/02_Bootstrap_Filtering_(P_0.05_N_10)/Intermediates/UV-UC/matrices


                                                                                                    

   Calculating mean and std deviation across bootstrap matrices...
   Saved mean and std matrices to: outputs/02_Bootstrap_Filtering_(P_0.05_N_10)/Intermediates/UV-UC
  Bootstrap population generation finished.
  Starting p-value filtering for samples...
  Filtering individual samples for condition: UV-UC (p-value < 0.05 removes edge)
   Loading bootstrap matrices from outputs/02_Bootstrap_Filtering_(P_0.05_N_10)/Intermediates/UV-UC/matrices...


    Loading matrices:  60%|██████    | 6/10 [00:00<00:00, 26.79it/s]

    Expected matrix shape: (740, 740)


                                                                    

   Loaded 10 bootstrap matrices into memory.
     Attempting to delete individual bootstrap matrix files from outputs/02_Bootstrap_Filtering_(P_0.05_N_10)/Intermediates/UV-UC/matrices...
   Pre-calculating bootstrap distributions for edges...
   Bootstrap distributions extracted, memory released (probably).
   Processing 20 samples for filtering...


                                                                                                    

   Saving condition-level summaries for UV-UC...
     Saved filtering summary: outputs/02_Bootstrap_Filtering_(P_0.05_N_10)/Condition_Summaries/filtering_summary_UV-UC.csv
     Filtered OTU table saved: outputs/02_Bootstrap_Filtering_(P_0.05_N_10)/Condition_Summaries/feature_table_filtered_UV-UC.csv
  Finished filtering for condition UV-UC. Samples processed: 20/20
  P-value filtering for samples finished.
  Starting bootstrap population generation...
 Bootstrapping 10 replicates for condition: MV-C
   Bootstrap intermediate matrices will be saved in: outputs/02_Bootstrap_Filtering_(P_0.05_N_10)/Intermediates/MV-C/matrices


                                                                                                    

   Calculating mean and std deviation across bootstrap matrices...
   Saved mean and std matrices to: outputs/02_Bootstrap_Filtering_(P_0.05_N_10)/Intermediates/MV-C
  Bootstrap population generation finished.
  Starting p-value filtering for samples...
  Filtering individual samples for condition: MV-C (p-value < 0.05 removes edge)
   Loading bootstrap matrices from outputs/02_Bootstrap_Filtering_(P_0.05_N_10)/Intermediates/MV-C/matrices...


    Loading matrices:  30%|███       | 3/10 [00:00<00:00, 25.51it/s]

    Expected matrix shape: (740, 740)


                                                                    

   Loaded 10 bootstrap matrices into memory.
     Attempting to delete individual bootstrap matrix files from outputs/02_Bootstrap_Filtering_(P_0.05_N_10)/Intermediates/MV-C/matrices...
   Pre-calculating bootstrap distributions for edges...
   Bootstrap distributions extracted, memory released (probably).
   Processing 20 samples for filtering...


                                                                                                    

   Saving condition-level summaries for MV-C...
     Saved filtering summary: outputs/02_Bootstrap_Filtering_(P_0.05_N_10)/Condition_Summaries/filtering_summary_MV-C.csv
     Filtered OTU table saved: outputs/02_Bootstrap_Filtering_(P_0.05_N_10)/Condition_Summaries/feature_table_filtered_MV-C.csv
  Finished filtering for condition MV-C. Samples processed: 20/20
  P-value filtering for samples finished.
  Starting bootstrap population generation...
 Bootstrapping 10 replicates for condition: V-C
   Bootstrap intermediate matrices will be saved in: outputs/02_Bootstrap_Filtering_(P_0.05_N_10)/Intermediates/V-C/matrices


                                                                                                    

   Calculating mean and std deviation across bootstrap matrices...
   Saved mean and std matrices to: outputs/02_Bootstrap_Filtering_(P_0.05_N_10)/Intermediates/V-C
  Bootstrap population generation finished.
  Starting p-value filtering for samples...
  Filtering individual samples for condition: V-C (p-value < 0.05 removes edge)
   Loading bootstrap matrices from outputs/02_Bootstrap_Filtering_(P_0.05_N_10)/Intermediates/V-C/matrices...


    Loading matrices:  30%|███       | 3/10 [00:00<00:00, 24.70it/s]

    Expected matrix shape: (740, 740)


                                                                    

   Loaded 10 bootstrap matrices into memory.
     Attempting to delete individual bootstrap matrix files from outputs/02_Bootstrap_Filtering_(P_0.05_N_10)/Intermediates/V-C/matrices...
   Pre-calculating bootstrap distributions for edges...
   Bootstrap distributions extracted, memory released (probably).
   Processing 20 samples for filtering...


                                                                                                    

   Saving condition-level summaries for V-C...
     Saved filtering summary: outputs/02_Bootstrap_Filtering_(P_0.05_N_10)/Condition_Summaries/filtering_summary_V-C.csv
     Filtered OTU table saved: outputs/02_Bootstrap_Filtering_(P_0.05_N_10)/Condition_Summaries/feature_table_filtered_V-C.csv
  Finished filtering for condition V-C. Samples processed: 20/20
  P-value filtering for samples finished.
  Starting bootstrap population generation...
 Bootstrapping 10 replicates for condition: UV-C10
   Bootstrap intermediate matrices will be saved in: outputs/02_Bootstrap_Filtering_(P_0.05_N_10)/Intermediates/UV-C10/matrices


                                                                                                    

   Calculating mean and std deviation across bootstrap matrices...
   Saved mean and std matrices to: outputs/02_Bootstrap_Filtering_(P_0.05_N_10)/Intermediates/UV-C10
  Bootstrap population generation finished.
  Starting p-value filtering for samples...
  Filtering individual samples for condition: UV-C10 (p-value < 0.05 removes edge)
   Loading bootstrap matrices from outputs/02_Bootstrap_Filtering_(P_0.05_N_10)/Intermediates/UV-C10/matrices...


    Loading matrices:  60%|██████    | 6/10 [00:00<00:00, 28.96it/s]

    Expected matrix shape: (740, 740)


                                                                    

   Loaded 10 bootstrap matrices into memory.
     Attempting to delete individual bootstrap matrix files from outputs/02_Bootstrap_Filtering_(P_0.05_N_10)/Intermediates/UV-C10/matrices...
   Pre-calculating bootstrap distributions for edges...
   Bootstrap distributions extracted, memory released (probably).
   Processing 8 samples for filtering...


                                                                                                    

   Saving condition-level summaries for UV-C10...
     Saved filtering summary: outputs/02_Bootstrap_Filtering_(P_0.05_N_10)/Condition_Summaries/filtering_summary_UV-C10.csv
     Filtered OTU table saved: outputs/02_Bootstrap_Filtering_(P_0.05_N_10)/Condition_Summaries/feature_table_filtered_UV-C10.csv
  Finished filtering for condition UV-C10. Samples processed: 8/8
  P-value filtering for samples finished.

Bootstrap P-Value Filtering Workflow Finished.
 Conditions processed: 5
 Conditions skipped (due to sample size or errors): 0
[Step 2] Bootstrap P-Value Filtering finished.
--------------------------------------------------

--- Workflow Finished ---
Total execution time: 169.37 seconds
Output saved in base directory: /Users/nandini.gadhia/Documents/projects/ot_omics/notebooks/outputs
Check subdirectories for results from enabled analyses:
 - 01_Processed_Data/
 - 02_Bootstrap_Filtering_(P_0.05_N_10)/

Script finished at: 2025-04-25 13:39:00


