# Import and sandbox

In [2]:
import numpy as np
import pandas as pd
import torch 
from tqdm import tqdm
import matplotlib.pyplot as plt
import itertools
import yaml

In [3]:

def get_method_variations(method):
    variations = []
    if method == 'apply_inverse_normal_transform':
        variations.append({'method': method})
    elif method == 'apply_robust_Z_score':
        center_by_options = ['mean', 'median']
        reduce_by_options = ['iqrs', 'std'] # could be  'mad' also
        use_control_options = [True, False]
        for center_by in center_by_options:
            for reduce_by in reduce_by_options:
                for use_control in use_control_options:
                    variations.append({'method': method, 'params': {'center_by': center_by, 'reduce_by': reduce_by, 'use_control': use_control}})
    elif method == 'apply_rescale':
        scale_options = ['0-1', '-1-1'] # could be '0-1' also
        for scale in scale_options:
            variations.append({'method': method, 'params': {'scale': scale}})
    elif method == 'apply_spherizing_transform':
        spherizing_methods = ['PCA', 'ZCA', 'ZCA-cor'] # could be 'PCA-cor' also
        norm_embeddings_options = [True, False]
        use_control_options = [True, False]
        for s_method in spherizing_methods:
            for norm_embeddings in norm_embeddings_options:
                for use_control in use_control_options:
                    variations.append({
                        'method': method,
                        'params': {
                            'method': s_method,
                            'norm_embeddings': norm_embeddings,
                            'use_control': use_control
                        }
                    })
    return variations

# Function to check if a sequence is valid based on the constraint
def is_valid_method_sequence(sequence):
    methods = [method for method in sequence]
    has_Z_score = 'apply_Z_score' in methods
    has_robust_Z_score = 'apply_robust_Z_score' in methods
    # Sequence is invalid if both are present
    return not (has_Z_score and has_robust_Z_score)

# Function to generate the name of a sequence based on methods and parameters
def generate_sequence_name(sequence):
    name_parts = []
    for t in sequence:
        method_abbrev = ''
        method = t['method']
        params = t.get('params', {})
        if method == 'apply_inverse_normal_transform':
            method_abbrev = 'Int'
        elif method == 'apply_robust_Z_score':
            method_abbrev = 'rZ'
            center_by = params.get('center_by', '')
            reduce_by = params.get('reduce_by', '')
            center_abbrev = 'm' if center_by == 'median' else 'M'
            reduce_abbrev = 'i' if reduce_by == 'iqrs' else ('m' if reduce_by == 'mad' else 's')
            method_abbrev += center_abbrev + reduce_abbrev
            if params.get('use_control'):
                method_abbrev += '_C'
        elif method == 'apply_rescale':
            scale = params.get('scale', '')
            method_abbrev = 'Res' + ('01' if scale == '0-1' else '11')
        elif method == 'apply_spherizing_transform':
            s_method = params.get('method', '')
            norm_embeddings = params.get('norm_embeddings', False)
            use_control = params.get('use_control', False)
            method_abbrev = s_method
            if norm_embeddings:
                method_abbrev += '_N'
            if use_control:
                method_abbrev += '_C'
        name_parts.append(method_abbrev)
    name = '_'.join(name_parts)
    return name


In [4]:
methods = [
    'apply_inverse_normal_transform',
    'apply_robust_Z_score',
    'apply_rescale',
    'apply_spherizing_transform'
]

# Generate method sequences without both Z_score methods
method_sequences = []
for n in [1, 2, 3, 4]:
    permutations = list(itertools.permutations(methods, n))
    for perm in permutations:
        if 'apply_Z_score' in perm and 'apply_robust_Z_score' in perm:
            continue
        method_sequences.append(perm)

# Limit the number of sequences to process 
MAX_SEQUENCES = 10000
transformation_sequences = []

for method_sequence in method_sequences[:MAX_SEQUENCES]:
    if not is_valid_method_sequence(method_sequence):
        continue
    # Get variations for each method in the sequence
    method_variations_list = []
    for method in method_sequence:
        method_variations_list.append(get_method_variations(method))
    # Generate all combinations of variations
    sequence_variations = itertools.product(*method_variations_list)
    for seq_variation in sequence_variations:
        name = generate_sequence_name(seq_variation)
        sequence = {
            'name': name,
            'transformations': [dict(t) for t in seq_variation]
        }
        transformation_sequences.append(sequence)

# Function to prevent YAML from using aliases (anchors)
def noalias_dumper():
    class NoAliasDumper(yaml.SafeDumper):
        def ignore_aliases(self, data):
            return True
    return NoAliasDumper

# Save to a YAML file
with open('transformation_combinations.yaml', 'w') as file:
    yaml.dump(transformation_sequences, file, Dumper=noalias_dumper(), default_flow_style=False, sort_keys=False)

print(f"Generation completed: {len(transformation_sequences)} sequences saved in 'transformation_combinations.yaml'")


Generation completed: 6915 sequences saved in 'transformation_combinations.yaml'


In [5]:
def filter_sequences_with_ZCA(sequences):
    filtered_sequences = []
    for sequence in sequences:
        for transformation in sequence['transformations']:
            if transformation.get('params', {}).get('method') == 'ZCA':
                filtered_sequences.append(sequence)
                break  # Once ZCA is found, no need to check further transformations in this sequence
    return filtered_sequences

# Apply the filter to transformation_sequences
transformation_sequences_with_ZCA = filter_sequences_with_ZCA(transformation_sequences)

# Optionally, save filtered sequences to a new YAML file
with open('transformation_combinations_with_ZCA.yaml', 'w') as file:
    yaml.dump(transformation_sequences_with_ZCA, file, Dumper=noalias_dumper(), default_flow_style=False, sort_keys=False)

print(f"Filtering completed: {len(transformation_sequences_with_ZCA)} sequences containing ZCA saved in 'transformation_combinations_with_ZCA.yaml'")


Filtering completed: 2252 sequences containing ZCA saved in 'transformation_combinations_with_ZCA.yaml'
