### Dependencies and input

In [3]:
import random
import pandas as pd

meta_data_source = "/Users/moritz/Desktop/unpast_1a/unpast_1a_meta_filter.txt"
cluster_data_source = "/Users/moritz/Desktop/unpast_1a/unpast_1a_cluster_final.tsv"

categorical_attributes = ["gender", "condition"] 
p_value_threshold = 0.05
draws = 1000

### Import of metadata

In [5]:
meta_data_df = pd.read_csv(meta_data_source, sep="\t")
#print(meta_data_df)

### Import of cluster data

In [7]:
cluster_data_df = pd.read_csv(cluster_data_source, sep="\t")
#print(cluster_data_df)

### Assignment of sample indexes within given cluster 

In [9]:
# to make a dict with the cluster id as key and the sample indexes as values
samples_in_cluster_dict = cluster_data_df.set_index('id')['sample_indexes'].to_dict()       

# to convert sample_indexes from string to list
for cluster_id, sample_ids in samples_in_cluster_dict.items():
    samples_in_cluster_dict[cluster_id] = [int(sample) for sample in sample_ids.split()]

#print(samples_in_cluster_dict)

# ===============================================================

### Metadata assignment within given cluster function

In [12]:
# to assign metadata information like gender or condition to the corresponding sample_indexes within the given clusters
def metadata_per_cluster(meta_data_list, samples_in_cluster_dict) -> dict:
    metadata_per_cluster_dict = {}
    
    for cluster_id, sample_ids in samples_in_cluster_dict.items():
        values = []
        for index in sample_ids: 
            values.append(meta_data_list[index])
        
        metadata_per_cluster_dict[cluster_id] = values
    
    return metadata_per_cluster_dict

### Bootstrapping function

In [14]:
# to determine an aproximative distribution from the given metadata
def bootstrapping(cluster_distribution_dict, draws, meta_data_list, sample_meta_data) -> dict:
    permutation_dict = {}
    length_of_cluster = len(sample_meta_data)
    
    for attribute in cluster_distribution_dict.keys():
        permutation_list = []
        
        for draw in range(draws):
            random_draws = random.sample(meta_data_list, k=length_of_cluster)
            attribute_counter = random_draws.count(attribute)
            permutation_list.append(attribute_counter)
            permutation_list.sort(reverse=True)
            permutation_dict[attribute] = permutation_list
    
    return permutation_dict

### Cluster distribution function

In [16]:
# to get the distribution of sample meta data for a given cluster 
def cluster_distribution(sample_meta_data) -> dict:
    cluster_occurrence_dict = {}
    
    for element in sample_meta_data: 
        if element in cluster_occurrence_dict:
            cluster_occurrence_dict[element] += 1
        else:
            cluster_occurrence_dict[element] = 1

    return cluster_occurrence_dict

### Calculation of P-Values function

In [18]:
# to determine the corresponding P Values with the given cluster distribution and the bootsrapping results
def calculate_p_value(cluster_distribution_dict, permutation_dict, draws) -> dict:
    if cluster_distribution_dict.keys() != permutation_dict.keys():
        raise keyerror("keys in cluster_distribution_dict and permutation_dict don't match!")
    
    draw_freq_list = []
    p_values_dict = {}
    
    for attribute, occurrence in cluster_distribution_dict.items():
        draw_freq_list = permutation_dict[attribute]
        counter = 0
        for element in draw_freq_list:
            if element < occurrence:
                break
            else:
                counter += 1
                
        p_values_dict[attribute] = counter/draws
    
    return p_values_dict

### Main

In [20]:
# main: to harness all loops through the given categorical attributes (defined within the first cell)
def main():
    for category in categorical_attributes:
        print(category)
        print("------------------------------------------------------------------\n")
        meta_data_list = meta_data_df[category].tolist()
        metadata_per_cluster_dict = metadata_per_cluster(meta_data_list, samples_in_cluster_dict)
        
        for cluster_id, sample_meta_data in metadata_per_cluster_dict.items():
            try:
                cluster_distribution_dict = cluster_distribution(sample_meta_data)
                #print(cluster_distribution_dict)
                permutation_dict = bootstrapping(cluster_distribution_dict, draws, meta_data_list, sample_meta_data)
                #print(permutation_dict)
                p_values_dict = calculate_p_value(cluster_distribution_dict, permutation_dict, draws)
                #print(p_values_dict)
                
                for attribute, p_value in p_values_dict.items():
                    if p_value <= p_value_threshold:
                        print("Cluster ID: "+str(cluster_id)+"|", "Attribute: "+str(attribute)+"|", "P-Value: "+str(p_value)+";")
            
            except:
                print("Exception occurred during handling of Cluster ID: "+str(cluster_id))
                
        print("\n") 
            

In [21]:
main()

gender
------------------------------------------------------------------

Cluster ID: 15| Attribute: F| P-Value: 0.0;


condition
------------------------------------------------------------------

Cluster ID: 11| Attribute: healthy| P-Value: 0.001;
Cluster ID: 15| Attribute: healthy| P-Value: 0.019;
Cluster ID: 57| Attribute: healthy| P-Value: 0.019;
Cluster ID: 71| Attribute: pah| P-Value: 0.023;
Cluster ID: 76| Attribute: ph-lung| P-Value: 0.003;
Cluster ID: 122| Attribute: healthy| P-Value: 0.026;
Cluster ID: 137| Attribute: healthy| P-Value: 0.003;


