### Dependencies and input

In [2]:
import random
import pandas as pd
import numpy as np

meta_data_source = "/Users/moritz/Desktop/unpast_1a/unpast_1a_meta_filter.txt"
cluster_data_source = "/Users/moritz/Desktop/unpast_1a/unpast_1a_cluster_final.tsv"

p_value_threshold = 0.05
draws = 1000

numerical_attributes = ["age", "height"] 

### Import of metadata

In [4]:
meta_data_df = pd.read_csv(meta_data_source, sep="\t")
#print(meta_data_df)

### Import of cluster data

In [6]:
cluster_data_df = pd.read_csv(cluster_data_source, sep="\t")
#print(cluster_data_df)

### Assignment of sample indexes within given cluster 

In [8]:
# to make a dict with the cluster id as key and the sample indexes as values
samples_in_cluster_dict = cluster_data_df.set_index('id')['sample_indexes'].to_dict()       

# to convert sample_indexes from string to list
for cluster_id, sample_ids in samples_in_cluster_dict.items():
    samples_in_cluster_dict[cluster_id] = [int(sample) for sample in sample_ids.split()]

#print(samples_in_cluster_dict)

# ===============================================================

### Metadata assignment within given cluster function

In [11]:
# to assign metadata information like gender or condition to the corresponding sample_indexes within the given clusters
def metadata_per_cluster(meta_data_list, samples_in_cluster_dict) -> dict:
    metadata_per_cluster_dict = {}
    
    for cluster_id, sample_ids in samples_in_cluster_dict.items():
        values = []
        for index in sample_ids: 
            values.append(meta_data_list[index])
        metadata_per_cluster_dict[cluster_id] = values
    
    return metadata_per_cluster_dict

### Bootstrapping function

In [13]:
# to determine an aproximative distribution from the given metadata
def bootstrapping(draws, meta_data_list, sample_meta_data):
    length_of_cluster = len(sample_meta_data)
    random_draws_mean_list = []
    
    for draw in range(draws):
        random_draws = random.sample(meta_data_list, k=length_of_cluster)
        random_draws_mean = sum(random_draws)/length_of_cluster
        random_draws_mean_list.append(random_draws_mean)
        random_draws_mean_list.sort(reverse=True)
    
    return random_draws_mean_list

### Cluster distribution function

In [15]:
# to get the distribution of sample meta data for a given cluster 
def cluster_distribution(sample_meta_data):
    cluster_occurrance_mean = sum(sample_meta_data)/len(sample_meta_data)
    return cluster_occurrance_mean

### Calculation of P-Values function

In [17]:
# to determine the corresponding P Values with the given cluster distribution and the bootsrapping results
def calculate_p_value(cluster_distribution_mean, permutation_list, draws) -> dict:
    counter = 0
    
    for element in permutation_list:
        if element < cluster_distribution_mean:
            break
        else:
            counter += 1

    p_value = counter/draws

    return p_value

### Main

In [19]:
# main: to harness all loops through the given numerical attributes (defined within the first cell)
def main():
    for category in numerical_attributes:
        print(category)
        print("------------------------------------------------------------------\n")
        meta_data_list = meta_data_df[category].tolist()
        
        metadata_per_cluster_dict = metadata_per_cluster(meta_data_list, samples_in_cluster_dict)
        
        for cluster_id, sample_meta_data in metadata_per_cluster_dict.items():
            try:
                cluster_distribution_mean = cluster_distribution(sample_meta_data)
                #print(cluster_distribution_mean)
                permutation_list = bootstrapping(draws, meta_data_list, sample_meta_data)
                #print(permutation_list)
                p_value = calculate_p_value(cluster_distribution_mean, permutation_list, draws)
                #print(p_value)
                
                if p_value <= p_value_threshold:
                    print("Cluster ID: "+str(cluster_id)+"|", "P-Value: "+str(p_value)+";")
            
            except:
                print("Exception occurred during handling of Cluster ID: "+str(cluster_id))
                
        print("\n")  

In [20]:
main()

age
------------------------------------------------------------------

Cluster ID: 7| P-Value: 0.003;
Cluster ID: 15| P-Value: 0.031;
Cluster ID: 40| P-Value: 0.04;
Cluster ID: 54| P-Value: 0.006;
Cluster ID: 72| P-Value: 0.036;
Cluster ID: 78| P-Value: 0.014;
Cluster ID: 82| P-Value: 0.012;
Cluster ID: 111| P-Value: 0.017;
Cluster ID: 125| P-Value: 0.02;
Cluster ID: 129| P-Value: 0.004;
Cluster ID: 137| P-Value: 0.041;


height
------------------------------------------------------------------

Cluster ID: 5| P-Value: 0.05;
Cluster ID: 26| P-Value: 0.033;
Cluster ID: 45| P-Value: 0.044;
Cluster ID: 53| P-Value: 0.032;
Cluster ID: 66| P-Value: 0.028;
Cluster ID: 69| P-Value: 0.027;
Cluster ID: 71| P-Value: 0.029;
Cluster ID: 75| P-Value: 0.014;
Cluster ID: 76| P-Value: 0.015;
Cluster ID: 81| P-Value: 0.042;
Cluster ID: 82| P-Value: 0.019;
Cluster ID: 91| P-Value: 0.016;
Cluster ID: 96| P-Value: 0.048;
Cluster ID: 100| P-Value: 0.028;
Cluster ID: 108| P-Value: 0.012;
Cluster ID: 111| P