In [1]:
import os
import sys
import urllib.request

## Dask Configuration

In [2]:
from dask_jobqueue import PBSCluster
from pathlib import Path

# Define the working directory path
working_directory = str(Path.cwd())

# Launch a scheduler and workers on HPC via PBS
cluster = PBSCluster(
     cores=4,
     memory="8GB",
     processes=1,
     queue="tamirQ",
     walltime="07:30:00",
     scheduler_options={"dashboard_address": ":12435"},
     # Additional custom options
     log_directory="dask-logs",
     #worker_extra_args=["--lifetime", "25m", "--lifetime-stagger", "4m"],  # for walltime="00:30:00"
     job_script_prologue=[f"cd {working_directory}"]
)

Perhaps you already have a cluster running?
Hosting the HTTP server on port 44506 instead


In [3]:
cluster.adapt(minimum=10, maximum=60)
print(cluster.job_script())

#!/usr/bin/env bash

#PBS -N dask-worker
#PBS -q tamirQ
#PBS -l select=1:ncpus=4:mem=7630MB
#PBS -l walltime=07:30:00
#PBS -e dask-logs/
#PBS -o dask-logs/
cd /tamir2/moranb/microbiome/Igem_TAU_2021
/tamir2/moranb/microbiome/Igem_TAU_2021/venv/bin/python -m distributed.cli.dask_worker tcp://132.66.112.146:43971 --nthreads 4 --memory-limit 7.45GiB --name dummy-name --nanny --death-timeout 60



In [4]:
from dask.distributed import Client, progress, wait, get_client, get_worker
client = Client(cluster)

In [5]:
import dask.bag as db
import dask.dataframe as dd
from dask import delayed, compute, persist
import json
from collections import defaultdict
import matplotlib
import numpy as np
import pandas as pd
from Bio import AlignIO, Entrez
import re

## Download genomes

In [6]:
from analysis.input_testing_data.generate_input_testing_data_for_modules import generate_testing_data
from modules.main import run_modules

In [7]:
# Configure your email for NCBI Entrez
Entrez.email = "bentulila@mail.tau.ac.il"

def download_refseq_genome(organism_name, output_dir="downloads"):
    print(f"Searching for RefSeq genome for: {organism_name}")

    # Search the assembly database
    search_handle = Entrez.esearch(
        db="assembly",
        term=f"{organism_name}[Organism] AND reference_genome[filter]",
        retmax=1,
        sort="relevance"
    )
    search_results = Entrez.read(search_handle)
    search_handle.close()

    if not search_results["IdList"]:
        print("No reference genome found.")
        return

    assembly_id = search_results["IdList"][0]

    # Fetch summary to get FTP path
    summary_handle = Entrez.esummary(db="assembly", id=assembly_id, report="full")
    summary = Entrez.read(summary_handle)
    summary_handle.close()

    ftp_path = summary['DocumentSummarySet']['DocumentSummary'][0]['FtpPath_RefSeq']
    if not ftp_path:
        print("No RefSeq FTP path found.")
        return

    print("Found RefSeq FTP path:")
    print(ftp_path)

    # Get base name of directory
    base_name = os.path.basename(ftp_path)

    # Files to download (Genomic sequence and annotation)
    files_to_download = [
        # f"{base_name}_genomic.fna.gz",  # DNA sequence
        # f"{base_name}_genomic.gff.gz",  # Annotations (GFF)
        f"{base_name}_genomic.gbff.gz"  # Annotations + sequences (GenBank)
    ]

    os.makedirs(output_dir, exist_ok=True)

    for file_name in files_to_download:
        file_url = f"{ftp_path}/{file_name}"
        out_path = os.path.join(output_dir, organism_name)
        print(f"Downloading: {file_url}")
        try:
            urllib.request.urlretrieve(file_url, f"{out_path}.gbff.gz")
            print(f"Saved to: {out_path}")
        except Exception as e:
            print(f"Failed to download {file_name}: {e}")

In [8]:
# base_output_dir = "/Users/shimka/Documents/Moran's Thesis/arabidopsis_data/genomes"
# genomes_gb_dir_path = os.path.join(base_output_dir, "genomes_gb")
# output_dir = os.path.join(base_output_dir, "refseq_genomes")

# genomes_gb_directory = Path(genomes_gb_dir_path)
# organisms_list = [str(f)[:-5].split("/")[-1].strip() for f in genomes_gb_directory.iterdir() if f.is_file()]
# organisms_list = [o.replace("_", " ") for o in organisms_list if not o.startswith(".")]

In [9]:
# for o in organisms_list:
#     download_refseq_genome(o, output_dir=output_dir)

In [10]:
# import gzip
# import shutil

# def decompress_gz(input_path, output_dir):
#     input_path = Path(input_path)
#     output_dir = Path(output_dir)
#     output_dir.mkdir(parents=True, exist_ok=True)

#     if not input_path.suffix == '.gz':
#         raise ValueError("Input file must have a .gz extension")

#     output_path = output_dir / input_path.with_suffix('').name

#     with gzip.open(input_path, 'rb') as f_in, open(output_path, 'wb') as f_out:
#         shutil.copyfileobj(f_in, f_out)

#     print(f"Decompressed to: {output_path}")

# arabidopsis_gbff_dir = "/Users/shimka/PycharmProjects/Igem_TAU_2021/analysis/example_data/arabidopsis_microbiome"
# for organism_file in Path(output_dir).iterdir():
#     if ".DS_Store" in str(organism_file):
#         continue
#     try:
#         decompress_gz(organism_file, arabidopsis_gbff_dir)
#     except Exception as e:
#         print("*****************")
#         print(organism_file)
#         print(e)
#         print("*****************")

## Arabidopsis analysis

In [11]:
data_directory_path = os.path.join(Path(os.getcwd()).resolve(), "analysis", "example_data")
arabidopsis_data_path = os.path.join(data_directory_path, "arabidopsis")
genomes_gb_path = os.path.join(arabidopsis_data_path, "arabidopsis_microbiome")
zorA_file_path = os.path.join(data_directory_path, "zorA_anti_phage_defense.fasta")
aligned_16s_path = os.path.join(arabidopsis_data_path, "arabidopsis_aligned_16s.fasta")

In [12]:
genomes_gb_directory = Path(genomes_gb_path)
organisms_list = [str(f) for f in genomes_gb_directory.iterdir() if f.is_file()]
len(organisms_list)

34

In [13]:
organism_names = [o.removeprefix("/tamir2/moranb/microbiome/Igem_TAU_2021/analysis/example_data/arabidopsis/arabidopsis_microbiome/").removesuffix(".gbff") for o in organisms_list]

In [14]:
samples_to_organism_name_mapping = {
    "Soil_522": "Janibacter limosus",
    "Soil_531": "Paenibacillus oryzisoli",
    "Soil_535": "Paenibacillus silvestris",
    "Soil_538": "Arthrobacter luteolus",
    "Soil_724D2": "Arthrobacter ginsengisoli",
    "Soil_728": "Peribacillus simplex",
    "Soil_729": "Knoellia subterranea",
    "Soil_736": "Peribacillus muralis",
    "Soil_744D2": "Phycicoccus duodecadis",
    "Soil_745": "Paenibacillus aceris",
    "Soil_748": "Pedococcus dokdonensis",
    "Soil_750": "Arthrobacter pascens",
    "Soil_756": "Paenibacillus ferrarius",
    "Soil_761": "Mycolicibacterium smegmatis",
    "Soil_762": "Agromyces allii",
    "Soil_763": "Priestia aryabhattai",
    "Soil_764": "Pseudarthrobacter phenanthrenivorans",
    "Soil_766": "Arthrobacter silvisoli",
    "Soil_768D1": "Peribacillus frigoritolerans",
    "Soil_772": "Nocardioides jensenii",
    "Soil_773": "Terrabacter terrae",
    "Soil_774": "Paenibacillus alginolyticus",
    "Soil_777": "Arthrobacter tumbae",
    "Soil_782": "Rhodanobacter denitrificans",
    "Soil_783": "Nocardioides terrigena",
    "Soil_787": "Rhodanobacter umsongensis",
    "Soil_796": "Pedococcus bigeumensis",
    "Soil_797": "Arthrobacter parietis",
    "Soil_802": "Microcella alkalica",
    "Soil_803": "Nocardioides donggukensis",
    "Soil_805": "Terrabacter tumescens",
    "Soil_809": "Nocardioides panaciterrulae",
    "Soil_810": "Nocardioides baculatus",
    "Soil_811": "Pedococcus aerophilus",
}
arabidopsis_mapping_path = os.path.join(arabidopsis_data_path, "sample_to_organism.json")

with open(arabidopsis_mapping_path, "w") as file:
    json.dump(samples_to_organism_name_mapping, file)

In [15]:
def k2p(seq1, seq2):
    transitions = {('A','G'), ('G','A'), ('C','T'), ('T','C')}
    transversions = {('A','C'), ('C','A'), ('A','T'), ('T','A'),
                     ('G','C'), ('C','G'), ('G','T'), ('T','G')}
    
    seq1_str = str(seq1)
    seq2_str = str(seq2)
    
    ts_count = 0
    tv_count = 0
    valid_sites = 0
    
    for a, b in zip(seq1_str, seq2_str):
        if a not in "ACGT" or b not in "ACGT":
            continue
        valid_sites += 1
        if a != b:
            if (a, b) in transitions:
                ts_count += 1
            elif (a, b) in transversions:
                tv_count += 1
    
    if valid_sites == 0:
        return np.nan
    
    P = ts_count / valid_sites
    Q = tv_count / valid_sites
    
    # Kimura 2-parameter formula
    try:
        dist = -0.5 * np.log(1 - 2*P - Q) - 0.25 * np.log(1 - 2*Q)
    except ValueError:  # invalid log domain
        dist = np.inf
    return dist

In [16]:
alignment = AlignIO.read(aligned_16s_path, "fasta")
seq_ids = [record.id for record in alignment]

# Create an empty distance matrix
n = len(alignment)
matrix = np.zeros((n, n))

# Fill in K2P distances
for i in range(n):
    for j in range(i, n):
        dist = k2p(str(alignment[i].seq).upper(), str(alignment[j].seq).upper())
        matrix[i, j] = matrix[j, i] = dist

# Convert to DataFrame for nicer display
new_index = [samples_to_organism_name_mapping[id] for id in seq_ids]
dist_df = pd.DataFrame(matrix, index=new_index, columns=new_index)

# Filter gappy columns in the MSA
# num_seqs = len(alignment)
# gap_threshold = 0.2
# for i in range(alignment.get_alignment_length()):
#     col = alignment[:, i]
#     gap_fraction = col.count('-') / num_seqs
#     if gap_fraction > gap_threshold:
#         print(i)

In [17]:
from scipy.cluster.hierarchy import linkage, fcluster
from scipy.spatial.distance import squareform
from sklearn.metrics import davies_bouldin_score
from sklearn.manifold import MDS


In [18]:
condensed_dist = squareform(dist_df.values)
# print(condensed_dist)
# Project the organisms to Cartesian coordinates suitable for clustering evaluation
embedding = MDS(dissimilarity='precomputed', random_state=0).fit_transform(dist_df.values)
Z = linkage(condensed_dist, method='average')

optimal_clusters_count = 2
min_dbi_score = None
cluster_lables = None
for num_clusters in range(2, 10):
    labels = fcluster(Z, num_clusters, criterion='maxclust')
    # Compute cluster quality score (lower DBI is better)
    dbi_score = davies_bouldin_score(embedding, labels)
    print("Davies-Bouldin Index:", dbi_score)
    if min_dbi_score is None or dbi_score < min_dbi_score:
        min_dbi_score = dbi_score
        optimal_clusters_count = num_clusters
        cluster_lables = labels
print(f"best cluster num is: { optimal_clusters_count} with dbi score of {min_dbi_score}")
# print("Cluster assignments:", cluster_lables)

cluster_assignment_dict = dict(zip(new_index, cluster_lables))
organism_clusters = list({n: [k for k in cluster_assignment_dict.keys() if cluster_assignment_dict[k] == n]
     for n in set(cluster_assignment_dict.values())}.values())

for cluster in organism_clusters:
    print(cluster)

Davies-Bouldin Index: 0.6431265991363254
Davies-Bouldin Index: 0.48242367007959946
Davies-Bouldin Index: 0.2562005150765968
Davies-Bouldin Index: 0.32414331374614125
Davies-Bouldin Index: 0.3537457377259033
Davies-Bouldin Index: 0.4034959756011239
Davies-Bouldin Index: 0.47599418729659576
Davies-Bouldin Index: 0.25261748815034313
best cluster num is: 9 with dbi score of 0.25261748815034313
['Arthrobacter luteolus', 'Arthrobacter ginsengisoli', 'Arthrobacter pascens', 'Pseudarthrobacter phenanthrenivorans', 'Arthrobacter silvisoli', 'Arthrobacter tumbae', 'Arthrobacter parietis']
['Nocardioides jensenii', 'Nocardioides terrigena', 'Nocardioides donggukensis', 'Nocardioides panaciterrulae', 'Nocardioides baculatus']
['Janibacter limosus', 'Knoellia subterranea', 'Phycicoccus duodecadis', 'Pedococcus dokdonensis', 'Terrabacter terrae', 'Pedococcus bigeumensis', 'Terrabacter tumescens', 'Pedococcus aerophilus']
['Agromyces allii']
['Microcella alkalica']
['Mycolicibacterium smegmatis']
['R

### Optimization Run Code

In [19]:
def run_modules_for_configuration(input):
    result = run_modules(input[1], should_run_output_module=False)
    return {"configuration": input[0], "result": result}

In [20]:
base_output_path = "/tamir2/moranb/microbiome/Igem_TAU_2021/analysis/results/arabidopsis"

### pairwise

In [19]:
pairwise_output_path = os.path.join(base_output_path, "pairwise")
for optimization_method in [
    # "single_codon_diff", 
    # "single_codon_ratio",
    # "zscore_bulk_aa_diff",
    # "zscore_bulk_aa_ratio",
    "single_wanted_organism",
]:
    run_configurations = []
    for wanted_organism in organisms_list:
        for unwanted_organism in organisms_list:
            if wanted_organism == unwanted_organism:
                continue
            run_configurations.append((wanted_organism, unwanted_organism, optimization_method)) 

    inputs = [(
        run_configration, 
        generate_testing_data(
            orf_optimization_method = run_configration[2],
            orf_optimization_cub_index = "CAI",
            wanted_hosts = [run_configration[0]],
            unwanted_hosts = [run_configration[1]], 
            genome_path = genomes_gb_path,
            sequence_file_path = zorA_file_path,
            output_path = pairwise_output_path,
        )) for run_configration in run_configurations]

    print(f"Total number of records is: {len(inputs)}")
    batch_size = 50
    
    for batch_index, batch_start_index in enumerate(range(0, len(inputs), batch_size)):
        inputs_batch = inputs[batch_start_index: batch_start_index+batch_size]
        inputs_db = db.from_sequence(inputs_batch)
        results_db = inputs_db.map(run_modules_for_configuration)
        batch_file_path = os.path.join(pairwise_output_path, optimization_method, f"batch-{batch_index}")
        batch_file_path = batch_file_path + "-debug"
        results_db.map(json.dumps).to_textfiles(os.path.join(batch_file_path, '*.json'))
        os.mknod(os.path.join(batch_file_path, "done"))


Total number of records is: 1122


Resource specification for PBS not set, initializing it to select=1:ncpus=4:mem=7630MB
Resource specification for PBS not set, initializing it to select=1:ncpus=4:mem=7630MB
Resource specification for PBS not set, initializing it to select=1:ncpus=4:mem=7630MB
Resource specification for PBS not set, initializing it to select=1:ncpus=4:mem=7630MB
Resource specification for PBS not set, initializing it to select=1:ncpus=4:mem=7630MB
Resource specification for PBS not set, initializing it to select=1:ncpus=4:mem=7630MB
Resource specification for PBS not set, initializing it to select=1:ncpus=4:mem=7630MB
Resource specification for PBS not set, initializing it to select=1:ncpus=4:mem=7630MB
Resource specification for PBS not set, initializing it to select=1:ncpus=4:mem=7630MB
Resource specification for PBS not set, initializing it to select=1:ncpus=4:mem=7630MB
Resource specification for PBS not set, initializing it to select=1:ncpus=4:mem=7630MB
Resource specification for PBS not set, ini

### One vs. All

In [None]:
single_wanted_output_path = os.path.join(base_output_path, "one_vs_all")
for optimization_method in [
    # "single_codon_diff", 
    # "single_codon_ratio",
    "zscore_bulk_aa_diff",
    "zscore_bulk_aa_ratio",
    # "single_wanted_organism",
]:
    run_configurations = []
    for wanted_organism in organisms_list:
        unwanted_organisms = set(organisms_list)
        unwanted_organisms.remove(wanted_organism)
        unwanted_organisms = list(unwanted_organisms)
        run_configurations.append([wanted_organism, unwanted_organisms, optimization_method]) 

    inputs = [(
        run_configuration, 
        generate_testing_data(
            orf_optimization_method = run_configuration[2],
            orf_optimization_cub_index = "CAI",
            wanted_hosts = [run_configuration[0]],
            unwanted_hosts = run_configuration[1], 
            genome_path = genomes_gb_path,
            sequence_file_path = zorA_file_path,
            output_path = single_wanted_output_path,
        )) for run_configuration in run_configurations]

    
    print(f"Total number of records is: {len(inputs)}")
    batch_size = 50
    
    for batch_index, batch_start_index in enumerate(range(0, len(inputs), batch_size)):
        batch_file_path = os.path.join(single_wanted_output_path, optimization_method, f"batch-{batch_index}")
        batch_file_path = batch_file_path + "-debug"
        inputs_batch = inputs[batch_start_index: batch_start_index+batch_size]
        inputs_db = db.from_sequence(inputs_batch)
        results_db = inputs_db.map(run_modules_for_configuration)
        results_db.map(json.dumps).to_textfiles(os.path.join(batch_file_path, '*.json'))
        os.mknod(os.path.join(batch_file_path, "done"))

    # Local debugging
        
    # inputs = [(
    #     run_configuration, 
    #     generate_testing_data(
    #         orf_optimization_method = run_configuration[2],
    #         orf_optimization_cub_index = "CAI",
    #         wanted_hosts = [run_configuration[0]],
    #         unwanted_hosts = run_configuration[1], 
    #         genome_path = genomes_gb_path,
    #         sequence_file_path = zorA_file_path,
    #         output_path = single_wanted_output_path,
    #     )) for run_configuration in run_configurations[:1]]

    
    # raw_result = run_modules_for_configuration(inputs[0])


Total number of records is: 34


Resource specification for PBS not set, initializing it to select=1:ncpus=4:mem=7630MB
Resource specification for PBS not set, initializing it to select=1:ncpus=4:mem=7630MB
Resource specification for PBS not set, initializing it to select=1:ncpus=4:mem=7630MB
Resource specification for PBS not set, initializing it to select=1:ncpus=4:mem=7630MB
Resource specification for PBS not set, initializing it to select=1:ncpus=4:mem=7630MB
Resource specification for PBS not set, initializing it to select=1:ncpus=4:mem=7630MB
Resource specification for PBS not set, initializing it to select=1:ncpus=4:mem=7630MB
Resource specification for PBS not set, initializing it to select=1:ncpus=4:mem=7630MB
Resource specification for PBS not set, initializing it to select=1:ncpus=4:mem=7630MB
Resource specification for PBS not set, initializing it to select=1:ncpus=4:mem=7630MB
Resource specification for PBS not set, initializing it to select=1:ncpus=4:mem=7630MB
Resource specification for PBS not set, ini

### Cluster vs. All

In [None]:
wanted_cluster_output_path = os.path.join(base_output_path, "cluster_vs_all")
for optimization_method in [
    "single_codon_diff", 
    "single_codon_ratio",
    "zscore_bulk_aa_diff",
    "zscore_bulk_aa_ratio",
    "single_wanted_organism",
]:
    run_configurations = []
    for organisms_cluster in organism_clusters:
        wanted_organisms = [os.path.join(genomes_gb_path, wanted_organism)+".gbff" for wanted_organism in organisms_cluster]  
        unwanted_organisms = set(organisms_list)
        unwanted_organisms = unwanted_organisms - set(wanted_organisms)
        unwanted_organisms = list(unwanted_organisms)

        assert len(wanted_organisms) + len(unwanted_organisms) == 34
        
        run_configurations.append([wanted_organisms, unwanted_organisms, optimization_method]) 

    inputs = [(
        run_configuration, 
        generate_testing_data(
            orf_optimization_method = run_configuration[2],
            orf_optimization_cub_index = "CAI",
            wanted_hosts = run_configuration[0],
            unwanted_hosts = run_configuration[1], 
            genome_path = genomes_gb_path,
            sequence_file_path = zorA_file_path,
            output_path = wanted_cluster_output_path,
        )) for run_configuration in run_configurations]
    
    print(f"Total number of records is: {len(inputs)}")
    batch_size = 50
    
    for batch_index, batch_start_index in enumerate(range(0, len(inputs), batch_size)):
        batch_file_path = os.path.join(wanted_cluster_output_path, optimization_method, f"batch-{batch_index}")
        batch_file_path = batch_file_path + "-debug"
        inputs_batch = inputs[batch_start_index: batch_start_index+batch_size]
        inputs_db = db.from_sequence(inputs_batch)
        results_db = inputs_db.map(run_modules_for_configuration)
        results_db.map(json.dumps).to_textfiles(os.path.join(batch_file_path, '*.json'))
        os.mknod(os.path.join(batch_file_path, "done"))


Total number of records is: 9


### Liyam's Legacy Analysis code 

####  Evolutionary distance and model performance

In [None]:
# final_tested_org = not_really_small_genome
# # func_options = ['single_codon_global', 'single_codon_local', 'zscore_hill_climbing_average', 'zscore_hill_climbing_weakest_link']
# func_options = ['single_codon_global', 'zscore_hill_climbing_average']

# spearman_dict = {}
# for translation_function in func_options:
#     opt_scores = []
#     msa_scores = []

#     tic = time.time()
#     for org1 in final_tested_org:
#         for org2 in final_tested_org:
#             # if org1.split(' ')[0] == org2.split(' ')[0]:
#             #     continue
#             if org1 == org2:
#                 continue
#             org_dict[org1]['optimized'] = True
#             org_dict[org1]['tai_profile'] = {}
#             org_dict[org1]['tai_std'] = {}
#             org_dict[org1]['tai_avg'] = {}
#             org_dict[org2]['optimized'] = False
#             org_dict[org2]['tai_profile'] = {}
#             org_dict[org2]['tai_std'] = {}
#             org_dict[org2]['tai_avg'] = {}
#             software_dict = {
#                 'sequence': cds,
#                 'tuning_param': 0.5,
#                 'organisms': {},
#                 }
#             software_dict['organisms'][org1] = org_dict[org1]
#             software_dict['organisms'][org2] = org_dict[org2]
#             inner_tic = time.time()
#             final_cds, optimization_index, weakest_score = run_orf_optimization(software_dict)
#             print('TIME: ', time.time()-inner_tic)
#             # alignment_score = pairwise2.align.globalxx(
#             #     ribosomal_dict[org1], ribosomal_dict[org2], score_only=True)
#             # aln_scores.append(alignment_score)
#             msa_score = diff_letters(ribosomal_msa_dict[org1], ribosomal_msa_dict[org2])
#             msa_scores.append(msa_score)
#             opt_scores.append(optimization_index)
#     spearman_dict[translation_function] = spearmanr(msa_scores, opt_scores, nan_policy='omit')
#     print(spearmanr(msa_scores, opt_scores, nan_policy='omit'))
#     trans_func_name = translation_function.replace('_', ' ')
#     plt.scatter(msa_scores, opt_scores, s=0.1)

#             # print(org1, org2, msa_scores, optimization_index)
#     toc = time.time()
#     print(toc-tic)

# print(spearman_dict)
# plt.legend(['single codon optimization', 'hill climbing optimization'], loc ="upper right")
# plt.title(f'Evolutionary distance and model performance')
# plt.xlabel('# Different aligned characters')
# plt.ylabel('Optimization score')
# plt.show()


#### Wanted hosts clustering

In [None]:
from scipy.stats import spearmanr
import numpy as np
from modules import models
from sklearn.cluster import AgglomerativeClustering, KMeans

def dict_to_cluster_np_array(user_input: models.ModuleInput): #todo: make this work on tai as well
    clustering_mat = []
    opt_org_list = []
    for organism in user_input.organisms:
        if organism.is_optimized:
            cai_profile = organism.cai_profile
            clustering_mat.append(list(cai_profile.values()))
            opt_org_list.append(organism.name)
    clustering_mat=np.array(clustering_mat)
    return clustering_mat, opt_org_list


def make_distance_matrix(clustering_mat):
    n_samples = clustering_mat.shape
    n_samples = n_samples[0]
    distance_matrix = np.zeros(shape=(n_samples, n_samples))
    for i in range(n_samples):
        sample_i = clustering_mat[i,:]
        for k in range(n_samples):
            sample_k = clustering_mat[k,:]
            distance_matrix[i,k] = 1-spearmanr(sample_k, sample_i)[0] # using -1 for distance instead of similarity
    return distance_matrix


def create_n_clusters(clustering_mat, n_clus):
    dist_metric = 'precomputed'
    distance_mat = make_distance_matrix(clustering_mat)
    clustering = AgglomerativeClustering(n_clusters=n_clus,
                                         affinity=dist_metric,
                                         linkage='average').fit(distance_mat, )

    return clustering.labels_
#
# def find_best_clustering(clustering_mat, max_clus_num, c_index='dbi', c_method = 'alggomerative' ):
#     dist_metric = 'precomputed'
#     scores = []
#     clusters = []
#
#     distance_mat = make_distance_matrix(clustering_mat)
#     n_samples = distance_mat.shape
#     n_samples = n_samples[0]
#     for n_clus in range(2, min(n_samples, max_clus_num)):
#         ##### clustering options ######
#         # if c_method == 'kmeans':
#         #     clustering = KMeans(n_clusters=n_clus).fit(clustering_mat)
#         # else:
#         clustering = AgglomerativeClustering(n_clusters=n_clus,
#                                              affinity= dist_metric,
#                                              linkage='average').fit(distance_mat, )
#
#         labels = clustering.labels_
#
#         ##### cluster eval indexes ####
#         if c_index == 'dbi':
#             score = davies_bouldin_score(clustering_mat, labels)
#         else:
#             score = silhouette_score(clustering_mat, labels, metric= dist_metric)
#
#         scores.append(score)
#         clusters.append(labels)
#
#     best_score = min(scores)
#     best_clusturing = clusters[scores.index(best_score)]
#     return best_clusturing, best_score


def return_list_of_sub_microbiomes(best_clusturing: list, module_input: models.ModuleInput):
    opt_org_list = [org.name for org in module_input.organisms if org.is_optimized]
    deopt_org_list = [org.name for org in module_input.organisms if not org.is_optimized]
    c_assignment_dict = dict(zip(opt_org_list, best_clusturing))
    opt_org_clusters = list({n: [k for k in c_assignment_dict.keys() if c_assignment_dict[k] == n]
         for n in set(c_assignment_dict.values())}.values())

    inp_obj_list = []
    for c_opt_org_list in opt_org_clusters:
        opt_and_deopt = c_opt_org_list+deopt_org_list
        new_module_input = models.ModuleInput(
            organisms=[],
            sequence=module_input.sequence,
            tuning_parameter=module_input.tuning_parameter,
            clusters_count=module_input.clusters_count,
        )

        new_module_input.organisms = [module_input.organisms[i] for i in range(len(opt_org_list + deopt_org_list))
                                    if module_input.organisms[i].name in opt_and_deopt]
        inp_obj_list.append(new_module_input)
    return inp_obj_list