## Description

In this notebook, we analyze results of clustering of sets of sequences on all stages of processing.
We parse the results of clustering.  
We discard ORFans and huge clusters.   
We extract FASTAs and save them in `_raw_clusters` directories.


## Modules

In [None]:
# %matplotlib notebook

In [None]:
from matplotlib import pyplot as plt
import numpy as np
import os
from Bio import SeqIO
from collections import Counter
from ete3 import NCBITaxa
from path import Path
ncbi = NCBITaxa()

## Data 

In [None]:
fungi_taxid = 4751

In [None]:
taxid_table = open('final_homolog_taxid_table')
taxid_table = {l.strip().split()[0]: l.strip().split()[1] for l in taxid_table if l.strip()}

In [None]:
all_accessions = open('standardized_final_accessions')  
all_accessions = {l.strip() for l in all_accessions if l.strip()}

In [None]:
target_accessions = open('basal_accessions.txt')
target_accessions = set(l.strip() for l in target_accessions if l.strip())
len(target_accessions)

In [None]:
single_family_target_accessions = open('single_family_accessions.txt')
single_family_target_accessions = set(l.strip() for l in single_family_target_accessions if l.strip())

## Cluster sanity checks

Check if each cluster contains self, if clusters are separate, and if clusters contain all accessions

**Before running the following cells, run the code in the section titled *Function definitions* at the end of this notebook.**

In [None]:
algorithm='mcl'
cluster_file = 'out.mcl_twoway_filter.mci.I17'
output_dirname = 'first_round_clusters'
min_number_of_species_in_cluster = 4

In [None]:
cluster_table = {}
cluster_table = parse_clustering(cluster_file, algorithm)   # need to substitute for proper algorithm
occurences = Counter([acc for key in cluster_table for acc in cluster_table[key]])
dups = [acc for acc in occurences if occurences[acc] > 1]
cluster_sizes = [len(cluster_table[key]) for key in cluster_table]

In [None]:
print('Nb of accessions in multiple clusters in', input_fname, ':', len(dups))
all_acc = {acc for key in cluster_table for acc in cluster_table[key]}
print('Nb of accessions in all clusters:', len(all_acc))
print('Nb of accessions in original sequence file:', len(all_accessions))
print('Difference in nbs of accessions:', len(all_acc) - len(all_accessions))
print('Nb of all clusters:', len(cluster_sizes))
print('Nb of singleton clusters:', sum(x==1 for x in cluster_sizes))
print('Nb of size <= 2 clusters:', sum(x<=2 for x in cluster_sizes))
print("Nb of size >= 4 clusters:", sum(x>=4 for x in cluster_sizes))
print("Nb of size >= 300 clusters:", sum(x>=300 for x in cluster_sizes))
print('Nb of reasonable size clusters:', sum(4 <= x <= 300 for x in cluster_sizes))
print('Average cluster size:', sum(cluster_sizes)/len(cluster_sizes))

In [None]:
plt.figure(figsize=(8,4))
plt.subplot(121)
plt.hist(cluster_sizes, bins=40)
plt.title('Distribution of cluster sizes')
plt.subplot(122)
plt.hist(np.log10(cluster_sizes), bins=40)
plt.title('Log-size')
plt.tight_layout()
plt.show()

In [None]:
print('Example large clusters:')
print(sorted(cluster_table, key = lambda k: len(cluster_table[k]), reverse=True)[:10])

## Cluster filtering and selecting FASTAs

In [None]:
fasta_dir = output_dirname
print('Input file:', cluster_file)
print('Output directory:', fasta_dir)

# try:
#     os.mkdir(fasta_dir)
# except FileExistsError:
#     dir_contents = os.listdir(fasta_dir)
#     for f in dir_contents:
#         os.remove(fasta_dir + '/' + f)
# remaining_sequences_file = 'unfiltered_sequences_for_alignment/' + '.'.join(fname.split('.')[:-1]) + '_joint.fasta'

print(len(cluster_table), 'clusters to process')

# Discard clusters without target proteins
cluster_table = {k: cluster_table[k] for k in cluster_table if single_family_target_accessions & set(cluster_table[k])}
print('Retained %i clusters with target proteins' % len(cluster_table))

#cluster_table['7589']

# get taxIDs to discard ORFans and purely fungal clusters
# untranslated = 0
all_remaining_seqids = [seqid for k in cluster_table for seqid in cluster_table[k]]
all_remaining_taxids = set(int(taxid_table[seqid]) for seqid in all_remaining_seqids)
all_lineages = {tx: ncbi.get_lineage(tx) for tx in all_remaining_taxids}  
all_remaining_fungal_taxids = {tx for tx in all_remaining_taxids if fungi_taxid in all_lineages[tx]}
all_fungal_lineage_ranks = ncbi.get_rank([lintx for fgtx in all_remaining_fungal_taxids for lintx in all_lineages[fgtx]]) 


nbs_of_species = {}
nbs_of_fungi = {}
nbs_of_fungal_groups = {}  # families or phyla, depending on the code below
for k in cluster_table:
    seqids = cluster_table[k]
    seqtx = [int(taxid_table[sqid]) for sqid in seqids]
    fungaltx = all_remaining_fungal_taxids & set(seqtx)
    nbs_of_species[k] = len(set(seqtx))
    nbs_of_fungi[k] = len(fungaltx)
    # Inspecting phyla:
    fungal_phyla = []
    for fgtx in fungaltx:
        current_phylum = None
        for lintx in all_lineages[fgtx]:
            if all_fungal_lineage_ranks[lintx] == 'phylum':
                current_phylum = lintx
                break
        assert current_phylum is not None
        fungal_phyla.append(current_phylum)
    assert len(fungal_phyla) == len(fungaltx)
    nbs_of_fungal_groups[k] = len(set(fungal_phyla))  # note: this ignores incertae sedis    
    

cluster_table = {k: cluster_table[k] for k in cluster_table if nbs_of_fungal_groups[k] <= 1}
saved_query_proteins = sum(acc in target_accessions for k in cluster_table for acc in cluster_table[k])
print('Retained %i clusters, %i target proteins, with a single fungal taxonomic group' % (len(cluster_table), saved_query_proteins))


cluster_table = {k: cluster_table[k] for k in cluster_table if nbs_of_fungi[k] < 0.6*nbs_of_species[k]}
saved_query_proteins = sum(acc in target_accessions for k in cluster_table for acc in cluster_table[k])
print('Retained %i clusters, %i target proteins, with mostly non-fungal taxonomy' % (len(cluster_table), saved_query_proteins))


cluster_table = {k: cluster_table[k] for k in cluster_table if nbs_of_species[k] >= min_number_of_species_in_cluster}
saved_query_proteins = sum(acc in target_accessions for k in cluster_table for acc in cluster_table[k])
print('Retained %i clusters, %i target proteins, with at least %i species' % (len(cluster_table), saved_query_proteins, min_number_of_species_in_cluster))


print('Average final cluster size:', sum(len(cluster_table[k]) for k in cluster_table)/len(cluster_table))
print('Largest cluster contains %i sequences' % max(len(cluster_table[k]) for k in cluster_table))
print('Smallest cluster contains %i sequences' % min(len(cluster_table[k]) for k in cluster_table))
acc_to_cluster = {acc:key for key in cluster_table for acc in cluster_table[key]}
all_sequences = SeqIO.parse('standardized_final_sequences.fa', 'fasta')
written_sequences = 0
sequences_to_save = []
for seq in all_sequences:
    try:
        cluster_id = acc_to_cluster[seq.id]
    except KeyError:
        continue
    with open(fasta_dir / cluster_id + '.fa', 'a') as h:
        SeqIO.write(seq, h, 'fasta')
        written_sequences += 1
print('Saved %i sequences' % written_sequences)

In [None]:
cluster_sizes = [len(cluster_table[key]) for key in cluster_table]
plt.figure(figsize=(8,4))
plt.subplot(121)
plt.hist(cluster_sizes, bins=40)
plt.title('Distribution of cluster sizes')
plt.subplot(122)
plt.hist(np.log10(cluster_sizes), bins=40)
plt.title('Log-size')
plt.tight_layout()
plt.show()

In [None]:
fungal_props_filtered = np.array([nbs_of_fungi[k]/nbs_of_species[k] for k in cluster_table])
fungal_props_raw = np.array([nbs_of_fungi[k]/nbs_of_species[k] for k in nbs_of_fungi])
plt.figure(figsize=(8,4))
plt.subplot(121)
plt.title('Initial proportion of fungal taxa')
plt.hist(fungal_props_raw[fungal_props_raw<1], bins=40)
plt.subplot(122)
plt.title('Final proportion of fungal taxa')
plt.hist(fungal_props_filtered, bins=40)
plt.tight_layout()
plt.show()

Summarizing the clustering results:

In [None]:
cluster_summary_file = output_dirname + '_summary.tsv'
print('Saving summary in', cluster_summary_file)
with open(cluster_summary_file, 'w') as h:
    for k in cluster_table:
        seqids = cluster_table[k]
        seqtx = [int(taxid_table[sqid]) for sqid in seqids]
        seqspec = ncbi.get_taxid_translator(seqtx)
        fungaltx = set(seqtx) & all_remaining_fungal_taxids
        for sid, stx in zip(seqids, seqtx):
            h.write('\t'.join(list(map(str, [k, sid, seqspec[stx], stx in fungaltx]))) + '\n')

## Function definitions

Parsing cluster files:

In [None]:
def parse_mmseqs2(cluster_file_path):
    """
    Returns a dict indexed with cluster names,
    storing lists of accessions in the cluster
    """
    with open(cluster_file_path) as cluster_file:
        cluster_table = {}
        for l in cluster_file:
            ref_id, mem_id = l.strip().split()
            try:
                cluster_table[ref_id].append(mem_id)
            except KeyError:
                cluster_table[ref_id] = [mem_id]
    return cluster_table

def parse_cdhit(cluster_file_path):
    with open(cluster_file_path) as cluster_file:
        cluster_table = {}
        current_cluster = ''
        for l in cluster_file:
            if l[0] == '>':
                current_cluster = l[1:].strip().replace(' ', '_')
                cluster_table[current_cluster] = []
            else:
                l = l.strip().split()
                acc = l[2][1:-3]
                cluster_table[current_cluster].append(acc)
    return cluster_table

def parse_mcl(cluster_file_path):
    with open(cluster_file_path) as cluster_file:
        cluster_table = {}
        for i, l in enumerate(cluster_file):
            cluster_table[str(i)] = l.strip().split()
    return cluster_table

In [None]:
def parse_clustering(cluster_file_path, algorithm):
    if algorithm=='mmseqs2':
        return parse_mmseqs2(cluster_file_path)
    elif algorithm=='cd-hit':
        return parse_cdhit(cluster_file_path)
    elif algorithm=='mcl':
        return parse_mcl(cluster_file_path)
    else:
        raise NotImplementedError()