## Description  
In this notebook, we analyze the processed and recomputed trees to classify them into three classes: vertical evolution, unsupported HGT, supported HGT.      

In [None]:
%matplotlib notebook

In [None]:
import numpy as np
from matplotlib import pyplot as plt
import os
from path import Path
from ete3 import Tree
from Bio import SeqIO
import ete3
from ete3 import Tree, NCBITaxa
import re
from path import Path
from warnings import warn
from collections import Counter
import os
from matplotlib import pyplot as plt
import numpy as np
from collections import defaultdict
from hgt_algorithms import *

In [None]:
ncbi = NCBITaxa()

## Data

In [None]:
tree_directory = Path('third_round_trees')

In [None]:
final_tree_dir = Path('final_processed_trees') 

In [None]:
taxid_table = open('final_homolog_taxid_table')
taxid_table = {l.strip().split()[0]: l.strip().split()[1] for l in taxid_table if l.strip()}

In [None]:
fungi_taxid = 4751
bacteria_taxid = 2
animal_taxid = 33208

In [None]:
target_accessions = open('basal_accessions.txt')
target_accessions = set(l.strip() for l in target_accessions if l.strip())

## Tree processing

In [None]:
hgt_data = [['cluster_id', 
             'hgt_criterion',
             'well_supported',
             'fungal_LCA',
             'gene_origin',
             'sister1_LCA',
             'sister2_LCA',
             'joint_sister_LCA',
             'fungal_accessions',
             'gene_tree_leaves',
             'fungal_species_nb', 
             'all_species_nb', 
             'split_distance',
             'repositionable',
             'fungal_clade_support',
             'average adjacent branch support',
             'fungal_branch_length']]

tree_files = [f for f in os.listdir(tree_directory) if f[-8:] == 'treefile' ]
nb_of_files = len(tree_files)
print(nb_of_files, 'trees to process')

nb_of_all_trees = 0
ORFan_trees = 0
purely_fungal_trees = 0
no_target_proteins = 0

all_branch_lengths = []
fg_clade_branch_lengths = [] 
tree_sizes = []

for file_id, filename in enumerate(tree_files):
    if not file_id % 10:
        print(file_id, 'out of', nb_of_files)
    cluster_accession = '.'.join(filename.split('.')[:-1])
    
    
    hgt_criterion = ''    
    bdi = ''
    repositionable = ''
    sister1_lca = ''
    sister2_lca = ''
    joint_lca = ''
    fungal_lca = ''
    well_supported = ''
    gene_origin = ''
    fg_support = ''
    fungal_branch_length = -1
    average_posthgt_substitution = -1  # average length from fungal leaves to root of fungal clade
    
    
    G = Tree(tree_directory / filename, format=1) 
    nb_of_all_trees += 1
    for i, n in enumerate(G.traverse('postorder')):
        if n.is_leaf():
            n.support = 100.  # by definition
        elif not n.is_root():
            if n.name:
                n.support = float(n.name)
            else:
                n.support = 100  # IQTree doesn't assign support values for identical proteins 
                
#            # Version for two types of branch support from IQTree (Ultrafast Bootstrap + SH-aLRT):
#             assert not n.name or len(n.name.split('/')) == 2
#             if len(n.name.split('/')) == 2:
#                 n.support = float(n.name.split('/')[1])
#                 n.name = '' 
#             else:
#                 n.support = 0
            
    G.set_outgroup(G.get_midpoint_outgroup())
    all_branch_lengths.append([n.dist for n in G.traverse() if not n.is_root()])
    
    # Labelling internal nodes:
    for i, n in enumerate(G.traverse('postorder')):
        if not n.is_leaf():
            n.name = 'node%i' % i
            
    # Get taxonomic information
    protein_accessions = {l.name for l in G}
    accession_to_taxid = {acc: taxid_table[acc] for acc in protein_accessions}
    all_species = set(accession_to_taxid.values())
    if len(all_species) < 4:
        ORFan_trees += 1
        continue
    lineages = {tx: ncbi.get_lineage(tx) for tx in all_species} # note: lineages coded in integers here
    fungal_taxids = {tx for tx in all_species if fungi_taxid in lineages[tx]} # note: taxids coded in strings here 
    fungal_proteins = {acc for acc in protein_accessions if taxid_table[acc] in fungal_taxids}
    target_proteins = fungal_proteins.intersection(target_accessions)
    
    if not target_proteins:
        no_target_proteins += 1
        continue
    if len(fungal_taxids) >= len(all_species) - 1:
        purely_fungal_trees += 1
        continue
        
    # Identify displaced clades
    S = ncbi.get_topology(all_species, intermediate_nodes=False)
    UG = UnrootedForest(G)
    US = UnrootedForest(S, binary=False)
    fungal_cdis = UG.get_clade_displacements(fungal_proteins, US, accession_to_taxid)
    assert fungal_cdis,'Fungal clades not found'
    # Identify if correct placement of fungal clade exists in G
    # This can be done once for all fungal clades
    predicted_fungal_clade_location = UG.get_optimal_clade_location(fungal_proteins, US, accession_to_taxid)
    min_cdi_in_G = predicted_fungal_clade_location[1]
    
    
    # Check each identified clade for other HGT criteria
    # Check supports and branch lengths
    for clade_edge, cdi in fungal_cdis:
        adjacent_nodes = UG.get_neighbours(clade_edge[0])
        adjacent_edges = [frozenset((clade_edge[0], adj_node)) for adj_node in adjacent_nodes]
        assert len(adjacent_edges) == 3
        supports = [UG.edge_supports[edge] for edge in adjacent_edges]
        average_support = sum(supports)/3.
        fungal_branch_length = UG.edge_lengths[frozenset(clade_edge)]
        fungal_branch_support = UG.edge_supports[frozenset(clade_edge)]
        fg_clade_branch_lengths.append(fungal_branch_length)
        if cdi > 0:
            cluster_type = 'hgt' 
        else:
            cluster_type = 'homoplasy'
        # Get gene origin
        neighbour_clade_leaves = [UG.get_clade_leaves((clade_edge[0], adj_node)) for adj_node in adjacent_nodes if adj_node != clade_edge[1]]
        assert len(neighbour_clade_leaves) == 2, 'Improper number of neighbours'
        # mask other fungal proteins
        neighbour_clade_leaves = [[leaf_name for leaf_name in clade if leaf_name not in fungal_proteins] for clade in neighbour_clade_leaves]
        neighbour_clade_species = [set(accession_to_taxid[leaf_name] for leaf_name in clade) for clade in neighbour_clade_leaves]
        all_neighbour_species = neighbour_clade_species[0] | neighbour_clade_species[1]
        sister1_lca = ncbi.get_topology(neighbour_clade_species[0]).taxid
        sister2_lca = ncbi.get_topology(neighbour_clade_species[1]).taxid
        joint_neighbour_lca = ncbi.get_topology(all_neighbour_species).taxid
        fungal_clade_leaves = UG.get_clade_leaves(clade_edge)
        fungal_species = set([accession_to_taxid[leaf_name] for leaf_name in fungal_clade_leaves])
        fungal_lca = ncbi.get_topology(fungal_species).taxid
        if cdi == 0:
            gene_origin = 'N/A'
        elif sister1_lca == sister2_lca:
            assert sister1_lca == joint_neighbour_lca
            gene_origin = joint_neighbour_lca
        elif sister1_lca == joint_neighbour_lca:
            gene_origin = sister2_lca
        elif sister2_lca == joint_neighbour_lca:
            gene_origin = sister1_lca
        else:
            gene_origin = 'Undetermined'
        
        highly_certain = cluster_type == 'hgt' and average_support >= 90 and min_cdi_in_G == 0 and gene_origin!='Undetermined'
        highly_certain = 'Yes' if highly_certain else 'No'
        assert cluster_accession and cluster_type and highly_certain and fungal_lca and \
                gene_origin and sister1_lca and sister2_lca and joint_neighbour_lca
        hgt_data.append([cluster_accession, 
                         cluster_type,
                         highly_certain,
                         fungal_lca,
                         gene_origin,
                         sister1_lca,
                         sister2_lca,
                         joint_neighbour_lca,
                         '"' + ','.join(fungal_clade_leaves) + '"',  # Note: accessions of duplicated sequences happen here 
                         len(G),                                     # due to new proteome releases, but this is needed for 
                         len(fungal_species),                        # xenolog analyses to get clades
                         len(all_species), 
                         cdi,
                         min_cdi_in_G,
                         fungal_branch_support, 
                         average_support,
                         fungal_branch_length])


Diagnostic information:

In [None]:
print(file_id, filename)
print(G.get_ascii(attributes=['name', 'support']))
print(S.get_ascii(attributes=['sci_name', 'taxid']))
print(fungal_cdis)
print(predicted_fungal_clade_location)
print(supports)
print(fungal_branch_length)

In [None]:
print('Processing results:')

print('All trees:', nb_of_all_trees)
print('All fungal clades:', len(hgt_data) - 1)
#print('Trees with fungi:', nb_of_trees_with_any_fungus)
#print('Trees with fungal and non-fungal species:', nb_of_trees_with_mixed_species)
#print('Trees with monophyletic EDF fungi in mixed-species trees:', nb_of_trees_with_monophyletic_edf_fungi)
print('Number of trees with >300 leaves:', sum(x>300 for x in tree_sizes))
#print('Trees without target proteins:', no_target_proteins)

print('Nb of HGT candidates:')
print(len(hgt_data)-1)
print('Nb of HGT clades per criterion:')
for cr in set(l[1] for l in hgt_data[1:]):
    print(cr, sum(l[1]==cr for l in hgt_data))
print('Nb of HGT trees per criterion:')
for cr in set(l[1] for l in hgt_data[1:]):
    print(cr, len(set([l[0] for l in hgt_data if l[1] == cr])))
print('Nb of fungal transferred proteins with weah support:', sum(len(l[8].split(',')) for l in hgt_data[1:] if l[1] == 'hgt'))
print('Nb of EDF transferred proteins with weak support:', sum(len(set(l[8].split(',')) & target_accessions) for l in hgt_data[1:] if l[1] == 'hgt'))
print('Number of well supported transfers:', sum(x[2] ==  'Yes' for x in hgt_data))
print('Number of trees with a well-supported transfer:', len(set(x[0] for x in hgt_data if x[2] == 'Yes')))
print('Proportion of well-supported transfers among displaced clade transfers:', round(sum(x[2] == 'Yes' for x in hgt_data)/sum(l[1]=='hgt' for l in hgt_data), 2))
# print('Number of well supported transfers with SH support > 85:', sum(x[2] == 'Yes' and x[15] > 85 for x in hgt_data))
print('Nb of well-supported fungal xenologs:', sum(len(l[8].split(',')) for l in hgt_data[1:] if l[2] == 'Yes'))
print('Nb of well-supported target xenologs:', sum(len(target_accessions.intersection(l[8].replace('"', '').split(','))) for l in hgt_data[1:] if l[2] == 'Yes'))
print('Nb of fungal proteins with homoplasy:', sum(len(l[8].replace('"', '').split(',')) for l in hgt_data[1:] if l[1] == 'homoplasy'))
print('Nb of EDF proteins with homoplasy:', sum(len(target_accessions.intersection(l[8].replace('"', '').split(','))) for l in hgt_data[1:] if l[1] == 'homoplasy'))


**Saving the result table:**

In [None]:
with open('hgt_results.tsv', 'w') as h:
    h.write('\n'.join('\t'.join(map(str, l)) for l in hgt_data) + '\n')

**Relabeling and saving the trees:**

In [None]:
try:
    os.mkdir(final_tree_dir)
except FileExistsError:
    pass
try:
    os.mkdir(final_tree_dir / 'supported_hgt')
except FileExistsError:
    dir_contents = os.listdir(final_tree_dir / 'supported_hgt')
    for f in dir_contents:
        os.remove(final_tree_dir / 'supported_hgt' / f)
try:
    os.mkdir(final_tree_dir / 'unsupported_hgt')
except FileExistsError:
    dir_contents = os.listdir(final_tree_dir / 'unsupported_hgt')
    for f in dir_contents:
        os.remove(final_tree_dir / 'unsupported_hgt' / f)
try:
    os.mkdir(final_tree_dir / 'homoplasy')
except FileExistsError:
    dir_contents = os.listdir(final_tree_dir / 'homoplasy')
    for f in dir_contents:
        os.remove(final_tree_dir / 'homoplasy' / f)
try:
    os.mkdir(final_tree_dir / 'identity')
except FileExistsError:
    dir_contents = os.listdir(final_tree_dir / 'identity')
    for f in dir_contents:
        os.remove(final_tree_dir / 'identity' / f)
        
for hgt in hgt_data[1:]:
    T = Tree(tree_directory / hgt[0] + '.treefile', format=1)
    taxids = {l.name : int(taxid_table[l.name]) for l in T}
    sci_names = ncbi.get_taxid_translator(list(taxids.values()))
    S = ncbi.get_topology(set(taxids.values()), intermediate_nodes=False)
    for l in T:
        l.accession = l.name
        l.taxid = taxids[l.name]
        l.name = sci_names[taxids[l.name]].replace(' ', '_')
    for l in S:
        l.name = sci_names[int(l.name)].replace(' ', '_')
    if hgt[2] == 'Yes':
        T.write(outfile = final_tree_dir / 'supported_hgt' / hgt[0] + '.genetree', 
                features=['accession', 'taxid'], format_root_node=False, format=1)
        S.write(outfile = final_tree_dir / 'supported_hgt' / hgt[0] + '.speciestree', 
                format_root_node=False, format=9)
    elif hgt[1] == 'hgt':
        T.write(outfile = final_tree_dir / 'unsupported_hgt' / hgt[0] + '.genetree', 
                features=['accession', 'taxid'], format_root_node=False, format=1)
        S.write(outfile = final_tree_dir / 'unsupported_hgt' / hgt[0] + '.speciestree', 
                format_root_node=False, format=9)
    elif hgt[1] == 'homoplasy':
        T.write(outfile = final_tree_dir / 'homoplasy' / hgt[0] + '.genetree', 
                features=['accession', 'taxid'], format_root_node=False, format=1)
        S.write(outfile = final_tree_dir / 'homoplasy' / hgt[0] + '.speciestree', 
                format_root_node=False, format=9)
    elif hgt[1] == 'identity':
        T.write(outfile = final_tree_dir / 'identity' / hgt[0] + '.genetree', 
                features=['accession', 'taxid'], format_root_node=False, format=1)
        S.write(outfile = final_tree_dir / 'identity' / hgt[0] + '.speciestree', 
                format_root_node=False, format=9)
    else:
        raise RuntimeError('sth went terribly wrong!')

## Initial analysis of tree properties   
Used as a part of a manual verification of the results.   
It's recommended to also take a look at the trees themselves, as well as partial results from all stages of the analysis.  

Branch lengths:

In [None]:
# all branch lengths for trees with displaced clade criterion:
dcl_blen = [l for bl, hgt in zip(all_branch_lengths, hgt_data[1:]) for l in bl if hgt[1] == 'hgt' ]
# all branch lengths for well-supported HGTs:
wsp_blen = [l for bl, hgt in zip(all_branch_lengths, hgt_data[1:]) for l in bl if hgt[2]]

In [None]:
plt.figure(figsize=(8,2))
plt.subplot(121)
plt.title('All displaced clades')
plt.hist(dcl_blen, bins=80)
plt.subplot(122)
plt.title('Well-supported')
plt.hist(wsp_blen, bins=80)
plt.tight_layout()
plt.show()

In [None]:
# Fungal clade supports for trees with displaced clade criterion:
dcl_fgsup = [hgt[-3] for hgt in hgt_data[1:] if hgt[1] == 'hgt' ]
# Fungal clade suports for well-supported HGTs:
wsp_fgsup = [hgt[-3] for hgt in hgt_data[1:] if hgt[2] == 'Yes']

In [None]:
# Local average supports for trees with displaced clade criterion:
dcl_avsup = [hgt[-2] for hgt in hgt_data[1:] if hgt[1] == 'hgt' ]
# Local average suports for well-supported HGTs:
wsp_avsup = [hgt[-2] for hgt in hgt_data[1:] if hgt[2] == 'Yes']

In [None]:
plt.figure(figsize=(8,4))
plt.subplot(221)
plt.title('All displaced clades, FG support')
plt.hist(dcl_fgsup, bins=80)
plt.subplot(222)
plt.title('Well-supported, FG support')
plt.hist(wsp_fgsup, bins=80)
plt.subplot(223)
plt.title('All displaced clades, LOC support')
plt.hist(dcl_avsup, bins=80)
plt.subplot(224)
plt.title('Well-supported, LOC support')
plt.hist(wsp_avsup, bins=80)
plt.tight_layout()
plt.show()

Reasons for lack of appropriate support of HGT hypothesis:

In [None]:
clade_id = []
for i, hgt in enumerate(hgt_data):
    clade_id.append(sum(prhgt[0] == hgt[0] for prhgt in hgt_data[:i]))

In [None]:
repositionable_problem = set([(hgt[0], clid) for hgt, clid in zip(hgt_data, clade_id) if hgt[1] == 'hgt' and hgt[-4] > 0])
origin_problem = set([(hgt[0], clid) for hgt, clid in zip(hgt_data, clade_id) if hgt[1] == 'hgt' and hgt[4] == 'Undetermined'])
# branch_length_problem = [hgt[0] for hgt in hgt_data if hgt[-1] >= 2]
clade_support_problem = set([(hgt[0], clid) for hgt, clid in zip(hgt_data, clade_id) if hgt[1] == 'hgt' and hgt[-2] != 'N/A (leaf)' and int(hgt[-2]) < 90])
homoplasy = set([(hgt[0], clid) for hgt, clid in zip(hgt_data, clade_id) if hgt[1] == 'homoplasy'])

In [None]:
print('Problem with repositionable:', len(repositionable_problem))
print('Problem with gene origin:', len(origin_problem))
# print('Problem with branch length:', len(branch_length_problem))
print('Problem with fungal clade support:', len(clade_support_problem))
print('Homoplasy:', len(homoplasy))
# print('Any problem:', len(repositionable_problem | origin_problem | branch_length_problem | clade_support_problem), 'out of', len(dcl_data))
# print('Non-problem:', len(dcl_data) - len(repositionable_problem | origin_problem | branch_length_problem | clade_support_problem))

In [None]:
venn_data = {'Not repositionable' : repositionable_problem,
             'Undeterminable donor': origin_problem, 
             'Low branch support': clade_support_problem}

In [None]:
from venn import venn

In [None]:
venn(venn_data)
plt.show()
plt.savefig('Results/%s_clade_problems.png' % CLUSTER_SUBDIR, dpi=600)

## Select a random set of trees for manual inspection

In [None]:
import numpy.random as rd

In [None]:
well_supported_hgt_clusters = [hgt[0] for hgt in hgt_data[1:] if hgt[2]]
len(well_supported_hgt_clusters)

In [None]:
randomly_selected_clusters = rd.choice(well_supported_hgt_clusters, 10)
print(randomly_selected_clusters)

In [None]:
with open('cluster_list_for_manual_verification.txt', 'w') as h:
    h.write('\n'.join(randomly_selected_clusters) + '\n')