# Evolutionary Similarity Index

Set of functions to assess evolutionary similarity between gene families using $I_{ES}$.

Accepted input files are: aminoacid [FASTA](https://en.wikipedia.org/wiki/FASTA_format), 
                                    [NEWICK](https://en.wikipedia.org/wiki/Newick_format), and 
                                    ".mldist" from [IQTree](http://www.iqtree.org/).
                                    
To start, execute the cell below by selecting it and pressing `CTRL+ENTER`and then click in the generated`Run All` button below. Or just go to the **Toolbar** above and select `Run`->`Run all cells`.

An **Error** message will be reported after the empty input form in the bottom of the notebook, fill it with your data to proceed. 

>If in GOOGLE COLAB, `Run All` and `Submit` buttons may not work, in both cases execute cells below through the **Toolbar** menu.

In [None]:
import ipywidgets as widgets
#
# Run all
#
def run_all(ev):
    display(Javascript('IPython.notebook.execute_cells_below()'))

start_running = widgets.Button(description ="Run All",
                               button_style='success',
                               tooltip     ='Click here to start.')
start_running.on_click(start_running)

display(start_running)

Test if in GOOGLE COLAB enviroment, and if yes, install `ETE3` and `iGraph` packages


In [None]:
# @title Test if in GOOGLE COLAB enviroment, and if yes, install missing packages
try:
  import google.colab
except ModuleNotFoundError:
    in_colab = False
else:
    in_colab = True
    !pip install python-igraph
    !pip install ete3
    
    from google.colab import files
    
    import requests
    functions = requests.get(
        'https://raw.githubusercontent.com/lthiberiol/evolSimIndex/master/base_functions.py'
    )
    with open('base_functions.py', 'w') as functions_handle:
        functions_handle.write(functions.text)

In [None]:
from io                     import BytesIO, StringIO
from IPython.display        import Javascript, FileLink

import numpy      as np
import pandas     as pd

import multiprocessing
import itertools
import re
import base_functions

## Simple parameters and data input interface

In [None]:
#
# Data input buttons
evol_dist_source = widgets.Dropdown(
    options    =[('',                       0       ), 
                 ('FASTA files',            'fasta' ), 
                 ('IQTree ".mldist" files', 'matrix'), 
                 ('newick files',           'tree'  )],
    disabled   =False,
    indent     =False,
    value      =0,
    layout     ={'width':'auto'}
)

must_align = widgets.Checkbox(
    value   =False,  
    disabled=True,
    indent  =False,
    description='Provided FASTAS are not yet aligned',
    layout     ={'width':'auto'}
)

gene_ids = widgets.Checkbox(
    value   =False,  
    disabled=False,
    indent  =False,
    description='Sequences are identified by genome only '
    '(all sequences from the same genome have the same name)',
    layout     ={'width':'auto'}
)

min_taxa_overlap = widgets.IntText(value      =5, 
                                   indent     =False,
                                   disabled   =False)

genome_gene_sep = widgets.Dropdown(
    options    =[('',                                     0  ), 
                 ('<genome>_<gene>', '_'), 
                 ('<genome>|<gene>', '|'), 
                 ('<genome>.<gene>', '.')],
    disabled   =False,
    indent     =True,
    value      =0,
    layout     ={'width':'auto'}
)

input_files = widgets.FileUpload(
    accept  ='',   # Accepted file extension e.g. '.txt', '.pdf', 'image/*', 'image/*,.pdf'
    multiple=True, # True to accept multiple files upload else False
    disabled=True
)

def toggle_align_widgets(dropdown_source):
    input_files.disabled = not dropdown_source.new
    
    if dropdown_source.new == 'fasta':
        must_align.disabled = False
#     elif dropdown_source.new == 'example':
#         genome_gene_sep.value = '_'
#         input_files.disabled  = True
    else:
        must_align.disabled = True
        must_align.value    = False
    
def toggle_genome_gene_sep(checkbox):
    genome_gene_sep.disabled = checkbox.new
    if checkbox.new:
        genome_gene_sep.value = 0
        
def clear_uploads(*args):
    input_files.value.clear()
    input_files._counter = 0
    input_files.disabled = False
    
    evol_dist_source.value = 0
    genome_gene_sep.value  = 0
    
    example.flag = False
        
clear_button = widgets.Button(description='Clear upload',
                              button_style='warning',
                              tooltip     ='Click to clear uploaded files')
clear_button.on_click(clear_uploads)

evol_dist_source.observe(toggle_align_widgets, names='value')
gene_ids.observe(toggle_genome_gene_sep,       names='value')


#
# load example
#
example = widgets.Button(description='Load example',
                         button_style='success',
                         tooltip     ='Load example parameters')
example.flag = False    
def load_example(*args):
    evol_dist_source.disabled = False
    evol_dist_source.value    = 'matrix'
    
    genome_gene_sep.value     = '_'
    
    input_files.disabled      = True
    
    example.flag = True
    
example.on_click(load_example)

#
# submit data
#
def run_all(ev):
    display(Javascript('IPython.notebook.execute_cells_below()'))

submit = widgets.Button(description ="Submit",
                        button_style='success',
                        tooltip     ='Click here to continue with provided data')
submit.on_click(run_all)

#
# threads
#
num_threads = widgets.IntSlider(min=1, 
                                max=multiprocessing.cpu_count())

#
# download through colab
#
if in_colab:
    def download(*args):
        files.download('Ies.csv')

    download_csv = widgets.Button(description='Download Ies.csv',
                          button_style='success')
    download_csv.on_click(download)

In [None]:
display(widgets.HBox([widgets.Label('Source of pairwise distances (there is an "example" options): '), 
                      evol_dist_source]),
        must_align,
        gene_ids,
        widgets.HBox([widgets.Label('Genome and gene ids are separated by which character: '),
                      genome_gene_sep]),
        
        widgets.HBox([widgets.Label('Minimum taxa containing both assessed gene families: '),
                      min_taxa_overlap]),
        
        widgets.HBox([widgets.Label('Number of threads to use: '),
                      num_threads]),
        
#         input_files,
        widgets.HBox([input_files, example]),
        clear_button,
        submit
       )

If `Submit` button above is not working, in the **Toolbar** click in `Cell`->`Run All Bellow`

#### Parsing provided data and parameters

In [None]:
# @title Breaking "run all" if no data was uploadded

if not input_files._counter > 1 and not example.flag:
    raise ValueError('You must upload at least two files!')

In [None]:
if   genome_gene_sep.value == '_':
    parse_leaf = re.compile('^(GC[AF]_\d+(?:\.\d)?)[_|](.*)$')
elif genome_gene_sep.value == '|':
    parse_leaf = re.compile('^(\S+?)\|(\S+)$')
elif genome_gene_sep.value == '.':
    parse_leaf = re.compile('^(\d+?)\.(.*)$')
    
if min_taxa_overlap.value < 2:
    min_taxa_overlap.value = 2

corr_evol = base_functions.correlate_evolution(gene_ids       =gene_ids.value,
                                               parse_leaf     =parse_leaf,
                                               min_taxa_overlap=min_taxa_overlap.value)

In [None]:
# @title Loading data...

dist_matrices = []
group_names   = []

if evol_dist_source.value == 'tree':
    for file_name, file_itself in input_files.value.items():
        dist_matrices.append( 
           corr_evol.get_matrix_from_tree(file_itself['content'].decode('utf-8')) 
        )
        group_names.append( file_name )
        
elif evol_dist_source.value == 'matrix':
    if not input_files._counter and example.flag:
        for file_itself in ['000284', '000302', '000304', '000321', '000528', 
#                             '000574', '000575', '000595', '000602', '000607',
#                             '000611', '000617', '000620', '000621', '000625',
                            '000632', '000645', '000647', '000657', '000663']:
            dist_matrices.append(corr_evol.load_matrix(
                f'https://raw.githubusercontent.com/lthiberiol/evolSimIndex/master/tests/{file_itself}.mldist'
            ))
            group_names.append( file_itself )
    else:
        for file_name, file_itself in input_files.value.items():
            dist_matrices.append( 
                corr_evol.load_matrix(BytesIO(file_itself['content'])) 
            )
            group_names.append( file_name )

## Assessing evolutionary similarity between gene families!

In [None]:
%%time 

print(f'Assessing Ies between {len(group_names)} genes')
print(f'\t**using {num_threads.value} threads\n')

matrix_combinations     = itertools.combinations(dist_matrices, 2)
group_name_combinations = itertools.combinations(group_names,   2)
group_name_combinations = np.array( list(group_name_combinations) )

pool    = multiprocessing.Pool(processes=num_threads.value)
results = pool.starmap(corr_evol.assess_coevolution, matrix_combinations)
pool.close()
pool.join()

coevol_df = pd.DataFrame(columns=['R_squared', 'Ibc', 'Ies'], 
                         data   =results)

coevol_df['gene1'] = group_name_combinations[:, 0]
coevol_df['gene2'] = group_name_combinations[:, 1]

coevol_df.to_csv('Ies.csv')

if not in_colab:
    local_file = FileLink('Ies.csv', result_html_prefix="Click here to download: ")
    display(local_file)
else:
    display(download_csv)