### Goal

Combine diamond and pannzer annotations 

### Input

- Directory of diamond (BLASTP) annotations
- Directory of PANNZER annotations

### Output

species + "_annotations_combined.tsv for every species

In [1]:
import pandas as pd
import glob
from functools import reduce
from collections import defaultdict

#### Annotation directories

In [2]:
diamond_dir = "diamond/"
pannzer_dir = "pannzer/"
#interpro_dir = "interpro_files/"

#### List diamond files

In [3]:
diamond_files = glob.glob(diamond_dir + "*best_hit")

In [4]:
interpro_files = glob.glob(interpro_dir + "*short.tsv")
interpro_files

['interpro_files/hvul_combined_short.tsv',
 'interpro_files/epal_combined_short.tsv',
 'interpro_files/hech_combined_short.tsv',
 'interpro_files/mvir_combined_short.tsv',
 'interpro_files/adig_combined_short.tsv',
 'interpro_files/dgig_combined_short.tsv',
 'interpro_files/chem_combined_short.tsv',
 'interpro_files/pdam_combined_short.tsv',
 'interpro_files/hsym_combined_short.tsv',
 'interpro_files/hvu2_combined_short.tsv',
 'interpro_files/nvec_combined_short.tsv',
 'interpro_files/ofav_combined_short.tsv',
 'interpro_files/amil_combined_short.tsv',
 'interpro_files/aaur_combined_short.tsv',
 'interpro_files/aten_combined_short.tsv',
 'interpro_files/spis_combined_short.tsv']

#### Function that combines diamond and pannzer annotations

In [5]:
def combine_annotations(diamond_file, species):
    diamond_annotations = pd.read_csv(diamond_file, sep="\t", 
                           names=['gene_id', 'blastp_hit', 'blastp_hit_id', 'pident', 
                                  'length', 'mismatch', 'gapopen', 'qstart',
                                  'qend', 'sstart', 'send', 'evalue', 'bitscore'])
    diamond_annotations = diamond_annotations[['gene_id', 'blastp_hit', 'evalue']]
    pannzer_desc_file = glob.glob(pannzer_dir + species + "_descriptions.out")[0]
    #print(pannzer_desc_file)
    pannzer_GO_file = glob.glob(pannzer_dir + species + "_GO_terms.out")[0]
    #print(pannzer_GO_file)
    pannzer_desc = pd.read_csv(pannzer_desc_file, sep="\t")
    pannzer_desc.rename(columns={'qpid':'gene_id', 'desc': 'pannzer_desc', 'genename': 'pannzer_name'}, inplace=True)
    pannzer_desc = pannzer_desc[['gene_id', 'pannzer_desc', 'pannzer_name']]
    pannzer_desc = pannzer_desc.drop_duplicates('gene_id')
    pannzer_GO = pd.read_csv(pannzer_GO_file, sep="\t")
    pannzer_MF = pannzer_GO[(pannzer_GO['ARGOT_rank'] == 1) & (pannzer_GO['ontology'] == 'MF')][['qpid', 'desc']]
    pannzer_BP = pannzer_GO[(pannzer_GO['ARGOT_rank'] == 1) & (pannzer_GO['ontology'] == 'BP')][['qpid', 'desc']]
    pannzer_CC = pannzer_GO[(pannzer_GO['ARGOT_rank'] == 1) & (pannzer_GO['ontology'] == 'CC')][['qpid', 'desc']]
    pannzer_MF.columns = ['gene_id', 'molecular_function']
    pannzer_BP.columns = ['gene_id', 'biological_process']
    pannzer_CC.columns = ['gene_id', 'cellular_component']
    annotation_list = [diamond_annotations, pannzer_desc, pannzer_MF, pannzer_BP, pannzer_CC]
    annotations_combined = reduce(lambda left,right: pd.merge(left,right,on='gene_id', how='outer'), annotation_list)
    return(annotations_combined)

#### Combine annotations for each species

In [6]:
for diamond_file in diamond_files:
    #print(diamond_file)
    species = diamond_file.split("/")[-1].split(".")[0]
    print("combine annotations for species", species)
    annotations_combined = combine_annotations(diamond_file, species)
    annotations_combined.to_csv(species + "_annotations_combined.tsv", sep="\t", index=False)

combine annotations for species aaur
combine annotations for species nvec
combine annotations for species aten
combine annotations for species dgig
combine annotations for species spis
combine annotations for species adig
combine annotations for species hvul
combine annotations for species amil
combine annotations for species hsym
combine annotations for species mvir
combine annotations for species chem
combine annotations for species hech
combine annotations for species hvu2
combine annotations for species pdam
combine annotations for species epal
combine annotations for species ofav
