In [1]:
import cobra
import os
import glob
import pandas as pd

# Matplotlib defaults
import matplotlib.pyplot as plt
import matplotlib
import matplotlib.ticker as mticker
matplotlib.rcParams["savefig.dpi"] = 300
matplotlib.rcParams['figure.dpi'] = 300
#Rivanna fonts should include sans-serif now, but this could still raise an error
matplotlib.rcParams['font.sans-serif'] = "Arial"
matplotlib.rcParams['font.family'] = "sans-serif"

SMALLER_SIZE = 8
SMALL_SIZE = 14
MEDIUM_SIZE = 16
BIGGER_SIZE = 16

matplotlib.rc('font', size=SMALL_SIZE)          # controls default text sizes
matplotlib.rc('axes', titlesize=MEDIUM_SIZE)     # fontsize of the axes title
matplotlib.rc('axes', labelsize=SMALL_SIZE)    # fontsize of the x and y labels
matplotlib.rc('xtick', labelsize=SMALL_SIZE)    # fontsize of the tick labels
matplotlib.rc('ytick', labelsize=SMALL_SIZE)    # fontsize of the tick labels
matplotlib.rc('legend', fontsize=SMALLER_SIZE)    # legend fontsize
matplotlib.rc('figure', titlesize=BIGGER_SIZE)  # fontsize of the figure title

In [2]:
model_dict_json = dict()
path = "/home/mac9jc/paradigm/models/"
os.chdir(path)

for filename in glob.glob(os.path.join(path, 'final_denovo_*.json')):
    key = filename.split('/')[len(filename.split('/'))-1]
    key = key[:-5]
    key = key[13:]
    model_dict_json[key] = cobra.io.load_json_model(filename)
    
model_dict_gf = dict()
        
for filename in glob.glob(os.path.join(path, 'gf_*.xml')):
    key = filename.split('/')[len(filename.split('/'))-1]
    key = key[:-4]
    key = key[3:]
    if key.startswith('P'):
        key = key #GOOD
    else: key = 'NA'
    if key != 'NA':
        model_dict_gf[key] = cobra.io.read_sbml_model(filename)
        
del model_dict_json['TgondiiRH'] #del model_dict_gf['TgondiiRH']
del model_dict_json['GintestinalisAssemblageAWB'] #del model_dict_gf['GintestinalisAssemblageAWB']
del model_dict_json['Pvivax-likePvl01']

In [24]:
def rename_organisms(file_o_genes):
    
    #model_dict_gf['LmajorSD75.1'] = model_dict_gf.pop('LmajorSD75')
    file_o_genes['Organism'] = file_o_genes['Organism'].str.replace('. ','', regex=False)
    file_o_genes['Organism'] = file_o_genes['Organism'].str.replace(' strain ','', regex=False)
    file_o_genes['Organism'] = file_o_genes['Organism'].str.replace('subsp','', regex=False)
    file_o_genes['Organism'] = file_o_genes['Organism'].str.replace(' ','', regex=False)
    file_o_genes['Organism'] = file_o_genes['Organism'].str.replace(':','', regex=False)
    file_o_genes['Organism'] = file_o_genes['Organism'].str.replace('-','', regex=False)
    file_o_genes['Organism'] = file_o_genes['Organism'].str.replace('/','', regex=False)
    file_o_genes['Organism'] = file_o_genes['Organism'].str.replace('isolate','', regex=False)
    file_o_genes['Organism'] = file_o_genes['Organism'].str.replace('strain','', regex=False)
    file_o_genes['Organism'] = file_o_genes['Organism'].str.replace('LdonovaniCLSL','LdonovaniCL-SL', regex=False)
    file_o_genes['Organism'] = file_o_genes['Organism'].str.replace('EhistolyticaHM1IMSSA','EhistolyticaHM1IMSS-A', regex=False)
    file_o_genes['Organism'] = file_o_genes['Organism'].str.replace('AcastellaniistrNeff','AcastellaniiNeff', regex=False)
    file_o_genes['Organism'] = file_o_genes['Organism'].str.replace('TcruziCLBrenerNonEsmeraldolike','TcruziCLBrenerNon-Esmeraldo-like', regex=False)
    file_o_genes['Organism'] = file_o_genes['Organism'].str.replace('NcaninumLiverpool','NcaninumLIV', regex=False)
    file_o_genes['Organism'] = file_o_genes['Organism'].str.replace('PknowlesiMalayanStrainPk1A','PknowlesiMalayanPk1A', regex=False)
    file_o_genes['Organism'] = file_o_genes['Organism'].str.replace('TgondiiGAB22007GALDOM2','TgondiiGAB2-2007-GAL-DOM2', regex=False)
    file_o_genes['Organism'] = file_o_genes['Organism'].str.replace('EcuniculiEcunIIIL','EcuniculiEcunIII-L', regex=False)
    file_o_genes['Organism'] = file_o_genes['Organism'].str.replace('EhistolyticaHM1IMSSB','EhistolyticaHM1IMSS-B', regex=False)
    file_o_genes['Organism'] = file_o_genes['Organism'].str.replace('HhammondiH.H.34','HhammondiHH34', regex=False)
    file_o_genes['Organism'] = file_o_genes['Organism'].str.replace('TcruziSylvioX1012012','TcruziSylvioX10-1-2012', regex=False)
    file_o_genes['Organism'] = file_o_genes['Organism'].str.replace('BayalaiB08376','BayalaiB08-376', regex=False)
    file_o_genes['Organism'] = file_o_genes['Organism'].str.replace('TcruziSylvioX101','TcruziSylvioX10-1', regex=False)
    file_o_genes['Organism'] = file_o_genes['Organism'].str.replace('TcruziCLBrenerEsmeraldolike','TcruziCLBrenerEsmeraldo-like', regex=False)
    file_o_genes['Organism'] = file_o_genes['Organism'].str.replace('GAssemblageBGS','GintestinalisAssemblageBGS', regex=False)
    file_o_genes['Organism'] = file_o_genes['Organism'].str.replace('GAssemblageBGS_B','GintestinalisAssemblageBGS_B', regex=False)
    file_o_genes['Organism'] = file_o_genes['Organism'].str.replace('GAssemblageEP15','GintestinalisAssemblageEP15', regex=False)
    file_o_genes['Organism'] = file_o_genes['Organism'].str.replace('TbruceibruceiTREU927','TbruceiTREU927', regex=False)
    file_o_genes['Organism'] = file_o_genes['Organism'].str.replace('PrelictumSGS1like','PrelictumSGS1-like', regex=False)
    file_o_genes['Organism'] = file_o_genes['Organism'].str.replace('GAssemblageA2DH','GintestinalisAssemblageADH', regex=False)
    file_o_genes['Organism'] = file_o_genes['Organism'].str.replace('TbruceiLister4272018','TbruceiLister427_2018', regex=False)
    file_o_genes['Organism'] = file_o_genes['Organism'].str.replace('PvivaxlikePvl01','Pvivax-likePvl01', regex=False)
    file_o_genes['Organism'] = file_o_genes['Organism'].str.replace('Pfragilenilgiri','PfragileNilgiri', regex=False)
    #gene_file['Organism'] = gene_file['Organism'].str.replace('','TgondiiRH', regex=False)
    #gene_file['Organism'] = gene_file['Organism'].str.replace('','GintestinalisAssemblageAWB', regex=False)
    
    return(file_o_genes)

def id_genes_not_in_model(model_dict, file_o_genes, model_genes_denovo_input, model_genes_gf_input):
    
    # model_dict: dictionary of models with keys being the species ID and values are filled with the model
    # the input for this script will typically be the de novo models

    gene_in_model = dict()
    gene_in_model_gf = dict()

    for species, model in model_dict.items():

        #subset gene_file for species
        temp_gene_file = file_o_genes[file_o_genes['Organism'].str.lower() == species.lower()]

        #preallocate - DE NOVO MODEL
        species_gene_not_in_model = dict()
        model_version = 'de novo'

        #assess genes - DE NOVO MODEL
        for row in temp_gene_file.iterrows():
            row_use = row[1]
            gene = row_use['Gene ID']
            if gene not in model_genes_denovo_input[species]:
                EC = row_use['EC numbers']
                name = row_use['Product Description']
                GO2 = row_use['Curated GO Functions']
                GO1 = row_use['Computed GO Functions']
                present = False
            else: 
                present = True# gene in model
            gene_in_model[gene] = {'species':species,'model_version':model_version,'EC':EC, 'name':name,'GO, curated':GO1,'GO, computed':GO2, 'present':present}

        #preallocate - GAPFILLED MODEL
        species_gene_not_in_model = dict()
        species_count_with_EC = 0
        model_version = 'gf'

        #get other model
        if species in model_dict_gf.keys():
            #assess genes - GAPFILLED MODEL
            for row in temp_gene_file.iterrows():
                row_use = row[1]
                gene = row_use['Gene ID']
                if gene not in model_genes_gf_input[species]:
                    EC = row_use['EC numbers']
                    name = row_use['Product Description']
                    GO2 = row_use['Curated GO Functions']
                    GO1 = row_use['Computed GO Functions']
                    present = False
                else: 
                    present = True # gene in model
                gene_in_model_gf[gene] = {'species':species,'model_version':model_version,'EC':EC, 'name':name,'GO, curated':GO1,'GO, computed':GO2, 'present':present}

    output1 = pd.DataFrame.from_dict(gene_in_model, orient='index')
    output1 = output1.rename(columns = {'index':'gene'})
    output2 = pd.DataFrame.from_dict(gene_in_model_gf, orient='index')
    output2 = output2.rename(columns = {'index':'gene'})
    output = output1.append(output2, ignore_index=False)
    
    return(output)


def get_gene_options(model_input):
    
    model_genes = list()
    
    for g in model_input.genes:
        gene_id = g.id
    
        if gene_id[0:4] == 'rna_': gene_id = gene_id.replace('rna_', '')
            
        if gene_id[0:6] == 'mRNA1_': gene_id = gene_id.replace('mRNA1_', '')
            
        if '.?-p1' in gene_id: gene_id = gene_id.split('.?-p1')[0]
        
        if ':' in gene_id: gene_id = gene_id.split(':')[0]
            
        if "-t" in gene_id: gene_id = gene_id.split('-t')[0]
            
        if "." in gene_id: gene_id = gene_id.split('.')[0]
            
        if "-RA-p1" in gene_id: gene_id = gene_id.split('-RA-p1')[0]
            
        if "-1-p1" in gene_id: gene_id = gene_id.split('-1-p1')[0]
            
        if "-T1-p1" in gene_id: gene_id = gene_id.split('-T1-p1')[0]
                 
        if "-p1" in gene_id: gene_id = gene_id.split('-p1')[0]
    
        model_genes.append(gene_id)
    #if gene_id == g.id: print(gene_id)
                
    return(model_genes)





In [52]:
def summarize_df(df_input, model_dict, gene_file_input, model_dict_gf_input, model_genes_denovo_dict_list, model_genes_gf_dict_list):
    
    df_denovo = df_input[df_input['model_version'] == 'de novo']
    df_gf = df_input[df_input['model_version'] == 'gf']
    
    df_denovo_present = df_denovo[df_denovo['present'] == True]
    df_denovo_absent = df_denovo[df_denovo['present'] == False]
    
    df_gf_present = df_gf[df_gf['present'] == True]
    df_gf_absent = df_gf[df_gf['present'] == False]
    
    dict_genes = dict()
    
    for species in model_dict.keys():
        
        
        # All EuPathDB genes
        EuPathDB_genes = set(gene_file_input[gene_file_input['Organism'] == species]['Gene ID'])
        # EuPathDB genes that WERE in de novo model
        EuPathDB_genes_in_denovo = set(df_denovo_present[df_denovo_present['species'] == species].index)
        # EuPathDB genes that WERE NOT in de novo model
        EuPathDB_genes_NOT_in_denovo = set(df_denovo_absent[df_denovo_absent['species'] == species].index)

        # All de novo genes
        denovo_genes = set(model_genes_denovo_dict_list[species])
        # De novo genes on EuPathDB (same list as EuPathDB_genes_in_denovo)
        denovo_on_EuPathDB = denovo_genes.intersection(EuPathDB_genes)
        # De novo genes NOT on EuPathDB
        unique_to_denovo = denovo_genes.difference(EuPathDB_genes)

        # check lists
        if len(denovo_on_EuPathDB.difference(EuPathDB_genes_in_denovo)) > 0 or len(EuPathDB_genes_in_denovo.difference(denovo_on_EuPathDB)) >0:
            print('error in de novo genes on EuPathDB: ',species)

        if len(EuPathDB_genes_in_denovo) + len(EuPathDB_genes_NOT_in_denovo) == len(EuPathDB_genes): a = 0
        else: print('error in EuPathDB split: ',species)

        if len(unique_to_denovo) + len(denovo_on_EuPathDB) == len(denovo_genes): a = 0
        else: print('error in de novo split: ',species)
        
        if species not in model_dict_gf_input.keys(): 
            N_EuPathDB_genes_in_gf = 'NaN'
            N_EuPathDB_genes_NOT_in_gf = 'NaN'
            N_gf_genes = 'NaN'
            N_gf_on_EuPathDB = 'NaN' # same as EuPathDB_in_gf
            N_unique_to_gf = 'NaN'
        else:
            # EuPathDB genes that WERE in de novo model
            EuPathDB_genes_in_gf = set(df_gf_present[df_gf_present['species'] == species].index)
            # EuPathDB genes that WERE NOT in de novo model
            EuPathDB_genes_NOT_in_gf = set(df_gf_absent[df_gf_absent['species'] == species].index)
            
            # All gf genes
            gf_genes = (set(model_genes_gf_dict_list[species]))
            # gf genes on EuPathDB (same list as EuPathDB_genes_in_gf)
            gf_on_EuPathDB = set(gf_genes.intersection(EuPathDB_genes)) #### HERE
            # gf genes NOT on EuPathDB
            unique_to_gf = set(gf_genes.difference(EuPathDB_genes))
            
            # check lists # ISSUE WITH FIRST TWO
            if len(gf_on_EuPathDB.difference(EuPathDB_genes_in_gf)) > 0 or len(EuPathDB_genes_in_gf.difference(gf_on_EuPathDB)) >0:
                print('gf - error in gf genes on EuPathDB: ',species) # PLASMODIUM ONLY

            if len(EuPathDB_genes_in_gf) + len(EuPathDB_genes_NOT_in_gf) == len(EuPathDB_genes): a = 0
            else: print('gf - error in EuPathDB split: ',species)

            if len(unique_to_gf) + len(gf_on_EuPathDB) == len(gf_genes): a = 0
            else: print('gf - error in de novo split: ',species)
                
            N_EuPathDB_genes_in_gf = len(EuPathDB_genes_in_gf)
            N_EuPathDB_genes_NOT_in_gf = len(EuPathDB_genes_NOT_in_gf)
            N_gf_genes = len(gf_genes)
            N_gf_on_EuPathDB = len(gf_on_EuPathDB)
            N_unique_to_gf = len(unique_to_gf)
            

        dict_genes[species] = {'total EuPathDB genes' : len(EuPathDB_genes),
                               'total de novo genes' : len(denovo_genes),
                               'total semi-curated genes': N_gf_genes,
                               '# in de novo' : len(denovo_on_EuPathDB),
                              '# in semi-curated' : N_gf_on_EuPathDB,
                               '# not in de novo' : len(EuPathDB_genes_NOT_in_denovo),
                              '# not in semi-curated' : N_EuPathDB_genes_NOT_in_gf,
                              '# only in de novo' : len(unique_to_denovo), 
                              '# only in semi-curated' : N_unique_to_gf,
                              
                              'EuPathDB genes' : EuPathDB_genes,
                               'de novo genes' : denovo_genes,
                               'in de novo' : denovo_on_EuPathDB,
                               'not in de novo' : EuPathDB_genes_NOT_in_denovo,
                              'only in de novo' : unique_to_denovo} 
        
    df_use = pd.DataFrame.from_dict(dict_genes, orient='index')

    return(df_use)

In [7]:
model_genes_denovo = dict()
model_genes_gf = dict()
for species, model in model_dict_json.items():
    model_genes_denovo[species] = get_gene_options(model)
for species, model in model_dict_gf.items():
    model_genes_gf[species] = get_gene_options(model)

In [5]:
gene_file = pd.read_csv('/home/mac9jc/paradigm/data/EuPathDB_GOTerms_metabolic_process.txt', sep="\t")
gene_file = rename_organisms(gene_file)

In [25]:
df = id_genes_not_in_model(model_dict_json, gene_file, model_genes_denovo, model_genes_gf)
df.head()

Unnamed: 0,species,model_version,EC,name,"GO, curated","GO, computed",present
13J3.01,TbruceiLister427,de novo,,"RNA polymerase III pseudogene, possible",DNA binding;DNA-directed 5'-3' RNA polymerase ...,,False
A0H76_1008,Heriocheircanceri,de novo,,hypothetical protein,DNA binding,,False
A0H76_1010,Heriocheircanceri,de novo,,CLPB,ATP binding,,False
A0H76_1016,Heriocheircanceri,de novo,,PRI2,DNA primase activity,,False
A0H76_102,Heriocheircanceri,de novo,,RS28,structural constituent of ribosome,,False


In [53]:
data_all = summarize_df(df, model_dict_json,  gene_file, model_dict_gf, model_genes_denovo, model_genes_gf)

In [66]:

print(sum(~(data_all['# not in de novo'] + data_all['# in de novo'] == data_all['total EuPathDB genes'])))
print(sum(~(data_all['# only in de novo'] + data_all['# in de novo'] == data_all['total de novo genes'])))
data_all_sub = data_all[~(data_all['# only in semi-curated'] == 'NaN')] 
print(sum(~(data_all_sub['# only in semi-curated'] + data_all_sub['# in semi-curated'] == data_all_sub['total semi-curated genes'])))

0
0
0


In [65]:
data_all_sub.shape
data_all.shape

(189, 14)

In [None]:
gene_file[gene_file['Organism'] == species].head() # source_id might not need converstion

In [None]:
# for text in response to reviewers
temp = df[(df['model_version'] =='de novo') & (df['species'] =='TgondiiME49')]
print(temp.loc['TGME49_321660']['GO, curated'])
print(temp.shape)
#temp
temp[pd.notna(temp['EC'])]

In [None]:
# percent of genes that have GO terms and EC and are not in model
df_denovo = df[df['model_version'] =='de novo']
df_gf = df[df['model_version'] =='gf']
dict_genes_temp = dict()
dict_genes = dict()
for species in model_dict_json.keys():
    species_df = gene_file[gene_file['Organism'] == species]
    N_EuPathDB_genes_withoutEC = species_df[species_df['EC numbers'] != species_df['EC numbers']].shape[0]
    percent_EuPathDB_genes_withoutEC = 100* N_EuPathDB_genes_withoutEC/species_df.shape[0]
    species_denovo_df = df_denovo[df_denovo['species'] == species]
    N_missing_from_denovo_withoutEC = species_denovo_df[species_denovo_df['EC'] != species_denovo_df['EC']].shape[0]
    percent_missing_from_denovo_withoutEC = 100* N_missing_from_denovo_withoutEC/species_denovo_df.shape[0]
    species_gf_df = df_gf[df_gf['species'] == species]
    N_missing_from_gf_withoutEC = species_gf_df[species_gf_df['EC'] != species_gf_df['EC']].shape[0]
    if species not in model_dict_gf.keys(): 
        N_missing_from_gf = 'NaN'
        percent_missing_from_gf_withoutEC = 'NaN'
    else:
        percent_missing_from_gf_withoutEC = 100* N_missing_from_gf_withoutEC/species_gf_df.shape[0]
    dict_genes[species] = {'all annotations' : percent_EuPathDB_genes_withoutEC,
                           'omitted from\nde novo' : percent_missing_from_denovo_withoutEC,
                          'omitted from\nsemi-curated' : percent_missing_from_gf_withoutEC}
    dict_genes_temp[species] = {'all annotations' : N_EuPathDB_genes_withoutEC,
                           'omitted from\nde novo' : N_missing_from_denovo_withoutEC,
                          'omitted from\nsemi-curated' : N_missing_from_gf_withoutEC}
    
EC_data = pd.DataFrame.from_dict(dict_genes, orient='index')
EC_data.head()


In [None]:

GO_gene_IN_model = list()

for species, model in model_dict_json.items():
     
    #subset gene_file for species
    temp_gene_file = gene_file[gene_file['Organism'].str.lower() == species.lower()]
    EuPathDB_list = temp_gene_file['Gene ID'].tolist()
    
    #assess genes - DE NOVO MODEL
    model_version = 'de novo'
    for g in model.genes:
        gene = g.id
        gene_options = [gene.split('-')[0], gene.split('.')[0], gene.split(':')[0], gene.split('A-')[0],
                       gene.split('.1-p1')[0], gene.split('.mRNA')[0]]
        if 'rna_' in gene[0:4]:
            gene_options = gene_options + [gene.split('rna_')[1].split('-')[0]]
        if any(item in EuPathDB_list for item in gene_options):
            GO_gene_IN_model.append({'species':species,'gene':g.id,
                                  'model_version':model_version,'present?':'present'})
        else:
            GO_gene_IN_model.append({'species':species,'gene':g.id,
                                  'model_version':model_version,'present?':'absent'})
    
    #assess genes - GAPFILLED (SEMI-CURATED) MODEL
    model_version = 'gf'
    if species in model_dict_gf.keys():
        model_gf = model_dict_gf[species]
        
        #assess genes - GAPFILLED MODEL
        for g in model.genes:
            gene = g.id
            gene_options = [gene.split('-')[0], gene.split('.')[0], gene.split(':')[0], gene.split('A-')[0],
                           gene.split('.1-p1')[0], gene.split('.mRNA')[0]]
            if 'rna_' in gene[0:4]:
                gene_options = gene_options + [gene.split('rna_')[1].split('-')[0]]
            if any(item in EuPathDB_list for item in gene_options):
                GO_gene_IN_model.append({'species':species,'gene':g.id,
                                      'model_version':model_version,
                                      'present?':'present'})
            else:
                GO_gene_IN_model.append({'species':species,'gene':g.id,
                                      'model_version':model_version,
                                      'present?':'absent'})
            
    
df = pd.DataFrame(GO_gene_IN_model)

In [None]:
print(len([g.id for g in model_dict_json['AalgeraePRA109'].genes]))
print(len(list(set([g.id for g in model_dict_json['AalgeraePRA109'].genes]))))

In [None]:
df[(df['model_version'] =='de novo') & (df['species'] =='AalgeraePRA109') & (df['present?'] == 'present')].shape[0] + df[(df['model_version'] =='de novo') & (df['species'] =='AalgeraePRA109') & (df['present?'] == 'absent')].shape[0] 

In [None]:
data_all.head()

In [None]:
 
df_denovo = df[df['model_version'] == 'de novo']
df_gf = df[df['model_version'] == 'gf']

df_ab_denovo = df_denovo[df_denovo['present?'] =='absent']
df_pres_denovo = df_denovo[df_denovo['present?'] =='present']

df_ab_gf = df_gf[df_gf['present?'] =='absent']
df_pres_gf = df_gf[df_gf['present?'] =='present']

dict_genes = dict()
for species in model_dict_json.keys():
    
    N_EuPathDB_genes_in_model_denovo = df_pres_denovo[df_pres_denovo['species'] == species].shape[0]
    N_EuPathDB_genes_in_model_gf = df_pres_gf[df_pres_gf['species'] == species].shape[0]
    N_genes_in_model_not_on_EuPathDB_denovo = df_ab_denovo[df_ab_denovo['species'] == species].shape[0]
    N_genes_in_model_not_on_EuPathDB_gf = df_ab_gf[df_ab_gf['species'] == species].shape[0]
    
    if species not in model_dict_gf.keys(): 
        N_EuPathDB_genes_in_model_gf = 'NaN' 
        N_genes_in_model_not_on_EuPathDB_gf = 'NaN'
        
    dict_genes[species] = {'absent from\nEuPathDB\n(de novo)':N_genes_in_model_not_on_EuPathDB_denovo,
                          'absent from\nEuPathDB\n(semi-curated)':N_genes_in_model_not_on_EuPathDB_gf,
                          'on EuPathDB\n(de novo)':N_EuPathDB_genes_in_model_denovo,
                          'on EuPathDB\n(semi-curated)':N_EuPathDB_genes_in_model_gf}
    
data_opp = pd.DataFrame.from_dict(dict_genes, orient='index')
data_opp.head()

In [None]:
gene_file = pd.read_csv('/home/mac9jc/paradigm/data/EuPathDB_GOTerms_amino_acid.txt', sep="\t")
gene_file = rename_organisms(gene_file)
[df, genes_in_model_AA] = id_genes_not_in_model(model_dict_json, gene_file, model_genes_denovo, model_genes_gf)
data_amino = summarize_df(df, model_dict_json)
data_amino.head()

In [None]:
gene_file = pd.read_csv('/home/mac9jc/paradigm/data/EuPathDB_GOTerm_nucleotide.txt', sep="\t")
gene_file = rename_organisms(gene_file)
[df, genes_in_model_nts] = id_genes_not_in_model(model_dict_json, gene_file, model_genes_denovo, model_genes_gf)
data_nucleotides = summarize_df(df, model_dict_json)
data_nucleotides.head()

In [None]:
gene_file = pd.read_csv('/home/mac9jc/paradigm/data/EuPathDB_GOTerm_transport.txt', sep="\t")
gene_file = rename_organisms(gene_file)
[df, genes_in_model_trans] = id_genes_not_in_model(model_dict_json, gene_file, model_genes_denovo, model_genes_gf)
data_transporters = summarize_df(df, model_dict_json)
data_transporters.head()

In [None]:
colors = ['white','grey','grey']

# all annotations
fig,ax = plt.subplots()
x1 = data_all['EuPathDB genes'].tolist()
x2 = data_all['# missing\nfrom de novo'].tolist()
x3 = [x for x in data_all['# missing\nfrom semi-curated'].tolist() if x != 'NaN']
bplot = ax.boxplot([x1,x2,x3],
                   showfliers=False, patch_artist=True,
                   labels = data_all.columns.tolist(),
                   medianprops = dict(color="black",linewidth=1.5)) 
ax.yaxis.grid(True, linestyle='-', which='major', color='lightgrey',
               alpha=0.5)
for patch, color in zip(bplot['boxes'], colors):
    patch.set_facecolor(color)
    patch.set_edgecolor("black")
ax.set_title('All metabolic annotations')
ax.axes.get_xaxis().set_visible(False)
plt.subplots_adjust(bottom = 0.35, right = 0.5)
fig.savefig("/home/mac9jc/paradigm/data/results/figures_for_reviewers/EuPathDB_comparison_all_annotations.svg")
fig.savefig("/home/mac9jc/paradigm/data/results/figures_for_reviewers/EuPathDB_comparison_all_annotations.png")

# amino acid annotations 
fig,ax = plt.subplots()
x1 = data_amino['EuPathDB genes'].tolist()
x2 = data_amino['# missing\nfrom de novo'].tolist()
x3 = [x for x in data_amino['# missing\nfrom semi-curated'].tolist() if x != 'NaN']
bplot = ax.boxplot([x1,x2,x3],
                   showfliers=False, patch_artist=True,
                   labels = data_all.columns.tolist(),
                   medianprops = dict(color="black",linewidth=1.5)) 
ax.yaxis.grid(True, linestyle='-', which='major', color='lightgrey',
               alpha=0.5)
for patch, color in zip(bplot['boxes'], colors):
    patch.set_facecolor(color)
    patch.set_edgecolor("black")
ax.set_title('Amino acid annotations')
ax.set_xticklabels(rotation=-25, ha='left', labels = ['Total EuPathDB genes','missing from de novo','missing from semi-curated'])
plt.subplots_adjust(bottom = 0.35, right = 0.5)
fig.savefig("/home/mac9jc/paradigm/data/results/figures_for_reviewers/EuPathDB_comparison_AA_annotations.svg")
fig.savefig("/home/mac9jc/paradigm/data/results/figures_for_reviewers/EuPathDB_comparison_AA_annotations.png")

# nucleotide annotations 
fig,ax = plt.subplots()
x1 = data_nucleotides['EuPathDB genes'].tolist()
x2 = data_nucleotides['# missing\nfrom de novo'].tolist()
x3 = [x for x in data_nucleotides['# missing\nfrom semi-curated'].tolist() if x != 'NaN']
bplot = ax.boxplot([x1,x2,x3],
                   showfliers=False, patch_artist=True,
                   labels = data_all.columns.tolist(),
                   medianprops = dict(color="black",linewidth=1.5)) 
ax.yaxis.grid(True, linestyle='-', which='major', color='lightgrey',
               alpha=0.5)
for patch, color in zip(bplot['boxes'], colors):
    patch.set_facecolor(color)
    patch.set_edgecolor("black")
ax.set_title('Nucleotide annotations')
ax.axes.get_xaxis().set_visible(False)
plt.subplots_adjust(bottom = 0.35, right = 0.5)
fig.savefig("/home/mac9jc/paradigm/data/results/figures_for_reviewers/EuPathDB_comparison_NTs_annotations.svg")
fig.savefig("/home/mac9jc/paradigm/data/results/figures_for_reviewers/EuPathDB_comparison_NTs_annotations.png")

# transporter annotations 
fig,ax = plt.subplots()
x1 = data_transporters['EuPathDB genes'].tolist()
x2 = data_transporters['# missing\nfrom de novo'].tolist()
x3 = [x for x in data_transporters['# missing\nfrom semi-curated'].tolist() if x != 'NaN']
bplot = ax.boxplot([x1,x2,x3],
                   showfliers=False, patch_artist=True,
                   labels = data_all.columns.tolist(),
                   medianprops = dict(color="black",linewidth=1.5)) 
ax.yaxis.grid(True, linestyle='-', which='major', color='lightgrey',
               alpha=0.5)
for patch, color in zip(bplot['boxes'], colors):
    patch.set_facecolor(color)
    patch.set_edgecolor("black")
ax.set_title('Transporter annotations')
ax.set_xticklabels(rotation=-25, ha='left', labels = ['Total EuPathDB genes','missing from de novo','missing from semi-curated'])
plt.subplots_adjust(bottom = 0.35, right = 0.5)
fig.savefig("/home/mac9jc/paradigm/data/results/figures_for_reviewers/EuPathDB_comparison_transporter_annotations.svg")
fig.savefig("/home/mac9jc/paradigm/data/results/figures_for_reviewers/EuPathDB_comparison_transporter_annotations.png")


In [None]:
fig,ax = plt.subplots()
x1 = EC_data['all annotations'].tolist()
x2 = EC_data['omitted from\nde novo'].tolist()
x3 = [x for x in EC_data['omitted from\nsemi-curated'].tolist() if x != 'NaN']
bplot = ax.boxplot([x1,x2,x3],
                   showfliers=False, patch_artist=True,
                   labels = data_all.columns.tolist(),
                   medianprops = dict(color="black",linewidth=1.5)) 
ax.yaxis.grid(True, linestyle='-', which='major', color='lightgrey',
               alpha=0.5)
colors = ['white','grey','grey']
for patch, color in zip(bplot['boxes'], colors):
    patch.set_facecolor(color)
    patch.set_edgecolor("black")
ax.set_title('% genes without EC number')
ax.set_xticklabels(rotation=-25, ha='left', labels = ['Total EuPathDB genes','missing from de novo','missing from semi-curated'])
plt.subplots_adjust(bottom = 0.35, right = 0.5)
fig.savefig("/home/mac9jc/paradigm/data/results/figures_for_reviewers/EuPathDB_EC_eval.svg")
fig.savefig("/home/mac9jc/paradigm/data/results/figures_for_reviewers/EuPathDB_EC_eval.png")



In [None]:
fig,ax = plt.subplots()
x1 = data_opp['absent from\nEuPathDB\n(de novo)'].tolist()
x2 = [x for x in data_opp['on EuPathDB\n(de novo)'].tolist() if x == x]
x3 = [x for x in data_opp['absent from\nEuPathDB\n(semi-curated)'].tolist() if x != 'NaN']
x4 = [x for x in data_opp['on EuPathDB\n(semi-curated)'].tolist() if x != 'NaN']
bplot = ax.boxplot([x1,x2,x3,x4],
                   showfliers=False, patch_artist=True,
                   labels = data_opp.columns.tolist(),
                   medianprops = dict(color="black",linewidth=1.5))  
ax.yaxis.grid(True, linestyle='-', which='major', color='lightgrey',
               alpha=0.5)
colors = ['white','grey','white','grey']
for patch, color in zip(bplot['boxes'], colors):
    patch.set_facecolor(color)
    patch.set_edgecolor("black")
ax.set_title('Genes per model:')
ax.set_xticklabels(rotation=-25, ha='left', labels = ['not on EuPathDB (de novo)','on EuPathDB (de novo)','not on EuPathDB (semi-curated)','on EuPathDB (semi-curated)'])
plt.subplots_adjust(bottom = 0.35, right = 0.5)
fig.savefig("/home/mac9jc/paradigm/data/results/figures_for_reviewers/Diamond_not_in_EuPathDB.svg")
fig.savefig("/home/mac9jc/paradigm/data/results/figures_for_reviewers/Diamond_not_in_EuPathDB.png")

