In [7]:
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt 
import os
from src.tools import Prodigal 
from src.files import FASTAFile
from src.reference import Reference


In [2]:
# How should I select the genomes for model training?
genome_ids = ['GCF_000005845.2'] # E. coli K-12
genome_ids += ['GCF_000009045.1'] # B. subtilis
genome_ids += ['GCF_000006765.1'] # P. aeruginosa
genome_ids += ['GCF_000195955.2'] # M. tuberculosis

names = dict()
names['GCF_000005845.2'] = 'E. coli K-12'
names['GCF_000009045.1'] = 'B. subtilis 168'
names['GCF_000006765.1'] = 'P. aeruginosa'
names['GCF_000195955.2'] = 'M. tuberculosis'

In [4]:
prodigal = Prodigal()
for genome_id in genome_ids:
    if not os.path.exists(f'../data/prodigal/{genome_id}_protein.faa'):
        prodigal.run(f'../data/ncbi/genomes/{genome_id}_genomic.fna', f'../data/prodigal/{genome_id}_protein.faa')

In [6]:
# Want to get the number of proteins in the NCBI reference. 
prodigal_fasta_files = {genome_id:FASTAFile(path=f'../data/prodigal/{genome_id}_protein.faa') for genome_id in genome_ids}
ncbi_fasta_files = {genome_id:FASTAFile(path=f'../data/ncbi/proteins/{genome_id}_protein.faa') for genome_id in genome_ids}

In [24]:
all_hits_df = Reference.load(f'../data/compare/{genome_id}_all_hits.csv')
top_hits_df = Reference.load(f'../data/compare/{genome_id}_top_hits.csv')

ids = top_hits_df[(top_hits_df.category == 'conflict') & (top_hits_df.n_hits > 1)].index
ids_to_inspect = all_hits_df[all_hits_df.query_id.isin(ids) & all_hits_df.in_frame].query_id

In [25]:
top_hits_df.loc[ids_to_inspect]

Unnamed: 0_level_0,n_hits,n_hits_same_strand,n_hits_opposite_strand,n_hits_in_frame,top_hit_feature,top_hit_contig_id,top_hit_product,top_hit_note,top_hit_pseudo,top_hit_locus_tag,...,overlap_stop,overlap_length,top_hit_overlap_fraction,query_overlap_fraction,exact_match,in_frame,in_frame_c_terminus,in_frame_n_terminus,category,sequence_identity
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
NC_000962.3_104,2,2,0,1,repeat_region,NC_000962,none,"REP-2, len: 1503 nt. REP251, member of REP13E1...",False,none,...,104669,957,0.636727,0.996875,False,False,False,False,conflict,0.0
NC_000962.3_3531,3,3,0,1,mobile_element,NC_000962,none,"IS1560-2, len: 1568 nt. Possible Insertion seq...",False,none,...,3800796,780,0.497449,1.0,False,False,False,False,conflict,0.0
NC_000962.3_3616,3,3,0,1,repeat_region,NC_000962,none,This region is a possible MT-complex-specific ...,False,none,...,3884917,960,0.699708,1.0,False,False,False,False,conflict,0.0


In [27]:
all_hits_df[all_hits_df.query_id.isin(ids_to_inspect)][['subject_protein_id']]

Unnamed: 0,subject_protein_id
143,NP_214608.1
144,none
5173,none
5174,NP_217903.1
5175,NP_217904.1
5289,NP_217983.1
5290,none
5291,NP_217984.1


In [11]:
compare_df = pd.concat([Reference.load(f'../data/compare/{genome_id}_top_hits.csv').assign(genome_id=genome_id) for genome_id in genome_ids])
compare_df['name'] = compare_df.genome_id.map(names)

In [16]:
def table_1(compare_df:pd.DataFrame, prodigal_fasta_files:dict, ncbi_fasta_files:dict, path:str=None):

    table_df = pd.DataFrame(index=compare_df.name.unique(), columns=['ncbi_n_cds', 'prodigal_n_cds', 'percent_matches', 'percent_exact_matches'])   
    table_df['ncbi_n_cds'] = pd.Series({names[genome_id]:len(fasta_file) for genome_id, fasta_file in ncbi_fasta_files.items()})
    table_df['prodigal_n_cds'] = pd.Series({names[genome_id]:len(fasta_file) for genome_id, fasta_file in prodigal_fasta_files.items()})
    for name, df in compare_df.groupby('name'):
        table_df.loc[name, 'percent_matches'] = (df.category == 'match').sum() / len(df)
        table_df.loc[name, 'percent_exact_matches'] = df.exact_match.sum() / len(df)
    return table_df

table_1(compare_df, prodigal_fasta_files, ncbi_fasta_files)

Unnamed: 0,ncbi_n_cds,prodigal_n_cds,percent_matches,percent_exact_matches
E. coli K-12,4298,4319,0.953461,0.900208
B. subtilis 168,4237,4226,0.977993,0.88973
P. aeruginosa,5572,5681,0.977997,0.896849
M. tuberculosis,3906,4085,0.92754,0.717013


In [None]:
# What statistics do I care about?
# (1) Total number of genes in reference. 
# (2) Total number of predicted genes. 
# (3) Total number of matched coding sequences (not including pseudogenes).
# (4) Number of exact matches. 
# (5) Number of genes with boundary errors (characterize the nature of the boundary errors later on).
# (2) The false discovery rate. 
# (3) The number of missed genes. 