In [2]:
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt 
import os
from src.tools import Prodigal 
from src.files import FASTAFile
from src.reference import Reference, compare

%load_ext autoreload 
%autoreload 2


In [3]:
# How should I select the genomes for model training?
genome_ids = ['GCF_000005845.2'] # E. coli K-12
genome_ids += ['GCF_000009045.1'] # B. subtilis
genome_ids += ['GCF_000006765.1'] # P. aeruginosa
genome_ids += ['GCF_000195955.2'] # M. tuberculosis

names = dict()
names['GCF_000005845.2'] = 'E. coli K-12'
names['GCF_000009045.1'] = 'B. subtilis 168'
names['GCF_000006765.1'] = 'P. aeruginosa'
names['GCF_000195955.2'] = 'M. tuberculosis'

In [9]:
prodigal = Prodigal()
for genome_id in genome_ids:
    if not os.path.exists(f'../data/prodigal/{genome_id}_protein.faa'):
        prodigal.run(f'../data/ncbi/genomes/{genome_id}_genomic.fna', f'../data/prodigal/{genome_id}_protein.faa')

query_paths = [f'../data/prodigal/{genome_id}_protein.faa' for genome_id in genome_ids]
reference_paths = [f'../data/ncbi/gbffs/{genome_id}_genomic.gbff' for genome_id in genome_ids]
for query_path, reference_path in zip(query_paths, reference_paths):
    compare(query_path, reference_path, overwrite=True)

ReferenceAnnotator._check: 100%|██████████| 234/234 [00:00<00:00, 824.90it/s]
ReferenceAnnotator._check: 100%|██████████| 10/10 [00:00<00:00, 1050.41it/s]


ReferenceAnnotator._check: Downgraded 0 "match" sequences to "intergenic" or "conflict".
ReferenceAnnotator._check: Upgraded 0 "intergenic" or "conflict" sequences to "match.
compare: Reference comparison complete. Results written to ../data/compare


ReferenceAnnotator._check: 100%|██████████| 388/388 [00:00<00:00, 543.16it/s]
ReferenceAnnotator._check: 100%|██████████| 11/11 [00:00<00:00, 2709.02it/s]


ReferenceAnnotator._check: Downgraded 0 "match" sequences to "intergenic" or "conflict".
ReferenceAnnotator._check: Upgraded 1 "intergenic" or "conflict" sequences to "match.
compare: Reference comparison complete. Results written to ../data/compare


ReferenceAnnotator._check: 100%|██████████| 455/455 [00:00<00:00, 902.35it/s]
ReferenceAnnotator._check: 100%|██████████| 33/33 [00:00<00:00, 1329.87it/s]


ReferenceAnnotator._check: Downgraded 0 "match" sequences to "intergenic" or "conflict".
ReferenceAnnotator._check: Upgraded 6 "intergenic" or "conflict" sequences to "match.
compare: Reference comparison complete. Results written to ../data/compare


ReferenceAnnotator._check: 100%|██████████| 862/862 [00:01<00:00, 832.14it/s] 
ReferenceAnnotator._check: 100%|██████████| 57/57 [00:00<00:00, 1917.43it/s]


ReferenceAnnotator._check: Downgraded 0 "match" sequences to "intergenic" or "conflict".
ReferenceAnnotator._check: Upgraded 2 "intergenic" or "conflict" sequences to "match.
compare: Reference comparison complete. Results written to ../data/compare


In [5]:
# Want to get the number of proteins in the NCBI reference. 
prodigal_fasta_files = {genome_id:FASTAFile(path=f'../data/prodigal/{genome_id}_protein.faa') for genome_id in genome_ids}
ncbi_fasta_files = {genome_id:FASTAFile(path=f'../data/ncbi/proteins/{genome_id}_protein.faa') for genome_id in genome_ids}

In [11]:
compare_df = pd.concat([Reference.load(f'../data/compare/{genome_id}_top_hits.csv').assign(genome_id=genome_id) for genome_id in genome_ids])
compare_df['name'] = compare_df.genome_id.map(names)

In [12]:
def table_1(compare_df:pd.DataFrame, prodigal_fasta_files:dict, ncbi_fasta_files:dict, path:str=None):

    table_df = pd.DataFrame(index=compare_df.name.unique(), columns=['ncbi_n_cds', 'prodigal_n_cds', 'percent_matches', 'percent_exact_matches'])   
    table_df['ncbi_n_cds'] = pd.Series({names[genome_id]:len(fasta_file) for genome_id, fasta_file in ncbi_fasta_files.items()})
    table_df['prodigal_n_cds'] = pd.Series({names[genome_id]:len(fasta_file) for genome_id, fasta_file in prodigal_fasta_files.items()})
    for name, df in compare_df.groupby('name'):
        table_df.loc[name, 'percent_matches'] = (df.category == 'match').sum() / len(df)
        table_df.loc[name, 'percent_exact_matches'] = df.exact_match.sum() / len(df)
    return table_df

table_1(compare_df, prodigal_fasta_files, ncbi_fasta_files)

Unnamed: 0,ncbi_n_cds,prodigal_n_cds,percent_matches,percent_exact_matches
E. coli K-12,4298,4319,0.954388,0.900208
B. subtilis 168,4237,4226,0.977993,0.88973
P. aeruginosa,5572,5681,0.977997,0.896849
M. tuberculosis,3906,4085,0.928274,0.717013


In [None]:
# Want to characterize the amount of length difference

In [None]:
# What statistics do I care about?
# (1) Total number of genes in reference. 
# (2) Total number of predicted genes. 
# (3) Total number of matched coding sequences (not including pseudogenes).
# (4) Number of exact matches. 
# (5) Number of genes with boundary errors (characterize the nature of the boundary errors later on).
# (2) The false discovery rate. 
# (3) The number of missed genes. 