In [2]:
import pandas as pd 
import numpy as np 
from utils import * 
import seaborn as sns 
import matplotlib.pyplot as plt
from matplotlib.gridspec import GridSpec
from src.files import FASTAFile, InterProScanFile
from src import fillna
from src.reference import compare, Reference
from src.tools import download, Prodigal

%load_ext autoreload 
%autoreload 2

# TODO: Did Prodigal find ribosome binding sites for the spurious sequences?
# TODO: How frequently do Shine-Delgarno sequences occur when they are not associated with a protein?
# TODO: I am operating under the assumption that the model is underpredicting spurious sequences in Campylobacterota,
#   but is this true? Maybe get a better sense using genomes with a more solid ground truth. 

In [None]:
# How should I select the genomes for model training?
genome_ids = ['GCF_000005845.2'] # E. coli K-12
genome_ids += ['GCF_000009045.1'] # B. subtilis
genome_ids += ['GCF_000006765.1'] # P. aeruginosa
genome_ids += ['GCF_000195955.2'] # M. tuberculosis

names = dict()
names['GCF_000005845.2'] = 'E. coli K-12'
names['GCF_000009045.1'] = 'B. subtilis 168'
names['GCF_000006765.1'] = 'P. aeruginosa'
names['GCF_000195955.2'] = 'M. tuberculosis'

# ncbi = download.NCBI()
# ncbi.get_genomes(genome_ids, include=['gbff', 'genome', 'protein'], dirs={'genome':'../data/ncbi/genomes', 'gbff':'../data/ncbi/gbffs', 'protein':'../data/ncbi/proteins'})
# ncbi.cleanup()

# prodigal = Prodigal()
# for genome_id in genome_ids:
#     prodigal.run(f'../data/ncbi/genomes/{genome_id}_genomic.fna', f'../data/prodigal/{genome_id}_protein.faa')

# query_paths = [f'../data/prodigal/{genome_id}_protein.faa' for genome_id in genome_ids]
# reference_paths = [f'../data/ncbi/gbffs/{genome_id}_genomic.gbff' for genome_id in genome_ids]
# for query_path, reference_path in zip(query_paths, reference_paths):
#     compare(query_path, reference_path, overwrite=True)

In [None]:
if not os.path.exists('../data/model_organisms_ncbi.csv'):
    ncbi_df = list()
    for genome_id in tqdm(genome_ids, desc='Building dataset from NCBI reference.'):
        protein_path = f'../data/ncbi/proteins/{genome_id}_protein.faa'
        gbff_path = f'../data/ncbi/gbffs/{genome_id}_genomic.gbff'

        gbff_df = GBFFFile(gbff_path).to_df()
        gbff_df = gbff_df[(gbff_df.feature == 'CDS') & (~gbff_df.pseudo)].copy()
        gbff_df['copy_number'] = gbff_df.protein_id.map(gbff_df.protein_id.value_counts()) # There are sometimes multiple copies of the same protein. 
        gbff_df = gbff_df.drop_duplicates('protein_id').copy()
        gbff_df = gbff_df.drop(columns=['seq']) # Use the sequences from the protein DataFrame, just to make sure everything is equal. 
        gbff_df = gbff_df.set_index('protein_id')
        gbff_df.index.name = 'id'

        protein_df = FASTAFile(path=protein_path).to_df(prodigal_output=False)
        protein_df = protein_df.drop(columns=['description'])

        assert len(protein_df) == len(gbff_df), 'Expected the number of non-pseudo CDS entries in the GBFF file to match the entries in the FASTA file.'
        assert np.all(np.sort(protein_df.index) == np.sort(gbff_df.index)), 'Expected the number of non-pseudo CDS entries in the GBFF file to match the entries in the FASTA file.'
        assert protein_df.index.is_unique and gbff_df.index.is_unique, 'Expected the indices of both DataFrames to be unique.'

        ncbi_df.append(protein_df.merge(gbff_df, left_index=True, right_index=True).assign(genome_id=genome_id))
    ncbi_df = pd.concat(ncbi_df)
    ncbi_df = ncbi_df[ncbi_df.seq.apply(len) < 2000].copy()
    ncbi_df.to_csv('../data/model_organisms_ncbi.csv')


In [10]:
def fix_b_subtilis(path:str='../data/compare/GCF_000009045.1_top_hits.csv'):
    df = pd.read_csv(path, index_col=0, dtype={'top_hit_partial':str, 'top_hit_translation_table':str, 'top_hit_codon_start':str})
    evidence_types = []
    for row in df.itertuples():
        if ('Evidence 1' in row.top_hit_note) or ('Evidence 2' in row.top_hit_note):
            evidence_types.append('experiment')
        elif ('Evidence 4' in row.top_hit_note) or ('Evidence 3' in row.top_hit_note):
            evidence_types.append('similar to sequence')
        elif (row.n_hits > 0):
            evidence_types.append('ab initio prediction')
        else:
            evidence_types.append('none')
    df['top_hit_evidence_type'] = evidence_types
    df.to_csv(path)

if not os.path.exists('../data/model_organisms_prodigal.csv'):
    fix_b_subtilis()
    prodigal_df = list()
    for genome_id in genome_ids:
        top_hits_df = Reference.load(f'../data/compare/{genome_id}_top_hits.csv')
        top_hits_df['genome_id'] = genome_id
        prodigal_df.append(top_hits_df)
    prodigal_df = pd.concat(prodigal_df)
    prodigal_df = prodigal_df[prodigal_df.query_seq.apply(len) < 2000].copy()
    prodigal_df.to_csv('../data/model_organisms_prodigal.csv')

In [22]:
model_name = 'campylobacterota_v3'

ncbi_results_df = pd.read_csv('../data/results/model_organisms_ncbi_predict.csv', index_col=0)
ncbi_results_df = ncbi_results_df.merge(pd.read_csv('../data/model_organisms_ncbi.csv', index_col=0), left_index=True, right_index=True)
ncbi_results_df = ncbi_results_df.rename(columns={col:col.replace(model_name, 'model') for col in ncbi_results_df.columns})
ncbi_results_df['name'] = ncbi_results_df.genome_id.map(names)

prodigal_results_df = pd.read_csv('../data/results/model_organisms_prodigal_predict.csv', index_col=0)
prodigal_results_df = prodigal_results_df.merge(pd.read_csv('../data/model_organisms_prodigal.csv', index_col=0), left_index=True, right_index=True)
prodigal_results_df = prodigal_results_df.rename(columns={col:col.replace(model_name, 'model') for col in prodigal_results_df.columns})
prodigal_results_df['name'] = prodigal_results_df.genome_id.map(names)

In [20]:
ncbi_results_df.groupby('name').apply(lambda df : (df.model_output_0 > 0.99).sum(), include_groups=False)

name
B. subtilis 168      52
E. coli K-12        147
M. tuberculosis    2194
P. aeruginosa       618
dtype: int64

In [23]:

prodigal_results_df.groupby('name').apply(lambda df : (df.model_output_0 > 0.99).sum(), include_groups=False)

name
B. subtilis 168      48
E. coli K-12        133
M. tuberculosis    2269
P. aeruginosa       606
dtype: int64

In [26]:
ncbi_results_df[(ncbi_results_df.name == 'M. tuberculosis') & (ncbi_results_df.model_output_0 > 0.99)]
ncbi_results_df[(ncbi_results_df.name == 'B. subtilis 168') & (ncbi_results_df.model_output_0 > 0.99)]

Unnamed: 0_level_0,model_label,model_output_0,model_output_1,campylobacterota_v2_label,campylobacterota_v2_output_0,campylobacterota_v2_output_1,campylobacterota_v1_label,campylobacterota_v1_output_0,campylobacterota_v1_output_1,seq,...,translation_table,codon_start,evidence_type,evidence_category,evidence_details,evidence_source,used_pgap,copy_number,genome_id,name
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
NP_388071.1,0,0.999851,0.000149,0,0.999827,0.000173,0,0.999832,0.000168,MRFLKALPRRAEVQYDCLDRTLETQENVNLNIRVNVKEVATWGVNT...,...,11,1,ab initio prediction,none,2.0,AMIGene,False,1,GCF_000009045.1,B. subtilis 168
NP_388232.1,0,0.996246,0.003754,0,0.994412,0.005588,0,0.99626,0.00374,MNRSGKHLISSIILYPRPSGECISSISLDKQTQATTSPLYFCWREK,...,11,1,experiment,none,"publication(s) with functional evidences, PMID...",none,False,1,GCF_000009045.1,B. subtilis 168
NP_388785.1,0,0.996983,0.003017,0,0.996932,0.003068,0,0.996417,0.003583,MKKANPFTHAGLPFLLFPSIMFLSNKSMEYVVFHLDLVYYVTHTPR...,...,11,1,ab initio prediction,none,2.0,AMIGene,False,1,GCF_000009045.1,B. subtilis 168
NP_388851.1,0,0.999418,0.000582,0,0.99914,0.00086,0,0.998702,0.001298,MFIKQFHIGAANLLFCFRERFFRSDRALKSAVRNISVKKGMELTLH...,...,11,1,experiment,none,"publication(s) with functional evidences, PMID...",none,False,1,GCF_000009045.1,B. subtilis 168
NP_388855.1,0,0.999211,0.000789,0,0.999039,0.000961,0,0.998911,0.001089,MSFITIVNWELVQFVSVSMIHEYVSHRSVYLYRYSFPRCSN,...,11,1,ab initio prediction,none,2.0,AMIGene,False,1,GCF_000009045.1,B. subtilis 168
NP_390255.2,0,0.999755,0.000245,0,0.999903,9.7e-05,0,0.999828,0.000172,MYRPVWRWPADLYFFCPERHYPARPAIFSVRLSANENGDHKTPFPS...,...,11,1,ab initio prediction,none,2.0,AMIGene,False,1,GCF_000009045.1,B. subtilis 168
NP_390534.1,0,0.998504,0.001496,0,0.99829,0.00171,0,0.998424,0.001576,MLLKNWPSRRIQRDKSKRAGIGGTNNRIPYTLLLCYVNVQKPFRIVDL,...,11,1,ab initio prediction,none,2.0,AMIGene,False,1,GCF_000009045.1,B. subtilis 168
NP_390545.3,0,0.990424,0.009576,0,0.9925,0.0075,0,0.991708,0.008292,MRLYTEYIPNKNKLINQQQYHSDLQDLLHQWILQLLQLDVLPLSNR...,...,11,1,ab initio prediction,none,2.0,AMIGene,False,1,GCF_000009045.1,B. subtilis 168
NP_390821.3,0,0.996077,0.003923,0,0.996306,0.003694,0,0.991644,0.008356,MKKRKLAWQQEFVKISFIKSRHYSQPTSIMMGGRMMKKEPERYIYD...,...,11,1,ab initio prediction,none,2.0,AMIGene,False,1,GCF_000009045.1,B. subtilis 168
NP_391102.1,0,0.996617,0.003384,0,0.996216,0.003784,0,0.99115,0.00885,MCYNRESNFANQTIDFYCYTEYNGFMTTKGANVMTEVEMKEQVQEV...,...,11,1,ab initio prediction,none,2.0,AMIGene,False,1,GCF_000009045.1,B. subtilis 168


In [28]:
prodigal_results_df[(prodigal_results_df.name == 'E. coli K-12') & (prodigal_results_df.model_output_0 > 0.99)].category.value_counts()

category
match         74
conflict      37
intergenic    22
Name: count, dtype: int64