In [2]:
import pandas as pd 
import numpy as np 
from utils import * 
import seaborn as sns 
import matplotlib.pyplot as plt
from matplotlib.gridspec import GridSpec
from src.files import FASTAFile, InterProScanFile
from src import fillna
from src.reference import compare, Reference
from src.tools import download, Prodigal

%load_ext autoreload 
%autoreload 2

# TODO: Did Prodigal find ribosome binding sites for the spurious sequences?
# TODO: How frequently do Shine-Delgarno sequences occur when they are not associated with a protein?
# TODO: I am operating under the assumption that the model is underpredicting spurious sequences in Campylobacterota,
#   but is this true? Maybe get a better sense using genomes with a more solid ground truth. 

In [None]:
# How should I select the genomes for model training?
genome_ids = ['GCF_000005845.2'] # E. coli K-12
genome_ids += ['GCF_000009045.1'] # B. subtilis
genome_ids += ['GCF_000006765.1'] # P. aeruginosa
genome_ids += ['GCF_000195955.2'] # M. tuberculosis

names = dict()
names['GCF_000005845.2'] = 'E. coli K-12'
names['GCF_000009045.1'] = 'B. subtilis 168'
names['GCF_000006765.1'] = 'P. aeruginosa'
names['GCF_000195955.2'] = 'M. tuberculosis'

# ncbi = download.NCBI()
# ncbi.get_genomes(genome_ids, include=['gbff', 'genome', 'protein'], dirs={'genome':'../data/ncbi/genomes', 'gbff':'../data/ncbi/gbffs', 'protein':'../data/ncbi/proteins'})
# ncbi.cleanup()

# prodigal = Prodigal()
# for genome_id in genome_ids:
#     prodigal.run(f'../data/ncbi/genomes/{genome_id}_genomic.fna', f'../data/prodigal/{genome_id}_protein.faa')

# query_paths = [f'../data/prodigal/{genome_id}_protein.faa' for genome_id in genome_ids]
# reference_paths = [f'../data/ncbi/gbffs/{genome_id}_genomic.gbff' for genome_id in genome_ids]
# for query_path, reference_path in zip(query_paths, reference_paths):
#     compare(query_path, reference_path, overwrite=True)

# Model seems to be doing a very poor job of generalizing to organisms in different phyla. 

In [32]:
def apply_threshold(results_df:pd.DataFrame, threshold:float=0.5):
    results_df = results_df[(results_df.model_output_0 > threshold) | (results_df.model_output_1 > threshold)].copy()
    return results_df

In [None]:
def fix_b_subtilis(path:str='../data/compare/GCF_000009045.1_top_hits.csv'):
    df = pd.read_csv(path, index_col=0, dtype={'top_hit_partial':str, 'top_hit_translation_table':str, 'top_hit_codon_start':str})
    evidence_types = []
    for row in df.itertuples():
        if ('Evidence 1' in row.top_hit_note) or ('Evidence 2' in row.top_hit_note):
            evidence_types.append('experiment')
        elif ('Evidence 4' in row.top_hit_note) or ('Evidence 3' in row.top_hit_note):
            evidence_types.append('similar to sequence')
        elif (row.n_hits > 0):
            evidence_types.append('ab initio prediction')
        else:
            evidence_types.append('none')
    df['top_hit_evidence_type'] = evidence_types
    df.to_csv(path)


In [22]:
model_name = 'campylobacterota_v3'

ncbi_results_df = pd.read_csv('../data/results/model_organisms_ncbi_predict.csv', index_col=0)
ncbi_results_df = ncbi_results_df.merge(pd.read_csv('../data/model_organisms_ncbi.csv', index_col=0), left_index=True, right_index=True)
ncbi_results_df = ncbi_results_df.rename(columns={col:col.replace(model_name, 'model') for col in ncbi_results_df.columns})
ncbi_results_df['name'] = ncbi_results_df.genome_id.map(names)

prodigal_results_df = pd.read_csv('../data/results/model_organisms_prodigal_predict.csv', index_col=0)
prodigal_results_df = prodigal_results_df.merge(pd.read_csv('../data/model_organisms_prodigal.csv', index_col=0), left_index=True, right_index=True)
prodigal_results_df = prodigal_results_df.rename(columns={col:col.replace(model_name, 'model') for col in prodigal_results_df.columns})
prodigal_results_df['name'] = prodigal_results_df.genome_id.map(names)

In [33]:
def table_1(results_df:pd.DataFrame, threshold:float=0.99, path:str=None):

    totals = results_df.groupby('category').apply(len, include_groups=False)
    results_df = apply_threshold(results_df.copy(), threshold=threshold)
    
    is_suspect = lambda df : (df['top_hit_product'] == 'hypothetical protein') & (df.top_hit_evidence_type == 'ab initio prediction')
    mask = is_suspect(results_df)
    print(f'table_1: {mask.sum()} sequences align to a suspect protein in the reference.')
    results_df = results_df[~mask].copy()

    table_df = pd.DataFrame(index=results_df.category.unique())
    table_df['n_spurious'] = results_df.groupby('category').apply(lambda df : (df.model_label == 0).sum(), include_groups=False)
    table_df['p_spurious'] = [f'{p * 100:.2f}%' for p in table_df.n_spurious / totals.loc[table_df.index]]
    table_df['n_real'] = results_df.groupby('category').apply(lambda df : (df.model_label == 1).sum(), include_groups=False)
    table_df['p_real'] = [f'{p * 100:.2f}%' for p in table_df.n_real / totals.loc[table_df.index]]

    return table_df

table_1(prodigal_results_df, threshold=0.99)

table_1: 30 sequences align to a suspect protein in the reference.


Unnamed: 0,n_spurious,p_spurious,n_real,p_real
intergenic,189,44.37%,70,16.43%
match,2699,15.37%,6466,36.83%
conflict,154,67.54%,11,4.82%
pseudogene,6,9.84%,27,44.26%


In [23]:

prodigal_results_df.groupby('name').apply(lambda df : (df.model_output_0 > 0.99).sum(), include_groups=False)

name
B. subtilis 168      48
E. coli K-12        133
M. tuberculosis    2269
P. aeruginosa       606
dtype: int64

In [28]:
prodigal_results_df[(prodigal_results_df.name == 'E. coli K-12') & (prodigal_results_df.model_output_0 > 0.99)].category.value_counts()

category
match         74
conflict      37
intergenic    22
Name: count, dtype: int64

In [30]:
prodigal_results_df.groupby('name').category.value_counts()

name             category  
B. subtilis 168  match         4119
                 pseudogene      35
                 intergenic      34
                 conflict        24
E. coli K-12     match         4117
                 intergenic     119
                 conflict        65
                 pseudogene      17
M. tuberculosis  match         3776
                 intergenic     159
                 conflict       128
                 pseudogene       9
P. aeruginosa    match         5544
                 intergenic     114
                 conflict        11
Name: count, dtype: int64