In [2]:
import pandas as pd 
import numpy as np 
from utils import * 
import seaborn as sns 
import matplotlib.pyplot as plt
from matplotlib.gridspec import GridSpec
from src.files import FASTAFile, InterProScanFile
from src import fillna
from src.reference import compare, Reference
from src.tools import download, Prodigal

%load_ext autoreload 
%autoreload 2

# TODO: Did Prodigal find ribosome binding sites for the spurious sequences?
# TODO: How frequently do Shine-Delgarno sequences occur when they are not associated with a protein?
# TODO: I am operating under the assumption that the model is underpredicting spurious sequences in Campylobacterota,
#   but is this true? Maybe get a better sense using genomes with a more solid ground truth. 

In [None]:
# How should I select the genomes for model training?
genome_ids = ['GCF_000005845.2'] # E. coli K-12
genome_ids += ['GCF_000009045.1'] # B. subtilis
genome_ids += ['GCF_000006765.1'] # P. aeruginosa
genome_ids += ['GCF_000195955.2'] # M. tuberculosis

names = dict()
names['GCF_000005845.2'] = 'E. coli K-12'
names['GCF_000009045.1'] = 'B. subtilis 168'
names['GCF_000006765.1'] = 'P. aeruginosa'
names['GCF_000195955.2'] = 'M. tuberculosis'

# ncbi = download.NCBI()
# ncbi.get_genomes(genome_ids, include=['gbff', 'genome', 'protein'], dirs={'genome':'../data/ncbi/genomes', 'gbff':'../data/ncbi/gbffs', 'protein':'../data/ncbi/proteins'})
# ncbi.cleanup()

# prodigal = Prodigal()
# for genome_id in genome_ids:
#     prodigal.run(f'../data/ncbi/genomes/{genome_id}_genomic.fna', f'../data/prodigal/{genome_id}_protein.faa')

# query_paths = [f'../data/prodigal/{genome_id}_protein.faa' for genome_id in genome_ids]
# reference_paths = [f'../data/ncbi/gbffs/{genome_id}_genomic.gbff' for genome_id in genome_ids]
# for query_path, reference_path in zip(query_paths, reference_paths):
#     compare(query_path, reference_path, overwrite=True)

In [None]:
if not os.path.exists('../data/model_organisms_ncbi.csv'):
    ncbi_df = list()
    for genome_id in tqdm(genome_ids, desc='Building dataset from NCBI reference.'):
        protein_path = f'../data/ncbi/proteins/{genome_id}_protein.faa'
        gbff_path = f'../data/ncbi/gbffs/{genome_id}_genomic.gbff'

        gbff_df = GBFFFile(gbff_path).to_df()
        gbff_df = gbff_df[(gbff_df.feature == 'CDS') & (~gbff_df.pseudo)].copy()
        gbff_df['copy_number'] = gbff_df.protein_id.map(gbff_df.protein_id.value_counts()) # There are sometimes multiple copies of the same protein. 
        gbff_df = gbff_df.drop_duplicates('protein_id').copy()
        gbff_df = gbff_df.drop(columns=['seq']) # Use the sequences from the protein DataFrame, just to make sure everything is equal. 
        gbff_df = gbff_df.set_index('protein_id')
        gbff_df.index.name = 'id'

        protein_df = FASTAFile(path=protein_path).to_df(prodigal_output=False)
        protein_df = protein_df.drop(columns=['description'])

        assert len(protein_df) == len(gbff_df), 'Expected the number of non-pseudo CDS entries in the GBFF file to match the entries in the FASTA file.'
        assert np.all(np.sort(protein_df.index) == np.sort(gbff_df.index)), 'Expected the number of non-pseudo CDS entries in the GBFF file to match the entries in the FASTA file.'
        assert protein_df.index.is_unique and gbff_df.index.is_unique, 'Expected the indices of both DataFrames to be unique.'

        ncbi_df.append(protein_df.merge(gbff_df, left_index=True, right_index=True).assign(genome_id=genome_id))
    ncbi_df = pd.concat(ncbi_df)
    ncbi_df = ncbi_df[ncbi_df.seq.apply(len) < 2000].copy()
    ncbi_df.to_csv('../data/model_organisms_ncbi.csv')


In [10]:
def fix_b_subtilis(path:str='../data/compare/GCF_000009045.1_top_hits.csv'):
    df = pd.read_csv(path, index_col=0, dtype={'top_hit_partial':str, 'top_hit_translation_table':str, 'top_hit_codon_start':str})
    evidence_types = []
    for row in df.itertuples():
        if ('Evidence 1' in row.top_hit_note) or ('Evidence 2' in row.top_hit_note):
            evidence_types.append('experiment')
        elif ('Evidence 4' in row.top_hit_note) or ('Evidence 3' in row.top_hit_note):
            evidence_types.append('similar to sequence')
        elif (row.n_hits > 0):
            evidence_types.append('ab initio prediction')
        else:
            evidence_types.append('none')
    df['top_hit_evidence_type'] = evidence_types
    df.to_csv(path)

if not os.path.exists('../data/model_organisms_prodigal.csv'):
    fix_b_subtilis()
    prodigal_df = list()
    for genome_id in genome_ids:
        top_hits_df = Reference.load(f'../data/compare/{genome_id}_top_hits.csv')
        top_hits_df['genome_id'] = genome_id
        prodigal_df.append(top_hits_df)
    prodigal_df = pd.concat(prodigal_df)
    prodigal_df = prodigal_df[prodigal_df.query_seq.apply(len) < 2000].copy()
    prodigal_df.to_csv('../data/model_organisms_prodigal.csv')