In [None]:
# CLAIM: Model does not detect nested gene pairs. 

In [47]:
import pandas as pd
from utils import *  
import os 
import numpy as np 
from scipy.stats.contingency import expected_freq
import re
from src.reference import annotate
from src.files import BLASTJsonFile
import seaborn as sns
from scipy.stats import chisquare

%load_ext autoreload 
%autoreload 2


The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [None]:
def write_csv(df:pd.DataFrame, path:str=None):
    columns = ['species', 'genome_id', 'top_hit_protein_id', 'top_hit_product', 'top_hit_evidence_type', 'length', 'top_hit_length', 'top_hit_spurious']
    df[columns].to_csv(path)
    print(f'write_csv: Wrote {len(df)} sequences to {path}.')

In [91]:
dataset_df = pd.read_csv('../data/results/results-2/dataset.csv', index_col=0)

dataset_df = annotate(dataset_df)
dataset_df['top_hit_hypothetical'] = dataset_df.top_hit_product == 'hypothetical protein'
dataset_df['query_codon_start'] = 1
dataset_df['query_id'] = dataset_df.index
dataset_df['query_product'] = 'hypothetical protein'
# dataset_df['query_rbs_motif'] = ~(dataset_df.query_rbs_motif.str.contains('T') |( dataset_df.query_rbs_motif == 'none'))
dataset_df['query_seq'] = dataset_df.seq
dataset_df['query_length'] = dataset_df.seq.apply(len) # Make sure these are in units of amino acids. 
dataset_df['top_hit_length'] = dataset_df.top_hit_seq.apply(len) # Make sure these are in units of amino acids. 
dataset_df['top_hit_id'] = dataset_df.top_hit_protein_id
dataset_df['top_hit_gc_content'] = dataset_df.top_hit_nt_seq.apply(get_gc_content)

# Exclude genomes on which Prodigal did horrifically (I think I needed to use a different translation table). 
# Also, one of the excluded genomes belongs to an endosymbiont which is not assigned a phylum.
exclude_genome_ids = ['GCF_029854295.1', 'GCF_021057185.1', 'GCF_016097415.1'] 
dataset_df = dataset_df[~dataset_df.genome_id.isin(exclude_genome_ids)].copy()

In [97]:
threshold = 0.9

results_df = pd.read_csv('../data/results/results-2/dataset_predict.csv', index_col=0)
results_df.columns = [col.replace('_v2', '') for col in results_df.columns]
results_df = results_df.merge(dataset_df, left_index=True, right_index=True, how='inner')
results_df['spurious'] = np.where(results_df.model_output_0 > threshold, True, False)
results_df['real'] = np.where(results_df.model_output_1 > threshold, True, False)
results_df['uncertain'] = ~results_df.real & ~results_df.spurious
results_df['model_label'] = np.select([results_df.real, results_df.spurious.values, results_df.uncertain.values], ['real', 'spurious', 'uncertain'], default='none')

top_hit_results_df = pd.read_csv('../data/results/results-2/top_hits_predict.csv', index_col=0)
top_hit_results_df.columns = [col.replace('_v2', '') for col in top_hit_results_df.columns]
top_hit_results_df = top_hit_results_df[~top_hit_results_df.index.duplicated(keep='first')].copy()
top_hit_results_df['spurious'] = np.where(top_hit_results_df.model_output_0 > threshold, True, False)
top_hit_results_df['real'] = np.where(top_hit_results_df.model_output_1 > threshold, True, False)
top_hit_results_df['uncertain'] = ~top_hit_results_df.real & ~top_hit_results_df.spurious
top_hit_results_df['model_label'] = np.select([top_hit_results_df.real, top_hit_results_df.spurious.values, top_hit_results_df.uncertain.values], ['real', 'spurious', 'uncertain'], default='none')


with pd.option_context('future.no_silent_downcasting', True):
    for field in ['model_label', 'model_output_1', 'model_output_0', 'real', 'spurious', 'uncertain']:
        results_df[f'query_{field}'] = results_df[field]
        results_df[f'top_hit_{field}'] = results_df.top_hit_protein_id.map(top_hit_results_df[field])

In [99]:
nested_fragmented_ids = ['NZ_AP025523.1_333', 'NZ_AP025523.1_686', 'NZ_AP025523.1_1533', 'NZ_AP025523.1_3060', 'NZ_CP065383.1_468', 'NZ_NIGF01000018.1_4', 'NZ_NIGF01000027.1_4', 'NZ_NIGF01000006.1_140', 'NC_014960.1_581']

write_csv(results_df[is_nested_cds_conflict(results_df) & ~(results_df.index.isin(nested_fragmented_ids))], path='../data/results/results-2/conflict_nested.csv')
write_fasta(results_df[is_nested_cds_conflict(results_df)], path='../data/results/results-2/conflict_nested.faa', add_top_hit=True)

write_csv: Wrote 76 sequences to ../data/results/results-2/conflict_nested.csv.
write_fasta: Wrote 170 sequences to ../data/results/results-2/conflict_nested.faa


In [93]:
def get_nested_results(results_df):
    results_df = results_df[is_cds_conflict(results_df) & (results_df.overlap_type == 'nested')].copy()
    results_df['query_nested'] = results_df.query_length < results_df.top_hit_length
    
    df = pd.DataFrame(index=results_df.index)

    fields = [col.replace('query_', '') for col in results_df.columns if (('query' in col) and (col != 'query_nested'))]
    fields = np.intersect1d([col.replace('top_hit_', '') for col in results_df.columns if ('top_hit' in col)], fields)
    # print(fields)

    for field in fields:
        df[f'parent_{field}'] = np.where(results_df.query_nested, results_df[f'top_hit_{field}'], results_df[f'query_{field}'])
        df[f'daughter_{field}'] = np.where(results_df.query_nested, results_df[f'query_{field}'], results_df[f'top_hit_{field}'])
    df['overlap_length'] = results_df.overlap_length
    df['same_strand'] = results_df.same_strand
    return df 

results_df = get_nested_results(results_df)
print('Num. nested conflicts:', len(results_df))

Num. nested conflicts: 85


In [76]:
print('Num. parent and daughter spurious:', (results_df.daughter_spurious & results_df.parent_spurious).sum())
print('Num. parent and daughter real:', (results_df.daughter_real & results_df.parent_real).sum())
print('Num. parent real and daughter spurious:', (results_df.daughter_spurious & results_df.parent_real).sum())
print('Num. parent spurious daughter real:', (results_df.daughter_real & results_df.parent_spurious).sum())

Num. parent and daughter spurious: 13
Num. parent and daughter real: 9
Num. parent real and daughter spurious: 32
Num. parent spurious daughter real: 13
