In [2]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt
from src.reference import annotate

In [6]:
dataset_df = pd.read_csv('../data/results/results-2/dataset.csv', index_col=0)

dataset_df = annotate(dataset_df)
dataset_df['top_hit_hypothetical'] = dataset_df.top_hit_product == 'hypothetical protein'
# Make sure these are in units of amino acids. 
dataset_df['query_length'] = dataset_df.seq.apply(len)
dataset_df['top_hit_length'] = dataset_df.top_hit_seq.apply(len)

# Exclude genomes on which Prodigal did horrifically (I think I needed to use a different translation table). 
# Also, one of the excluded genomes belongs to an endosymbiont which is not assigned a phylum.
exclude_genome_ids = ['GCF_029854295.1', 'GCF_021057185.1', 'GCF_016097415.1'] 
dataset_df = dataset_df[~dataset_df.genome_id.isin(exclude_genome_ids)].copy()

In [30]:
threshold = 0.9

results_df = pd.read_csv('../data/results/results-2/dataset_predict.csv', index_col=0)
results_df.columns = [col.replace('_v2', '') for col in results_df.columns]
results_df = results_df.merge(dataset_df, left_index=True, right_index=True, how='inner')
results_df['spurious'] = np.where(results_df.model_output_0 > threshold, True, False)
results_df['real'] = np.where(results_df.model_output_1 > threshold, True, False)
results_df['uncertain'] = ~results_df.real & ~results_df.spurious
results_df['model_label'] = np.select([results_df.real, results_df.spurious.values, results_df.uncertain.values], ['real', 'spurious', 'uncertain'], default='none')

top_hit_results_df = pd.read_csv('../data/results/results-2/top_hits_predict.csv', index_col=0)
top_hit_results_df.columns = [col.replace('_v2', '') for col in top_hit_results_df.columns]
top_hit_results_df = top_hit_results_df[~top_hit_results_df.index.duplicated(keep='first')].copy()

with pd.option_context('future.no_silent_downcasting', True):
    results_df['top_hit_real'] = results_df.top_hit_protein_id.map(top_hit_results_df.model_output_1 > threshold).fillna(False).astype(bool)
    results_df['top_hit_spurious'] = results_df.top_hit_protein_id.map(top_hit_results_df.model_output_0 > threshold).fillna(False).astype(bool)
    results_df['top_hit_model_output_1'] = results_df.top_hit_protein_id.map(top_hit_results_df.model_output_1)
    results_df['top_hit_model_output_0'] = results_df.top_hit_protein_id.map(top_hit_results_df.model_output_0)
    results_df['top_hit_uncertain'] = results_df.top_hit_protein_id.map((top_hit_results_df.model_output_0 <= threshold) & (top_hit_results_df.model_output_1 <= threshold)).fillna(False).astype(bool)
    results_df['top_hit_model_label'] = np.where(results_df.top_hit_real, 'real', 'none')
    results_df['top_hit_model_label'] = np.where(results_df.top_hit_spurious, 'spurious', results_df.top_hit_model_label)
    results_df['top_hit_model_label'] = np.where(results_df.top_hit_uncertain, 'uncertain', results_df.top_hit_model_label)

In [47]:
adjusted_dataset_df = pd.read_csv('../data/results/results-2/adjusted_conflict.csv', index_col=0)
adjusted_results_df = pd.read_csv('../data/results/results-2/adjusted_conflict_predict.csv', index_col=0)

adjusted_results_df.columns = [col.replace('_v2', '') for col in adjusted_results_df.columns]
adjusted_results_df = adjusted_results_df.merge(adjusted_dataset_df, left_index=True, right_index=True, how='inner')
adjusted_results_df['spurious'] = np.where(adjusted_results_df.model_output_0 > threshold, True, False)
adjusted_results_df['real'] = np.where(adjusted_results_df.model_output_1 > threshold, True, False)
adjusted_results_df['uncertain'] = ~adjusted_results_df.real & ~adjusted_results_df.spurious
adjusted_results_df['model_label'] = np.select([adjusted_results_df.real, adjusted_results_df.spurious.values, adjusted_results_df.uncertain.values], ['real', 'spurious', 'uncertain'], default='none')

In [51]:
adjusted_results_df.columns

Index(['model_v3_label', 'model_v3_output_0', 'model_v3_output_1',
       'model_label', 'model_output_0', 'model_output_1', 'model_v1_label',
       'model_v1_output_0', 'model_v1_output_1', 'index.1', 'id', 'seq',
       'nt_seq', 'truncation_length', 'terminus', 'overlap_type',
       'overlap_length', 'original_overlap_length', 'conflict_id', 'species',
       'pair_id', 'gc_content', 'spurious', 'real', 'uncertain'],
      dtype='object')

In [52]:
# Want to add the original predictions to the adjusted data. 
df = list()

for row in adjusted_results_df.itertuples():
    id_ = getattr(row, 'id')
    row_ = {'id':id_}
    if id_[0] == 'W':
        df_ = results_df[results_df.top_hit_protein_id == id_]
        row_['original_model_label'] = df_.top_hit_model_label.iloc[0]
        row_['original_model_output_0'] = df_.top_hit_model_output_0.iloc[0]
        row_['original_model_output_1'] = df_.top_hit_model_output_1.iloc[0]
        row_['original_seq'] = df_.top_hit_seq.iloc[0]
    elif id_[0] == 'N':
        df_ = results_df.loc[id_]
        row_['original_model_label'] = df_.model_label
        row_['original_model_output_0'] = df_.model_output_0
        row_['original_model_output_1'] = df_.model_output_1
        row_['original_seq'] = df_.seq
    df.append(row_)

df = pd.DataFrame(df)
adjusted_results_df = adjusted_results_df.merge(df, left_on='id', right_on='id')


In [45]:
adjusted_dataset_df.columns

Index(['index.1', 'id', 'seq', 'nt_seq', 'truncation_length', 'terminus',
       'overlap_type', 'overlap_length', 'original_overlap_length',
       'conflict_id', 'species', 'pair_id', 'gc_content'],
      dtype='object')

In [53]:
adjusted_results_df[adjusted_results_df.original_model_label != adjusted_results_df.model_label][['original_model_label', 'model_label', 'truncation_length', 'seq', 'original_seq']]

Unnamed: 0,original_model_label,model_label,truncation_length,seq,original_seq
0,spurious,real,0,MKESRSHCFIFCGIESIFNCKPEKLPHFLWILCHFTTRDIGSQQLG...,MKESRSHCFIFCGIESIFNCKPEKLPHFLWILCHFTTRDIGSQQLG...
3,spurious,real,0,MFCRIKQQIEALFQSSRSLSRPPTGRHIEVPIFRADTGLHPLTIVP...,MFCRIKQQIEALFQSSRSLSRPPTGRHIEVPIFRADTGLHPLTIVP...
7,spurious,real,46,MRDMSFVERVFHAFITCGTRELVVSIPANNPRYVSNIRIL,MELFLKYRRTDNKTKLIKFHKTIGIWNIERPYITQQNTPIHIITNI...
8,spurious,real,46,MRDMSFVERVFHAFITCGTRELVVSIPANNPRYVSNIRIL,MELFLKYRRTDNKTKLIKFHKTIGIWNIERPYITQQNTPIHIITNI...
9,spurious,real,0,MNRSATGFERSADLPKTVLNGQFANHAPIFSRGC,MNRSATGFERSADLPKTVLNGQFANHAPIFSRGC
...,...,...,...,...,...
1036,real,spurious,0,MCRMFGIVTTKSQSIAPWMTGVEPSLRSLAIADKSGEPNPDGWGVA...,MCRMFGIVTTKSQSIAPWMTGVEPSLRSLAIADKSGEPNPDGWGVA...
1037,real,spurious,0,MAKCENCGKKTVFGHRRSFSMRATNRAFRPNLQKVLVIENGRKVHK...,MAKCENCGKKTVFGHRRSFSMRATNRAFRPNLQKVLVIENGRKVHK...
1038,real,spurious,0,MAKCENCGKKTVFGHRRSFSMRATNRAFRPNLQKVLVIENGRKVHK...,MAKCENCGKKTVFGHRRSFSMRATNRAFRPNLQKVLVIENGRKVHK...
1043,real,uncertain,0,MSKQTKTASQNTKSRQVVAVNRRARHDYEIGETYEAGIALVGTEVK...,MSKQTKTASQNTKSRQVVAVNRRARHDYEIGETYEAGIALVGTEVK...
