In [1]:
import pandas as pd
import numpy as np
import os
import re

In [7]:
# Function definitions
def get_diff_row(df1, df2, on):
    df = df1.merge(df2, on=on, how='outer', indicator=True)
    return (df[df._merge!='both'])
def summarize_result(result_df):
    df = result_df.copy()
    important_cols = ['genome_name', 'given_O', 'predicted_O', 'O_info', 'given_H', 'predicted_H', 'H_info', 'serotype_tag']
    print('all predictions(%d):' %result_df.shape[0])
    '''
    no prediction if:
        both predicted antigen are None
    '''
    s1 = (df['predicted_O'].isnull() & df['predicted_H'].isnull())
    no_df = df[s1][important_cols]
    print('%d no predictions' %no_df.shape[0])
    '''
    incorrect prediction if:
        not in 'no_df'
        at least one prediction is wrong
    '''
    s2 = (
        ((df['given_O'].notnull() & df['predicted_O'].notnull()) & (df['given_O']!=df['predicted_O'])) |
        ((df['given_H'].notnull() & df['predicted_H'].notnull()) & (df['given_H']!=df['predicted_H']))
    )
    incorrect_df = df[~s1 & s2][important_cols]
    print('%d incorrect predictions' %incorrect_df.shape[0])
    '''
    correct prediction if:
        not in 'no_df'
        not in 'incorrect_df'
        for each given serotype, the correct prediction is made
    '''
    s3 = (
        (df['given_O'].isnull() | (df['given_O']==df['predicted_O'])) &
        (df['given_H'].isnull() | (df['given_H']==df['predicted_H']))
    )
    correct_df = df[~s1 & ~s2 & s3][important_cols]
    print('%d correct predictions' %correct_df.shape[0])
    '''
    semicorrect prediction if:
        not in 'no_df'
        not in 'incorrect_df'
        not in 'correct_df'
        one correct prediction is made, no prediction for the other
    '''
    s4 = (
        (df['given_O'].isnull() | (df['given_O']==df['predicted_O'])) |
        (df['given_H'].isnull() | (df['given_H']==df['predicted_H']))
    )
    semicorrect_df = df[~s1 & ~s2 & ~s3 & s4][important_cols]
    print('%d semicorrect predictions' %semicorrect_df.shape[0])
    '''
    remaining prediction
    '''
    remaining_df = df[~s1 & ~s2 & ~s3 & ~s4]
    print('%d remaining predictions' %remaining_df.shape[0])
    
    # Summary
    correct_count = 0
    incorrect_count = 0
    given_count = 0
    # O serotype
    # get number of given serotypes
    total_s = df['given_O'].notnull()
    num_total = df[total_s].shape[0]
    given_count += num_total
    print("number of given_O serotypes is %d"
            %num_total)
    # get number of unpredicted serotype
    unpredicted_s = total_s & df['predicted_O'].isnull()
    num_unpredicted = (df[unpredicted_s].shape[0])
    print("number of unpredicted_O serotypes is %d or %.2f%%"
            %(num_unpredicted, num_unpredicted/num_total*100))
    correct_s = total_s & ~unpredicted_s & (df['given_O']==df['predicted_O'])
    num_correct = df[correct_s].shape[0]
    correct_count += num_correct
    print("number of correctly predicted_O serotypes is %d or %.2f%%"
            %(num_correct, num_correct/num_total*100))
    incorrect_s = total_s & ~unpredicted_s & ~correct_s
    incorrect_o_df = df[incorrect_s][important_cols]
    num_incorrect = incorrect_o_df.shape[0]
    incorrect_count += num_incorrect
    print("number of incorrectly predicted_O serotypes is %d or %.2f%%"
            %(num_incorrect, num_incorrect/num_total*100))
    
    # H serotype
    # get number of given serotypes
    total_s = df['given_H'].notnull()
    num_total = (df[total_s].shape[0])
    given_count += num_total
    print("number of given_H serotypes is %d"
            %num_total)
    # get number of unpredicted serotype
    unpredicted_s = total_s & df['predicted_H'].isnull()
    num_unpredicted = (df[unpredicted_s].shape[0])
    print("number of unpredicted_H serotypes is %d or %.2f%%"
            %(num_unpredicted, num_unpredicted/num_total*100))
    correct_s = total_s & ~unpredicted_s & (df['given_H']==df['predicted_H'])
    num_correct = df[correct_s].shape[0]
    correct_count += num_correct
    print("number of correctly predicted_H serotypes is %d or %.2f%%"
            %(num_correct, num_correct/num_total*100))
    incorrect_s = total_s & ~unpredicted_s & ~correct_s
    incorrect_h_df = df[incorrect_s][important_cols]
    num_incorrect = incorrect_h_df.shape[0]
    incorrect_count += num_incorrect
    print("number of incorrectly predicted_H serotypes is %d or %.2f%%"
            %(num_incorrect, num_incorrect/num_total*100))
    print("Overall concordance=%.2f%%(%d/%d)" %(correct_count/given_count*100, correct_count,given_count))
    print("Overall discrepancies=%.2f%%(%d/%d)" %(incorrect_count/given_count*100, incorrect_count, given_count))

    return no_df, incorrect_df, correct_df, semicorrect_df, incorrect_o_df, incorrect_h_df, result_df

def remove_blacklist_genomes(df):
    start_count = df.shape[0]
    print('Start with %d rows' %(start_count))
    df = df[~df.blacklisted]
    print('%d rows removed for blacklisted genome' %(start_count-df.shape[0]))
    print('End with %d rows' %(df.shape[0]))
    return df

def mark_blacklist_genomes(df, blacklist_file):
    blacklist_df = pd.read_csv(blacklist_file)
    df = df.merge(blacklist_df, on='genome_name', how='outer', indicator=True)
    df['blacklisted'] = df._merge!='left_only'
    df.drop('_merge', axis=1, inplace=True)
    return df

def read_info_df(file):
    # Read info df
    df = pd.read_csv(file).drop('Unnamed: 0', axis=1)
    df.columns = ['genome_name', 'given_H', 'given_O', 'serotype_tag']
    df.given_H='H'+df.loc[df.given_H.notnull()].given_H.astype(int).astype(str)
    df.given_O='O'+df.loc[df.given_O.notnull()].given_O.astype(int).astype(str)
    df = mark_blacklist_genomes(df, 'blacklist.csv')
    df = remove_blacklist_genomes(df)
    return df

def read_output_file(file):
    # Read from result file
    df = pd.read_csv(file)
    df = df[['index', 'O_prediction', 'O_info', 'H_prediction', 'H_info']]
    df.columns = ['genome_name', 'predicted_O', 'O_info', 'predicted_H', 'H_info']
    df.loc[df.predicted_O=='-', 'predicted_O'] = np.nan
    df.loc[df.predicted_H=='-', 'predicted_H'] = np.nan
    return df

def create_merge_df(info_file, output_file):
    # merge with info file
    df = read_output_file(output_file)
    df2 = read_info_df(info_file)
    df = df.merge(df2, on='genome_name', how='inner')
    df = df[['genome_name', 'given_O', 'predicted_O', 'O_info', 'given_H', 'predicted_H', 'H_info', 'serotype_tag', 'blacklisted']]
    return df

In [8]:
merge_df = create_merge_df('enterobase_serotype.csv', 'output/enterobase/output.csv')
results = summarize_result(merge_df)
get_diff_row(read_info_df('enterobase_serotype.csv'), merge_df, 'genome_name')
merge_df

Start with 5844 rows
489 rows removed for blacklisted genome
End with 5355 rows
all predictions(5355):
44 no predictions
282 incorrect predictions
4283 correct predictions
746 semicorrect predictions
0 remaining predictions
number of given_O serotypes is 5223
number of unpredicted_O serotypes is 827 or 15.83%
number of correctly predicted_O serotypes is 4233 or 81.05%
number of incorrectly predicted_O serotypes is 163 or 3.12%
number of given_H serotypes is 2570
number of unpredicted_H serotypes is 44 or 1.71%
number of correctly predicted_H serotypes is 2384 or 92.76%
number of incorrectly predicted_H serotypes is 142 or 5.53%
Overall concordance=84.91%(6617/7793)
Overall discrepancies=3.91%(305/7793)
Start with 5844 rows
489 rows removed for blacklisted genome
End with 5355 rows


Unnamed: 0,genome_name,given_O,predicted_O,O_info,given_H,predicted_H,H_info,serotype_tag,blacklisted
0,ESC_AA7875AA_AS,O157,O157,Alignment found,,H7,Alignment found,O157,False
1,ESC_AA7899AA_AS,O157,O157,Alignment found,,H7,Alignment found,O157,False
2,ESC_AA7930AA_AS,O157,O157,Alignment found,,H7,Alignment found,O157,False
3,ESC_AA7942AA_AS,O157,O157,Alignment found,,H7,Alignment found,O157,False
4,ESC_AA7966AA_AS,O113,O113,Alignment found,H21,H21,Alignment found,O113:H21,False
5,ESC_AA7970AA_AS,O157,O157,Alignment found,,H7,Alignment found,O157,False
6,ESC_AA7978AA_AS,O157,O157,Alignment found,,H7,Alignment found,O157,False
7,ESC_AA7989AA_AS,O157,O157,Alignment found,,H7,Alignment found,O157,False
8,ESC_AA8002AA_AS,O157,O157,Alignment found,,H7,Alignment found,O157,False
9,ESC_AA8009AA_AS,O157,O157,Alignment found,,H7,Alignment found,O157,False


In [9]:
print(merge_df.shape)
results = summarize_result(merge_df)

(5355, 9)
all predictions(5355):
44 no predictions
282 incorrect predictions
4283 correct predictions
746 semicorrect predictions
0 remaining predictions
number of given_O serotypes is 5223
number of unpredicted_O serotypes is 827 or 15.83%
number of correctly predicted_O serotypes is 4233 or 81.05%
number of incorrectly predicted_O serotypes is 163 or 3.12%
number of given_H serotypes is 2570
number of unpredicted_H serotypes is 44 or 1.71%
number of correctly predicted_H serotypes is 2384 or 92.76%
number of incorrectly predicted_H serotypes is 142 or 5.53%
Overall concordance=84.91%(6617/7793)
Overall discrepancies=3.91%(305/7793)


In [28]:
def result_to_summary(result_df):
    # split into two dfs
    def rename_antigen_cols(df, antigen):
        df = df.rename(columns={
            'given_'+antigen: 'given',
            'predicted_'+antigen:'predicted',
            antigen+'_info':'info'})
        df = df[df['given'].notnull()]
        df = df[df['given'] != df['predicted']]
        return df
    writer = pd.ExcelWriter('enterobase_summary.xlsx')
    summary_dfs = []
    for antigen in ['O', 'H']:
        df = result_df.copy()
        interested_cols = ['genome_name', 'given_'+antigen, 'predicted_'+antigen, antigen+'_info']
        df = rename_antigen_cols(df[interested_cols], antigen)
        print(df.shape)
        interested_cols = ['genome_name', 'given', 'predicted', 'info']
        df = df[interested_cols]
        df['explanation'] = np.nan
        df.loc[(df['info']=='No alignment found') & df.explanation.isnull(), 'explanation'] = 'No matching alignment'
        df.loc[df.given.isnull() & df.explanation.isnull(), 'explanation'] = 'Insufficient serotype info'
        df.loc[(df.genome_name=='GCA_000617165.2_Ec02-3404') & df.explanation.isnull(), 'explanation'] = 'identified as non-ecoli by reference marker alignment'
        df.loc[df['info'].isnull() & df.explanation.isnull(), 'explanation'] = 'File system error'
        df.loc[df.explanation.isnull(), 'explanation'] = 'Not sure'
        df = df.drop(['info'], axis=1)
        df.reset_index(inplace=True, drop=True)
        df.to_excel(writer,'%s serotype'%antigen, na_rep='-')
        summary_dfs.append(df)
    writer.save()
    return summary_dfs
df = result_to_summary(results[-1])[0]

(990, 4)
(186, 4)


Unnamed: 0,genome_name,given,predicted,explanation
0,ESC_AA8228AA_AS,O103,,No matching alignment
1,ESC_AA8336AA_AS,O157,,No matching alignment
2,ESC_AA8870AA_AS,O103,,No matching alignment
3,ESC_AA9454AA_AS,O26,,No matching alignment
4,ESC_AA9488AA_AS,O157,,No matching alignment
5,ESC_AA9657AA_AS,O103,,No matching alignment
6,ESC_AA9661AA_AS,O157,O121,Not sure
7,ESC_AA9677AA_AS,O157,,No matching alignment
8,ESC_AA9748AA_AS,O157,,No matching alignment
9,ESC_BA0283AA_AS,O157,O146,Not sure
