In [135]:
import pandas as pd
import numpy as np
import os
import re

In [136]:
def get_diff_row(df1, df2, on):
    df = df1.merge(df2, on=on, how='outer', indicator=True)
    return (df[df._merge!='both'])
def parse_assembly_summary(file):
    df = pd.read_csv(file, sep='\t', index_col='assembly_accession')
    
    # Get genome name
    df['genome_name'] = pd.DataFrame(df['ftp_path'].str.split('/', expand=True).iloc[:,-1])

    # Add O serotype info
    serotype_re = '(?<![A-z])(O\d{1,3})'
    df['given_O'] = df['organism_name'].str.extract(serotype_re, expand=False)
    df.given_O.fillna(df['infraspecific_name'].str.extract(serotype_re, expand=False), inplace=True)

    # Add H serotype info
    serotype_re2 = '\:(H\d{1,2})'
    df['given_H'] = df['organism_name'].str.extract(serotype_re2, expand=False)
    df.given_H.fillna(df['infraspecific_name'].str.extract(serotype_re2, expand=False), inplace=True)

    # Remove rows without serotype
    df = df[df['given_O'].notnull() | df['given_H'].notnull()].copy()
    
    # Filter out incorrect H serotypes
    filter_s = (df['given_H'].str[1:].astype(float)<=56) | df['given_H'].isnull()
    filter_s2 = (df['given_O'].str[1:].astype(float)<=186) | df['given_O'].isnull()
    df = df[filter_s & filter_s2]
    return df

In [142]:
def summarize_result(result_df):
    important_cols = [
        'organism_name', 'genome_name', 'infraspecific_name',
        'given_O', 'predicted_O', 'O_info',
        'given_H', 'predicted_H', 'H_info'
    ]
    
    df = result_df.copy()
    print('all predictions(%d):' %result_df.shape[0])
    '''
    no prediction if:
        both predicted antigen are None
    '''
    s1 = (df['predicted_O'].isnull() & df['predicted_H'].isnull())
    no_df = df[s1][important_cols]
    print('%d no predictions' %no_df.shape[0])
    '''
    incorrect prediction if:
        not in 'no_df'
        at least one prediction is wrong
    '''
    s2 = (
        ((df['given_O'].notnull() & df['predicted_O'].notnull()) & (df['given_O']!=df['predicted_O'])) |
        ((df['given_H'].notnull() & df['predicted_H'].notnull()) & (df['given_H']!=df['predicted_H']))
    )
    incorrect_df = df[~s1 & s2][important_cols]
    print('%d incorrect predictions' %incorrect_df.shape[0])
    '''
    correct prediction if:
        not in 'no_df'
        not in 'incorrect_df'
        for each given serotype, the correct prediction is made
    '''
    s3 = (
        (df['given_O'].isnull() | (df['given_O']==df['predicted_O'])) &
        (df['given_H'].isnull() | (df['given_H']==df['predicted_H']))
    )
    correct_df = df[~s1 & ~s2 & s3][important_cols]
    print('%d correct predictions' %correct_df.shape[0])
    '''
    semicorrect prediction if:
        not in 'no_df'
        not in 'incorrect_df'
        not in 'correct_df'
        one correct prediction is made, no prediction for the other
    '''
    s4 = (
        (df['given_O'].isnull() | (df['given_O']==df['predicted_O'])) |
        (df['given_H'].isnull() | (df['given_H']==df['predicted_H']))
    )
    semicorrect_df = df[~s1 & ~s2 & ~s3 & s4][important_cols]
    print('%d semicorrect predictions' %semicorrect_df.shape[0])
    '''
    remaining prediction
    '''
    remaining_df = df[~s1 & ~s2 & ~s3 & ~s4]
    print('%d remaining predictions' %remaining_df.shape[0])
    
    # Summary
    correct_count = 0
    incorrect_count = 0
    given_count = 0
    # O serotype
    # get number of given serotypes
    total_s = df['given_O'].notnull()
    num_total = df[total_s].shape[0]
    given_count += num_total
    print("number of given_O serotypes is %d"
            %num_total)
    # get number of unpredicted serotype
    unpredicted_s = total_s & df['predicted_O'].isnull()
    num_unpredicted = (df[unpredicted_s].shape[0])
    print("number of unpredicted_O serotypes is %d or %.2f%%"
            %(num_unpredicted, num_unpredicted/num_total*100))
    correct_s = total_s & ~unpredicted_s & (df['given_O']==df['predicted_O'])
    num_correct = df[correct_s].shape[0]
    correct_count += num_correct
    print("number of correctly predicted_O serotypes is %d or %.2f%%"
            %(num_correct, num_correct/num_total*100))
    incorrect_s = total_s & ~unpredicted_s & ~correct_s
    incorrect_o_df = df[incorrect_s][important_cols]
    num_incorrect = incorrect_o_df.shape[0]
    incorrect_count += num_incorrect
    print("number of incorrectly predicted_O serotypes is %d or %.2f%%"
            %(num_incorrect, num_incorrect/num_total*100))
    
    # H serotype
    # get number of given serotypes
    total_s = df['given_H'].notnull()
    num_total = (df[total_s].shape[0])
    given_count += num_total
    print("number of given_H serotypes is %d"
            %num_total)
    # get number of unpredicted serotype
    unpredicted_s = total_s & df['predicted_H'].isnull()
    num_unpredicted = (df[unpredicted_s].shape[0])
    print("number of unpredicted_H serotypes is %d or %.2f%%"
            %(num_unpredicted, num_unpredicted/num_total*100))
    correct_s = total_s & ~unpredicted_s & (df['given_H']==df['predicted_H'])
    num_correct = df[correct_s].shape[0]
    correct_count += num_correct
    print("number of correctly predicted_H serotypes is %d or %.2f%%"
            %(num_correct, num_correct/num_total*100))
    incorrect_s = total_s & ~unpredicted_s & ~correct_s
    incorrect_h_df = df[incorrect_s][important_cols]
    num_incorrect = incorrect_h_df.shape[0]
    incorrect_count += num_incorrect
    print("number of incorrectly predicted_H serotypes is %d or %.2f%%"
            %(num_incorrect, num_incorrect/num_total*100))
    print("Overall concordance=%.2f%%(%d/%d)" %(correct_count/given_count*100, correct_count,given_count))
    print("Overall discrepancies=%.2f%%(%d/%d)" %(incorrect_count/given_count*100, incorrect_count, given_count))

    return no_df, incorrect_df, correct_df, semicorrect_df, incorrect_o_df, incorrect_h_df, df

In [143]:
def result_to_summary(df):
    h_interested_cols = ['genome_name', 'given_H', 'predicted_H', 'H_info', 'fliC', 'fllA', 'flkA', 'flmA', 'flnA']
    o_interested_cols = ['genome_name', 'given_O', 'predicted_O', 'O_info', 'wzx', 'wzy', 'wzm', 'wzt']
    # split into two dfs
    def rename_antigen_cols(df, antigen):
        df = df.rename(columns={
            'given_'+antigen: 'given',
            'predicted_'+antigen:'predicted',
            antigen+'_info':'info'})
        df = df[df['given'].notnull() | df['predicted'].notnull()]
        df = df[df['given'] != df['predicted']]
        return df
    mislabelled_genomes = ['GCA_000695155_1', 'GCA_000234315_2', 'GCA_000671295_1', 'GCA_000520035_1', 'GCA_001940375_1', 'GCA_000285655_3', 'GCA_000617365_2', 'GCA_000617425_1', 'GCA_000619345_1', 'GCA_000632575_1', 'GCA_000617045_2']
    h_df = rename_antigen_cols(df[h_interested_cols], 'H')
    o_df = rename_antigen_cols(df[o_interested_cols], 'O')
    df = h_df.append(o_df, ignore_index=True)
    interested_cols = ['genome_name', 'given', 'predicted', 'info']
    df = df[interested_cols]
    df['explanation'] = np.nan
    df.loc[(df.genome_name.apply(lambda x: '_'.join(x.replace('.', '_').split('_')[:3])).isin(mislabelled_genomes)) & df.explanation.isnull(), 'explanation'] = 'potentially mislabelled'
    df.loc[(df['info']=='No alignment found') & df.explanation.isnull(), 'explanation'] = 'No matching alignment'
    df.loc[df.given.isnull() & df.explanation.isnull(), 'explanation'] = 'Insufficient serotype info'
    df.loc[(df.genome_name=='GCA_000617165.2_Ec02-3404') & df.explanation.isnull(), 'explanation'] = 'identified as non-ecoli by reference marker alignment'
    df.loc[df['info'].isnull() & df.explanation.isnull(), 'explanation'] = 'File system error'
    df.loc[df.explanation.isnull(), 'explanation'] = 'Not sure'
    df = df.drop(['info'], axis=1)
    df.to_csv('genbank_result_summary.csv', na_rep='N/A')
    return df

In [144]:
mapping_df = parse_assembly_summary('assembly_summary.csv')[['genome_name', 'organism_name', 'infraspecific_name','given_O', 'given_H']]
output_df = pd.read_csv('output/genbank/output.csv')
output_df['genome_name']=output_df['index'].str.split('_genomic').str[0]
output_df.drop('index', axis=1, inplace=True)
output_df = output_df[['O_prediction', 'O_info', 'H_prediction', 'H_info', 'genome_name']]
output_df.columns = ['predicted_O', 'O_info', 'predicted_H', 'H_info', 'genome_name']
output_df.loc[output_df.predicted_O=='-', 'predicted_O'] = np.nan
output_df.loc[output_df.predicted_H=='-', 'predicted_H'] = np.nan
merge_df = mapping_df.merge(output_df, on='genome_name', how='inner')
print(merge_df.shape)
results = summarize_result(merge_df)
results[1]
# Remove the genomes below from folder
diff_df = get_diff_row(mapping_df, output_df, 'genome_name')
list(diff_df.genome_name)

(536, 9)
all predictions(536):
1 no predictions
8 incorrect predictions
488 correct predictions
39 semicorrect predictions
0 remaining predictions
number of given_O serotypes is 535
number of unpredicted_O serotypes is 26 or 4.86%
number of correctly predicted_O serotypes is 504 or 94.21%
number of incorrectly predicted_O serotypes is 5 or 0.93%
number of given_H serotypes is 387
number of unpredicted_H serotypes is 15 or 3.88%
number of correctly predicted_H serotypes is 368 or 95.09%
number of incorrectly predicted_H serotypes is 4 or 1.03%
Overall concordance=94.58%(872/922)
Overall discrepancies=0.98%(9/922)


['GCA_000147855.3_ASM14785v3',
 'GCA_000258025.1_ASM25802v1',
 'GCA_000281775.1_ASM28177v1',
 'GCA_001309885.1_ASM130988v1',
 'GCA_001309895.1_ASM130989v1',
 'GCA_001309905.1_ASM130990v1',
 'GCA_001309925.1_ASM130992v1',
 'GCA_001309965.1_ASM130996v1',
 'GCA_001440355.1_ECO5_020',
 'GCA_001440565.1_ASM144056v1',
 'GCA_001440615.1_ASM144061v1',
 'GCA_001440645.1_ASM144064v1',
 'GCA_001440655.1_ASM144065v1',
 'GCA_001440665.1_ASM144066v1',
 'GCA_001637865.1_ASM163786v1',
 'GCA_001638005.1_ASM163800v1',
 'GCA_001700405.1_ASM170040v1',
 'GCA_002109305.1_E_coli_SHECO001_v1',
 'GCA_002111485.1_E_coli_SHECO002_v1',
 'GCA_002111495.1_E_coli_SHECO003_v1',
 'GCA_002215095.1_ASM221509v1',
 'GCA_002215115.1_ASM221511v1',
 'GCA_002215155.1_ASM221515v1',
 'GCA_002317635.1_ASM231763v1',
 'GCA_002317645.1_ASM231764v1',
 'GCA_002317655.1_ASM231765v1',
 'GCA_002317695.1_ASM231769v1',
 'GCA_002317705.1_ASM231770v1',
 'GCA_002317725.1_ASM231772v1',
 'GCA_002317755.1_ASM231775v1',
 'GCA_002317775.1_ASM2317

In [145]:
df = pd.read_excel('in_silico_summary.xlsx')
df = df[df.file.notnull()]
df2 = results[-1].copy()
df2['file'] = df2.genome_name.apply(lambda x: x.split('.')[0]+'_lcl.fasta')
df2.fillna('-', inplace=True)
df2['new predicted'] = df2.apply(lambda row: ':'.join([row['predicted_O'], row['predicted_H']]), axis=1)
df2 = df2[['file', 'new predicted']]
merged_df = df.merge(df2, on='file', how='left')
merged_df.to_excel('modified_in_silico_summary.xlsx', index=False)
merged_df[merged_df['new predicted'].isnull()]

Unnamed: 0,file,genbank provided,analysis,ectyper predicted,new predicted
406,GCA_000749525_lcl.fasta,O91H21,Serotype,O181:H49,
407,GCA_000749545_lcl.fasta,O91H21,Serotype,O91:H21,
408,GCA_000749565_lcl.fasta,O91H21,Serotype,O91:H21,
409,GCA_000749575_lcl.fasta,O91H21,Serotype,O91:H21,
411,GCA_000819645_lcl.fasta,O78,Serotype,O78:H19,
421,GCA_001268885_lcl.fasta,O74,Serotype,O66:H45,
422,GCA_001268925_lcl.fasta,O76,Serotype,-:H34,


In [147]:
results[-1][results[-1].loc[:, 'genome_name'].str.contains('001693635')]

Unnamed: 0,genome_name,organism_name,infraspecific_name,given_O,given_H,predicted_O,O_info,predicted_H,H_info
444,GCA_001693635.1_ASM169363v1,Escherichia coli,strain=O177:H21,O177,H21,O177,Alignment found,H21,Alignment found
