In [1]:
import pandas as pd
import numpy as np
import os
import re

In [74]:
df = pd.read_csv('assembly_summary.csv', sep='\t', index_col='assembly_accession')

# Get genome name
df['genome_name'] = pd.DataFrame(df['ftp_path'].str.split('/', expand=True).iloc[:,-1])

# Add O serotype info
serotype_re = '[\:\040|]([O]\d{1,3})'
df['given_O'] = df['organism_name'].str.extract(serotype_re, expand=False)

# Add H serotype info
serotype_re2 = '[\:\040|]([H]\d{1,3})'
df['given_H'] = df['organism_name'].str.extract(serotype_re2, expand=False)

# Remove rows without O/H serotype
df = df[df['given_O'].notnull() | df['given_H'].notnull()].copy()

temp_df = df.copy()
# Filter out incorrect H serotypes
df['given_H'].str[1:]

filter_s = (df['given_H'].str[1:].astype(float)<=56) | df['given_H'].isnull()
filter_s2 = (df['given_O'].str[1:].astype(float)<=186) | df['given_O'].isnull()
df = df[filter_s & filter_s2].copy()

# Remove once that are potentially strain name
df = df[((df.organism_name.str.split('Escherichia coli ').str[-1]) != \
                      (df.infraspecific_name.str.split('=').str[-1]))].copy()

mapping_df = df
temp_df.shape
df = mapping_df.merge(temp_df, on='genome_name', how='outer', indicator=True)
display(df[df._merge!='both']['genome_name'])

452            GCA_000014845.1_ASM1484v1
453           GCA_000176535.2_ASM17653v2
454           GCA_000176675.2_ASM17667v2
455           GCA_000176695.2_ASM17669v2
456           GCA_000190855.1_ASM19085v1
457           GCA_000190895.1_ASM19089v1
458           GCA_000190915.1_ASM19091v1
459           GCA_000191015.1_ASM19101v1
460           GCA_000210475.1_ASM21047v1
461           GCA_000215165.1_ASM21516v1
462    GCA_000241975.1_Esch_coli_H397_V1
463    GCA_000242035.1_Esch_coli_H494_V1
464    GCA_000261405.1_Esch_coli_H730_V1
465           GCA_000332755.1_ASM33275v1
466                   GCA_000340235.1_O8
467           GCA_000403965.1_ASM40396v1
468           GCA_000403985.1_ASM40398v1
469              GCA_001021615.1_APECO18
470          GCA_001021635.1_ASM102163v1
471          GCA_001620375.1_ASM162037v1
472          GCA_001874485.1_ASM187448v1
473          GCA_002109645.1_ASM210964v1
474          GCA_002109685.1_ASM210968v1
475          GCA_002109765.1_ASM210976v1
476          GCA

In [75]:
def summarize_result(result_df):
    important_cols = [
        'organism_name', 'genome_name', 'infraspecific_name',
        'given_O', 'predicted_O', 'O_info',
        'given_H', 'predicted_H', 'H_info'
    ]
    
    df = result_df
    print('all predictions(%d):' %result_df.shape[0])
    '''
    no prediction if:
        both predicted antigen are None
    '''
    s1 = (df['predicted_O'].isnull() & df['predicted_H'].isnull())
    no_df = df[s1][important_cols]
    print('%d no predictions' %no_df.shape[0])
    '''
    incorrect prediction if:
        not in 'no_df'
        at least one prediction is wrong
    '''
    s2 = (
        ((df['given_O'].notnull() & df['predicted_O'].notnull()) & (df['given_O']!=df['predicted_O'])) |
        ((df['given_H'].notnull() & df['predicted_H'].notnull()) & (df['given_H']!=df['predicted_H']))
    )
    incorrect_df = df[~s1 & s2][important_cols]
    print('%d incorrect predictions' %incorrect_df.shape[0])
    '''
    correct prediction if:
        not in 'no_df'
        not in 'incorrect_df'
        for each given serotype, the correct prediction is made
    '''
    s3 = (
        (df['given_O'].isnull() | (df['given_O']==df['predicted_O'])) &
        (df['given_H'].isnull() | (df['given_H']==df['predicted_H']))
    )
    correct_df = df[~s1 & ~s2 & s3][important_cols]
    print('%d correct predictions' %correct_df.shape[0])
    '''
    semicorrect prediction if:
        not in 'no_df'
        not in 'incorrect_df'
        not in 'correct_df'
        one correct prediction is made, no prediction for the other
    '''
    s4 = (
        (df['given_O'].isnull() | (df['given_O']==df['predicted_O'])) |
        (df['given_H'].isnull() | (df['given_H']==df['predicted_H']))
    )
    semicorrect_df = df[~s1 & ~s2 & ~s3 & s4][important_cols]
    print('%d semicorrect predictions' %semicorrect_df.shape[0])
    '''
    remaining prediction
    '''
    remaining_df = df[~s1 & ~s2 & ~s3 & ~s4]
    print('%d remaining predictions' %remaining_df.shape[0])
    
    # Summary
    correct_count = 0
    incorrect_count = 0
    given_count = 0
    # O serotype
    # get number of given serotypes
    total_s = df['given_O'].notnull()
    num_total = df[total_s].shape[0]
    given_count += num_total
    print("number of given_O serotypes is %d"
            %num_total)
    # get number of unpredicted serotype
    unpredicted_s = total_s & df['predicted_O'].isnull()
    num_unpredicted = (df[unpredicted_s].shape[0])
    print("number of unpredicted_O serotypes is %d or %.2f%%"
            %(num_unpredicted, num_unpredicted/num_total*100))
    correct_s = total_s & ~unpredicted_s & (df['given_O']==df['predicted_O'])
    num_correct = df[correct_s].shape[0]
    correct_count += num_correct
    print("number of correctly predicted_O serotypes is %d or %.2f%%"
            %(num_correct, num_correct/num_total*100))
    incorrect_s = total_s & ~unpredicted_s & ~correct_s
    incorrect_o_df = df[incorrect_s][important_cols]
    num_incorrect = incorrect_o_df.shape[0]
    incorrect_count += num_incorrect
    print("number of incorrectly predicted_O serotypes is %d or %.2f%%"
            %(num_incorrect, num_incorrect/num_total*100))
    
    # H serotype
    # get number of given serotypes
    total_s = df['given_H'].notnull()
    num_total = (df[total_s].shape[0])
    given_count += num_total
    print("number of given_H serotypes is %d"
            %num_total)
    # get number of unpredicted serotype
    unpredicted_s = total_s & df['predicted_H'].isnull()
    num_unpredicted = (df[unpredicted_s].shape[0])
    print("number of unpredicted_H serotypes is %d or %.2f%%"
            %(num_unpredicted, num_unpredicted/num_total*100))
    correct_s = total_s & ~unpredicted_s & (df['given_H']==df['predicted_H'])
    num_correct = df[correct_s].shape[0]
    correct_count += num_correct
    print("number of correctly predicted_H serotypes is %d or %.2f%%"
            %(num_correct, num_correct/num_total*100))
    incorrect_s = total_s & ~unpredicted_s & ~correct_s
    incorrect_h_df = df[incorrect_s][important_cols]
    num_incorrect = incorrect_h_df.shape[0]
    incorrect_count += num_incorrect
    print("number of incorrectly predicted_H serotypes is %d or %.2f%%"
            %(num_incorrect, num_incorrect/num_total*100))
    print("Overall concordance=%.2f%%(%d/%d)" %(correct_count/given_count*100, correct_count,given_count))
    print("Overall discrepancies=%.2f%%(%d/%d)" %(incorrect_count/given_count*100, incorrect_count, given_count))

    return no_df, incorrect_df, correct_df, semicorrect_df, incorrect_o_df, incorrect_h_df, df

In [76]:
df = pd.read_csv('output/genbank/output.csv')
df['genome_name']=df['index'].str.split('_genomic').str[0]
df.drop('index', axis=1, inplace=True)

df.columns = ['predicted_O', 'O_info', 'predicted_H', 'H_info', 'wzx', 'wzy', 'wzm', 'wzt', 'fliC', 'fllA', 'flkA', 'flmA', 'flnA', 'genome_name']
df.loc[df.predicted_O=='-', 'predicted_O'] = np.nan
df.loc[df.predicted_H=='-', 'predicted_H'] = np.nan
df2 = mapping_df.copy()
df = df2.merge(df, on='genome_name', how='left')
results = summarize_result(df)

all predictions(452):
2 no predictions
6 incorrect predictions
409 correct predictions
35 semicorrect predictions
0 remaining predictions
number of given_O serotypes is 451
number of unpredicted_O serotypes is 23 or 5.10%
number of correctly predicted_O serotypes is 425 or 94.24%
number of incorrectly predicted_O serotypes is 3 or 0.67%
number of given_H serotypes is 385
number of unpredicted_H serotypes is 15 or 3.90%
number of correctly predicted_H serotypes is 366 or 95.06%
number of incorrectly predicted_H serotypes is 4 or 1.04%
Overall concordance=94.62%(791/836)
Overall discrepancies=0.84%(7/836)


In [80]:
df = results[-1]
h_interested_cols = ['genome_name', 'given_H', 'predicted_H', 'H_info', 'fliC', 'fllA', 'flkA', 'flmA', 'flnA']
o_interested_cols = ['genome_name', 'given_O', 'predicted_O', 'O_info', 'wzx', 'wzy', 'wzm', 'wzt']
# split into two dfs
def rename_antigen_cols(df, antigen):
    df = df.rename(columns={
        'given_'+antigen: 'given',
        'predicted_'+antigen:'predicted',
        antigen+'_info':'info'})
    df = df[df['given'].notnull() | df['predicted'].notnull()]
    df = df[df['given'] != df['predicted']]
    return df
h_df = rename_antigen_cols(df[h_interested_cols], 'H')
o_df = rename_antigen_cols(df[o_interested_cols], 'O')
df = h_df.append(o_df, ignore_index=True)
interested_cols = ['genome_name', 'given', 'predicted', 'info']
df = df[interested_cols]
df['explanation'] = np.nan
df.loc[(df['info']=='No alignment found') & df.explanation.isnull(), 'explanation'] = 'No matching alignment'
df.loc[df.given.isnull() & df.explanation.isnull(), 'explanation'] = 'Insufficient serotype info'
df.loc[(df.genome_name=='GCA_000632575.1_Ec2009C-4747') & df.explanation.isnull(), 'explanation'] = 'Mislabelled, see "image/gen_tre1.png"'
df.loc[(df.genome_name=='GCA_000619345.1_Ec2009C-3292') & df.explanation.isnull(), 'explanation'] = 'Mislabelled, see "image/gen_tre2.png"'
df.loc[(df.genome_name=='GCA_000617165.2_Ec02-3404') & df.explanation.isnull(), 'explanation'] = 'identified as non-ecoli by reference marker alignment'
df.loc[df['info'].isnull() & df.explanation.isnull(), 'explanation'] = 'File system error'
df.loc[df.explanation.isnull(), 'explanation'] = 'Not sure'
df = df.drop(['info'], axis=1)
df.to_csv('genbank_result_summary.csv', na_rep='N/A')
df[df.explanation=='File system error']

Unnamed: 0,genome_name,given,predicted,explanation
