In [1]:
import pandas as pd
import numpy as np
import os
import re

In [136]:
df = pd.read_csv('assembly_summary.csv', sep='\t', index_col='assembly_accession')

# Get genome name
df['genome_name'] = pd.DataFrame(df['ftp_path'].str.split('/', expand=True).iloc[:,-1])

# Add O serotype info
serotype_re = '[\:\040|]([O]\d{1,3})'
df['given_O'] = df['organism_name'].str.extract(serotype_re, expand=False)

# Add H serotype info
serotype_re2 = '[\:\040|]([H]\d{1,3})'
df['given_H'] = df['organism_name'].str.extract(serotype_re2, expand=False)

# Remove rows without O/H serotype
df = df[df['given_O'].notnull() | df['given_H'].notnull()].copy()

temp_df = df.copy()
# Filter out incorrect H serotypes
df['given_H'].str[1:]

filter_s = (df['given_H'].str[1:].astype(float)<=56) | df['given_H'].isnull()
filter_s2 = (df['given_O'].str[1:].astype(float)<=186) | df['given_O'].isnull()
df = df[filter_s & filter_s2].copy()
'''
# Remove once that are potentially strain name (might filter out legit genome, disabled for now)
df = df[((df.organism_name.str.split('Escherichia coli ').str[-1]) != \
                      (df.infraspecific_name.str.split('=').str[-1]))].copy()
'''
mapping_df = df
temp_df.shape
df = mapping_df.merge(temp_df, on='genome_name', how='outer', indicator=True)
display(df[df._merge!='both']['genome_name'])
# temp_df[temp_df.genome_name.str.contains('001874485')]

463           GCA_000176535.2_ASM17653v2
464           GCA_000176675.2_ASM17667v2
465           GCA_000176695.2_ASM17669v2
466           GCA_000190855.1_ASM19085v1
467           GCA_000190895.1_ASM19089v1
468           GCA_000190915.1_ASM19091v1
469           GCA_000191015.1_ASM19101v1
470           GCA_000210475.1_ASM21047v1
471    GCA_000241975.1_Esch_coli_H397_V1
472    GCA_000242035.1_Esch_coli_H494_V1
473    GCA_000261405.1_Esch_coli_H730_V1
474          GCA_002109645.1_ASM210964v1
475          GCA_002109685.1_ASM210968v1
476          GCA_002109765.1_ASM210976v1
477          GCA_002109775.1_ASM210977v1
478          GCA_002109805.1_ASM210980v1
479          GCA_002110115.1_ASM211011v1
480          GCA_002110165.1_ASM211016v1
481          GCA_002110225.1_ASM211022v1
482          GCA_002110245.1_ASM211024v1
483          GCA_002110265.1_ASM211026v1
484          GCA_002110275.1_ASM211027v1
485          GCA_002110545.1_ASM211054v1
486          GCA_002110555.1_ASM211055v1
Name: genome_nam

In [137]:
def summarize_result(result_df):
    important_cols = [
        'organism_name', 'genome_name', 'infraspecific_name',
        'given_O', 'predicted_O', 'O_info',
        'given_H', 'predicted_H', 'H_info'
    ]
    
    df = result_df
    print('all predictions(%d):' %result_df.shape[0])
    '''
    no prediction if:
        both predicted antigen are None
    '''
    s1 = (df['predicted_O'].isnull() & df['predicted_H'].isnull())
    no_df = df[s1][important_cols]
    print('%d no predictions' %no_df.shape[0])
    '''
    incorrect prediction if:
        not in 'no_df'
        at least one prediction is wrong
    '''
    s2 = (
        ((df['given_O'].notnull() & df['predicted_O'].notnull()) & (df['given_O']!=df['predicted_O'])) |
        ((df['given_H'].notnull() & df['predicted_H'].notnull()) & (df['given_H']!=df['predicted_H']))
    )
    incorrect_df = df[~s1 & s2][important_cols]
    print('%d incorrect predictions' %incorrect_df.shape[0])
    '''
    correct prediction if:
        not in 'no_df'
        not in 'incorrect_df'
        for each given serotype, the correct prediction is made
    '''
    s3 = (
        (df['given_O'].isnull() | (df['given_O']==df['predicted_O'])) &
        (df['given_H'].isnull() | (df['given_H']==df['predicted_H']))
    )
    correct_df = df[~s1 & ~s2 & s3][important_cols]
    print('%d correct predictions' %correct_df.shape[0])
    '''
    semicorrect prediction if:
        not in 'no_df'
        not in 'incorrect_df'
        not in 'correct_df'
        one correct prediction is made, no prediction for the other
    '''
    s4 = (
        (df['given_O'].isnull() | (df['given_O']==df['predicted_O'])) |
        (df['given_H'].isnull() | (df['given_H']==df['predicted_H']))
    )
    semicorrect_df = df[~s1 & ~s2 & ~s3 & s4][important_cols]
    print('%d semicorrect predictions' %semicorrect_df.shape[0])
    '''
    remaining prediction
    '''
    remaining_df = df[~s1 & ~s2 & ~s3 & ~s4]
    print('%d remaining predictions' %remaining_df.shape[0])
    
    # Summary
    correct_count = 0
    incorrect_count = 0
    given_count = 0
    # O serotype
    # get number of given serotypes
    total_s = df['given_O'].notnull()
    num_total = df[total_s].shape[0]
    given_count += num_total
    print("number of given_O serotypes is %d"
            %num_total)
    # get number of unpredicted serotype
    unpredicted_s = total_s & df['predicted_O'].isnull()
    num_unpredicted = (df[unpredicted_s].shape[0])
    print("number of unpredicted_O serotypes is %d or %.2f%%"
            %(num_unpredicted, num_unpredicted/num_total*100))
    correct_s = total_s & ~unpredicted_s & (df['given_O']==df['predicted_O'])
    num_correct = df[correct_s].shape[0]
    correct_count += num_correct
    print("number of correctly predicted_O serotypes is %d or %.2f%%"
            %(num_correct, num_correct/num_total*100))
    incorrect_s = total_s & ~unpredicted_s & ~correct_s
    incorrect_o_df = df[incorrect_s][important_cols]
    num_incorrect = incorrect_o_df.shape[0]
    incorrect_count += num_incorrect
    print("number of incorrectly predicted_O serotypes is %d or %.2f%%"
            %(num_incorrect, num_incorrect/num_total*100))
    
    # H serotype
    # get number of given serotypes
    total_s = df['given_H'].notnull()
    num_total = (df[total_s].shape[0])
    given_count += num_total
    print("number of given_H serotypes is %d"
            %num_total)
    # get number of unpredicted serotype
    unpredicted_s = total_s & df['predicted_H'].isnull()
    num_unpredicted = (df[unpredicted_s].shape[0])
    print("number of unpredicted_H serotypes is %d or %.2f%%"
            %(num_unpredicted, num_unpredicted/num_total*100))
    correct_s = total_s & ~unpredicted_s & (df['given_H']==df['predicted_H'])
    num_correct = df[correct_s].shape[0]
    correct_count += num_correct
    print("number of correctly predicted_H serotypes is %d or %.2f%%"
            %(num_correct, num_correct/num_total*100))
    incorrect_s = total_s & ~unpredicted_s & ~correct_s
    incorrect_h_df = df[incorrect_s][important_cols]
    num_incorrect = incorrect_h_df.shape[0]
    incorrect_count += num_incorrect
    print("number of incorrectly predicted_H serotypes is %d or %.2f%%"
            %(num_incorrect, num_incorrect/num_total*100))
    print("Overall concordance=%.2f%%(%d/%d)" %(correct_count/given_count*100, correct_count,given_count))
    print("Overall discrepancies=%.2f%%(%d/%d)" %(incorrect_count/given_count*100, incorrect_count, given_count))

    return no_df, incorrect_df, correct_df, semicorrect_df, incorrect_o_df, incorrect_h_df, df

In [138]:
def result_to_summary(df):
    h_interested_cols = ['genome_name', 'given_H', 'predicted_H', 'H_info', 'fliC', 'fllA', 'flkA', 'flmA', 'flnA']
    o_interested_cols = ['genome_name', 'given_O', 'predicted_O', 'O_info', 'wzx', 'wzy', 'wzm', 'wzt']
    # split into two dfs
    def rename_antigen_cols(df, antigen):
        df = df.rename(columns={
            'given_'+antigen: 'given',
            'predicted_'+antigen:'predicted',
            antigen+'_info':'info'})
        df = df[df['given'].notnull() | df['predicted'].notnull()]
        df = df[df['given'] != df['predicted']]
        return df
    mislabelled_genomes = ['GCA_000695155_1', 'GCA_000234315_2', 'GCA_000671295_1', 'GCA_000520035_1', 'GCA_001940375_1', 'GCA_000285655_3', 'GCA_000617365_2', 'GCA_000617425_1', 'GCA_000619345_1', 'GCA_000632575_1', 'GCA_000617045_2']
    h_df = rename_antigen_cols(df[h_interested_cols], 'H')
    o_df = rename_antigen_cols(df[o_interested_cols], 'O')
    df = h_df.append(o_df, ignore_index=True)
    interested_cols = ['genome_name', 'given', 'predicted', 'info']
    df = df[interested_cols]
    df['explanation'] = np.nan
    df.loc[(df.genome_name.apply(lambda x: '_'.join(x.replace('.', '_').split('_')[:3])).isin(mislabelled_genomes)) & df.explanation.isnull(), 'explanation'] = 'potentially mislabelled'
    df.loc[(df['info']=='No alignment found') & df.explanation.isnull(), 'explanation'] = 'No matching alignment'
    df.loc[df.given.isnull() & df.explanation.isnull(), 'explanation'] = 'Insufficient serotype info'
    df.loc[(df.genome_name=='GCA_000617165.2_Ec02-3404') & df.explanation.isnull(), 'explanation'] = 'identified as non-ecoli by reference marker alignment'
    df.loc[df['info'].isnull() & df.explanation.isnull(), 'explanation'] = 'File system error'
    df.loc[df.explanation.isnull(), 'explanation'] = 'Not sure'
    df = df.drop(['info'], axis=1)
    df.to_csv('genbank_result_summary.csv', na_rep='N/A')
    return df

In [139]:
df = pd.read_csv('output/genbank/output.csv')
df['genome_name']=df['index'].str.split('_genomic').str[0]
df.drop('index', axis=1, inplace=True)

df.columns = ['predicted_O', 'O_info', 'predicted_H', 'H_info', 'wzx', 'wzy', 'wzm', 'wzt', 'fliC', 'fllA', 'flkA', 'flmA', 'flnA', 'genome_name']
df.loc[df.predicted_O=='-', 'predicted_O'] = np.nan
df.loc[df.predicted_H=='-', 'predicted_H'] = np.nan
df2 = mapping_df.copy()
df = df2.merge(df, on='genome_name', how='left')
results = summarize_result(df)

all predictions(463):
2 no predictions
9 incorrect predictions
415 correct predictions
37 semicorrect predictions
0 remaining predictions
number of given_O serotypes is 460
number of unpredicted_O serotypes is 25 or 5.43%
number of correctly predicted_O serotypes is 431 or 93.70%
number of incorrectly predicted_O serotypes is 4 or 0.87%
number of given_H serotypes is 388
number of unpredicted_H serotypes is 15 or 3.87%
number of correctly predicted_H serotypes is 367 or 94.59%
number of incorrectly predicted_H serotypes is 6 or 1.55%
Overall concordance=94.10%(798/848)
Overall discrepancies=1.18%(10/848)


In [144]:
df = pd.read_excel('in_silico_summary.xlsx')
df = df[df.file.notnull()]
df2 = results[-1].copy()
df2['file'] = df2.genome_name.apply(lambda x: x.split('.')[0]+'_lcl.fasta')
df2.fillna('-', inplace=True)
df2['new predicted'] = df2.apply(lambda row: ':'.join([row['predicted_O'], row['predicted_H']]), axis=1)
df2 = df2[['file', 'new predicted']]
merged_df = df.merge(df2, on='file', how='left')
merged_df.to_excel('modified_in_silico_summary.xlsx')
merged_df[merged_df['new predicted'].isnull()]

Unnamed: 0,file,genbank provided,analysis,ectyper predicted,new predicted
387,GCA_000681435_lcl.fasta,O25,Serotype,"""novel"":H4",
406,GCA_000749525_lcl.fasta,O91H21,Serotype,O181:H49,
407,GCA_000749545_lcl.fasta,O91H21,Serotype,O91:H21,
408,GCA_000749565_lcl.fasta,O91H21,Serotype,O91:H21,
409,GCA_000749575_lcl.fasta,O91H21,Serotype,O91:H21,
411,GCA_000819645_lcl.fasta,O78,Serotype,O78:H19,
414,GCA_000835045_lcl.fasta,O157,Serotype,O157:H19,
415,GCA_000835055_lcl.fasta,O157:,Serotype,O157:H19,
421,GCA_001268885_lcl.fasta,O74,Serotype,O66:H45,
422,GCA_001268925_lcl.fasta,O76,Serotype,-:H34,


In [141]:
df = results[-1]
df[df.genome_name.str.contains('001693635')]

Unnamed: 0,bioproject,biosample,wgs_master,refseq_category,taxid,species_taxid,organism_name,infraspecific_name,isolate,version_status,...,H_info,wzx,wzy,wzm,wzt,fliC,fllA,flkA,flmA,flnA
