In [1]:
import pandas as pd
import json
import numpy as np

In [79]:
def clean_prediction_df(prediction_df):
    '''
    # Generate Allele Mapping
    serotype_dict = json.load(open('serotype_dict.json'))
    new_list = []
    for serotype, alleles in serotype_dict.items():
        for allele in alleles:
            new_entry = {
                'H source': allele.get('name'),
                'H allele': '-'.join([serotype, str(allele.get('num'))])
            }
            new_list.append(new_entry)
    allele_df = pd.DataFrame(new_list)
    allele_df.to_csv('allele_mapping.csv', index_label=False)
    '''
    allele_df = pd.read_csv('allele_mapping.csv')

    # Merge with allele data
    prediction_df = prediction_df.merge(allele_df, left_on='H allele', right_on='H allele', how='left')
    allele_df.columns = ['O allele', 'O source']
    prediction_df = prediction_df.merge(allele_df, left_on='O allele', right_on='O allele', how='left')
    prediction_df.head()

    # Read ectyper output
    # Data cleaning
    prediction_df['genome name'] = prediction_df['genome name'].str.split('|', expand=True)[1]
    prediction_df['predicted O'] = prediction_df['predicted O'].str.split('O', expand=True)[1]
    prediction_df['predicted H'] = prediction_df['predicted H'].str.split('H', expand=True)[1]
    
    '''
    # Get genome file paths as a list
    import os
    genome_dir = '/home/sam/Projects/MoreSerotype/temp/genomes'
    genome_names = os.listdir(genome_dir)
    genome_files = []
    for genome_name in genome_names:
        file = os.path.join(genome_dir, genome_name)
        genome_files.append(file)

    # Create serotype dataframe
    from Bio import SeqIO
    import re
    serotype_data = []
    for genome_file in genome_files:
        records = SeqIO.parse(genome_file, "fasta")
        entry = None
        for record in records:
            name = record.description
            genome_name = name.split('|')[0]
            serotype_tag = name.split('|')[1]
            serotypes = {'O': '','H': ''}
            for key in ['O','H']:
                regex = re.compile("(?<!(Non-))("+key+"\d{1,3})(?!\d)")
                results = regex.findall(serotype_tag)
                results_len = len(results)
                if results_len > 0:
                    serotypes[key] = results[0][1][1:]
            entry = {
                'genome name': genome_name,
                'given O': serotypes['O'],
                'given H': serotypes['H'],
                'serotype tag': serotype_tag
            }
            if serotypes['O'] == '' and serotypes['H'] =='':
                pass
                # print(name)
            break
        if entry is not None:
            serotype_data.append(entry)
    df2 = pd.DataFrame(serotype_data)
    df2.index = df2['genome name']
    df2 = df2[['given O', 'given H']]

    df2.to_csv('genome-serotype_mapping.csv')
    '''
    geneome_df = pd.read_csv('genome-serotype_mapping.csv', index_col='genome name', dtype={'given O':str, 'given H':str})
    

    # Merge two data frames
    prediction_df = prediction_df.merge(geneome_df, left_on='genome name', right_index=True, how='inner')
    
    prediction_df = prediction_df.replace('-', np.nan)
    prediction_df.fillna(value=np.nan, inplace=True)
    
    # Rename columns
    prediction_df.columns = [
        'H_allele', 'O_allele', 'genome_name',
        'predicted_H', 'predicted_O', 'H_source',
        'O_source', 'given_O', 'given_H'
    ]
    # Reorder columns
    prediction_df = prediction_df[[
        'genome_name',
        'given_O', 'predicted_O',
        'given_H', 'predicted_H',
        'O_allele', 'O_source',
        'H_allele', 'H_source'
    ]]
    return prediction_df

In [89]:
def summarize_result(result_df):
    important_cols = result_df.columns
    
    df = result_df
    print('all predictions(%d):' %result_df.shape[0])
    '''
    no prediction if:
        both predicted antigen are None
    '''
    s1 = (df['predicted_O'].isnull() & df['predicted_H'].isnull())
    no_df = df[s1][important_cols]
    print('%d no predictions' %no_df.shape[0])
    '''
    incorrect prediction if:
        not in 'no_df'
        at least one prediction is wrong
    '''
    s2 = (
        ((df['given_O'].notnull() & df['predicted_O'].notnull()) & (df['given_O']!=df['predicted_O'])) |
        ((df['given_H'].notnull() & df['predicted_H'].notnull()) & (df['given_H']!=df['predicted_H']))
    )
    incorrect_df = df[~s1 & s2][important_cols]
    print('%d incorrect predictions' %incorrect_df.shape[0])
    '''
    correct prediction if:
        not in 'no_df'
        not in 'incorrect_df'
        for each given serotype, the correct prediction is made
    '''
    s3 = (
        (df['given_O'].isnull() | (df['given_O']==df['predicted_O'])) &
        (df['given_H'].isnull() | (df['given_H']==df['predicted_H']))
    )
    correct_df = df[~s1 & ~s2 & s3][important_cols]
    print('%d correct predictions' %correct_df.shape[0])
    '''
    semicorrect prediction if:
        not in 'no_df'
        not in 'incorrect_df'
        not in 'correct_df'
        one correct prediction is made, no prediction for the other
    '''
    s4 = (
        (df['given_O'].isnull() | (df['given_O']==df['predicted_O'])) |
        (df['given_H'].isnull() | (df['given_H']==df['predicted_H']))
    )
    semicorrect_df = df[~s1 & ~s2 & ~s3 & s4][important_cols]
    print('%d semicorrect predictions' %semicorrect_df.shape[0])
    '''
    remaining prediction
    '''
    remaining_df = df[~s1 & ~s2 & ~s3 & ~s4]
    print('%d remaining predictions' %remaining_df.shape[0])
    
    # Summary
    
    # O serotype
    # get number of given serotypes
    total_s = df['given_O'].notnull()
    num_total = df[total_s].shape[0]
    print("number of given_O serotypes is %d"
            %num_total)
    # get number of unpredicted serotype
    unpredicted_s = total_s & df['predicted_O'].isnull()
    num_unpredicted = (df[unpredicted_s].shape[0])
    print("number of unpredicted_O serotypes is %d or %.2f%%"
            %(num_unpredicted, num_unpredicted/num_total*100))
    correct_s = total_s & ~unpredicted_s & (df['given_O']==df['predicted_O'])
    num_correct = df[correct_s].shape[0]
    print("number of correctly predicted_O serotypes is %d or %.2f%%"
            %(num_correct, num_correct/num_total*100))
    incorrect_s = total_s & ~unpredicted_s & ~correct_s
    incorrect_o_df = df[incorrect_s][important_cols]
    num_incorrect = incorrect_o_df.shape[0]
    print("number of incorrectly predicted_O serotypes is %d or %.2f%%"
            %(num_incorrect, num_incorrect/num_total*100))
    
    # H serotype
    # get number of given serotypes
    total_s = df['given_H'].notnull()
    num_total = (df[total_s].shape[0])
    print("number of given_H serotypes is %d"
            %num_total)
    # get number of unpredicted serotype
    unpredicted_s = total_s & df['predicted_H'].isnull()
    num_unpredicted = (df[unpredicted_s].shape[0])
    print("number of unpredicted_H serotypes is %d or %.2f%%"
            %(num_unpredicted, num_unpredicted/num_total*100))
    correct_s = total_s & ~unpredicted_s & (df['given_H']==df['predicted_H'])
    num_correct = df[correct_s].shape[0]
    print("number of correctly predicted_H serotypes is %d or %.2f%%"
            %(num_correct, num_correct/num_total*100))
    incorrect_s = total_s & ~unpredicted_s & ~correct_s
    incorrect_h_df = df[incorrect_s][important_cols]
    num_incorrect = incorrect_h_df.shape[0]
    print("number of incorrectly predicted_H serotypes is %d or %.2f%%"
            %(num_incorrect, num_incorrect/num_total*100))

    return no_df, incorrect_df, correct_df, semicorrect_df, incorrect_o_df, incorrect_h_df, df

In [90]:
# old allele library
raw_df = pd.read_json('legacy.json')
clean_df = clean_prediction_df(raw_df)
result1 = summarize_result(clean_df)

all predictions(5810):
28 no predictions
774 incorrect predictions
4275 correct predictions
733 semicorrect predictions
0 remaining predictions
number of given_O serotypes is 5671
number of unpredicted_O serotypes is 867 or 15.29%
number of correctly predicted_O serotypes is 4326 or 76.28%
number of incorrectly predicted_O serotypes is 478 or 8.43%
number of given_H serotypes is 2887
number of unpredicted_H serotypes is 50 or 1.73%
number of correctly predicted_H serotypes is 2433 or 84.27%
number of incorrectly predicted_H serotypes is 404 or 13.99%


In [91]:
# old allele library + 97% length cutoff
raw_df = pd.read_json('legacy2.json')
clean_df = clean_prediction_df(raw_df)
result2 = summarize_result(clean_df)

all predictions(5810):
31 no predictions
762 incorrect predictions
4235 correct predictions
782 semicorrect predictions
0 remaining predictions
number of given_O serotypes is 5671
number of unpredicted_O serotypes is 925 or 16.31%
number of correctly predicted_O serotypes is 4282 or 75.51%
number of incorrectly predicted_O serotypes is 464 or 8.18%
number of given_H serotypes is 2887
number of unpredicted_H serotypes is 52 or 1.80%
number of correctly predicted_H serotypes is 2432 or 84.24%
number of incorrectly predicted_H serotypes is 403 or 13.96%


In [92]:
# new allele library
raw_df = pd.read_json('new.json')
clean_df = clean_prediction_df(raw_df)
result3 = summarize_result(clean_df)

all predictions(5792):
0 no predictions
781 incorrect predictions
4337 correct predictions
674 semicorrect predictions
0 remaining predictions
number of given_O serotypes is 5653
number of unpredicted_O serotypes is 780 or 13.80%
number of correctly predicted_O serotypes is 4379 or 77.46%
number of incorrectly predicted_O serotypes is 494 or 8.74%
number of given_H serotypes is 2879
number of unpredicted_H serotypes is 28 or 0.97%
number of correctly predicted_H serotypes is 2451 or 85.13%
number of incorrectly predicted_H serotypes is 400 or 13.89%


In [93]:
# new allele library + 97% length cutoff
raw_df = pd.read_json('new2.json')
clean_df = clean_prediction_df(raw_df)
result4 = summarize_result(clean_df)

all predictions(5789):
0 no predictions
767 incorrect predictions
4293 correct predictions
729 semicorrect predictions
0 remaining predictions
number of given_O serotypes is 5650
number of unpredicted_O serotypes is 841 or 14.88%
number of correctly predicted_O serotypes is 4331 or 76.65%
number of incorrectly predicted_O serotypes is 478 or 8.46%
number of given_H serotypes is 2879
number of unpredicted_H serotypes is 30 or 1.04%
number of correctly predicted_H serotypes is 2450 or 85.10%
number of incorrectly predicted_H serotypes is 399 or 13.86%


In [97]:
# oct30
raw_df = pd.read_json('oct30_output/output.json')
clean_df = clean_prediction_df(raw_df)
result5 = summarize_result(clean_df)
result5[6][result5[6].given_O=='151']

all predictions(5789):
17 no predictions
674 incorrect predictions
3706 correct predictions
1392 semicorrect predictions
0 remaining predictions
number of given_O serotypes is 5650
number of unpredicted_O serotypes is 1577 or 27.91%
number of correctly predicted_O serotypes is 3710 or 65.66%
number of incorrectly predicted_O serotypes is 363 or 6.42%
number of given_H serotypes is 2879
number of unpredicted_H serotypes is 30 or 1.04%
number of correctly predicted_H serotypes is 2450 or 85.10%
number of incorrectly predicted_H serotypes is 399 or 13.86%


Unnamed: 0,genome_name,given_O,predicted_O,given_H,predicted_H,O_allele,O_source,H_allele,H_source
1657,ESC_GA6847AA_AS,151,,4.0,4,,,H4-2,1__fliC__fliC-H4__43 AJ605764.1;flagellin;H4
1808,ESC_GA8882AA_AS,151,118.0,11.0,11,O118-2,9__wzy__wzy-O118-Gp3__341 HM204927.1;O antigen...,H11-18,part_of_ESC_GA8882AA_AS
1810,ESC_GA8836AA_AS,151,118.0,11.0,11,O118-2,9__wzy__wzy-O118-Gp3__341 HM204927.1;O antigen...,H11-18,part_of_ESC_GA8882AA_AS
1909,ESC_GA8952AA_AS,151,118.0,,11,O118-2,9__wzy__wzy-O118-Gp3__341 HM204927.1;O antigen...,H11-18,part_of_ESC_GA8882AA_AS
2200,ESC_GA9397AA_AS,151,,,11,,,H11-18,part_of_ESC_GA8882AA_AS
2387,ESC_HA1807AA_AS,151,118.0,,10,O118-2,9__wzy__wzy-O118-Gp3__341 HM204927.1;O antigen...,H10-4,part_of_ESC_GA8794AA_AS
3522,ESC_HA8317AA_AS,151,10.0,,30,O10-4,part_of_ESC_HA7958AA_AS,H30-1,1__fliC__fliC-H30__31 AY250011.1;flagellin;H30
3569,ESC_HA8276AA_AS,151,78.0,41.0,9,O78-4,part_of_ESC_LA5784AA_AS,H9-1,1__fliC__fliC-H9__76 AY249994.1;flagellin;H9
