In [1]:
import pandas as pd

In [50]:
# Read ectyper output
json_output_file = 'all_genome_output.json'
ectyper_df = pd.read_json(json_output_file)
ectyper_df

Unnamed: 0,genome name,predicted H,predicted O
0,lcl|ESC_AA7966AA_AS,H21,O113
1,lcl|ESC_AA7970AA_AS,H7,O157
2,lcl|ESC_BA4596AA_AS,H1,O85
3,lcl|ESC_BA1890AA_AS,H10,O49
4,lcl|ESC_BA1526AA_AS,H10,O6
5,lcl|ESC_AA9715AA_AS,H10,O121
6,lcl|ESC_CA6887AA_AS,H11,O103
7,lcl|ESC_CA6032AA_AS,H11,-
8,lcl|ESC_CA5330AA_AS,H11,O103
9,lcl|ESC_CA5018AA_AS,H11,O26


In [35]:
# Get genome file paths as a list
import os
genome_dir = '/home/sam/Projects/MoreSerotype/temp/genomes'
genome_names = os.listdir(genome_dir)
genome_files = []
for genome_name in genome_names:
    file = os.path.join(genome_dir, genome_name)
    genome_files.append(file)

# Create serotype dataframe
from Bio import SeqIO
import re
serotype_data = []
for genome_file in genome_files:
    records = SeqIO.parse(genome_file, "fasta")
    for record in records:
        name = record.description
        genome_name = name.split('|')[0]
        serotype_tag = name.split('|')[1]
        serotypes = {'O': '','H': ''}
        for key in ['O','H']:
            regex = re.compile("(?<!(Non-))("+key+"\d{1,3})(?!\d)")
            results = regex.findall(serotype_tag)
            results_len = len(results)
            if results_len > 0:
                serotypes[key] = results[0][1][1:]
        entry = {
            'genome name': genome_name,
            'given O': serotypes['O'],
            'given H': serotypes['H'],
            'serotype tag': serotype_tag
        }
        serotype_data.append(entry)
        if serotypes['O'] == '' and serotypes['H'] =='':
            pass
            # print(name)
        break
df2 = pd.DataFrame(serotype_data)
serotype_df = df2[['genome name', 'given H', 'given O', 'serotype tag']].copy()
serotype_df

Unnamed: 0,genome name,given H,given O,serotype tag
0,ESC_HA7792AA_AS,,,Ox13
1,ESC_HA9975AA_AS,21,98,O98:H21.
2,ESC_HA7457AA_AS,7,157,O157:H7
3,ESC_IA1636AA_AS,11,26,O26:H11
4,ESC_HA7911AA_AS,,40,O40
5,ESC_GA6789AA_AS,,26,O26
6,ESC_HA8681AA_AS,7,104,O104:H7
7,ESC_HA1758AA_AS,,30,O30
8,ESC_IA0097AA_AS,21,116,O116:H21
9,ESC_IA0176AA_AS,49,181,O181:H49


In [49]:
# Merge two data frames
df = ectyper_df.merge(serotype_df, left_on='genome name', right_on='genome name', how='right')
merged_df = df[['genome name', 'given O', 'predicted O', 'given H', 'predicted H', 'serotype tag']].copy()
ectyper_df

Unnamed: 0,genome name,predicted H,predicted O
0,ESC_AA7966AA_AS,21,113
1,ESC_AA7970AA_AS,7,157
2,ESC_BA4596AA_AS,1,85
3,ESC_BA1890AA_AS,10,49
4,ESC_BA1526AA_AS,10,6
5,ESC_AA9715AA_AS,10,121
6,ESC_CA6887AA_AS,11,103
7,ESC_CA6032AA_AS,11,
8,ESC_CA5330AA_AS,11,103
9,ESC_CA5018AA_AS,11,26


In [47]:
final_df = mapping_df.merge(merged_df, left_on='genome name', right_index=True, how='inner')
final_df.head()

KeyError: 'genome name'

In [46]:
allele_df = pd.read_csv('//home/sam/Projects/Experiment/allele_mapping.csv')
# Merge with allele data
final_df = merged_df.merge(allele_df, left_on='H allele', right_on='H allele', how='left')
allele_df.columns = ['O allele', 'O source']
final_df = final_df.merge(allele_df, left_on='O allele', right_on='O allele', how='left')
final_df.head()

KeyError: 'H allele'

In [38]:
def summarize_result(result_df):
    important_cols = ['genome name', 'given O', 'predicted O', 'given H', 'predicted H', 'serotype tag']
    
    df = result_df
    print('all predictions(%d):' %result_df.shape[0])
    '''
    no prediction if:
        both predicted antigen are None
    '''
    s1 = (df['predicted O'].isnull() & df['predicted H'].isnull())
    no_df = df[s1][important_cols]
    print('%d no predictions' %no_df.shape[0])
    '''
    incorrect prediction if:
        not in 'no_df'
        at least one prediction is wrong
    '''
    s2 = (
        ((df['given O'].notnull() & df['predicted O'].notnull()) & (df['given O']!=df['predicted O'])) |
        ((df['given H'].notnull() & df['predicted H'].notnull()) & (df['given H']!=df['predicted H']))
    )
    incorrect_df = df[~s1 & s2][important_cols]
    print('%d incorrect predictions' %incorrect_df.shape[0])
    '''
    correct prediction if:
        not in 'no_df'
        not in 'incorrect_df'
        for each given serotype, the correct prediction is made
    '''
    s3 = (
        (df['given O'].isnull() | (df['given O']==df['predicted O'])) &
        (df['given H'].isnull() | (df['given H']==df['predicted H']))
    )
    correct_df = df[~s1 & ~s2 & s3][important_cols]
    print('%d correct predictions' %correct_df.shape[0])
    '''
    semicorrect prediction if:
        not in 'no_df'
        not in 'incorrect_df'
        not in 'correct_df'
        one correct prediction is made, no prediction for the other
    '''
    s4 = (
        (df['given O'].isnull() | (df['given O']==df['predicted O'])) |
        (df['given H'].isnull() | (df['given H']==df['predicted H']))
    )
    semicorrect_df = df[~s1 & ~s2 & ~s3 & s4][important_cols]
    print('%d semicorrect predictions' %semicorrect_df.shape[0])
    '''
    remaining prediction
    '''
    remaining_df = df[~s1 & ~s2 & ~s3 & ~s4]
    print('%d remaining predictions' %remaining_df.shape[0])
    
    # Summary
    
    # O serotype
    # get number of given serotypes
    total_s = df['given O'].notnull()
    num_total = df[total_s].shape[0]
    print("number of given O serotypes is %d"
            %num_total)
    # get number of unpredicted serotype
    unpredicted_s = total_s & df['predicted O'].isnull()
    num_unpredicted = (df[unpredicted_s].shape[0])
    print("number of unpredicted O serotypes is %d or %.2f%%"
            %(num_unpredicted, num_unpredicted/num_total*100))
    correct_s = total_s & ~unpredicted_s & (df['given O']==df['predicted O'])
    num_correct = df[correct_s].shape[0]
    print("number of correctly predicted O serotypes is %d or %.2f%%"
            %(num_correct, num_correct/num_total*100))
    incorrect_s = total_s & ~unpredicted_s & ~correct_s
    incorrect_o_df = df[incorrect_s][important_cols]
    num_incorrect = incorrect_o_df.shape[0]
    print("number of incorrectly predicted O serotypes is %d or %.2f%%"
            %(num_incorrect, num_incorrect/num_total*100))
    
    # H serotype
    # get number of given serotypes
    total_s = df['given H'].notnull()
    num_total = (df[total_s].shape[0])
    print("number of given H serotypes is %d"
            %num_total)
    # get number of unpredicted serotype
    unpredicted_s = total_s & df['predicted H'].isnull()
    num_unpredicted = (df[unpredicted_s].shape[0])
    print("number of unpredicted H serotypes is %d or %.2f%%"
            %(num_unpredicted, num_unpredicted/num_total*100))
    correct_s = total_s & ~unpredicted_s & (df['given H']==df['predicted H'])
    num_correct = df[correct_s].shape[0]
    print("number of correctly predicted H serotypes is %d or %.2f%%"
            %(num_correct, num_correct/num_total*100))
    incorrect_s = total_s & ~unpredicted_s & ~correct_s
    incorrect_h_df = df[incorrect_s][important_cols]
    num_incorrect = incorrect_h_df.shape[0]
    print("number of incorrectly predicted H serotypes is %d or %.2f%%"
            %(num_incorrect, num_incorrect/num_total*100))

    return no_df, incorrect_df, correct_df, semicorrect_df, incorrect_o_df, incorrect_h_df, df

In [24]:
new_result = summarize_result(final_df)

all predictions(528):
49 no predictions
57 incorrect predictions
371 correct predictions
51 semicorrect predictions
0 remaining predictions
number of given O serotypes is 515
number of unpredicted O serotypes is 94 or 18.25%
number of correctly predicted O serotypes is 391 or 75.92%
number of incorrectly predicted O serotypes is 30 or 5.83%
number of given H serotypes is 427
number of unpredicted H serotypes is 31 or 7.26%
number of correctly predicted H serotypes is 367 or 85.95%
number of incorrectly predicted H serotypes is 29 or 6.79%


In [26]:
new_result[4]

Unnamed: 0,given O,predicted O,given H,predicted H
73,73.0,17.0,45.0,45.0
75,73.0,17.0,46.0,46.0
116,154.0,100.0,11.0,11.0
124,151.0,118.0,11.0,11.0
168,19.0,129.0,23.0,23.0
189,178.0,153.0,19.0,19.0
206,43.0,131.0,46.0,46.0
212,73.0,17.0,31.0,31.0
267,75.0,16.0,6.0,6.0
319,73.0,93.0,10.0,10.0
