In [1]:
import pandas as pd
import json
import numpy as np

In [2]:
def clean_prediction_df(prediction_df):
    '''
    # Generate Allele Mapping
    serotype_dict = json.load(open('serotype_dict.json'))
    new_list = []
    for serotype, alleles in serotype_dict.items():
        for allele in alleles:
            new_entry = {
                'H source': allele.get('name'),
                'H allele': '-'.join([serotype, str(allele.get('num'))])
            }
            new_list.append(new_entry)
    allele_df = pd.DataFrame(new_list)
    allele_df.to_csv('allele_mapping.csv', index_label=False)
    '''
    allele_df = pd.read_csv('allele_mapping.csv')
    allele_df.head()

    # Merge with allele data
    prediction_df = prediction_df.merge(allele_df, left_on='H allele', right_on='H allele', how='left')
    allele_df.columns = ['O allele', 'O source']
    prediction_df = prediction_df.merge(allele_df, left_on='O allele', right_on='O allele', how='left')
    prediction_df.head()

    # Read ectyper output
    # Data cleaning
    prediction_df['genome name'] = prediction_df['genome name'].str.split('|', expand=True)[1]
    prediction_df['predicted O'] = prediction_df['predicted O'].str.split('O', expand=True)[1]
    prediction_df['predicted H'] = prediction_df['predicted H'].str.split('H', expand=True)[1]
    prediction_df.index = prediction_df['genome name']
    prediction_df = prediction_df[['predicted H','H source', 'predicted O', 'O source']]
    prediction_df.head()

    '''
    # Get genome file paths as a list
    import os
    genome_dir = '/home/sam/Projects/MoreSerotype/temp/genomes'
    genome_names = os.listdir(genome_dir)
    genome_files = []
    for genome_name in genome_names:
        file = os.path.join(genome_dir, genome_name)
        genome_files.append(file)

    # Create serotype dataframe
    from Bio import SeqIO
    import re
    serotype_data = []
    for genome_file in genome_files:
        records = SeqIO.parse(genome_file, "fasta")
        entry = None
        for record in records:
            name = record.description
            genome_name = name.split('|')[0]
            serotype_tag = name.split('|')[1]
            serotypes = {'O': '','H': ''}
            for key in ['O','H']:
                regex = re.compile("(?<!(Non-))("+key+"\d{1,3})(?!\d)")
                results = regex.findall(serotype_tag)
                results_len = len(results)
                if results_len > 0:
                    serotypes[key] = results[0][1][1:]
            entry = {
                'genome name': genome_name,
                'given O': serotypes['O'],
                'given H': serotypes['H'],
                'serotype tag': serotype_tag
            }
            if serotypes['O'] == '' and serotypes['H'] =='':
                pass
                # print(name)
            break
        if entry is not None:
            serotype_data.append(entry)
    df2 = pd.DataFrame(serotype_data)
    df2.index = df2['genome name']
    df2 = df2[['given O', 'given H']]

    df2.to_csv('genome-serotype_mapping.csv')
    '''
    geneome_df = pd.read_csv('genome-serotype_mapping.csv', index_col='genome name', dtype={'given O':str, 'given H':str})
    

    # Merge two data frames
    prediction_df = prediction_df.merge(geneome_df, left_index=True, right_index=True, how='inner')
    prediction_df = prediction_df[['predicted O', 'given O', 'predicted H', 'given H', 'O source', 'H source']]
    prediction_df.head()
    prediction_df.fillna(value=np.nan, inplace=True)
    return prediction_df

In [70]:
def summarize_result(result_df):
    df = result_df
    print('all predictions(%d):' %result_df.shape[0])
    print(result_df.count())
    '''
    no prediction if:
        both predicted antigen are None
    '''
    s1 = (df['predicted O'].isnull() & df['predicted H'].isnull())
    no_df = df[s1]
    print('\n%d no predictions' %no_df.shape[0])
    print(no_df.count())
    '''
    incorrect prediction if:
        not in 'no_df'
        at least one prediction is wrong
    '''
    s2 = (
        ((df['given O'].notnull() & df['predicted O'].notnull()) & (df['given O']!=df['predicted O'])) |
        ((df['given H'].notnull() & df['predicted H'].notnull()) & (df['given H']!=df['predicted H']))
    )
    incorrect_df = df[~s1 & s2]
    print('\n%d incorrect predictions' %incorrect_df.shape[0])
    print(incorrect_df.count())
    '''
    correct prediction if:
        not in 'no_df'
        not in 'incorrect_df'
        for each given serotype, the correct prediction is made
    '''
    s3 = (
        (df['given O'].isnull() | (df['given O']==df['predicted O'])) &
        (df['given H'].isnull() | (df['given H']==df['predicted H']))
    )
    correct_df = df[~s1 & ~s2 & s3]
    print('\n%d correct predictions' %correct_df.shape[0])
    print(correct_df.count())
    '''
    semicorrect prediction if:
        not in 'no_df'
        not in 'incorrect_df'
        not in 'correct_df'
        at least one the correct prediction is made
    '''
    s4 = (
        (df['given O'].isnull() | (df['given O']==df['predicted O'])) |
        (df['given H'].isnull() | (df['given H']==df['predicted H']))
    )
    semicorrect_df = df[~s1 & ~s2 & ~s3 & s4]
    print('\n%d semicorrect predictions' %semicorrect_df.shape[0])
    print(correct_df.count())
    '''
    remaining prediction
    '''
    remaining_df = df[~s1 & ~s2 & ~s3 & ~s4]
    print('\n%d remaining predictions' %remaining_df.shape[0])
    print(remaining_df.count())

In [71]:
# old allele library
summarize_result(clean_prediction_df(pd.read_json('legacy.json')))

all predictions(5810):
predicted O    4875
given O        5671
predicted H    5669
given H        2887
O source          0
H source          0
dtype: int64

28 no predictions
predicted O     0
given O        28
predicted H     0
given H        13
O source        0
H source        0
dtype: int64

774 incorrect predictions
predicted O    632
given O        758
predicted H    757
given H        513
O source         0
H source         0
dtype: int64

4275 correct predictions
predicted O    4213
given O        4153
predicted H    4209
given H        1919
O source          0
H source          0
dtype: int64

733 semicorrect predictions
predicted O    4213
given O        4153
predicted H    4209
given H        1919
O source          0
H source          0
dtype: int64

0 remaining predictions
predicted O    0
given O        0
predicted H    0
given H        0
O source       0
H source       0
dtype: int64


In [72]:
# old allele library + 97% length cutoff
summarize_result(clean_prediction_df(pd.read_json('legacy2.json')))

all predictions(5810):
predicted O    4815
given O        5671
predicted H    5665
given H        2887
O source          0
H source          0
dtype: int64

31 no predictions
predicted O     0
given O        31
predicted H     0
given H        13
O source        0
H source        0
dtype: int64

762 incorrect predictions
predicted O    613
given O        746
predicted H    744
given H        510
O source         0
H source         0
dtype: int64

4235 correct predictions
predicted O    4171
given O        4113
predicted H    4170
given H        1886
O source          0
H source          0
dtype: int64

782 semicorrect predictions
predicted O    4171
given O        4113
predicted H    4170
given H        1886
O source          0
H source          0
dtype: int64

0 remaining predictions
predicted O    0
given O        0
predicted H    0
given H        0
O source       0
H source       0
dtype: int64


In [73]:
# new allele library
summarize_result(clean_prediction_df(pd.read_json('new.json')))

all predictions(5792):
predicted O    4955
given O        5653
predicted H    5690
given H        2879
O source       4955
H source       5690
dtype: int64

0 no predictions
predicted O    0
given O        0
predicted H    0
given H        0
O source       0
H source       0
dtype: int64

781 incorrect predictions
predicted O    647
given O        765
predicted H    767
given H        511
O source       647
H source       767
dtype: int64

4337 correct predictions
predicted O    4284
given O        4214
predicted H    4273
given H        1970
O source       4284
H source       4273
dtype: int64

674 semicorrect predictions
predicted O    4284
given O        4214
predicted H    4273
given H        1970
O source       4284
H source       4273
dtype: int64

0 remaining predictions
predicted O    0
given O        0
predicted H    0
given H        0
O source       0
H source       0
dtype: int64


In [74]:
# new allele library + 97% length cutoff
summarize_result(clean_prediction_df(pd.read_json('new2.json')))

all predictions(5789):
predicted O    4889
given O        5650
predicted H    5686
given H        2879
O source       4889
H source       5686
dtype: int64

0 no predictions
predicted O    0
given O        0
predicted H    0
given H        0
O source       0
H source       0
dtype: int64

767 incorrect predictions
predicted O    626
given O        751
predicted H    752
given H        508
O source       626
H source       752
dtype: int64

4293 correct predictions
predicted O    4238
given O        4170
predicted H    4230
given H        1933
O source       4238
H source       4230
dtype: int64

729 semicorrect predictions
predicted O    4238
given O        4170
predicted H    4230
given H        1933
O source       4238
H source       4230
dtype: int64

0 remaining predictions
predicted O    0
given O        0
predicted H    0
given H        0
O source       0
H source       0
dtype: int64
