In [49]:
import pandas as pd
import json
import numpy as np

In [67]:
def clean_prediction_df(prediction_df):
    '''
    # Generate Allele Mapping
    serotype_dict = json.load(open('serotype_dict.json'))
    new_list = []
    for serotype, alleles in serotype_dict.items():
        for allele in alleles:
            new_entry = {
                'H source': allele.get('name'),
                'H allele': '-'.join([serotype, str(allele.get('num'))])
            }
            new_list.append(new_entry)
    allele_df = pd.DataFrame(new_list)
    allele_df.to_csv('allele_mapping.csv', index_label=False)
    '''
    allele_df = pd.read_csv('allele_mapping.csv')
    allele_df.head()

    # Merge with allele data
    prediction_df = prediction_df.merge(allele_df, left_on='H allele', right_on='H allele', how='left')
    allele_df.columns = ['O allele', 'O source']
    prediction_df = prediction_df.merge(allele_df, left_on='O allele', right_on='O allele', how='left')
    prediction_df.head()

    # Read ectyper output
    # Data cleaning
    prediction_df['genome name'] = prediction_df['genome name'].str.split('|', expand=True)[1]
    prediction_df['predicted O'] = prediction_df['predicted O'].str.split('O', expand=True)[1]
    prediction_df['predicted H'] = prediction_df['predicted H'].str.split('H', expand=True)[1]
    prediction_df.index = prediction_df['genome name']
    prediction_df = prediction_df[['predicted H','H source', 'predicted O', 'O source']]
    prediction_df.head()

    '''
    # Get genome file paths as a list
    import os
    genome_dir = '/home/sam/Projects/MoreSerotype/temp/genomes'
    genome_names = os.listdir(genome_dir)
    genome_files = []
    for genome_name in genome_names:
        file = os.path.join(genome_dir, genome_name)
        genome_files.append(file)

    # Create serotype dataframe
    from Bio import SeqIO
    import re
    serotype_data = []
    for genome_file in genome_files:
        records = SeqIO.parse(genome_file, "fasta")
        entry = None
        for record in records:
            name = record.description
            genome_name = name.split('|')[0]
            serotype_tag = name.split('|')[1]
            serotypes = {'O': '','H': ''}
            for key in ['O','H']:
                regex = re.compile("(?<!(Non-))("+key+"\d{1,3})(?!\d)")
                results = regex.findall(serotype_tag)
                results_len = len(results)
                if results_len > 0:
                    serotypes[key] = results[0][1][1:]
            entry = {
                'genome name': genome_name,
                'given O': serotypes['O'],
                'given H': serotypes['H'],
                'serotype tag': serotype_tag
            }
            if serotypes['O'] == '' and serotypes['H'] =='':
                pass
                # print(name)
            break
        if entry is not None:
            serotype_data.append(entry)
    df2 = pd.DataFrame(serotype_data)
    df2.index = df2['genome name']
    df2 = df2[['given O', 'given H']]

    df2.to_csv('genome-serotype_mapping.csv')
    '''
    geneome_df = pd.read_csv('genome-serotype_mapping.csv', index_col='genome name', dtype={'given O':str, 'given H':str})
    

    # Merge two data frames
    prediction_df = prediction_df.merge(geneome_df, left_index=True, right_index=True, how='inner')
    prediction_df = prediction_df[['predicted O', 'given O', 'predicted H', 'given H', 'O source', 'H source']]
    prediction_df.head()
    prediction_df.fillna(value=np.nan, inplace=True)
    return prediction_df

In [71]:
def summarize_result(result_df):
    df1 = result_df
    # find conflict
    '''
    incorrect prediction if:
        given is different from predicton
        given is not None
        predicted is not None
    '''
    s1 = (df1['given O'].notnull() & df1['predicted O'].notnull()) & \
         (df1['given O']!=df1['predicted O'])
    s2 = (df1['given H'].notnull() & df1['predicted H'].notnull()) & \
         (df1['given H']!=df1['predicted H'])
    s3 = s1 | s2
    incorrect_df = df1[s3]
    print('%d incorrect predictions' %incorrect_df.shape[0])
    print(incorrect_df.head())
    '''
    no prediction if:
        both predicted antigen are None
    '''
    s1 = df1['predicted O'].isnull() & df1['predicted H'].isnull()
    no_df = df1[s1]
    print('%d no predictions' %no_df.shape[0])
    print(no_df.head())
    '''
    correct prediction if:
        for each given serotype, the correct prediction is made
    '''
    s1=df1['given O'].isnull() | (df1['given O']==df1['predicted O'])
    s2=df1['given H'].isnull() | (df1['given H']==df1['predicted H'])
    s3=s1&s2
    correct_df = df1[s3]
    # conflict rows
    # match rows
    print('%d correct predictions' %correct_df.shape[0])
    print(correct_df.head())

In [80]:
# old allele library
summarize_result(clean_prediction_df(pd.read_json('legacy.json')))

774 incorrect predictions
                predicted O given O predicted H given H  O source  H source
genome name                                                                
ESC_AA9715AA_AS         121     157          10     NaN       NaN       NaN
ESC_AA8012AA_AS          26     157          11     NaN       NaN       NaN
ESC_AA7929AA_AS          26     157          11     NaN       NaN       NaN
ESC_AA9661AA_AS         121     157          15     NaN       NaN       NaN
ESC_AA8510AA_AS         121     157          19     NaN       NaN       NaN
28 no predictions
                predicted O given O predicted H given H  O source  H source
genome name                                                                
ESC_BA9928AA_AS         NaN     145         NaN     NaN       NaN       NaN
ESC_CA1039AA_AS         NaN     145         NaN     NaN       NaN       NaN
ESC_CA0497AA_AS         NaN     145         NaN     NaN       NaN       NaN
ESC_FA8178AA_AS         NaN     112         

In [81]:
# old allele library + 97% length cutoff
summarize_result(clean_prediction_df(pd.read_json('legacy2.json')))

762 incorrect predictions
                predicted O given O predicted H given H  O source  H source
genome name                                                                
ESC_AA9715AA_AS         121     157          10     NaN       NaN       NaN
ESC_AA8012AA_AS          26     157          11     NaN       NaN       NaN
ESC_AA7929AA_AS          26     157          11     NaN       NaN       NaN
ESC_AA9661AA_AS         121     157          15     NaN       NaN       NaN
ESC_AA8510AA_AS         121     157          19     NaN       NaN       NaN
31 no predictions
                predicted O given O predicted H given H  O source  H source
genome name                                                                
ESC_BA9928AA_AS         NaN     145         NaN     NaN       NaN       NaN
ESC_CA1039AA_AS         NaN     145         NaN     NaN       NaN       NaN
ESC_CA0497AA_AS         NaN     145         NaN     NaN       NaN       NaN
ESC_FA8178AA_AS         NaN     112         

In [82]:
# new allele library
summarize_result(clean_prediction_df(pd.read_json('new.json')))

781 incorrect predictions
                predicted O given O predicted H given H  \
genome name                                               
ESC_AA9715AA_AS         121     157          10     NaN   
ESC_AA8012AA_AS          26     157          11     NaN   
ESC_AA7929AA_AS          26     157          11     NaN   
ESC_AA9661AA_AS         121     157          15     NaN   
ESC_AA8510AA_AS         121     157          19     NaN   

                                                          O source  \
genome name                                                          
ESC_AA9715AA_AS  8__wzx__wzx-O121__141 AY208937.1;O antigen fli...   
ESC_AA8012AA_AS  8__wzx__wzx-O26__234 AF529080.1;O antigen flip...   
ESC_AA7929AA_AS  8__wzx__wzx-O26__234 AF529080.1;O antigen flip...   
ESC_AA9661AA_AS  8__wzx__wzx-O121__141 AY208937.1;O antigen fli...   
ESC_AA8510AA_AS  8__wzx__wzx-O121__141 AY208937.1;O antigen fli...   

                                H source  
genome name               

In [84]:
# new allele library + 97% length cutoff
summarize_result(clean_prediction_df(pd.read_json('new2.json')))

767 incorrect predictions
                predicted O given O predicted H given H  \
genome name                                               
ESC_AA9715AA_AS         121     157          10     NaN   
ESC_AA8012AA_AS          26     157          11     NaN   
ESC_AA7929AA_AS          26     157          11     NaN   
ESC_AA9661AA_AS         121     157          15     NaN   
ESC_AA8510AA_AS         121     157          19     NaN   

                                                          O source  \
genome name                                                          
ESC_AA9715AA_AS  8__wzx__wzx-O121__141 AY208937.1;O antigen fli...   
ESC_AA8012AA_AS  8__wzx__wzx-O26__234 AF529080.1;O antigen flip...   
ESC_AA7929AA_AS  8__wzx__wzx-O26__234 AF529080.1;O antigen flip...   
ESC_AA9661AA_AS  8__wzx__wzx-O121__141 AY208937.1;O antigen fli...   
ESC_AA8510AA_AS  8__wzx__wzx-O121__141 AY208937.1;O antigen fli...   

                                H source  
genome name               