In [1]:
import pandas as pd

In [2]:
# Read ectyper output
json_output_file = 'all_genome_output.json'
df = pd.read_json(json_output_file)
# Data cleaning
df['genome name'] = df['genome name'].str.split('|', expand=True)[1]
df['predicted O'] = df['predicted O'].str.split('O', expand=True)[1]
df['predicted H'] = df['predicted H'].str.split('H', expand=True)[1]
df.index = df['genome name']
df = df[['predicted H', 'predicted O']]
df

Unnamed: 0_level_0,predicted H,predicted O
genome name,Unnamed: 1_level_1,Unnamed: 2_level_1
ESC_AA7966AA_AS,21,113
ESC_AA7970AA_AS,7,157
ESC_BA4596AA_AS,1,85
ESC_BA1890AA_AS,10,49
ESC_BA1526AA_AS,10,6
ESC_AA9715AA_AS,10,121
ESC_CA6887AA_AS,11,103
ESC_CA6032AA_AS,11,
ESC_CA5330AA_AS,11,103
ESC_CA5018AA_AS,11,26


In [3]:
# Get genome file paths as a list
import os
genome_dir = '/home/sam/Projects/MoreSerotype/temp/genomes'
genome_names = os.listdir(genome_dir)
genome_files = []
for genome_name in genome_names:
    file = os.path.join(genome_dir, genome_name)
    genome_files.append(file)

In [5]:
# Create serotype dataframe
serotype_data = []
from Bio import SeqIO
import re
for genome_file in genome_files:
    records = SeqIO.parse(genome_file, "fasta")
    for record in records:
        name = record.description
        genome_name = name.split('|')[0]
        serotype_tag = name.split('|')[1]
        serotypes = {'O': '','H': ''}
        for key in ['O','H']:
            regex = re.compile("(?<!(Non-))("+key+"\d{1,3})(?!\d)")
            results = regex.findall(serotype_tag)
            results_len = len(results)
            if results_len > 0:
                serotypes[key] = results[0][1][1:]
        dict = {
            'genome name': genome_name,
            'given O': serotypes['O'],
            'given H': serotypes['H'],
            'serotype tag': serotype_tag
        }
        if serotypes['O'] == '' and serotypes['H'] =='':
            pass
            # print(name)
        break
    serotype_data.append(dict)
df2 = pd.DataFrame(serotype_data)
df2.index = df2['genome name']
df2 = df2[['given O', 'given H']]
df2.describe()

Unnamed: 0,given O,given H
count,6154,6154.0
unique,193,56.0
top,157,
freq,2439,3104.0


In [95]:
# Merge two data frames
df3 = pd.merge(df, df2, left_index=True, right_index=True, how='right')
df3 = df3.apply(lambda x: pd.to_numeric(x, downcast='signed'))
pd.options.display.float_format = '{:,.0f}'.format
df3 = df3[['given O', 'given H', 'predicted O', 'predicted H']]
df3

Unnamed: 0_level_0,given O,given H,predicted O,predicted H
genome name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
ESC_AA7875AA_AS,157,,,
ESC_AA7899AA_AS,157,,,
ESC_AA7929AA_AS,157,,26,11
ESC_AA7930AA_AS,157,,,
ESC_AA7942AA_AS,157,,,
ESC_AA7966AA_AS,113,21,113,21
ESC_AA7970AA_AS,157,,157,7
ESC_AA7978AA_AS,157,,,
ESC_AA7989AA_AS,157,,,
ESC_AA8002AA_AS,157,,,


In [103]:
# find conflict
'''
incorrect prediction if:
    given is different from predicton
    given is not None
    predicted is not None
'''

s1 = (df3['given O'].notnull() & df3['predicted O'].notnull()) & \
     (df3['given O']!=df3['predicted O'])
s2 = (df3['given H'].notnull() & df3['predicted H'].notnull()) & \
     (df3['given H']!=df3['predicted H'])
s3 = s1 | s2
incorrect_df = df3[s3]
print('%d incorrect predictions' %incorrect_df.shape[0])
print(incorrect_df.describe())
'''
no prediction if:
    both predicted antigen are None
'''
s1 = df3['predicted O'].isnull() & df3['predicted H'].isnull()
no_df = df3[s1]
print('%d no predictions' %no_df.shape[0])
print(no_df.describe())
'''
correct prediction if:
    for each given serotype, the correct prediction is made
'''
s1=df3['given O'].isnull() | (df3['given O']==df3['predicted O'])
s2=df3['given H'].isnull() | (df3['given H']==df3['predicted H'])
s3=s1&s2
correct_df = df3[s3]
# conflict rows
# match rows
print('%d correct predictions' %correct_df.shape[0])
print(correct_df.describe())

763 incorrect predictions
       given O  given H  predicted O  predicted H
count      746      507          626          751
mean        91       22           76           18
std         78       18           60           13
min          1        1            1            1
25%         25        9           15            7
50%         86       17           83           16
75%        146       31          127           25
max        428      252          187           56
1173 no predictions
       given O  given H  predicted O  predicted H
count    1,163      456            0            0
mean       124       10          nan          nan
std         51        8          nan          nan
min          1        1          nan          nan
25%        103        7          nan          nan
50%        157        7          nan          nan
75%        157       10          nan          nan
max        409       52          nan          nan
3628 correct predictions
       given O  given H  pred

In [104]:
correct_df

Unnamed: 0_level_0,given O,given H,predicted O,predicted H
genome name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
ESC_AA7966AA_AS,113,21,113,21
ESC_AA7970AA_AS,157,,157,7
ESC_AA8026AA_AS,157,,157,
ESC_AA8470AA_AS,157,,157,
ESC_AA8923AA_AS,145,,145,
ESC_AA9052AA_AS,111,,111,2
ESC_AA9243AA_AS,157,,157,
ESC_AA9366AA_AS,121,,121,19
ESC_AA9419AA_AS,26,11,26,11
ESC_AA9535AA_AS,2,6,2,6
