In [1]:
import pandas as pd

In [2]:
# Read ectyper output
json_output_file = 'all_genome_output.json'
df = pd.read_json(json_output_file)
# Data cleaning
df['genome name'] = df['genome name'].str.split('|', expand=True)[1]
df['predicted O'] = df['predicted O'].str.split('O', expand=True)[1]
df['predicted H'] = df['predicted H'].str.split('H', expand=True)[1]
df.index = df['genome name']
df = df[['predicted H', 'predicted O']]
df

Unnamed: 0_level_0,predicted H,predicted O
genome name,Unnamed: 1_level_1,Unnamed: 2_level_1
ESC_AA7966AA_AS,21,113
ESC_AA7970AA_AS,7,157
ESC_BA4596AA_AS,1,85
ESC_BA1890AA_AS,10,49
ESC_BA1526AA_AS,10,6
ESC_AA9715AA_AS,10,121
ESC_CA6887AA_AS,11,103
ESC_CA6032AA_AS,11,
ESC_CA5330AA_AS,11,103
ESC_CA5018AA_AS,11,26


In [3]:
# Get genome file paths as a list
import os
genome_dir = '/home/sam/Projects/MoreSerotype/temp/genomes'
genome_names = os.listdir(genome_dir)
genome_files = []
for genome_name in genome_names:
    file = os.path.join(genome_dir, genome_name)
    genome_files.append(file)

# Create serotype dataframe
from Bio import SeqIO
import re
serotype_data = []
for genome_file in genome_files:
    records = SeqIO.parse(genome_file, "fasta")
    for record in records:
        name = record.description
        genome_name = name.split('|')[0]
        serotype_tag = name.split('|')[1]
        serotypes = {'O': '','H': ''}
        for key in ['O','H']:
            regex = re.compile("(?<!(Non-))("+key+"\d{1,3})(?!\d)")
            results = regex.findall(serotype_tag)
            results_len = len(results)
            if results_len > 0:
                serotypes[key] = results[0][1][1:]
        dict = {
            'genome name': genome_name,
            'given O': serotypes['O'],
            'given H': serotypes['H'],
            'serotype tag': serotype_tag
        }
        if serotypes['O'] == '' and serotypes['H'] =='':
            pass
            # print(name)
        break
    serotype_data.append(dict)
df2 = pd.DataFrame(serotype_data)
df2.index = df2['genome name']
df2 = df2[['given O', 'given H']]
df2.describe()

In [41]:
# Merge two data frames
df3 = pd.merge(df, df2, left_index=True, right_index=True, how='right')
df3 = df3.apply(lambda x: pd.to_numeric(x, downcast='signed'))
pd.options.display.float_format = '{:,.0f}'.format
df3 = df3[['given O', 'given H', 'predicted O', 'predicted H']]
print(df3.describe())
df3.head()

       given O  given H  predicted O  predicted H
count    6,005    3,050        4,221        4,852
mean       112       14          107           13
std         60       13           58           11
min          1        1            1            1
25%         70        7           53            7
50%        145        7          127            7
75%        157       18          157           18
max        428      252          187           56


Unnamed: 0_level_0,given O,given H,predicted O,predicted H
genome name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
ESC_AA7875AA_AS,157,,,
ESC_AA7899AA_AS,157,,,
ESC_AA7929AA_AS,157,,26.0,11.0
ESC_AA7930AA_AS,157,,,
ESC_AA7942AA_AS,157,,,


In [9]:
# find conflict
'''
incorrect prediction if:
    given is different from predicton
    given is not None
    predicted is not None
'''

s1 = (df3['given O'].notnull() & df3['predicted O'].notnull()) & \
     (df3['given O']!=df3['predicted O'])
s2 = (df3['given H'].notnull() & df3['predicted H'].notnull()) & \
     (df3['given H']!=df3['predicted H'])
s3 = s1 | s2
incorrect_df = df3[s3]
print('%d incorrect predictions' %incorrect_df.shape[0])
print(incorrect_df.describe())
'''
no prediction if:
    both predicted antigen are None
'''
s1 = df3['predicted O'].isnull() & df3['predicted H'].isnull()
no_df = df3[s1]
print('%d no predictions' %no_df.shape[0])
print(no_df.describe())
'''
correct prediction if:
    for each given serotype, the correct prediction is made
'''
s1=df3['given O'].isnull() | (df3['given O']==df3['predicted O'])
s2=df3['given H'].isnull() | (df3['given H']==df3['predicted H'])
s3=s1&s2
correct_df = df3[s3]
# conflict rows
# match rows
print('%d correct predictions' %correct_df.shape[0])
print(correct_df.describe())
correct_df

763 incorrect predictions
       given O  given H  predicted O  predicted H
count      746      507          626          751
mean        91       22           76           18
std         78       18           60           13
min          1        1            1            1
25%         25        9           15            7
50%         86       17           83           16
75%        146       31          127           25
max        428      252          187           56
1173 no predictions
       given O  given H  predicted O  predicted H
count    1,163      456            0            0
mean       124       10          nan          nan
std         51        8          nan          nan
min          1        1          nan          nan
25%        103        7          nan          nan
50%        157        7          nan          nan
75%        157       10          nan          nan
max        409       52          nan          nan
3628 correct predictions
       given O  given H  pred

Unnamed: 0_level_0,given O,given H,predicted O,predicted H
genome name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
ESC_AA7966AA_AS,113,21,113,21
ESC_AA7970AA_AS,157,,157,7
ESC_AA8026AA_AS,157,,157,
ESC_AA8470AA_AS,157,,157,
ESC_AA8923AA_AS,145,,145,
ESC_AA9052AA_AS,111,,111,2
ESC_AA9243AA_AS,157,,157,
ESC_AA9366AA_AS,121,,121,19
ESC_AA9419AA_AS,26,11,26,11
ESC_AA9535AA_AS,2,6,2,6


In [2]:
serotype_dict_file = '/home/sam/Projects/MoreSerotype/output/serotype_dict.json'
import json
serotype_dict = json.load(open(serotype_dict_file))
added_genome = []
for alleles in serotype_dict.values():
    for allele in alleles:
        allele_name = allele['name']
        if 'part_of_' in allele_name:
            added_genome.append(allele_name.split('part_of_')[1])
added_df = pd.DataFrame(added_genome, columns=['genome name'])
added_df = added_df[~added_df.duplicated()]
merged_added_df = added_df.merge(df3, left_on='genome name', right_index=True, how='inner')
merged_added_df.describe()

NameError: name 'pd' is not defined

In [45]:
# find conflict
'''
incorrect prediction if:
    given is different from predicton
    given is not None
    predicted is not None
'''

s1 = (merged_added_df['given O'].notnull() & merged_added_df['predicted O'].notnull()) & \
     (merged_added_df['given O']!=merged_added_df['predicted O'])
s2 = (merged_added_df['given H'].notnull() & merged_added_df['predicted H'].notnull()) & \
     (merged_added_df['given H']!=merged_added_df['predicted H'])
s3 = s1 | s2
incorrect_df = merged_added_df[s3]
print('%d incorrect predictions' %incorrect_df.shape[0])
print(incorrect_df.describe())
print(incorrect_df.head())
'''
no prediction if:
    both predicted antigen are None
'''
s1 = merged_added_df['predicted O'].isnull() & merged_added_df['predicted H'].isnull()
no_df = merged_added_df[s1]
print('%d no predictions' %no_df.shape[0])
print(no_df.describe())
print(no_df.head())
'''
correct prediction if:
    for each given serotype, the correct prediction is made
'''
s1=merged_added_df['given O'].isnull() | (merged_added_df['given O']==merged_added_df['predicted O'])
s2=merged_added_df['given H'].isnull() | (merged_added_df['given H']==merged_added_df['predicted H'])
s3=s1&s2
correct_df = merged_added_df[s3]
# conflict rows
# match rows
print('%d correct predictions' %correct_df.shape[0])
print(correct_df.describe())
correct_df.head()

64 incorrect predictions
       given O  given H  predicted O  predicted H
count       61       64           64           64
mean        84       26           76           21
std         69       17           60           14
min          2        1            2            1
25%         26       10           15           10
50%         75       24           86           20
75%        109       40          127           30
max        409       56          184           49
        genome name  given O  given H  predicted O  predicted H
25  ESC_HA8634AA_AS       73       10           93           10
42  ESC_GA9177AA_AS      154       11          100           11
50  ESC_GA8882AA_AS      151       11          118           11
72  ESC_HA8149AA_AS       85       12           85            1
98  ESC_HA8803AA_AS       85       16            8           16
57 no predictions
       given O  given H  predicted O  predicted H
count       56       30            0            0
mean        98       14

Unnamed: 0,genome name,given O,given H,predicted O,predicted H
0,ESC_IA2245AA_AS,4,1,4,1
0,ESC_IA2245AA_AS,4,1,4,1
1,ESC_HA8537AA_AS,83,1,83,1
2,ESC_IA0005AA_AS,2,1,2,1
3,ESC_IA2248AA_AS,6,1,6,1


In [None]:
/home/sam/Projects/MoreSerotype/temp/genomes/ESC_LA5772AA_AS