In [1]:
import os
from os import walk

import pandas as pd
import numpy as np
import sys
import re

import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns

from collections import Counter

%matplotlib inline

pd.set_option('display.max_rows', 9000)
pd.set_option('display.max_columns', 1500)
pd.set_option('max_colwidth', 400)

In [2]:
#Set to False if no subsampling, else set to the desired number for subsample results:
#subsampling = "2"
subsampling = False



#For a special case of looking only at samples with 200X coverage:
#high_coverage_analysis = False
high_coverage_analysis = False

#Failsafe:
if high_coverage_analysis == True:
    subsampling = False

In [3]:
main_folder_path = 'C:\\Users\\nikol\\OneDrive\\DTU\\11_semester\\'

resultpath = main_folder_path + '1000_genomes_results\\'

#Outcomment all but the resolution desired:
if subsampling != False:
    resultpath = main_folder_path + 'output_' + subsampling + 'X\\' + '1000_genomes_results\\'

gold_standard_path = main_folder_path + 'gold_standard_data\\'

In [4]:
#Chose allele resolution: options: "two_field", "g_group" or "p_group" (or e_group)
resolution = "e_group"

# Functions for Allele conversion

In [5]:
#Function for converting an allele to four field resolution (disregarding any trailing letters - still unambiguous)

def make_two_field(allele_high_res):
    two_field_finder = re.search(r"(A|B|C|DRB1|DQB1)\*\d{2}:\d{2,3}", allele_high_res)
    
    if two_field_finder != None:
        allele_two_field = two_field_finder.group(0)
    else:
        allele_two_field = None
    
    return allele_two_field    


def make_three_field(allele_high_res):
    three_field_finder = re.search(r"(A|B|C|DRB1|DQB1)\*\d{2}:\d{2,3}:\d{2,3}", allele_high_res)
    
    if three_field_finder != None:
        allele_three_field = three_field_finder.group(0)
    else:
        allele_three_field = None
    
    return allele_three_field     

## Function for P-group conversion - THIS DICT IS SLIGHTLY DIFFERENT
however, the function convert_to_p_type returns the same as before

In [6]:
#Make dict for P-type conversion

p_group_filepath = gold_standard_path + 'p_group_resolution.txt'

p_group_dict = dict()

#Read the important results:
with open(p_group_filepath, 'r') as infile:
    for line in infile:
        #If several alleles map to the same one, they are separated by a "/"
#        if ('/' in line) and (line[0] != '#'):
        if line[0] != '#':
            gene = line.split('*')[0]
            
            #Only register the valid alleles:
            if gene in ['A', 'B', 'C', 'DRB1', 'DQB1']:
                
                if "/" in line:
                    p_group_full = gene + "*" + line.split(';')[-1][:-1]
                else:
                    p_group_full = gene + "*" + line.split(';')[-2]
            
                #Read the rest of the alleles and clean up the front and end part
                synonymous_alleles = line.split('/')
                synonymous_alleles[0] = synonymous_alleles[0].split(';')[1]
                synonymous_alleles[-1] = synonymous_alleles[-1].split(';')[0]
                
                #Convert all alleles to four field resolution
                for i in range(len(synonymous_alleles)):
                    synonymous_alleles[i] = gene + "*" + synonymous_alleles[i]
                    synonymous_alleles[i] = make_two_field(synonymous_alleles[i])
                    
                #Add the group itself to the list, so that tools predicting in G-type resolution can be converted in similar fashion
                synonynous_alleles = synonymous_alleles.append(p_group_full)

                #Remove duplicates when converting to four field:
                synonymous_allels_unique_two_field = list(set(synonymous_alleles))
                
                #Add key in dict for each of the unique entries:
                for synonymous_allele in synonymous_allels_unique_two_field:

                    p_group_dict[synonymous_allele] = p_group_full
                    

#Make P type conversion function using p_group_dict
def convert_to_p_group(allele):

    #Start by converting to four field:
    allele_two_field = make_two_field(allele)
    
    #Find corresponding P-type if it exists. If not, return the four field resolution
    if allele_two_field in p_group_dict:
        allele_two_field_p_group = p_group_dict[allele_two_field]
    else:
        allele_two_field_p_group = allele_two_field
        
    return allele_two_field_p_group


## Function for G-group conversion

In [7]:
#Make dict for G-type conversion

g_group_filepath = gold_standard_path + 'g_group_resolution.txt'

g_group_dict = dict()


#Make conversion from full allele names to G groups
with open(g_group_filepath, 'r') as infile:
    for line in infile:
        #If several alleles map to the same one, they are separated by a "/"
        if line[0] != '#':
            gene = line.split('*')[0]
            
            #Only register the valid alleles:
            if gene in ['A', 'B', 'C', 'DRB1', 'DQB1']:
                        
                #find full G type for entry - differs depending on, whether there are 1 or multiple entries in the group.
                if "/" in line:
                    g_group_full = gene + "*" + line.split(';')[-1][:-1]
                else:
                    g_group_full = gene + "*" + line.split(';')[-2]

                #For full dict (parallel to four-field dict)
                #Read the rest of the alleles and clean up the front and end part
                synonymous_alleles = line.split('/')
                synonymous_alleles[0] = synonymous_alleles[0].split(';')[1]
                synonymous_alleles[-1] = synonymous_alleles[-1].split(';')[0]
                
                #Add "gene*" in front of all alleles:
                for i in range(len(synonymous_alleles)):
                    synonymous_alleles[i] = gene + "*" + synonymous_alleles[i]
                
                #Add the group itself to the list, so that tools predicting in G-type resolution can be converted in similar fashion
                synonynous_alleles = synonymous_alleles.append(g_group_full)
                
                #Remove duplicates
                synonymous_allels_unique = sorted(list(set(synonymous_alleles)), reverse=True)
                
                #Add key in dict for each of the unique entries:
                for allele in synonymous_allels_unique :                    
                    g_group_dict[allele] = g_group_full


                    
print("Ambiguities in two-field resolution:")
print("Allele", "Primary G-group", "Alternative group", sep = "\t")
#Add entries for 4-field and 6-field resolution typings, not already found in g_group_dict.
#This is done after, in order to not overwrite ambiguous G-group mappings such as C*02:02,
#which could map to C*02:02:01 or C*02:10:01G depending on the full typing.
with open(g_group_filepath, 'r') as infile:
    for line in infile:
        #If several alleles map to the same one, they are separated by a "/"
        if line[0] != '#':
            gene = line.split('*')[0]
            
            #Only register the valid alleles:
            if gene in ['A', 'B', 'C', 'DRB1', 'DQB1']:                
                
                if "/" in line:
                    g_group_full = gene + "*" + line.split(';')[-1][:-1]
                else:
                    g_group_full = gene + "*" + line.split(';')[-2]

                #Add to four field dict (Parralel to the full dict)
                synonymous_alleles_two_field = line.split('/')
                synonymous_alleles_two_field[0] = synonymous_alleles_two_field[0].split(';')[1]
                synonymous_alleles_two_field[-1] = synonymous_alleles_two_field[-1].split(';')[0]
                                
                #Convert all alleles to four field resolution for four field typing:
                for i in range(len(synonymous_alleles_two_field)):
                    synonymous_alleles_two_field[i] = gene + "*" + synonymous_alleles_two_field[i]
                    synonymous_alleles_two_field[i] = make_two_field(synonymous_alleles_two_field[i])

                #Remove duplicates when converting to four field:
                synonymous_allels_unique_two_field = sorted(list(set(synonymous_alleles_two_field)), reverse=True)
                
                #Add key in dict for each of the unique entries:
                for allele in synonymous_allels_unique_two_field:                    
                    
                    #Only add entries, which aren't already in the dict
                    if (allele in g_group_dict.keys()):
                        
                        #For two-field: Check for ambiguities in the G-type conversion e.g. C*02:02:02G and C*02:10:01G have same exon sequence for exon 2 and 3
                        #Print the ambiguity - because of sort, nothing is added. two-field-typed-allele is kept in the group
                        if (g_group_dict[allele] != g_group_full) and ("/" in line):
                            print("two_field_res", allele, g_group_dict[allele], g_group_full, sep=', ')
                            
                    else:
                        g_group_dict[allele] = g_group_full
                
                #Add to six field dict (Parralel to the full dict)
                synonymous_alleles_three_field = line.split('/')
                synonymous_alleles_three_field[0] = synonymous_alleles_three_field[0].split(';')[1]
                synonymous_alleles_three_field[-1] = synonymous_alleles_three_field[-1].split(';')[0]
                
                  #Convert all alleles to six field resolution for six field typing:
                for i in range(len(synonymous_alleles_three_field)):
                    synonymous_alleles_three_field[i] = gene + "*" + synonymous_alleles_three_field[i]
                    synonymous_alleles_three_field[i] = make_three_field(synonymous_alleles_three_field[i])
                    
                
                synonymous_alleles_three_field = [i for i in synonymous_alleles_three_field if i != None]
                #Remove duplicates when converting to six field:
                synonymous_allels_unique_three_field = sorted(list(set(synonymous_alleles_three_field)), reverse=True)
                
                #Add key in dict for each of the unique entries:
                for allele in synonymous_allels_unique_three_field:                    
                    
                    #Only add entries, which aren't already in the dict
                    if (allele in g_group_dict.keys()):
                        
                        #For six-field: Check for ambiguities in the G-type conversion e.g. C*02:02:02G and C*02:10:01G have same exon sequence for exon 2 and 3
                        #Print the ambiguity - because of sort, nothing is added. six-field-typed-allele is kept in the group
                        if (g_group_dict[allele] != g_group_full) and ("/" in line):
                            print("three_field_res:", allele, g_group_dict[allele], g_group_full, sep=', ')
                            
                    else:
                        g_group_dict[allele] = g_group_full

                
                
                    
#Make G type conversion function using g_group_dict
def convert_to_g_group(allele):

    if allele in g_group_dict.keys():
        allele_g_group = g_group_dict[allele]
    else:
        allele_g_group = make_two_field(allele)
        
    return allele_g_group


Ambiguities in two-field resolution:
Allele	Primary G-group	Alternative group
two_field_res, A*02:01, A*02:01:01G, A*02:01:02G
two_field_res, A*23:03, A*23:03:01, A*23:03:02G
two_field_res, A*24:02, A*24:02:01G, A*24:02:115G
two_field_res, A*24:208, A*24:208:01, A*24:208:02G
two_field_res, A*29:02, A*29:02:01G, A*29:02:17G
two_field_res, A*68:01, A*68:01:01G, A*68:01:02G
two_field_res, B*15:16, B*15:16:01G, B*15:16:02G
two_field_res, B*27:05, B*27:05:02G, B*27:05:18G
two_field_res, B*38:02, B*38:02:01G, B*38:02:02G
two_field_res, B*39:02, B*39:02:01G, B*39:02:02G
two_field_res, B*39:06, B*39:06:01, B*39:06:02G
two_field_res, B*40:01, B*40:01:01G, B*40:01:03G
two_field_res, B*44:281, B*44:03:01G, B*44:03:02G
two_field_res, B*44:03, B*44:03:01G, B*44:03:02G
two_field_res, B*51:01, B*51:01:01G, B*51:01:02G
two_field_res, B*52:01, B*52:01:01G, B*52:01:02G
two_field_res, B*67:01, B*67:01:01, B*67:01:02G
two_field_res, C*02:02, C*02:02:01, C*02:02:02G
two_field_res, C*02:02, C*02:02:01, C*02

# Evaxion typing resolution
Using Evaxion typing:

In [8]:
#Make dict for G-type conversion

e_group_filepath = gold_standard_path + 'e_group_resolution.txt'

e_group_dict = dict()


#Make conversion from full allele names to G groups
with open(e_group_filepath, 'r') as infile:
    for line in infile:
        if line.startswith('HLA-'):
            gene = line[4]
            
            if gene in ['A', 'B', 'C']:     
                
                
                digits_from_type = re.search(r'\d{2}:\d{2,3}',line)
                
                if digits_from_type != None:
                    allele = gene + "*" + digits_from_type.group(0)

                    peptide_sequence = line.split('\t')[-1][:-1]

                    e_group_dict[allele] = peptide_sequence
                    

            
        elif line.startswith('DRB1'):
            gene = 'DRB1'
            
            digits_from_type_raw = line.split('_')[1].split('\t')[0]
            
            digits_from_type = digits_from_type_raw[0:2] + ':' + digits_from_type_raw[2:]
            
            allele = gene + "*" + digits_from_type
                
            peptide_sequence = line.split('\t')[-1][:-1]
            
            e_group_dict[allele] = peptide_sequence
            

#Function for converting to e-group resolution:
#Make E type conversion function using e_group_dict
def convert_to_e_group(allele):

    #Start by converting to four field:
    allele_two_field = make_two_field(allele)
    
    #Find corresponding e-type if it exists. If not, return the P resolution
    if allele_two_field in e_group_dict:
        allele_e_group = e_group_dict[allele_two_field]
    else:
        allele_e_group = convert_to_p_group(allele_two_field)
        #print(allele_two_field)
        
    return allele_e_group



# Calculations 

Number of alleles:

- two field: 16621
- G group: 18483
- overlap between G group and two field: 12151 
- P group: 14001
- Evaxion group: 12433

## Number of two-field alleles

In [9]:
#Assume, that the g_group dict contains all known alleles - the unique keys converted to two field
#Would then be all the known two field alleles
0
g_group_keys = list(g_group_dict.keys())
len(list(set([make_two_field(i) for i in g_group_keys])))

16621

## Number of G groups

In [10]:
#Calculation of total G groups (taken from .txt file - http://hla.alleles.org/alleles/g_groups.html: )
#A, B and C - starting_lines + DQB1 + DRB1
14871-6+1228+2390


18483

In [11]:
#Check with g_group_dict:
g_group_values = list(g_group_dict.values())
len(list(set(g_group_values)))

18483

In [12]:
#Explanation as to why, there are more G groups than two-field alleles: These are all the G-groups mapping to A*01:01
a_0101_list = ['A*01:01:01G', 'A*01:01:02', 'A*01:01:03', 'A*01:01:04', 'A*01:01:05', 'A*01:01:06', 'A*01:01:07', 'A*01:01:08', 'A*01:01:09', 'A*01:01:10', 'A*01:01:101', 'A*01:01:102', 'A*01:01:107', 'A*01:01:109', 'A*01:01:11', 'A*01:01:110', 'A*01:01:111', 'A*01:01:112', 'A*01:01:113', 'A*01:01:114', 'A*01:01:117', 'A*01:01:119', 'A*01:01:12', 'A*01:01:120', 'A*01:01:13', 'A*01:01:14', 'A*01:01:15', 'A*01:01:16', 'A*01:01:17', 'A*01:01:18', 'A*01:01:19', 'A*01:01:20', 'A*01:01:21', 'A*01:01:22', 'A*01:01:23', 'A*01:01:24', 'A*01:01:25', 'A*01:01:26', 'A*01:01:27', 'A*01:01:28', 'A*01:01:29', 'A*01:01:30', 'A*01:01:31', 'A*01:01:32', 'A*01:01:33', 'A*01:01:34', 'A*01:01:35', 'A*01:01:36', 'A*01:01:37', 'A*01:01:39', 'A*01:01:40', 'A*01:01:41', 'A*01:01:42', 'A*01:01:43', 'A*01:01:44', 'A*01:01:45', 'A*01:01:46', 'A*01:01:47', 'A*01:01:48', 'A*01:01:49', 'A*01:01:50', 'A*01:01:52', 'A*01:01:53', 'A*01:01:54', 'A*01:01:55', 'A*01:01:56', 'A*01:01:57', 'A*01:01:58', 'A*01:01:59', 'A*01:01:60', 'A*01:01:61', 'A*01:01:62', 'A*01:01:63', 'A*01:01:64', 'A*01:01:65', 'A*01:01:66', 'A*01:01:67', 'A*01:01:68', 'A*01:01:69', 'A*01:01:70', 'A*01:01:71', 'A*01:01:72', 'A*01:01:73', 'A*01:01:74', 'A*01:01:75', 'A*01:01:76', 'A*01:01:77', 'A*01:01:78', 'A*01:01:79', 'A*01:01:80', 'A*01:01:81', 'A*01:01:82', 'A*01:01:85', 'A*01:01:86', 'A*01:01:87', 'A*01:01:88', 'A*01:01:89', 'A*01:01:90', 'A*01:01:92', 'A*01:01:96', 'A*01:01:97', 'A*01:01:98', 'A*01:01:99']
len(a_0101_list)

103

In [13]:
#Alleles belonging to A*01:01:01G
a_0101G_list = ["A*01:01:01:01", "A*01:01:01:02N", "A*01:01:01:03", "A*01:01:01:04", "A*01:01:01:05", "A*01:01:01:06", "A*01:01:01:07", "A*01:01:01:08", "A*01:01:01:09", "A*01:01:01:10", "A*01:01:01:11", "A*01:01:01:12", "A*01:01:01:13", "A*01:01:01:14", "A*01:01:01:15", "A*01:01:01:16", "A*01:01:01:17", "A*01:01:01:18", "A*01:01:01:19", "A*01:01:01:20", "A*01:01:01:21", "A*01:01:01:22", "A*01:01:01:23", "A*01:01:01:24", "A*01:01:01:25", "A*01:01:01:26", "A*01:01:01:27", "A*01:01:01:28", "A*01:01:01:29", "A*01:01:01:30", "A*01:01:01:31", "A*01:01:01:32", "A*01:01:01:33", "A*01:01:01:34", "A*01:01:01:35", "A*01:01:01:36", "A*01:01:01:37", "A*01:01:01:38", "A*01:01:01:39", "A*01:01:01:40", "A*01:01:01:41", "A*01:01:01:42", "A*01:01:01:43", "A*01:01:01:44", "A*01:01:01:45", "A*01:01:01:46", "A*01:01:01:47", "A*01:01:01:48", "A*01:01:01:49", "A*01:01:01:50", "A*01:01:01:51", "A*01:01:01:52", "A*01:01:01:53", "A*01:01:01:54", "A*01:01:01:55", "A*01:01:01:56", "A*01:01:01:57", "A*01:01:01:58", "A*01:01:01:59", "A*01:01:01:60", "A*01:01:01:61", "A*01:01:01:62", "A*01:01:38L", "A*01:01:51", "A*01:01:83", "A*01:01:84", "A*01:01:91", "A*01:01:93", "A*01:01:94", "A*01:01:95", "A*01:01:100", "A*01:01:103", "A*01:01:104", "A*01:01:105", "A*01:01:106", "A*01:01:108", "A*01:01:115", "A*01:01:116", "A*01:01:118", "A*01:04:01:01N", "A*01:04:01:02N", "A*01:22N", "A*01:32", "A*01:37:01:01", "A*01:37:01:02", "A*01:45", "A*01:56N", "A*01:81", "A*01:87N", "A*01:103", "A*01:107", "A*01:109", "A*01:132", "A*01:141", "A*01:142", "A*01:155", "A*01:177", "A*01:212", "A*01:217", "A*01:234", "A*01:237", "A*01:246", "A*01:248Q", "A*01:249", "A*01:251", "A*01:252", "A*01:253", "A*01:261", "A*01:274", "A*01:276", "A*01:277", "A*01:280", "A*01:281Q", "A*01:288", "A*01:291", "A*01:295", "A*01:296", "A*01:297", "A*01:300", "A*01:305", "A*01:306", "A*01:309", "A*01:316", "A*01:317", "A*01:319", "A*01:323", "A*01:324", "A*01:325", "A*01:328N", "A*01:331N", "A*01:332", "A*01:346", "A*01:347", "A*01:349", "A*01:351", "A*01:353"]
len(a_0101G_list)

136

In [21]:
#Alleles belonging to A*01:01:01G
a_0101G_diff_from_2_field_list = ["A*01:01:01:01", "A*01:04:01:01N", "A*01:22N", "A*01:32", "A*01:37:01:01", "A*01:45", "A*01:56N", "A*01:81", "A*01:87N", "A*01:103", "A*01:107", "A*01:109", "A*01:132", "A*01:141", "A*01:142", "A*01:155", "A*01:177", "A*01:212", "A*01:217", "A*01:234", "A*01:237", "A*01:246", "A*01:248Q", "A*01:249", "A*01:251", "A*01:252", "A*01:253", "A*01:261", "A*01:274", "A*01:276", "A*01:277", "A*01:280", "A*01:281Q", "A*01:288", "A*01:291", "A*01:295", "A*01:296", "A*01:297", "A*01:300", "A*01:305", "A*01:306", "A*01:309", "A*01:316", "A*01:317", "A*01:319", "A*01:323", "A*01:324", "A*01:325", "A*01:328N", "A*01:331N", "A*01:332", "A*01:346", "A*01:347", "A*01:349", "A*01:351", "A*01:353"]
len(a_0101G_diff_from_2_field_list)

56

In [23]:
convert_to_g_group('A*01:01:11')

'A*01:01:11'

## Number of P group alleles
The way I made the P group conversion was different from the G-group, as I didn't include the two-field resolutions, which didn't map to a P group (i.e. they made up their own P group) in the dict.
I therefore take all the alleles (g_group_values) and run it through convert_to_p_group(i) and then take unique values

In [14]:
#From P_group file (http://hla.alleles.org/alleles/p_groups.html)
#A, B and C - starting_lines + DQB1 + DRB1
10623-6+842+1744


13203

In [15]:
len(list(set(p_group_dict.values())))

13203

In [16]:
unique_p_alleles_set = set()
for allele in list(set(g_group_values)):
    
    unique_p_alleles_set.add(convert_to_p_group(allele))

len(list(unique_p_alleles_set))

14001

In [17]:
#Finding the difference between these two sets:
set(unique_p_alleles_set).difference(set(p_group_dict.values()))

{'A*01:11',
 'A*01:123',
 'A*01:15',
 'A*01:16',
 'A*01:160',
 'A*01:162',
 'A*01:178',
 'A*01:179',
 'A*01:18',
 'A*01:186',
 'A*01:240',
 'A*01:247',
 'A*01:250',
 'A*01:258',
 'A*01:269',
 'A*01:27',
 'A*01:285',
 'A*01:290',
 'A*01:293',
 'A*01:308',
 'A*01:31',
 'A*01:320',
 'A*01:326',
 'A*01:336',
 'A*01:52',
 'A*01:53',
 'A*01:57',
 'A*02:113',
 'A*02:125',
 'A*02:222',
 'A*02:223',
 'A*02:225',
 'A*02:226',
 'A*02:227',
 'A*02:250',
 'A*02:284',
 'A*02:301',
 'A*02:314',
 'A*02:32',
 'A*02:321',
 'A*02:350',
 'A*02:366',
 'A*02:373',
 'A*02:395',
 'A*02:439',
 'A*02:468',
 'A*02:476',
 'A*02:490',
 'A*02:501',
 'A*02:514',
 'A*02:516',
 'A*02:525',
 'A*02:53',
 'A*02:540',
 'A*02:622',
 'A*02:643',
 'A*02:696',
 'A*02:715',
 'A*02:748',
 'A*02:773',
 'A*02:775',
 'A*02:788',
 'A*02:789',
 'A*02:796',
 'A*02:797',
 'A*02:803',
 'A*02:806',
 'A*02:807',
 'A*02:82',
 'A*02:831',
 'A*02:833',
 'A*02:858',
 'A*02:879',
 'A*02:88',
 'A*02:880',
 'A*02:887',
 'A*02:895',
 'A*02:94',


Using this site: https://www.ebi.ac.uk/cgi-bin/ipd/imgt/hla/allele.cgi
found out, that the alleles not included are null alleles, and that they therefore should be included, when comparing to the number of two-field and g group resolutions

## Number of evaxion group alleles:
Using the info found before (about null alleles not being included) I use this method for finding the number of e-group alleles:

In [18]:
unique_e_alleles_set = set()
for allele in list(set(g_group_values)):
    
    unique_e_alleles_set.add(convert_to_e_group(allele))

len(list(unique_e_alleles_set))

12433

# Overlap between groups

In [19]:
g_group

NameError: name 'g_group' is not defined

In [None]:
overlap = 0


for allele in list(set(g_group_values)):
    
    allele_two_field = make_two_field(allele)
    print(allele, allele_two_field, convert_to_g_group(allele))
    
    if allele_two_field == convert_to_g_group(allele):
        overlap += 1

print(overlap)

In [None]:
overlap

In [None]:
convert_to_g_group('C*01:02:40')

# Number of null alleles, not in a P group
Results taken from: Result parsing (null alleles)
					Plotting (number of mistakes)

In [4]:
n_tot = 8290
n_hlaI = 4974

In [10]:
#Kourami

count = 14
mistakes_p = 1352
mistakes_netmhc = 1297

#Percentage of errors:
print("P group: % of errors:", count/mistakes_p * 100, "%")
print("NetMHCseq: % of errors:", count/mistakes_netmhc * 100, "%")

#Percentage of total:
print("% of total:", count/n_tot * 100, "%")

P group: % of errors: 1.0355029585798818 %
NetMHCseq: % of errors: 1.0794140323824208 %
% of total: 0.16887816646562123 %


HG00463	C*08:36
HG00501	A*24:86
HG00501	A*24:86
HG00689	C*08:36
HG01061	DRB1*04:178
HG01353	A*24:86
NA06986	DQB1*02:18
NA19307	DRB1*15:17
NA19359	DRB1*15:17
NA19384	DRB1*15:17
NA19397	DRB1*15:17
NA20289	A*30:73
NA20512	DRB1*15:17
NA20534	A*30:73

In [9]:
#HLA-LA

count = 7
mistakes_p = 309
mistakes_netmhc = 269

#Percentage of errors:
print("P group: % of errors:", count/mistakes_p * 100, "%")
print("NetMHCseq: % of errors:", count/mistakes_netmhc * 100, "%")

#Percentage of total:
print("% of total:", count/n_tot * 100, "%")


P group: % of errors: 2.26537216828479 %
NetMHCseq: % of errors: 2.6022304832713754 %
% of total: 0.08443908323281062 %



HG00240	A*02:250
HG00310	A*03:03
HG00327	A*68:18
HG00407	B*40:265
HG00620	DQB1*03:118
HG01438	A*03:36
NA20799	A*03:03


Optitype

count: 0
mistakes at P group level: 67
mistakes at NetMHCseq group level: 56


In [11]:
#Hisatgenotype

count = 16
mistakes_p = 894
mistakes_netmhc = 837

#Percentage of errors:
print("P group: % of errors:", count/mistakes_p * 100, "%")
print("NetMHCseq: % of errors:", count/mistakes_netmhc * 100, "%")

#Percentage of total:
print("% of total:", count/n_tot * 100, "%")

P group: % of errors: 1.7897091722595078 %
NetMHCseq: % of errors: 1.911589008363202 %
% of total: 0.19300361881785283 %



HG00109	DQB1*06:112
HG00120	A*01:16
HG00272	A*01:16
HG00310	DQB1*06:112
HG00332	A*02:321
HG00346	DQB1*06:112
HG00384	A*03:11
HG00403	A*02:321
HG00463	DQB1*03:95
HG00500	C*01:117
HG00533	DQB1*06:112
HG00640	C*05:07
HG01061	C*04:217
NA19473	C*04:105
NA19657	A*01:16
NA20757	A*02:321

In [12]:
#STC-seq

count = 33
mistakes_p = 4060
mistakes_netmhc = 4000

#Percentage of errors:
print("P group: % of errors:", count/mistakes_p * 100, "%")
print("NetMHCseq: % of errors:", count/mistakes_netmhc * 100, "%")

#Percentage of total:
print("% of total:", count/n_tot * 100, "%")

P group: % of errors: 0.812807881773399 %
NetMHCseq: % of errors: 0.8250000000000001 %
% of total: 0.3980699638118214 %



HG00096	A*01:11
HG00127	A*26:127
HG00131	C*06:116
HG00131	C*06:116
HG00142	C*06:116
HG00245	DQB1*03:90
HG00367	C*06:116
HG00372	DQB1*06:193
HG00451	A*11:251
HG00475	DQB1*06:54
HG00557	C*06:116
HG01066	DQB1*02:18
HG01110	C*04:217
HG01110	C*06:116
HG01111	B*35:145
HG01187	C*06:116
HG01197	C*06:116
HG01256	C*07:393
NA06985	C*06:116
NA06986	DQB1*02:18
NA12046	C*06:116
NA12046	C*06:116
NA12286	C*06:116
NA12286	C*06:116
NA19309	C*06:116
NA19311	B*14:41
NA19318	B*07:251
NA19446	C*06:116
NA19463	C*06:116
NA19747	B*14:41
NA20755	B*41:45
NA20800	B*37:42
NA20800	B*37:42

# Weird G group conversion:

In [None]:
#Make dict for G-type conversion

g_group_filepath = gold_standard_path + 'g_group_resolution.txt'

g_group_dict = dict()


#Make conversion from full allele names to G groups
with open(g_group_filepath, 'r') as infile:
    for line in infile:
        #If several alleles map to the same one, they are separated by a "/"
        if line[0] != '#':
            gene = line.split('*')[0]
            
            #Only register the valid alleles:
            if gene in ['A', 'B', 'C', 'DRB1', 'DQB1']:
                        
                #find full G type for entry - differs depending on, whether there are 1 or multiple entries in the group.
                if "/" in line:
                    g_group_full = gene + "*" + line.split(';')[-1][:-1]
                else:
                    g_group_full = gene + "*" + line.split(';')[-2]

                #For full dict (parallel to four-field dict)
                #Read the rest of the alleles and clean up the front and end part
                synonymous_alleles = line.split('/')
                synonymous_alleles[0] = synonymous_alleles[0].split(';')[1]
                synonymous_alleles[-1] = synonymous_alleles[-1].split(';')[0]
                
                #Add "gene*" in front of all alleles:
                for i in range(len(synonymous_alleles)):
                    synonymous_alleles[i] = gene + "*" + synonymous_alleles[i]
                
                #Add the group itself to the list, so that tools predicting in G-type resolution can be converted in similar fashion
                synonynous_alleles = synonymous_alleles.append(g_group_full)
                
                #Remove duplicates
                synonymous_allels_unique = sorted(list(set(synonymous_alleles)), reverse=True)
                
                #Add key in dict for each of the unique entries:
                for allele in synonymous_allels_unique :                    
                    g_group_dict[allele] = g_group_full


                    
print("Ambiguities in two-field resolution:")
print("Allele", "Primary G-group", "Alternative group", sep = "\t")
#Add entries for 4-field and 6-field resolution typings, not already found in g_group_dict.
#This is done after, in order to not overwrite ambiguous G-group mappings such as C*02:02,
#which could map to C*02:02:01 or C*02:10:01G depending on the full typing.
with open(g_group_filepath, 'r') as infile:
    for line in infile:
        #If several alleles map to the same one, they are separated by a "/"
        if line[0] != '#':
            gene = line.split('*')[0]
            
            #Only register the valid alleles:
            if gene in ['A', 'B', 'C', 'DRB1', 'DQB1']:                
                
                if "/" in line:
                    g_group_full = gene + "*" + line.split(';')[-1][:-1]
                else:
                    g_group_full = gene + "*" + line.split(';')[-2]

                #Add to four field dict (Parralel to the full dict)
                synonymous_alleles_two_field = line.split('/')
                synonymous_alleles_two_field[0] = synonymous_alleles_two_field[0].split(';')[1]
                synonymous_alleles_two_field[-1] = synonymous_alleles_two_field[-1].split(';')[0]
                                
                #Convert all alleles to four field resolution for four field typing:
                for i in range(len(synonymous_alleles_two_field)):
                    synonymous_alleles_two_field[i] = gene + "*" + synonymous_alleles_two_field[i]
                    synonymous_alleles_two_field[i] = make_two_field(synonymous_alleles_two_field[i])

                #Remove duplicates when converting to four field:
                synonymous_allels_unique_two_field = sorted(list(set(synonymous_alleles_two_field)), reverse=True)
                
                #Add key in dict for each of the unique entries:
                for allele in synonymous_allels_unique_two_field:                    
                    
                    #Only add entries, which aren't already in the dict
                    if (allele in g_group_dict.keys()):
                        
                        #For two-field: Check for ambiguities in the G-type conversion e.g. C*02:02:02G and C*02:10:01G have same exon sequence for exon 2 and 3
                        #Print the ambiguity - because of sort, nothing is added. two-field-typed-allele is kept in the group
                        if (g_group_dict[allele] != g_group_full) and ("/" in line):
                            print("two_field_res", allele, g_group_dict[allele], g_group_full, sep=', ')
                            
                    else:
                        g_group_dict[allele] = g_group_full
                
                #Add to six field dict (Parralel to the full dict)
                synonymous_alleles_three_field = line.split('/')
                synonymous_alleles_three_field[0] = synonymous_alleles_three_field[0].split(';')[1]
                synonymous_alleles_three_field[-1] = synonymous_alleles_three_field[-1].split(';')[0]
                
                  #Convert all alleles to six field resolution for six field typing:
                for i in range(len(synonymous_alleles_three_field)):
                    synonymous_alleles_three_field[i] = gene + "*" + synonymous_alleles_three_field[i]
                    synonymous_alleles_three_field[i] = make_three_field(synonymous_alleles_three_field[i])
                    
                
                synonymous_alleles_three_field = [i for i in synonymous_alleles_three_field if i != None]
                #Remove duplicates when converting to six field:
                synonymous_allels_unique_three_field = sorted(list(set(synonymous_alleles_three_field)), reverse=True)
                
                #Add key in dict for each of the unique entries:
                for allele in synonymous_allels_unique_three_field:                    
                    
                    #Only add entries, which aren't already in the dict
                    if (allele in g_group_dict.keys()):
                        
                        #For six-field: Check for ambiguities in the G-type conversion e.g. C*02:02:02G and C*02:10:01G have same exon sequence for exon 2 and 3
                        #Print the ambiguity - because of sort, nothing is added. six-field-typed-allele is kept in the group
                        if (g_group_dict[allele] != g_group_full) and ("/" in line):
                            print("three_field_res:", allele, g_group_dict[allele], g_group_full, sep=', ')
                            
                    else:
                        g_group_dict[allele] = g_group_full

                
                
                    
#Make G type conversion function using g_group_dict
def convert_to_g_group(allele):

    if allele in g_group_dict.keys():
        allele_g_group = g_group_dict[allele]
    else:
        allele_g_group = make_two_field(allele)
        
    return allele_g_group
