In [1]:
import os
from os import walk

import pandas as pd
import numpy as np
import sys
import re

import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns

%matplotlib inline

pd.set_option('display.max_rows', 9000)
pd.set_option('display.max_columns', 1500)
pd.set_option('max_colwidth', 400)

In [2]:
main_folder_path = 'C:\\Users\\nikol\\OneDrive\\DTU\\11_semester\\'

resultpath = main_folder_path + '1000_genomes_results\\'

gold_standard_path = main_folder_path + 'gold_standard_data\\'

# Functions for conversion to P-group resolution

In [3]:
def make_four_field(allele_high_res):
    four_field_finder = re.search(r"(A|B|C|DRB1|DQB1)\*\d{2}:\d{2,3}", allele_high_res)
    
    allele_four_field = four_field_finder.group(0)
    
    return allele_four_field    

In [4]:
#Make dict for G-type conversion

p_type_filepath = gold_standard_path + 'p_group_resolution.txt'

p_type_dict = dict()

#Read the important results:
with open(p_type_filepath, 'r') as infile:
    for line in infile:
        #If several alleles map to the same one, they are separated by a "/"
        if ('/' in line) and (line[0] != '#'):
            gene = line.split('*')[0]
            
            #Only register the valid alleles:
            if gene in ['A', 'B', 'C', 'DRB1', 'DBQ1']:
                        
                #Find the four field, G type resolution. The G type is found at the end of the line.
                p_type_full = gene + "*" + line.split(';')[-1]
                p_type_four_field = make_four_field(p_type_full)
            
                #Read the rest of the alleles and clean up the front and end part
                synonymous_alleles = line.split('/')
                synonymous_alleles[0] = synonymous_alleles[0].split(';')[1]
                synonymous_alleles[-1] = synonymous_alleles[-1].split(';')[0]
                

                #Convert all alleles to four field resolution
                for i in range(len(synonymous_alleles)):
                    synonymous_alleles[i] = gene + "*" + synonymous_alleles[i]
                    synonymous_alleles[i] = make_four_field(synonymous_alleles[i])

                #Remove duplicates when converting to four field:
                synonymous_allels_unique_four_field = list(set(synonymous_alleles))
                
                #Add key in dict for each of the unique entries:
                for synonymous_allele in synonymous_allels_unique_four_field:
                    
                    #Check for ambiguities in the G-type conversion e.g. C*02:02:02G and C*02:10:01G have same exon sequence for exon 2 and 3
                    if synonymous_allele in p_type_dict.keys():
                        print(synonymous_allele, p_type_full)
                    
                    p_type_dict[synonymous_allele] = p_type_four_field

In [5]:
#Make G type conversion function:
def convert_to_four_field_p_type(allele):

    #Start by converting to four field:
    allele_four_field = make_four_field(allele)
    
    #Find corresponding G-type if it exists. If not, return the four field resolution
    if allele_four_field in p_type_dict:
        if allele_four_field != p_type_dict[allele_four_field]:
            print(allele_four_field)
            print(p_type_dict[allele_four_field])
        allele_four_field_p_type = p_type_dict[allele_four_field]
    else:
        allele_four_field_p_type = allele_four_field
        
    return allele_four_field_p_type


# Typing Results:

## Kourami
Save both single and ambiguous results

In [79]:
kourami_result_filepath = resultpath + 'kourami\\'

kourami_log_filepath = resultpath + 'kourami_results_from_logfiles\\'

kourami_files = list()
kourami_logfiles = list()

#Initalize result dict for single guess and for multiple typing
kourami_results = dict()
kourami_results_ambiguous = dict()


for (dirpath, dirnames, filenames) in walk(kourami_result_filepath):
    kourami_files.extend(filenames)
    
for (dirpath, dirnames, filenames) in walk(kourami_log_filepath):
    kourami_logfiles.extend(filenames)
    

for filename in kourami_files:
    #Don't include the performance logs
    if filename.endswith('.txt'):
        
        #If file is not empty:
        if os.stat(kourami_result_filepath + filename).st_size != 0:
            temp_result_list = list()
            temp_result_list_ambiguous = list()
    
            with open(kourami_result_filepath + filename, 'r') as infile:

                for line in infile:
                    #Find the first match / allele prediction
                    allele_searcher = re.search(r"(A|B|C|DRB1|DQB1)\*\d{2}:\d{2,3}", line)
                    
                    #Find all allele predictions (ambiguous). ?: is required for non-capturing groups
                    allele_searcher_ambiguous = re.findall(r"(?:A|B|C|DRB1|DQB1)\*\d{2}:\d{2,3}", line)
                    
                    if allele_searcher is not None:
                        found_allele = allele_searcher.group(0)
                        
                        #Convert to G-type
                        found_allele_four_field_p_type = convert_to_four_field_p_type(found_allele)
                        
                        for i in range(len(allele_searcher_ambiguous)):
                            allele_searcher_ambiguous[i] = convert_to_four_field_p_type(allele_searcher_ambiguous[i])
                            
                        #Remove duplicates for loci for ambiguous                
                        found_allele_four_field_p_type_ambiguous = list(set(allele_searcher_ambiguous))
                        
                        #Add to list of predictions for this sample
                        temp_result_list += [found_allele_four_field_p_type]
                        temp_result_list_ambiguous += found_allele_four_field_p_type_ambiguous

            #Add sample prediction to dict
            kourami_results[filename[:-4]] = temp_result_list
            kourami_results_ambiguous[filename[:-4]] = temp_result_list_ambiguous
            
        #If file is empty, take results from logfiles.    
        else:
            print("Prediction failed for: ", filename, ". Grabbing result from .log-file instead" , sep = '')
            
            log_filename = (filename[:-4] + '.log')
            
            if log_filename in kourami_logfiles:
                
                temp_result_list = list()
    
                with open(kourami_log_filepath + log_filename, 'r') as infile:
            
                    for line in infile:
                        allele_searcher = re.search(r"(A|B|C|DRB1|DQB1)\*\d{2}:\d{2,3}", line)
                        
                        if allele_searcher is not None:
                            found_allele = allele_searcher.group(0)
                            
                            found_allele_two_field = convert_to_four_field_p_type(found_allele)
                            temp_result_list.append(found_allele_two_field)

                kourami_results[filename[:-4]] = temp_result_list
            
            else:
                print("Unable to load results from logfile. No prediction added for", filename)         
                
#kourami_results

C*02:10
C*02:02
C*02:10
C*02:02
A*74:21
A*02:65
A*74:21
A*02:65
C*02:10
C*02:02
C*02:10
C*02:02
C*02:10
C*02:02
C*02:10
C*02:02
C*02:10
C*02:02
C*02:10
C*02:02
C*02:10
C*02:02
C*02:10
C*02:02
C*02:10
C*02:02
C*02:10
C*02:02
C*02:10
C*02:02
C*02:10
C*02:02
C*02:10
C*02:02
C*02:10
C*02:02
C*02:10
C*02:02
C*02:10
C*02:02
C*02:10
C*02:02
C*02:10
C*02:02
C*02:10
C*02:02
C*02:10
C*02:02
C*02:10
C*02:02
C*02:10
C*02:02
Prediction failed for: NA19346.txt. Grabbing result from .log-file instead
Prediction failed for: NA19471.txt. Grabbing result from .log-file instead
C*02:10
C*02:02
C*02:10
C*02:02
C*02:10
C*02:02
C*02:10
C*02:02
C*02:10
C*02:02
C*02:10
C*02:02
C*06:17
C*06:02
C*06:17
C*06:02
C*02:10
C*02:02
C*02:10
C*02:02
C*02:10
C*02:02
C*02:10
C*02:02
C*02:10
C*02:02
C*02:10
C*02:02


## HLA-LA

In [7]:
hla_la_result_filepath = resultpath + 'hla-la\\'

hla_la_files = []
for (dirpath, dirnames, filenames) in walk(hla_la_result_filepath):
    hla_la_files.extend(filenames)
    
#print(hla_la_files)


hla_la_results = dict()

for filename in hla_la_files:
    if filename.endswith('.txt'):
        temp_results_0 = pd.read_csv(hla_la_result_filepath + filename, sep = "\t")['Allele']
        temp_results = [i for i in temp_results_0 if i.startswith(('A', 'B', 'C', 'DRB1', 'DQB1'))]

        hla_la_results[filename[:-4]] = [convert_to_four_field_p_type(i) for i in temp_results]


#hla_la_results

C*02:10
C*02:02
C*02:10
C*02:02
C*02:10
C*02:02
C*02:10
C*02:02
C*02:10
C*02:02
C*02:10
C*02:02
C*02:10
C*02:02
C*02:10
C*02:02
C*02:10
C*02:02
C*02:10
C*02:02
C*02:10
C*02:02
C*02:10
C*02:02
C*02:10
C*02:02
C*02:10
C*02:02
C*02:10
C*02:02
C*02:10
C*02:02
C*02:10
C*02:02
C*02:10
C*02:02
C*02:10
C*02:02
C*02:10
C*02:02
C*02:10
C*02:02
C*02:10
C*02:02
C*02:10
C*02:02
C*02:10
C*02:02
C*02:10
C*02:02
C*02:10
C*02:02
C*02:10
C*02:02


## Optitype

In [8]:
optitype_result_filepath = resultpath + 'optitype\\'

optitype_files = []
for (dirpath, dirnames, filenames) in walk(optitype_result_filepath):
    optitype_files.extend(filenames)
    
#print(optitype_files)


optitype_results = dict()

for filename in optitype_files:
    if filename.endswith('.txt'):
        temp_results = list(pd.read_csv(optitype_result_filepath + filename, sep = "\t").iloc[0])[1:7]

        temp_results_filtered = [i for i in temp_results if isinstance(i,str)]
        optitype_results[filename[:-4]] = [convert_to_four_field_p_type(i) for i in temp_results_filtered]

#optitype_results

C*02:10
C*02:02
C*07:06
C*07:01
C*07:06
C*07:01
C*02:10
C*02:02
A*74:02
A*74:01
C*02:10
C*02:02
C*02:10
C*02:02
C*02:10
C*02:02
C*02:10
C*02:02
C*02:10
C*02:02
C*02:10
C*02:02
C*02:10
C*02:02
C*02:10
C*02:02
C*02:10
C*02:02
C*02:10
C*02:02
C*02:10
C*02:02
C*02:10
C*02:02
C*02:10
C*02:02
C*02:10
C*02:02
C*02:10
C*02:02
C*02:10
C*02:02
C*02:10
C*02:02
C*02:10
C*02:02
C*02:10
C*02:02
C*02:10
C*02:02
C*07:06
C*07:01
C*02:10
C*02:02
C*02:10
C*02:02
C*02:10
C*02:02
C*02:10
C*02:02
C*02:10
C*02:02
C*02:10
C*02:02
C*02:10
C*02:02


## Hisatgenotype

In [9]:
hisatgenotype_result_filepath = resultpath + 'hisatgenotype\\'

hisatgenotype_files = []
for (dirpath, dirnames, filenames) in walk(hisatgenotype_result_filepath):
    hisatgenotype_files.extend(filenames)
    
#print(hisatgenotype_files)

#Save two predictions. One, with one guess per allele and one with the full prediction
hisatgenotype_results = dict()

for filename in hisatgenotype_files:
    if filename.endswith('.txt'):
    
        hisatgenotype_resultlist = list()
        
        hisatgenotype_resultlist_ambiguous = list()
      
        with open(hisatgenotype_result_filepath + filename) as infile:
            for line in infile:
                result = re.match(r'^\t+(1|2)\sranked (A|B|C|DRB1|DQB1)',line)
            
                if result is not None:
                    hisatgenotype_resultlist.append(line.split()[2])              
                     
            #Duplicate prediction for an allele in case of homologous case, so that each gene has two predictions.
            #In a homologous case, both result dicts only have one prediction and both needs an update.
            for allele in ['A', 'B', 'C', 'DRB1', 'DQB1']:
                allele_list = [pred for pred in hisatgenotype_resultlist if pred.startswith(allele)]

                if len(allele_list) == 1:
                    hisatgenotype_resultlist.append(allele_list[0])
                    hisatgenotype_resultlist.sort()

        hisatgenotype_results[filename[:-4]] = [convert_to_four_field_p_type(i) for i in hisatgenotype_resultlist]
                
#hisatgenotype_results           
        

DRB1*11:97
DRB1*11:01
A*03:26
A*03:01
DRB1*14:54
DRB1*14:01
C*02:10
C*02:02
A*03:26
A*03:01
DRB1*14:54
DRB1*14:01
C*07:18
C*07:01
B*44:27
B*44:02
DRB1*14:54
DRB1*14:01
C*07:18
C*07:01
A*03:26
A*03:01
C*07:18
C*07:01
DRB1*14:54
DRB1*14:01
DRB1*14:54
DRB1*14:01
A*02:69
A*02:11
B*44:27
B*44:02
DRB1*14:54
DRB1*14:01
DRB1*14:54
DRB1*14:01
DRB1*14:54
DRB1*14:01
DRB1*14:54
DRB1*14:01
DRB1*14:54
DRB1*14:01
C*07:06
C*07:01
C*07:18
C*07:01
DRB1*14:54
DRB1*14:01
A*23:17
A*23:01
C*02:10
C*02:02
DRB1*14:54
DRB1*14:01
A*74:02
A*74:01
C*12:109
C*12:03
A*11:126
A*11:01
DRB1*14:54
DRB1*14:01
DRB1*14:54
DRB1*14:01
C*07:18
C*07:01
A*24:270
A*24:02
C*15:13
C*15:02
A*23:17
A*23:01
A*23:17
A*23:01
C*02:10
C*02:02
C*17:03
C*17:01
C*02:10
C*02:02
B*07:252
B*07:02
B*51:193
B*51:01
C*07:18
C*07:01
C*07:18
C*07:01
B*51:193
B*51:01
B*07:06
B*07:05
C*17:03
C*17:01
C*02:10
C*02:02
DRB1*14:54
DRB1*14:01
C*15:13
C*15:02
C*02:10
C*02:02
C*02:10
C*02:02
C*02:10
C*02:02
C*02:10
C*02:02
C*02:10
C*02:02
DRB1*14:54
DRB1*14

## STC-seq

In [10]:
def flatten(a):
    return [item for sublist in a for item in sublist]

In [11]:
stc_seq_result_filepath = resultpath + 'stc-seq\\'

stc_seq_files = []
for (dirpath, dirnames, filenames) in walk(stc_seq_result_filepath):
    stc_seq_files.extend(filenames)
    
#print(stc_seq_files)

stc_seq_results = dict()
stc_seq_results_ambiguous = dict()

for filename in stc_seq_files:
    if filename.endswith('.txt'):
        stc_df = pd.read_csv(stc_seq_result_filepath + filename, sep = "\t").reset_index()
        stc_df.columns = stc_df.iloc[0]
        stc_list = list(stc_df[stc_df['Locus'].isin(['A', 'B', 'C','DRB1', 'DQB1'])]['Genotype'])
        stc_seq_alt_list = list(stc_df[stc_df['Locus'].isin(['A', 'B', 'C','DRB1', 'DQB1'])]['Alternative_genotype'].dropna())

        temp_results =  flatten([i.split(',') for i in stc_list])
        temp_results_ambiguous = temp_results + flatten([i.replace(';',',').split(',') for i in stc_seq_alt_list])

        stc_seq_results[filename[:-4]] = [convert_to_four_field_p_type(i) for i in temp_results]
        stc_seq_results_ambiguous[filename[:-4]] = [convert_to_four_field_p_type(i) for i in temp_results_ambiguous] 

#stc_seq_results

A*01:81
A*01:01
A*01:81
A*01:01
A*01:81
A*01:01
A*03:26
A*03:01
C*05:03
C*05:01
C*05:03
C*05:01
DRB1*08:77
DRB1*08:01
DRB1*08:77
DRB1*08:01
A*03:26
A*03:01
A*03:26
A*03:01
A*03:26
A*03:01
A*03:26
A*03:01
A*11:126
A*11:01
A*03:26
A*03:01
A*03:26
A*03:01
A*01:81
A*01:01
A*01:81
A*01:01
A*01:81
A*01:01
C*05:03
C*05:01
C*05:03
C*05:01
C*05:03
C*05:01
DRB1*14:54
DRB1*14:01
DRB1*14:54
DRB1*14:01
B*35:57
B*35:01
C*02:10
C*02:02
C*02:10
C*02:02
B*53:37
B*53:01
B*53:37
B*53:01
B*53:37
B*53:01
B*53:37
B*53:01
A*03:26
A*03:01
A*03:26
A*03:01
DRB1*14:54
DRB1*14:01
DRB1*14:54
DRB1*14:01
C*07:18
C*07:01
C*07:18
C*07:01
DRB1*08:77
DRB1*08:01
DRB1*08:77
DRB1*08:01
C*15:87
C*15:02
B*44:27
B*44:02
B*44:27
B*44:02
A*01:81
A*01:01
DRB1*08:77
DRB1*08:01
DRB1*08:77
DRB1*08:01
DRB1*14:54
DRB1*14:01
DRB1*14:54
DRB1*14:01
A*03:26
A*03:01
A*03:26
A*03:01
A*03:26
A*03:01
DRB1*08:77
DRB1*08:01
DRB1*08:77
DRB1*08:01
DRB1*08:77
DRB1*08:01
DRB1*03:124
DRB1*03:01
DRB1*03:124
DRB1*03:01
A*03:26
A*03:01
A*03:26
A*03:01

## Depth analysis

In [12]:
#load data
depth_filepath = resultpath + 'depth\\'

depth_files = []
for (dirpath, dirnames, filenames) in walk(depth_filepath):

    depth_files.extend(filenames)

depth_results = dict()

for filename in depth_files:
    
    if filename.endswith('.depth.mosdepth.summary.txt'):
        temp_results = pd.read_csv(depth_filepath + filename, sep='\t')
        
        temp_results.set_index('chrom', inplace=True)
        
        mean_region_depth = temp_results.loc['total_region', 'mean']
        depth_results[filename[0:7]] = mean_region_depth
    
#depth_results

# Load Gold standard (## 1000Genomes)

## Load 2014 dataset

Loading and preprocessing of gold-standard dataset 

In [52]:
milleg_2014_raw_filepath= gold_standard_path + '1000G_hla_diversity.txt'
milleg_2014_filepath= gold_standard_path + '1000G_hla_diversity_edited.txt'

#Remove quotes from file:
with open(milleg_2014_raw_filepath, 'r') as infile:
    newfile = ""
    for line in infile:
        newline = line.replace("\"", "")
        newfile += newline

outfile = open(milleg_2014_filepath, 'w')
outfile.write(newfile)
outfile.close()

#Make new dataframe pretty:

mille_gs_df = pd.read_csv(milleg_2014_filepath, sep = " ")
#mille_gs_df

#Change name of Utah individuals from CEPH to CEU as seen in the 1000 genomes database:
mille_gs_df.replace('CEPH','CEU', inplace=True)

#Remove samples with NaN (non typed alleles)
mille_gs_df.dropna(inplace=True)          
#Print number of subgroups in the dataset:
#mille_gs_df['sbgroup'].unique()



### Remove samples, which doesn't have exome data on 1000Genomes and create download links

In [53]:
import http.client
from urllib.parse import urlparse

def checkUrl(url):
    p = urlparse(url)
    conn = http.client.HTTPConnection(p.netloc)
    conn.request('HEAD', p.path)
    resp = conn.getresponse()
    return resp.status < 400

# if __name__ == '__main__':
#     print(checkUrl('http://www.stackoverflow.com')) # True
#     print(checkUrl('http://stackoverflow.com/notarealpage.html')) # False

In [54]:
# #Thre results of this code are saved in two files, to save time

# #Check all exome links to create a list of id's with exome data
# gold_standard_id_list = list()

# #Save the valid urls for download later
# gold_standard_url_list = list()

# i = 0

# original_sample_list = list(mille_gs_df['id'])

# for identity in original_sample_list:
#     i += 1
#     if i % 100 == 0:
#         print(i, " iterations completed")
        
#     sbgroup = mille_gs_df[mille_gs_df['id'] == identity]['sbgroup'].iloc[0]         
#     wget_url = "ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/data_collections/1000_genomes_project/data/{}/{}/exome_alignment/{}.alt_bwamem_GRCh38DH.20150826.{}.exome.cram".format(sbgroup, identity, identity, sbgroup, )
    
#     if checkUrl(wget_url):
#         gold_standard_id_list.append(identity)
#         gold_standard_url_list.append(wget_url)

# with open(gold_standard_path + 'gold_standard_url_list.txt', 'w') as outfile:
#     for entry in wget_url_list:
#         outfile.write(entry + '\n')
    
# with open(gold_standard_path + 'gold_standard_id_list.txt', 'w') as outfile:
#     for entry in valid_wes_id_list:
#         outfile.write(entry + '\n')

gold_standard_url_list = list()
with open(gold_standard_path + 'gold_standard_url_list.txt', 'r') as infile:
    for line in infile:
        gold_standard_url_list.append(line[:-1])   

gold_standard_id_list= list()
with open(gold_standard_path + 'gold_standard_id_list.txt', 'r') as infile:
    for line in infile:
        gold_standard_id_list.append(line[:-1])   

In [55]:
#10 samples analysed in ATHLATES and with results taken from 2018 dataset.
high_cov_id_list = ["HG01756", "HG01757", "HG01872", "HG01873", "HG01886", "HG01953", "HG01968", "HG02014", "HG02057", "NA20313"]


In [56]:
MG_exome_df = mille_gs_df[mille_gs_df['id'].isin(gold_standard_id_list)]

#Replace 0000 with empty string:
MG_exome_df.replace('0000', '', inplace=True)

#MG_exome_df

### Double Typing
Merge rows, where a person has been typed twice. (this was only relevant for 2014 data)

In [57]:
#Check for non-identical rows
print(len(set(list(MG_exome_df['id']))))
print(len(list(MG_exome_df['id'])))

819
822


In [58]:
non_unique = list({x for x in list(MG_exome_df['id']) if list(MG_exome_df['id']).count(x) > 1})
non_unique_df = MG_exome_df[MG_exome_df['id'].isin(non_unique)]
non_unique_df

Unnamed: 0,id,sbgroup,A,A.1,B,B.1,C,C.1,DRB1,DRB1.1,DQB1,DQB1.1
440,NA19119,YRI,23:01:01/23:07N/23:17/23:18/23:20,36:01,35:01:01/35:01:03/35:40N/35:42/35:57/35:94,49:01:01,04:01:01:01/04:01:01:02/04:01:01:03/04:09N/04:28/04:30/04:41,07:01:01/07:01:02/07:01:09/07:06/07:18/07:52,03:01:01:01/03:01:01:02,07:01:01:01/07:01:01:02,02:01:01/02:02/02:04,05:01:01
441,NA19119,YRI,23:01,36:01,35:01,49:01,04:01,07:01,,,,
480,NA19210,YRI,03:01,33:01,15:10,58:01,03:02:01/03:02:02/03:02:03,08:04,,,,
481,NA19210,YRI,03:01:01:01/03:01:01:02N/03:01:01:03/03:01:07/03:20/03:21N/03:26/03:37/03:45,33:01:01,15:10,58:01:01/58:11,03:02:01/03:02:02/03:02:03,08:04,03:01:01:01/03:01:01:02,13:01:01,03:01:01/03:01:04/03:09/03:19/03:21/03:22/03:24,06:03:01
486,NA19223,YRI,,,,,,,03:02,13:27,04:02,02:01
487,NA19223,YRI,30:01:01/30:01:02/30:24,33:03:01/33:03:03/33:15/33:25,41:04,42:01,17:01/17:02/17:03,17:01/17:02/17:03,03:02:01,13:27,02:01:01/02:02/02:04,04:02


In [59]:
clean_duplicates_df = pd.DataFrame()
for column in ['A','A.1','B','B.1','C','C.1','DRB1','DRB1.1','DQB1','DQB1.1']:
    clean_duplicates_df[column] = non_unique_df.groupby(['id','sbgroup'])[column].apply('/'.join)
    
    #Remove potentaial starting '/'
    for identity in clean_duplicates_df.index:
        entry = clean_duplicates_df.loc[identity, column]
        
        if entry.startswith('/'):
            clean_duplicates_df.at[identity, column] = entry[1:]
        if entry.endswith('/'):
            clean_duplicates_df.at[identity, column] = entry[:-1]
        
clean_duplicates_df.reset_index(inplace=True)

In [60]:
clean_duplicates_df

Unnamed: 0,id,sbgroup,A,A.1,B,B.1,C,C.1,DRB1,DRB1.1,DQB1,DQB1.1
0,NA19119,YRI,23:01:01/23:07N/23:17/23:18/23:20/23:01,36:01/36:01,35:01:01/35:01:03/35:40N/35:42/35:57/35:94/35:01,49:01:01/49:01,04:01:01:01/04:01:01:02/04:01:01:03/04:09N/04:28/04:30/04:41/04:01,07:01:01/07:01:02/07:01:09/07:06/07:18/07:52/07:01,03:01:01:01/03:01:01:02,07:01:01:01/07:01:01:02,02:01:01/02:02/02:04,05:01:01
1,NA19210,YRI,03:01/03:01:01:01/03:01:01:02N/03:01:01:03/03:01:07/03:20/03:21N/03:26/03:37/03:45,33:01/33:01:01,15:10/15:10,58:01/58:01:01/58:11,03:02:01/03:02:02/03:02:03/03:02:01/03:02:02/03:02:03,08:04/08:04,03:01:01:01/03:01:01:02,13:01:01,03:01:01/03:01:04/03:09/03:19/03:21/03:22/03:24,06:03:01
2,NA19223,YRI,30:01:01/30:01:02/30:24,33:03:01/33:03:03/33:15/33:25,41:04,42:01,17:01/17:02/17:03,17:01/17:02/17:03,03:02/03:02:01,13:27/13:27,04:02/02:01:01/02:02/02:04,02:01/04:02


In [61]:
#Remove the duplicate columns from the full dataframe"
MG_exome_df = MG_exome_df[~MG_exome_df['id'].isin(non_unique)]

#Add back the clean duplicate rows:
MG_exome_df = MG_exome_df.append(clean_duplicates_df, sort=False)

#Reset index
MG_exome_df.reset_index(inplace=True, drop = True)

#Check that only uniwue entries exist now.
print(len(set(list(MG_exome_df['id']))))
print(len(list(MG_exome_df['id'])))

819
819


### Convert all entries to 4-field resolution, P-group

In [62]:
#Set id as index
MG_exome_df.set_index('id', inplace=True)

In [63]:
#Remember entries, which are not typed
non_typed_samples = list()

#Remove 6 field resolution and duplicates when reduced to 4 field resolution.
#Furthermore - in cases, where 4 field resolution is not available, use 2018 data.
for identity in list(MG_exome_df.index):
    
    for col in MG_exome_df.columns[1:]:
        predictionlist = MG_exome_df.loc[identity,col].split('/')
        
        #Loop over entries and add 4 field resolution alleles to the gold standard dataframe.
        try:
            prediction_4_field = list(set([col.split('.')[0] + "*" + prediction.split(':')[0] + ':' + prediction.split(':')[1] for prediction in predictionlist]))
            
            for i in range(len(prediction_4_field)):
                prediction_4_field[i] = convert_to_four_field_p_type(prediction_4_field[i])
        
        except IndexError as error:
            non_typed_samples.append(identity)
            prediction_4_field = ['not_typed_in_2014']
            
        #Convert to P-type resolution:
        
 
        MG_exome_df.at[identity,col] = prediction_4_field

C*07:06
C*07:01
B*15:19
B*15:12
C*07:06
C*07:01
C*07:06
C*07:01
C*07:06
C*07:01
C*07:06
C*07:01
C*07:06
C*07:01
C*07:06
C*07:01
C*07:06
C*07:01
B*15:19
B*15:12
B*15:19
B*15:12
B*15:19
B*15:12
C*04:41
C*04:01
C*04:28
C*04:01
C*04:30
C*04:01
C*07:18
C*07:01
C*07:06
C*07:01
C*07:52
C*07:01
DRB1*12:06
DRB1*12:01
DRB1*12:17
DRB1*12:01
DRB1*12:10
DRB1*12:01
A*01:37
A*01:01
A*01:32
A*01:01
B*15:103
B*15:03
C*02:10
C*02:02
A*02:75
A*02:01
A*02:132
A*02:01
A*02:140
A*02:01
A*02:09
A*02:01
A*02:97
A*02:01
A*02:66
A*02:01
A*02:89
A*02:01
A*02:134
A*02:01
A*30:24
A*30:01
C*04:41
C*04:01
C*04:28
C*04:01
C*04:30
C*04:01
C*17:02
C*17:01
C*17:03
C*17:01
A*03:37
A*03:01
A*03:45
A*03:01
A*03:26
A*03:01
A*03:20
A*03:01
A*30:24
A*30:01
C*04:41
C*04:01
C*04:28
C*04:01
C*04:30
C*04:01
C*17:02
C*17:01
C*17:03
C*17:01
C*04:41
C*04:01
C*04:28
C*04:01
C*04:30
C*04:01
DRB1*12:06
DRB1*12:01
DRB1*12:17
DRB1*12:01
DRB1*12:10
DRB1*12:01
A*74:02
A*74:01
B*15:103
B*15:03
B*35:94
B*35:01
B*35:57
B*35:01
B*35:42
B*35:01

A*02:132
A*02:01
A*02:140
A*02:01
A*02:09
A*02:01
A*02:97
A*02:01
A*02:66
A*02:01
A*02:89
A*02:01
A*02:134
A*02:01
A*23:18
A*23:01
A*23:17
A*23:01
A*23:20
A*23:01
B*07:58
B*07:02
B*07:61
B*07:02
B*07:59
B*07:02
C*07:18
C*07:01
C*07:06
C*07:01
C*07:52
C*07:01
A*03:37
A*03:01
A*03:45
A*03:01
A*03:26
A*03:01
A*03:20
A*03:01
B*07:58
B*07:02
B*07:61
B*07:02
B*07:59
B*07:02
B*51:30
B*51:01
B*51:48
B*51:01
B*51:51
B*51:01
B*51:32
B*51:01
C*01:25
C*01:02
C*07:66
C*07:02
C*07:74
C*07:02
C*07:50
C*07:02
A*24:79
A*24:02
A*24:76
A*24:02
A*24:79
A*24:02
A*24:76
A*24:02
C*07:18
C*07:01
C*07:06
C*07:01
C*07:52
C*07:01
C*07:66
C*07:02
C*07:74
C*07:02
C*07:50
C*07:02
A*03:37
A*03:01
A*03:45
A*03:01
A*03:26
A*03:01
A*03:20
A*03:01
A*68:33
A*68:01
B*35:94
B*35:01
B*35:57
B*35:01
B*35:42
B*35:01
C*04:41
C*04:01
C*04:28
C*04:01
C*04:30
C*04:01
A*02:75
A*02:01
A*02:132
A*02:01
A*02:140
A*02:01
A*02:09
A*02:01
A*02:97
A*02:01
A*02:66
A*02:01
A*02:89
A*02:01
A*02:134
A*02:01
B*35:94
B*35:01
B*35:57
B*35:01
B*

A*02:01
A*02:97
A*02:01
A*02:66
A*02:01
A*02:89
A*02:01
A*02:134
A*02:01
A*68:33
A*68:01
B*48:09
B*48:01
C*04:41
C*04:01
C*04:28
C*04:01
C*04:30
C*04:01
C*08:24
C*08:01
C*08:22
C*08:01
C*08:20
C*08:01
A*30:24
A*30:01
C*17:02
C*17:01
C*17:03
C*17:01
A*02:75
A*02:01
A*02:132
A*02:01
A*02:140
A*02:01
A*02:09
A*02:01
A*02:97
A*02:01
A*02:66
A*02:01
A*02:89
A*02:01
A*02:134
A*02:01
B*40:55
B*40:01
DRB1*12:06
DRB1*12:01
DRB1*12:17
DRB1*12:01
DRB1*12:10
DRB1*12:01
A*01:37
A*01:01
A*01:32
A*01:01
A*23:18
A*23:01
A*23:17
A*23:01
A*23:20
A*23:01
B*35:94
B*35:01
B*35:57
B*35:01
B*35:42
B*35:01
C*04:41
C*04:01
C*04:28
C*04:01
C*04:30
C*04:01
A*02:75
A*02:01
A*02:132
A*02:01
A*02:140
A*02:01
A*02:09
A*02:01
A*02:97
A*02:01
A*02:66
A*02:01
A*02:89
A*02:01
A*02:134
A*02:01
A*24:79
A*24:02
A*24:76
A*24:02
C*04:41
C*04:01
C*04:28
C*04:01
C*04:30
C*04:01
DRB1*14:54
DRB1*14:01
A*23:18
A*23:01
A*23:17
A*23:01
A*23:20
A*23:01
C*02:10
C*02:02
C*17:02
C*17:01
C*17:03
C*17:01
A*24:79
A*24:02
A*24:76
A*24:02
A

A*23:01
C*04:41
C*04:01
C*04:28
C*04:01
C*04:30
C*04:01
C*17:02
C*17:01
C*17:03
C*17:01
A*02:75
A*02:01
A*02:132
A*02:01
A*02:140
A*02:01
A*02:09
A*02:01
A*02:97
A*02:01
A*02:66
A*02:01
A*02:89
A*02:01
A*02:134
A*02:01
A*30:24
A*30:01
B*15:102
B*15:01
B*15:104
B*15:01
B*15:140
B*15:01
B*15:146
B*15:01
DRB1*14:54
DRB1*14:01
DRB1*14:54
DRB1*14:01
A*03:37
A*03:01
A*03:45
A*03:01
A*03:26
A*03:01
A*03:20
A*03:01
A*30:24
A*30:01
B*07:58
B*07:02
B*07:61
B*07:02
B*07:59
B*07:02
C*07:66
C*07:02
C*07:74
C*07:02
C*07:50
C*07:02
A*02:75
A*02:01
A*02:132
A*02:01
A*02:140
A*02:01
A*02:09
A*02:01
A*02:97
A*02:01
A*02:66
A*02:01
A*02:89
A*02:01
A*02:134
A*02:01
A*02:75
A*02:01
A*02:132
A*02:01
A*02:140
A*02:01
A*02:09
A*02:01
A*02:97
A*02:01
A*02:66
A*02:01
A*02:89
A*02:01
A*02:134
A*02:01
B*35:94
B*35:01
B*35:57
B*35:01
B*35:42
B*35:01
B*44:27
B*44:02
B*44:66
B*44:02
C*04:41
C*04:01
C*04:28
C*04:01
C*04:30
C*04:01
C*05:03
C*05:01
A*03:37
A*03:01
A*03:45
A*03:01
A*03:26
A*03:01
A*03:20
A*03:01
B*15:10

A*02:97
A*02:01
A*02:66
A*02:01
A*02:89
A*02:01
A*02:134
A*02:01
B*51:30
B*51:01
B*51:48
B*51:01
B*51:51
B*51:01
B*51:32
B*51:01
C*05:03
C*05:01
C*07:18
C*07:01
C*07:06
C*07:01
C*07:52
C*07:01
A*01:37
A*01:01
A*01:32
A*01:01
C*07:18
C*07:01
C*07:06
C*07:01
C*07:52
C*07:01
A*03:37
A*03:01
A*03:45
A*03:01
A*03:26
A*03:01
A*03:20
A*03:01
A*24:79
A*24:02
A*24:76
A*24:02
B*07:58
B*07:02
B*07:61
B*07:02
B*07:59
B*07:02
B*07:58
B*07:02
B*07:61
B*07:02
B*07:59
B*07:02
C*07:66
C*07:02
C*07:74
C*07:02
C*07:50
C*07:02
C*07:66
C*07:02
C*07:74
C*07:02
C*07:50
C*07:02
A*01:37
A*01:01
A*01:32
A*01:01
A*68:33
A*68:01
B*44:27
B*44:02
B*44:66
B*44:02
C*07:18
C*07:01
C*07:06
C*07:01
C*07:52
C*07:01
C*07:11
C*07:04
A*01:37
A*01:01
A*01:32
A*01:01
A*01:37
A*01:01
A*01:32
A*01:01
C*07:18
C*07:01
C*07:06
C*07:01
C*07:52
C*07:01
B*27:13
B*27:05
A*01:37
A*01:01
A*01:32
A*01:01
A*01:37
A*01:01
A*01:32
A*01:01
B*07:58
B*07:02
B*07:61
B*07:02
B*07:59
B*07:02
C*07:18
C*07:01
C*07:06
C*07:01
C*07:52
C*07:01
C*07:66

A*02:01
A*02:140
A*02:01
A*02:09
A*02:01
A*02:97
A*02:01
A*02:66
A*02:01
A*02:89
A*02:01
A*02:134
A*02:01
B*40:55
B*40:01
A*01:37
A*01:01
A*01:32
A*01:01
A*03:37
A*03:01
A*03:45
A*03:01
A*03:26
A*03:01
A*03:20
A*03:01
B*07:58
B*07:02
B*07:61
B*07:02
B*07:59
B*07:02
C*07:18
C*07:01
C*07:06
C*07:01
C*07:52
C*07:01
C*07:66
C*07:02
C*07:74
C*07:02
C*07:50
C*07:02
A*03:37
A*03:01
A*03:45
A*03:01
A*03:26
A*03:01
A*03:20
A*03:01
A*03:37
A*03:01
A*03:45
A*03:01
A*03:26
A*03:01
A*03:20
A*03:01
B*07:58
B*07:02
B*07:61
B*07:02
B*07:59
B*07:02
B*35:94
B*35:01
B*35:57
B*35:01
B*35:42
B*35:01
C*04:41
C*04:01
C*04:28
C*04:01
C*04:30
C*04:01
C*07:66
C*07:02
C*07:74
C*07:02
C*07:50
C*07:02
A*02:75
A*02:01
A*02:132
A*02:01
A*02:140
A*02:01
A*02:09
A*02:01
A*02:97
A*02:01
A*02:66
A*02:01
A*02:89
A*02:01
A*02:134
A*02:01
A*03:37
A*03:01
A*03:45
A*03:01
A*03:26
A*03:01
A*03:20
A*03:01
B*40:56
B*40:02
B*40:97
B*40:02
C*07:18
C*07:01
C*07:06
C*07:01
C*07:52
C*07:01
A*02:75
A*02:01
A*02:132
A*02:01
A*02:140
A

A*01:37
A*01:01
A*01:32
A*01:01
A*02:75
A*02:01
A*02:132
A*02:01
A*02:140
A*02:01
A*02:09
A*02:01
A*02:97
A*02:01
A*02:66
A*02:01
A*02:89
A*02:01
A*02:134
A*02:01
B*27:13
B*27:05
C*07:18
C*07:01
C*07:06
C*07:01
C*07:52
C*07:01
A*03:37
A*03:01
A*03:45
A*03:01
A*03:26
A*03:01
A*03:20
A*03:01
A*03:37
A*03:01
A*03:45
A*03:01
A*03:26
A*03:01
A*03:20
A*03:01
B*35:94
B*35:01
B*35:57
B*35:01
B*35:42
B*35:01
B*35:94
B*35:01
B*35:57
B*35:01
B*35:42
B*35:01
C*04:41
C*04:01
C*04:28
C*04:01
C*04:30
C*04:01
C*04:41
C*04:01
C*04:28
C*04:01
C*04:30
C*04:01
A*01:37
A*01:01
A*01:32
A*01:01
A*03:37
A*03:01
A*03:45
A*03:01
A*03:26
A*03:01
A*03:20
A*03:01
B*27:13
B*27:05
B*35:94
B*35:01
B*35:57
B*35:01
B*35:42
B*35:01
C*01:25
C*01:02
C*04:41
C*04:01
C*04:28
C*04:01
C*04:30
C*04:01
A*03:37
A*03:01
A*03:45
A*03:01
A*03:26
A*03:01
A*03:20
A*03:01
A*03:37
A*03:01
A*03:45
A*03:01
A*03:26
A*03:01
A*03:20
A*03:01
B*07:58
B*07:02
B*07:61
B*07:02
B*07:59
B*07:02
B*15:102
B*15:01
B*15:104
B*15:01
B*15:140
B*15:01
B*

A*02:09
A*02:01
A*02:97
A*02:01
A*02:66
A*02:01
A*02:89
A*02:01
A*02:134
A*02:01
A*24:79
A*24:02
A*24:76
A*24:02
B*35:94
B*35:01
B*35:57
B*35:01
B*35:42
B*35:01
B*40:55
B*40:01
C*04:41
C*04:01
C*04:28
C*04:01
C*04:30
C*04:01
A*02:75
A*02:01
A*02:132
A*02:01
A*02:140
A*02:01
A*02:09
A*02:01
A*02:97
A*02:01
A*02:66
A*02:01
A*02:89
A*02:01
A*02:134
A*02:01
A*03:37
A*03:01
A*03:45
A*03:01
A*03:26
A*03:01
A*03:20
A*03:01
B*07:58
B*07:02
B*07:61
B*07:02
B*07:59
B*07:02
B*44:27
B*44:02
B*44:66
B*44:02
C*05:03
C*05:01
C*07:66
C*07:02
C*07:74
C*07:02
C*07:50
C*07:02
A*01:37
A*01:01
A*01:32
A*01:01
A*03:37
A*03:01
A*03:45
A*03:01
A*03:26
A*03:01
A*03:20
A*03:01
B*07:58
B*07:02
B*07:61
B*07:02
B*07:59
B*07:02
C*07:18
C*07:01
C*07:06
C*07:01
C*07:52
C*07:01
A*02:75
A*02:01
A*02:132
A*02:01
A*02:140
A*02:01
A*02:09
A*02:01
A*02:97
A*02:01
A*02:66
A*02:01
A*02:89
A*02:01
A*02:134
A*02:01
B*15:102
B*15:01
B*15:104
B*15:01
B*15:140
B*15:01
B*15:146
B*15:01
B*39:46
B*39:01
C*07:66
C*07:02
C*07:74
C*07:

A*02:75
A*02:01
A*02:132
A*02:01
A*02:140
A*02:01
A*02:09
A*02:01
A*02:97
A*02:01
A*02:66
A*02:01
A*02:89
A*02:01
A*02:134
A*02:01
A*33:15
A*33:03
A*33:25
A*33:03
B*40:55
B*40:01
C*07:66
C*07:02
C*07:74
C*07:02
C*07:50
C*07:02
DRB1*12:06
DRB1*12:01
DRB1*12:17
DRB1*12:01
DRB1*12:10
DRB1*12:01
A*02:75
A*02:01
A*02:132
A*02:01
A*02:140
A*02:01
A*02:09
A*02:01
A*02:97
A*02:01
A*02:66
A*02:01
A*02:89
A*02:01
A*02:134
A*02:01
A*02:75
A*02:01
A*02:132
A*02:01
A*02:140
A*02:01
A*02:09
A*02:01
A*02:97
A*02:01
A*02:66
A*02:01
A*02:89
A*02:01
A*02:134
A*02:01
B*44:27
B*44:02
B*44:66
B*44:02
B*45:07
B*45:01
C*05:03
C*05:01
DRB1*12:06
DRB1*12:01
DRB1*12:17
DRB1*12:01
DRB1*12:10
DRB1*12:01
A*03:37
A*03:01
A*03:45
A*03:01
A*03:26
A*03:01
A*03:20
A*03:01
B*35:94
B*35:01
B*35:57
B*35:01
B*35:42
B*35:01
B*51:30
B*51:01
B*51:48
B*51:01
B*51:51
B*51:01
B*51:32
B*51:01
C*04:41
C*04:01
C*04:28
C*04:01
C*04:30
C*04:01
C*04:41
C*04:01
C*04:28
C*04:01
C*04:30
C*04:01
A*03:37
A*03:01
A*03:45
A*03:01
A*03:26
A*0

C*04:41
C*04:01
C*04:28
C*04:01
C*04:30
C*04:01
C*07:66
C*07:02
C*07:74
C*07:02
C*07:50
C*07:02
B*07:58
B*07:02
B*07:61
B*07:02
B*07:59
B*07:02
C*04:41
C*04:01
C*04:28
C*04:01
C*04:30
C*04:01
C*07:66
C*07:02
C*07:74
C*07:02
C*07:50
C*07:02
A*02:75
A*02:01
A*02:132
A*02:01
A*02:140
A*02:01
A*02:09
A*02:01
A*02:97
A*02:01
A*02:66
A*02:01
A*02:89
A*02:01
A*02:134
A*02:01
A*31:23
A*31:01
B*35:67
B*35:43
B*35:79
B*35:43
B*44:27
B*44:02
B*44:66
B*44:02
C*01:25
C*01:02
C*05:03
C*05:01
A*02:69
A*02:11
B*45:07
B*45:01
C*04:41
C*04:01
C*04:28
C*04:01
C*04:30
C*04:01
A*02:69
A*02:11
A*03:37
A*03:01
A*03:45
A*03:01
A*03:26
A*03:01
A*03:20
A*03:01
C*15:13
C*15:02
A*02:75
A*02:01
A*02:132
A*02:01
A*02:140
A*02:01
A*02:09
A*02:01
A*02:97
A*02:01
A*02:66
A*02:01
A*02:89
A*02:01
A*02:134
A*02:01
B*15:102
B*15:01
B*15:104
B*15:01
B*15:140
B*15:01
B*15:146
B*15:01
B*35:94
B*35:01
B*35:57
B*35:01
B*35:42
B*35:01
C*04:41
C*04:01
C*04:28
C*04:01
C*04:30
C*04:01
A*02:75
A*02:01
A*02:132
A*02:01
A*02:140
A*02

A*01:37
A*01:01
A*01:32
A*01:01
A*03:37
A*03:01
A*03:45
A*03:01
A*03:26
A*03:01
A*03:20
A*03:01
C*04:41
C*04:01
C*04:28
C*04:01
C*04:30
C*04:01
A*02:75
A*02:01
A*02:132
A*02:01
A*02:140
A*02:01
A*02:09
A*02:01
A*02:97
A*02:01
A*02:66
A*02:01
A*02:89
A*02:01
A*02:134
A*02:01
A*02:179
A*02:05
C*07:66
C*07:02
C*07:74
C*07:02
C*07:50
C*07:02
A*30:24
A*30:01
A*02:69
A*02:11
A*24:79
A*24:02
A*24:76
A*24:02
A*03:37
A*03:01
A*03:45
A*03:01
A*03:26
A*03:01
A*03:20
A*03:01
B*35:94
B*35:01
B*35:57
B*35:01
B*35:42
B*35:01
C*04:41
C*04:01
C*04:28
C*04:01
C*04:30
C*04:01
C*07:18
C*07:01
C*07:06
C*07:01
C*07:52
C*07:01
A*03:37
A*03:01
A*03:45
A*03:01
A*03:26
A*03:01
A*03:20
A*03:01
A*24:79
A*24:02
A*24:76
A*24:02
B*35:67
B*35:43
B*35:79
B*35:43
C*01:25
C*01:02
A*02:75
A*02:01
A*02:132
A*02:01
A*02:140
A*02:01
A*02:09
A*02:01
A*02:97
A*02:01
A*02:66
A*02:01
A*02:89
A*02:01
A*02:134
A*02:01
A*31:23
A*31:01
B*35:67
B*35:43
B*35:79
B*35:43
C*01:25
C*01:02
C*07:18
C*07:01
C*07:06
C*07:01
C*07:52
C*07:01
A

## Merge Alleles in original dataframe

In [81]:
#Merge haplotypes
MG_exome_df['A_merged']= MG_exome_df[['A', 'A.1']].apply(lambda x: list(x), axis=1)
MG_exome_df['B_merged']= MG_exome_df[['B', 'B.1']].apply(lambda x: list(x), axis=1)
MG_exome_df['C_merged']= MG_exome_df[['C', 'C.1']].apply(lambda x: list(x), axis=1)
MG_exome_df['DRB1_merged']= MG_exome_df[['DRB1', 'DRB1.1']].apply(lambda x: list(x), axis=1)
MG_exome_df['DQB1_merged']= MG_exome_df[['DQB1', 'DQB1.1']].apply(lambda x: list(x), axis=1)

MG_exome_merged_df = MG_exome_df.drop(columns=['A', 'A.1', 'B', 'B.1', 'C', 'C.1', 'DRB1', 'DRB1.1','DQB1', 'DQB1.1'])

MG_exome_merged_df.rename(columns={"A_merged": "A", "B_merged": "B", "C_merged": "C", "DRB1_merged": "DRB1", "DQB1_merged": "DQB1"}, inplace = True)

## Load and convert 2018 data for error correction in 2014 dataset

This data is not used itself directly, but fills out the gaps in the 2014 dataset loaded later

In [82]:
m2018_filepath = gold_standard_path + '2018_1129_HLA_types_full_1000_Genomes_Project_panel.txt'

m2018_gs_raw_df = pd.read_csv(m2018_filepath, sep = "\t")

#Rename columns
m2018_gs_raw_df.rename(columns={'Sample ID': 'id', 'Population': 'sbgroup', 'HLA-A 1': 'A', 'HLA-A 2': 'A.1', 'HLA-B 1': 'B',
                            'HLA-B 2': 'B.1', 'HLA-C 1': 'C', 'HLA-C 2': 'C.1', 'HLA-DQB1 1': 'DQB1', 'HLA-DQB1 2': 'DQB1.1',
                            'HLA-DRB1 1': 'DRB1' , 'HLA-DRB1 2': 'DRB1.1' }, inplace=True)

#Set id as index
m2018_gs_raw_df.set_index('id', inplace=True)

#Remove samples with NaN (non typed alleles)
m2018_gs_df = m2018_gs_raw_df.dropna()          


In [83]:
#Find entries with an *, indicating a former mistake in the 2014 dataset
changed_sample_indexes = list()

for index in m2018_gs_df.index:
    for col in m2018_gs_df.columns[1:]:
        for entry in m2018_gs_df.loc[index][col]:
            if '*' in entry:              
                changed_sample_indexes.append(index)

#Include samples, which are missing as well.
changed_sample_indexes += non_typed_samples

#Include the 10 high coverage samples:
changed_sample_indexes += high_cov_id_list

In [84]:
#make relevant dataframe smaller
corrections_2018_df_raw = m2018_gs_df.loc[set(changed_sample_indexes)]

#Limit the 2018 dataframe to 2014 samples and the 10 high coverage samples 
corrections_2018_df = corrections_2018_df_raw[corrections_2018_df_raw.index.isin(list(MG_exome_df.index) + high_cov_id_list)]

In [85]:
#Make dataframe for writing into 2014 dataframe:

#Remove * from entry:
corrections_2018_df_no_asterix = corrections_2018_df.copy()
corrections_2018_df_no_asterix=corrections_2018_df_no_asterix.replace({'\*':''}, regex=True)

#Add allele name in front of the resolution:
for identity in corrections_2018_df_no_asterix.index:
    for col in corrections_2018_df_no_asterix.columns[2:]:
        entrylist = corrections_2018_df_no_asterix.loc[identity][col].split('/')       
        
        entrylist = list(set([col.split('.')[0] + "*" + i.split('*')[0] for i in entrylist]))
        
        corrections_2018_df_no_asterix.at[identity,col] = entrylist


corrections_2018_df

#Merge alleles into one column:

corrections_2018_df_no_asterix['A_merged']= corrections_2018_df_no_asterix[['A', 'A.1']].apply(lambda x: list(x), axis=1)
corrections_2018_df_no_asterix['B_merged']= corrections_2018_df_no_asterix[['B', 'B.1']].apply(lambda x: list(x), axis=1)
corrections_2018_df_no_asterix['C_merged']= corrections_2018_df_no_asterix[['C', 'C.1']].apply(lambda x: list(x), axis=1)
corrections_2018_df_no_asterix['DRB1_merged']= corrections_2018_df_no_asterix[['DRB1', 'DRB1.1']].apply(lambda x: list(x), axis=1)
corrections_2018_df_no_asterix['DQB1_merged']= corrections_2018_df_no_asterix[['DQB1', 'DQB1.1']].apply(lambda x: list(x), axis=1)

corrections_2018_merged_df = corrections_2018_df_no_asterix.drop(columns=['A', 'A.1', 'B', 'B.1', 'C', 'C.1', 'DRB1', 'DRB1.1','DQB1', 'DQB1.1'])

corrections_2018_merged_df.rename(columns={"A_merged": "A", "B_merged": "B", "C_merged": "C", "DRB1_merged": "DRB1", "DQB1_merged": "DQB1"}, inplace = True)

In [86]:
#Update samples, where typing was not performed in 2014:

for identity in non_typed_samples:
    for col in list(MG_exome_merged_df.columns)[1:]:
        entry = MG_exome_merged_df.loc[identity,col]
        if ['not_typed_in_2014'] in entry:
            
            column = col.split('.')[0]
                
            new_type = corrections_2018_merged_df.loc[identity][column]

            old_type = MG_exome_merged_df.loc[identity,column]

            corrected_typing = list()

            for new_allele in list(new_type):
                if (new_allele in list(old_type)[0]):
                    corrected_typing.append(list(old_type)[0][0])
                elif new_allele in list(old_type)[1]:
                    corrected_typing.append(list(old_type)[0][1])
                else:
                    corrected_typing.append(new_allele)

            MG_exome_merged_df.at[identity,column] = corrected_typing
            
            

In [87]:
#Perform update of samples, where mistakes have been found

changed_sample_indexes = list()


#Loop over the samples, which needs to be corrected
for identity in list(corrections_2018_df.index):
    
    #Loop over relevant columns (genes)
    for col in corrections_2018_df.columns[3:]:
        
        #Loop over alleles
        for entry in corrections_2018_df[corrections_2018_df.index == identity][col]:
            
            #If allele is corrected since 2014 - update it
            if '*' in entry:
                column = col.split('.')[0]
                
                new_type = corrections_2018_merged_df.loc[identity][column]
                
                old_type = MG_exome_merged_df.loc[identity,column]
                
                #print(old_type)
                corrected_typing = list()
                
                for new_allele in list(new_type):
                    if (new_allele in list(old_type)[0]):
                        corrected_typing.append(list(old_type)[0][0])
                    elif new_allele in list(old_type)[1]:
                        corrected_typing.append(list(old_type)[0][1])
                    else:
                        corrected_typing.append(new_allele)
                
                #Make sure, that the correction is p-type resolution:
                for i in range(len(corrected_typing)):
                    corrected_typing[i] = [convert_to_four_field_p_type(corrected_typing[i][0])]
                
                MG_exome_merged_df.at[identity,column] = corrected_typing
            
    #Add the 10 high coverage samples to MG_exome_merged
    if identity in high_cov_id_list:
        

        high_cov_prediction_row = corrections_2018_merged_df.loc[identity][['A','B','C','DRB1','DQB1']]
        
        #Make sure, that correction is p-type resolution:
        for gene in high_cov_prediction_row.index:
            for i in range(len(high_cov_prediction_row[gene])):
                high_cov_prediction_row[gene][i][0] = convert_to_four_field_p_type(high_cov_prediction_row[gene][i][0])
        
        MG_exome_merged_df = MG_exome_merged_df.append(high_cov_prediction_row)    

identity NA18508
New Allele ['DQB1*02:02']
list(old_type)[0] ['DQB1*02:01']
identity NA18508
New Allele ['DQB1*04:02']
list(old_type)[0] ['DQB1*02:01']
identity NA19099
New Allele ['DQB1*02:02']
list(old_type)[0] ['DQB1*05:02']
identity NA19099
New Allele ['DQB1*05:02']
list(old_type)[0] ['DQB1*05:02']
identity NA18522
New Allele ['C*02:10']
list(old_type)[0] ['C*02:02']
identity NA18522
New Allele ['C*07:01']
list(old_type)[0] ['C*02:02']
C*02:10
C*02:02
identity NA19240
New Allele ['C*04:01']
list(old_type)[0] ['C*04:01']
identity NA19240
New Allele ['C*18:02']
list(old_type)[0] ['C*04:01']
C*18:02
C*18:01
identity NA20805
New Allele ['C*05:37']
list(old_type)[0] ['C*05:01', 'C*05:01']
identity NA20805
New Allele ['C*16:04']
list(old_type)[0] ['C*05:01', 'C*05:01']
C*05:37
C*05:01
identity NA19116
New Allele ['B*07:02']
list(old_type)[0] ['B*07:02']
identity NA19116
New Allele ['B*39:24']
list(old_type)[0] ['B*07:02']
identity NA19116
New Allele ['DQB1*04:02']
list(old_type)[0] ['DQB

identity NA11840
New Allele ['B*27:05']
list(old_type)[0] ['B*27:03']
identity NA11840
New Allele ['B*57:01']
list(old_type)[0] ['B*27:03']
identity NA18507
New Allele ['C*02:10']
list(old_type)[0] ['C*02:02']
identity NA18507
New Allele ['C*17:01']
list(old_type)[0] ['C*02:02']
C*02:10
C*02:02
identity NA18507
New Allele ['DQB1*03:01']
list(old_type)[0] ['DQB1*06:05']
identity NA18507
New Allele ['DQB1*06:09']
list(old_type)[0] ['DQB1*06:05']
identity NA19093
New Allele ['DRB1*13:02']
list(old_type)[0] ['DRB1*15:03']
identity NA19093
New Allele ['DRB1*15:03']
list(old_type)[0] ['DRB1*15:03']


## Make URL list for 10 high coverage WES samples

In [88]:
print(high_cov_id_list) 

# high_cov_url_list = list()

# for identity in high_cov_id_list:
       
#     sbgroup = m2018_gs_df.loc[identity]['sbgroup']
#     wget_url = "ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/data_collections/1000_genomes_project/data/{}/{}/exome_alignment/{}.alt_bwamem_GRCh38DH.20150826.{}.exome.cram".format(sbgroup, identity, identity, sbgroup, )
    
#     if checkUrl(wget_url):
#         high_cov_url_list.append(wget_url)

# with open(gold_standard_path + 'high_cov_url_list.txt', 'w') as outfile:
#     for entry in high_cov_url_list:
#         outfile.write(entry + '\n')
    
# with open(gold_standard_path + 'high_cov_id_list.txt', 'w') as outfile:
#     for entry in high_cov_id_list:
#         outfile.write(entry + '\n')

high_cov_url_list = list()
with open(gold_standard_path + 'high_cov_url_list.txt', 'r') as infile:
    for line in infile:
        high_cov_url_list.append(line[:-1])   

high_cov_id_list= list()
with open(gold_standard_path + 'high_cov_id_list.txt', 'r') as infile:
    for line in infile:
        high_cov_id_list.append(line[:-1])   

['HG01756', 'HG01757', 'HG01872', 'HG01873', 'HG01886', 'HG01953', 'HG01968', 'HG02014', 'HG02057', 'NA20313']


## Write config.yaml file for Snakemake workflow (both 2014 data and high cov data)

In [89]:
full_sample_urllist = list(set(gold_standard_url_list)) + high_cov_url_list
full_sample_id_list = list(set(gold_standard_id_list)) + high_cov_id_list

configfile_1 = 'C:\\Users\\nikol\\OneDrive\\DTU\\11_semester\\Data\\config_all.yaml'

with open(configfile_1, 'w') as outfile:
    outfile.write('sample_urlist:\n')
    for entry in full_sample_urllist:
        entry_name = entry.split('/')[9]
        outfile.write("  " + entry_name + ": " +  "\"" + entry + "\"" + '\n')


In [90]:
MG_exome_merged_df

Unnamed: 0_level_0,sbgroup,A,B,C,DRB1,DQB1
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
NA06985,CEU,"[[A*03:01], [A*02:01]]","[[B*07:02], [B*57:01]]","[[C*07:02], [C*06:02]]","[[DRB1*15:01], [DRB1*15:01]]","[[DQB1*06:02], [DQB1*06:02]]"
NA06986,CEU,"[[A*03:01], [A*32:01]]","[[B*44:03], [B*44:03]]","[[C*04:01], [C*16:01]]","[[DRB1*07:01], [DRB1*07:01]]","[[DQB1*02:02], [DQB1*02:02]]"
NA06994,CEU,"[[A*02:01], [A*32:01]]","[[B*40:02], [B*08:01]]","[[C*02:02], [C*07:01, C*07:01]]","[[DRB1*01:01], [DRB1*04:04]]","[[DQB1*05:01], [DQB1*03:02]]"
NA07000,CEU,"[[A*02:01], [A*68:01]]","[[B*44:02], [B*40:01]]","[[C*03:03, C*03:04], [C*07:04]]","[[DRB1*03:01], [DRB1*11:01]]","[[DQB1*02:01], [DQB1*03:01]]"
NA07037,CEU,"[[A*30:01], [A*31:01]]","[[B*15:10], [B*40:01]]","[[C*03:04], [C*03:04]]","[[DRB1*04:04], [DRB1*13:02]]","[[DQB1*03:02], [DQB1*06:04]]"
NA07048,CEU,"[[A*02:01], [A*02:01]]","[[B*44:02], [B*07:02]]","[[C*05:01], [C*07:02]]","[[DRB1*04:01], [DRB1*15:01]]","[[DQB1*03:01], [DQB1*06:02]]"
NA07051,CEU,"[[A*02:01], [A*68:01]]","[[B*15:12, B*15:12, B*15:01], [B*07:02]]","[[C*03:03], [C*07:02]]","[[DRB1*04:01], [DRB1*15:01]]","[[DQB1*03:01], [DQB1*06:02]]"
NA07056,CEU,"[[A*01:01], [A*02:01]]","[[B*08:01], [B*57:01]]","[[C*07:01, C*07:01], [C*06:02]]","[[DRB1*03:01], [DRB1*07:01]]","[[DQB1*02:01], [DQB1*03:03]]"
NA07347,CEU,"[[A*30:02], [A*26:01]]","[[B*18:01], [B*44:02]]","[[C*05:01], [C*05:01]]","[[DRB1*03:01], [DRB1*11:04]]","[[DQB1*02:01], [DQB1*03:01]]"
NA07357,CEU,"[[A*01:01], [A*24:02]]","[[B*08:01], [B*39:06]]","[[C*07:01, C*07:01], [C*07:02]]","[[DRB1*03:01], [DRB1*04:04]]","[[DQB1*02:01], [DQB1*03:02]]"


# Majority Votes

In [None]:
def most_frequent(List): 
    return max(set(List), key = List.count) 

### All 5

In [None]:
def get_prediction(result_dict, subject, allele):
    try:
        result = sorted([pred for pred in result_dict[subject] if pred.startswith(allele)])
    except KeyError as error:
        result = ''
        
    return result

In [None]:
from collections import Counter

all_five_results = dict()

for subject in gold_standard_id_list:
    
    all_five_results[subject] = list()

    for allele in ['A', 'B', 'C', 'DRB1', 'DQB1']:
        #Get list of all predictions:
        kourami_pred = get_prediction(kourami_results, subject, allele)
        hisatgenotype_pred = get_prediction(hisatgenotype_results, subject, allele)
        hla_la_pred = get_prediction(hla_la_results, subject, allele)
        optitype_pred = get_prediction(optitype_results, subject, allele)
        stc_seq_pred = get_prediction(stc_seq_results, subject, allele)
        
        
        
        all_predictions_list = [kourami_pred, hisatgenotype_pred, hla_la_pred, optitype_pred, stc_seq_pred]
        
        all_predictions_list_clean = [preds for preds in all_predictions_list if (preds != []) and (preds != '')]
        
        all_predictions_list_transposed = list(map(list, zip(*all_predictions_list_clean)))
        
        preds_count_1 = Counter(all_predictions_list_transposed[0])
        preds_count_2 = Counter(all_predictions_list_transposed[1])
        
        mode_1 = preds_count_1.most_common(1)[0][0]
        mode_2 = preds_count_2.most_common(1)[0][0]
        

        all_five_results[subject].extend([mode_1, mode_2])
    

### HLA-LA, Kourami, Hisatgenotype

In [None]:
from collections import Counter

graph_tools_results = dict()

for subject in gold_standard_id_list:
    
    graph_tools_results[subject] = list()

    for allele in ['A', 'B', 'C', 'DRB1', 'DQB1']:
        #Get list of all predictions:
        kourami_pred = get_prediction(kourami_results, subject, allele)
        hisatgenotype_pred = get_prediction(hisatgenotype_results, subject, allele)
        hla_la_pred = get_prediction(hla_la_results, subject, allele)
        optitype_pred = get_prediction(optitype_results, subject, allele)
        stc_seq_pred = get_prediction(stc_seq_results, subject, allele)
        
        
        
        all_predictions_list = [kourami_pred, hla_la_pred, hisatgenotype_pred]
        
        all_predictions_list_clean = [preds for preds in all_predictions_list if (preds != []) and (preds != '')]
        
        all_predictions_list_transposed = list(map(list, zip(*all_predictions_list_clean)))
        
        preds_count_1 = Counter(all_predictions_list_transposed[0])
        preds_count_2 = Counter(all_predictions_list_transposed[1])
        
        mode_1 = preds_count_1.most_common(1)[0][0]
        mode_2 = preds_count_2.most_common(1)[0][0]
        
        graph_tools_results[subject].extend([mode_1, mode_2])
    

# Results 1000 Genome (only 2014 data)

Future work: Implement intersection over union as a performance metric

# Calculate Results

In [None]:
def check_result(correct_alleles_list, tool_result_dict, subject_key, tool_name, allele):
    try:
        hit_1 = set(correct_alleles_list[0]).intersection(tool_result_dict[subject_key])
        hit_2 = set(correct_alleles_list[1]).intersection(tool_result_dict[subject_key])
    except KeyError as error:
        hit_1 = ''
        hit_2 = ''
    
    #In the case of homozygosity
    if (correct_alleles_list[0] == correct_alleles_list[1]):
        try:
            hit_1 = [''] * tool_result_dict[subject_key].count(correct_alleles_list[0][0])
            hit_2 = ''
        except KeyError as error:
            hit_1 = ''
            hit_2 = ''            
    
    #Register mistakes, if tool didn't get both alleles correct
    if (len(hit_1)+len(hit_2) != 2):
        print(subject_key)
        print(tool_name, tool_result_dict[subject_key])
        print(correct_alleles_list)
        print("\n")
        #Only register mistakes for optitype for HLA-A, HLA-B and HLA-C
        if allele in ['A', 'B', 'C']:
            error_dataframe_hla_I.loc[subject_key, tool_name]  += 2 - (len(hit_1)+len(hit_2))
        
        elif tool_name == 'Optitype':
            pass
        
        else:
            error_dataframe_hla_II.loc[subject_key, tool_name]  += 2 - (len(hit_1)+len(hit_2))
        
#         print(subject_key)
#         print(correct_alleles_list)
#         print(tool_result_dict[subject_key])
        
        
    return len(hit_1)+len(hit_2)

#Get the number of predictions from a tool for a specific allele for a specific subject (2 or 0)
def get_count(allele, pred_dict, subject):
    try:
        output = len([pred for pred in pred_dict[subject] if pred.startswith(allele)])
    except KeyError as error:
        output = 0
        
    return output

In [None]:
idx = pd.IndexSlice

#Create dataframe over samples which the tools didn't predict right:

error_dataframe_hla_I = pd.DataFrame({'mean_depth' : depth_results})

for tool in ['Kourami', 'HLA-LA', 'Optitype', 'Hisatgenotype', 'STC-seq', 'ensemble_all', 'ensemble_graph']:
    error_dataframe_hla_I[tool] = 0

    
error_dataframe_hla_II = pd.DataFrame({'mean_depth' : depth_results})

for tool in ['Kourami', 'HLA-LA', 'Hisatgenotype', 'STC-seq', 'ensemble_all', 'ensemble_graph']:
    error_dataframe_hla_II[tool] = 0    

    
#Create score dataframe:
tool_extended_list = ['Kourami', 'Kourami', 'Kourami', 'Kourami', 'HLA-LA', 'HLA-LA',  'HLA-LA', 'HLA-LA', 'Optitype', 'Optitype', 'Optitype', 'Optitype', 'Hisatgenotype','Hisatgenotype', 'Hisatgenotype','Hisatgenotype', 'STC-seq', 'STC-seq', 'STC-seq', 'STC-seq', 'ensemble_all', 'ensemble_all', 'ensemble_all', 'ensemble_all', 'ensemble_graph', 'ensemble_graph', 'ensemble_graph', 'ensemble_graph', 'Total', 'Total', 'Total']
measure_list = ['score', 'count', 'call_rate', 'accuracy', 'score', 'count', 'call_rate', 'accuracy','score', 'count', 'call_rate', 'accuracy','score', 'count', 'call_rate', 'accuracy','score', 'count', 'call_rate', 'accuracy','score', 'count', 'call_rate', 'accuracy', 'score', 'count', 'call_rate', 'accuracy', 'count']

arrays = [tool_extended_list,measure_list]
tuples = list(zip(*arrays))

index = pd.MultiIndex.from_tuples(tuples, names=['Tool', 'Metric'])

#Make overall dataframe: (Evaxion is HLA-I + DRB1)
results_df = pd.DataFrame(0, index=['A', 'B', 'C', 'DRB1', 'DQB1', 'HLA-I', 'HLA-II', 'Evaxion', 'Total'], columns=index)

#Loop over subjects:
for subject in list(MG_exome_merged_df.index):
    
    #Only include subjects which have been typed by the tools:
    if subject not in gold_standard_id_list:
        continue
      
    #Loop over alleles
    for allele in MG_exome_merged_df.columns[1:]:
        results_df.loc[allele, idx['Total', 'count']] += 2
        
        facit = MG_exome_merged_df.loc[subject,allele]

        
        results_df.loc[allele, idx['Kourami', 'score']]  += check_result(facit, kourami_results, subject, 'Kourami', allele)
        results_df.loc[allele, idx['Kourami', 'count']]  += get_count(allele, kourami_results, subject)
      
 
        results_df.loc[allele, idx['HLA-LA', 'score']]+= check_result(facit, hla_la_results, subject, 'HLA-LA', allele)
        results_df.loc[allele, idx['HLA-LA', 'count']]+= get_count(allele, hla_la_results, subject)

    
        results_df.loc[allele, idx['Optitype', 'score']] += check_result(facit, optitype_results, subject, 'Optitype', allele)
        results_df.loc[allele, idx['Optitype', 'count']]+= get_count(allele, optitype_results, subject)
        
        
        results_df.loc[allele, idx['Hisatgenotype', 'score']] += check_result(facit, hisatgenotype_results, subject, 'Hisatgenotype', allele)
        results_df.loc[allele, idx['Hisatgenotype', 'count']]+= get_count(allele, hisatgenotype_results, subject)
        
        
        results_df.loc[allele, idx['STC-seq', 'score']] += check_result(facit, stc_seq_results, subject, 'STC-seq', allele)
        results_df.loc[allele, idx['STC-seq', 'count']]+= get_count(allele, stc_seq_results, subject)
        
        #The majority vote methods:
        
        results_df.loc[allele, idx['ensemble_all', 'score']] += check_result(facit, all_five_results, subject, 'ensemble_all', allele)
        results_df.loc[allele, idx['ensemble_all', 'count']]+= get_count(allele, all_five_results, subject)
        
        
        results_df.loc[allele, idx['ensemble_graph', 'score']] += check_result(facit, graph_tools_results, subject, 'ensemble_graph', allele)
        results_df.loc[allele, idx['ensemble_graph', 'count']]+= get_count(allele, graph_tools_results, subject)

In [None]:
#Calculate counts for Total:
#Add for Total:
results_df.loc['HLA-I', idx['Total', 'count']] += sum(results_df.loc[['A','B','C'], idx['Total', 'count']])
results_df.loc['HLA-II', idx['Total', 'count']] += sum(results_df.loc[['DRB1','DQB1'], idx['Total', 'count']])
results_df.loc['Evaxion', idx['Total', 'count']] += sum(results_df.loc[['HLA-I','DRB1'], idx['Total', 'count']])
results_df.loc['Total', idx['Total', 'count']] += sum(results_df.loc[['HLA-I','HLA-II'], idx['Total', 'count']])


#Loop over subjects:
for method in ['HLA-LA', 'Hisatgenotype', 'Kourami', 'Optitype', 'STC-seq', 'ensemble_all', 'ensemble_graph']:
    
    #Calculate scores for groups of alleles:
    results_df.loc['HLA-I', idx[method, 'score']] += sum(results_df.loc[['A','B','C'], idx[method, 'score']])
    results_df.loc['HLA-II', idx[method, 'score']] += sum(results_df.loc[['DRB1','DQB1'], idx[method, 'score']])
    results_df.loc['Evaxion', idx[method, 'score']] += sum(results_df.loc[['HLA-I','DRB1'], idx[method, 'score']])
    results_df.loc['Total', idx[method, 'score']] += sum(results_df.loc[['HLA-I','HLA-II'], idx[method, 'score']])
         
    results_df.loc['HLA-I', idx[method, 'count']] += sum(results_df.loc[['A','B','C'], idx[method, 'count']])
    results_df.loc['HLA-II', idx[method, 'count']] += sum(results_df.loc[['DRB1','DQB1'], idx[method, 'count']])
    results_df.loc['Evaxion', idx[method, 'count']] += sum(results_df.loc[['HLA-I','DRB1'], idx[method, 'count']])
    results_df.loc['Total', idx[method, 'count']] += sum(results_df.loc[['HLA-I','HLA-II'], idx[method, 'count']])
                                                        
        
    for gene in list(results_df.index):

        results_df.loc[gene, idx[method, 'call_rate']] += round(results_df.loc[gene, idx[method, 'count']] / results_df.loc[gene, idx['Total', 'count']]*100,2)
        
        results_df.loc[gene, idx[method, 'accuracy']] += round(results_df.loc[gene, idx[method, 'score']] / results_df.loc[gene, idx['Total', 'count']]*100,2)
        
        

In [None]:
results_df

# Add metadata to error dataframe

In [None]:
#Join the error dataframe with information of continent and ethnicity
errors_with_metadata_hla_I_df_1 = error_dataframe_hla_I.join(m2018_gs_raw_df[['Region', 'sbgroup']])
errors_with_metadata_hla_I_df_1.rename(columns={'sbgroup' : 'Population'}, inplace=True)

#The same for HLA-II
errors_with_metadata_hla_II_df_1 = error_dataframe_hla_II.join(m2018_gs_raw_df[['Region', 'sbgroup']])
errors_with_metadata_hla_II_df_1.rename(columns={'sbgroup' : 'Population'}, inplace=True)

In [None]:
#Datapath for metadata excelsheet:
metadata_path = gold_standard_path + '1000G_sample_info.xlsx'

#Load data for gender and translation of Population abbreviations
metadata_1_df = pd.read_excel(metadata_path, sheet_name='Sample Info')[['Sample', 'Population', 'Population Description', 'Gender']]

#load data for sequencing center:
metadata_2_df = pd.read_excel(metadata_path, sheet_name='Final Phase Sequence Data')[['Unnamed: 0', 'Exome']]

#Reset header in metadata_2
new_header = metadata_2_df.iloc[0] #grab the first row for the header
metadata_2_df = metadata_2_df[1:] #take the data less the header row
metadata_2_df.columns = new_header #set the header row as the df header

#Set "Sample" as inde in both metadata dataframes:
metadata_1_df.set_index('Sample', inplace=True)
metadata_2_df.set_index('Sample', inplace=True)

In [None]:
#Join the relevant info to the error dataframe:
errors_with_metadata_hla_I_df_2 = errors_with_metadata_hla_I_df_1.join(metadata_1_df[['Population Description', 'Gender']])

errors_metadata_hla_I_df = errors_with_metadata_hla_I_df_2.join(metadata_2_df[['Center']])

#Do the same for HLA-II
errors_with_metadata_hla_II_df_2 = errors_with_metadata_hla_II_df_1.join(metadata_1_df[['Population Description', 'Gender']])

errors_metadata_hla_II_df = errors_with_metadata_hla_II_df_2.join(metadata_2_df[['Center']])

### Add a column with the sum of errors for all but STC-seq

In [None]:
errors_metadata_hla_I_df['mean_error_graph'] = errors_metadata_hla_I_df.apply(lambda row: (row['Kourami'] + row['HLA-LA'] + row['Hisatgenotype']) / 3  , axis = 1) 

errors_metadata_hla_II_df['mean_error_graph'] = errors_metadata_hla_II_df.apply(lambda row: (row['Kourami'] + row['HLA-LA'] + row['Hisatgenotype']) / 3  , axis = 1) 


# Plots of the Performance of the tools

In [None]:
#labels in order:
labels = ['Kourami', 'HLA-LA', 'Optitype', 'Hisatgenotype', 'STC-seq', 'ensemble_all', 'ensemble_graph']

def make_plot_from_allele_list(allele_index):

    if allele_index in ('HLA-II', 'DRB1', 'DQB1'):
        accuracy = list(results_df.loc[allele_index, idx[['Kourami', 'HLA-LA', 'Hisatgenotype', 'STC-seq', 'ensemble_graph'], 'accuracy']])
        call_rate = list(results_df.loc[allele_index, idx[['Kourami', 'HLA-LA', 'Hisatgenotype', 'STC-seq', 'ensemble_graph'], 'call_rate']])
        
        x = np.arange(len(labels)-2)
        
    else:
        accuracy = list(results_df.loc[allele_index, idx[:, 'accuracy']])
        call_rate = list(results_df.loc[allele_index, idx[:, 'call_rate']])
        
        x = np.arange(len(labels))  # the label locations
        

    
    width = 0.35  # the width of the bars

    fig, ax = plt.subplots(figsize=(10,6))
    rects1 = ax.bar(x - width/2, accuracy, width, label='accuracy', color = '#0000FF')
    rects2 = ax.bar(x + width/2, call_rate, width, label='call_rate', color = '#808080')

    # Add some text for labels, title and custom x-axis tick labels, etc.
    ax.set_ylabel('Score')
    ax.set_xticks(x)
    
    if allele_index in ('HLA-II', 'DRB1', 'DBQ1'):
        ax.set_xticklabels(['Kourami', 'HLA-LA', 'Hisatgenotype', 'STC-seq', 'ensemble_graph'], rotation = 45)
    else:
        ax.set_xticklabels(labels, rotation = 45)

    ax.legend()


    def autolabel(rects):
        """Attach a text label above each bar in *rects*, displaying its height."""
        for rect in rects:
            height = rect.get_height()
            ax.annotate('{}'.format(height),
                        xy=(rect.get_x() + rect.get_width() / 2, 10),
                        xytext=(0, 3),  # 3 points vertical offset
                        textcoords="offset points",
                        ha='center', va='bottom', 
                        color="w", size = 8, weight = 800)


    autolabel(rects1)
    autolabel(rects2)

    # Major ticks every 20, minor ticks every 5
    minor_ticks = np.arange(0, 101, 5)

    ax.set_yticks(minor_ticks, minor=True)

    ax.grid(which='minor', alpha=0.4)

    # Or if you want different settings for the grids:
    plt.grid(axis = 'y')
    ax.set_axisbelow(True)

    plt.legend(bbox_to_anchor=(1.05, 1.0), loc='upper left')
    plt.tight_layout()

    plt.title('HLA typing performance for allele: ' + allele_index)
    return plt, ax

## Overall Results

In [None]:
make_plot_from_allele_list('Total')

## Evaxion Alleles (HLA-I + DRB1)

In [None]:
make_plot_from_allele_list('Evaxion')

## HLA-I and HLA-II

In [None]:
make_plot_from_allele_list('HLA-I')

In [None]:
make_plot_from_allele_list('HLA-II')

## Individual Alleles

In [None]:
make_plot_from_allele_list('A')

In [None]:
make_plot_from_allele_list('B')

In [None]:
make_plot_from_allele_list('C')

In [None]:
make_plot_from_allele_list('DRB1')

In [None]:
make_plot_from_allele_list('DQB1')

# Analysis of Errors in relation to sequencing depth

In [None]:
kourami_length_dist_df = pd.DataFrame(columns = ['mean_depth'])
optitype_length_dist_df = pd.DataFrame(columns = ['mean_depth'])
hla_la_length_dist_df = pd.DataFrame(columns = ['mean_depth'])
hisatgenotype_length_dist_df = pd.DataFrame(columns = ['mean_depth'])
stc_seq_length_dist_df = pd.DataFrame(columns = ['mean_depth'])

for sample_id in list(error_dataframe_hla_I.index): 
    #Kourami
    n_errors = error_dataframe_hla_I.loc[sample_id, 'Kourami']
    if n_errors > 0:
        for n in range(n_errors):
            depth_of_error_sample = error_dataframe_hla_I.loc[sample_id, 'mean_depth']
            kourami_length_dist_df = kourami_length_dist_df.append({'mean_depth' : depth_of_error_sample}, ignore_index=True)
    
    #Optitype
    n_errors = error_dataframe_hla_I.loc[sample_id, 'Optitype']
    if n_errors > 0:
        for n in range(n_errors):
            depth_of_error_sample = error_dataframe_hla_I.loc[sample_id, 'mean_depth']
            optitype_length_dist_df = optitype_length_dist_df.append({'mean_depth' : depth_of_error_sample}, ignore_index=True)
            
    #HLA-LA
    n_errors = error_dataframe_hla_I.loc[sample_id, 'HLA-LA']
    if n_errors > 0:
        for n in range(n_errors):
            depth_of_error_sample = error_dataframe_hla_I.loc[sample_id, 'mean_depth']
            hla_la_length_dist_df = hla_la_length_dist_df.append({'mean_depth' : depth_of_error_sample}, ignore_index=True)

    #Hisatgenotype
    n_errors = error_dataframe_hla_I.loc[sample_id, 'Hisatgenotype']
    if n_errors > 0:
        for n in range(n_errors):
            depth_of_error_sample = error_dataframe_hla_I.loc[sample_id, 'mean_depth']
            hisatgenotype_length_dist_df = hisatgenotype_length_dist_df.append({'mean_depth' : depth_of_error_sample}, ignore_index=True)

    #STC-seq
    n_errors = error_dataframe_hla_I.loc[sample_id, 'STC-seq']
    if n_errors > 0:
        for n in range(n_errors):
            depth_of_error_sample = error_dataframe_hla_I.loc[sample_id, 'mean_depth']
            stc_seq_length_dist_df = stc_seq_length_dist_df.append({'mean_depth' : depth_of_error_sample}, ignore_index=True)


In [None]:
labels_boxplot = ['Full Dataset'] + labels[:-2]

overall_coverage = error_dataframe_hla_I['mean_depth'].values
kourami_coverage = kourami_length_dist_df['mean_depth'].values
optitype_coverage = optitype_length_dist_df['mean_depth'].values
hla_la_coverage= hla_la_length_dist_df['mean_depth'].values
hisatgenotype_coverage = hisatgenotype_length_dist_df['mean_depth'].values
stc_seq_coverage = stc_seq_length_dist_df['mean_depth'].values
data = [overall_coverage, kourami_coverage, hla_la_coverage, optitype_coverage, hisatgenotype_coverage, stc_seq_coverage] 
  
    
fig = plt.figure(figsize =(10, 7)) 
  
# Creating axes instance 
ax = fig.add_axes([0, 0, 1, 1]) 
 
# Creating plot 
bp = ax.boxplot(data) 
  
# show plot 
ax.set_xticklabels(labels_boxplot, rotation = 45)

ax.set_ylim(0, max(overall_coverage)+10)
 

## Doing these calculations might not be viable, as the distributions aren't normal. Maybe Kurtosis instead or transform distribution to normal first

In [None]:
overall_mean = error_dataframe_hla_I['mean_depth'].mean()
kourami_mean = kourami_length_dist_df['mean_depth'].mean()
optitype_mean = optitype_length_dist_df['mean_depth'].mean()
hla_la_mean = hla_la_length_dist_df['mean_depth'].mean()
hisatgenotype_mean = hisatgenotype_length_dist_df['mean_depth'].mean()
stc_seq_mean = stc_seq_length_dist_df['mean_depth'].mean()

mean_values = [overall_mean, kourami_mean, optitype_mean, hla_la_mean, hisatgenotype_mean, stc_seq_mean]

overall_std = error_dataframe_hla_I['mean_depth'].std()
kourami_std = kourami_length_dist_df['mean_depth'].std()
optitype_std = optitype_length_dist_df['mean_depth'].std()
hla_la_std = hla_la_length_dist_df['mean_depth'].std()
hisatgenotype_std = hisatgenotype_length_dist_df['mean_depth'].std()
stc_seq_std = stc_seq_length_dist_df['mean_depth'].std()

std_values = [overall_std, kourami_std, optitype_std, hla_la_std, hisatgenotype_std, stc_seq_std]

plt.bar(range(len(error_dataframe_hla_I.columns[:-2])), mean_values, 
        yerr=std_values, align='center', alpha=0.5, color = '#0000FF')

plt.xticks(range(len(error_dataframe_hla_I.columns[:-2])), ['All Samples'] + list(error_dataframe_hla_I.columns[1:-2]), rotation = 45)
plt.ylabel('depth')
plt.title('Mean Sequencing coverage with std. error')
plt.show()

# The same analysis - now for HLA-II

In [None]:
kourami_length_dist_df = pd.DataFrame(columns = ['mean_depth'])
optitype_length_dist_df = pd.DataFrame(columns = ['mean_depth'])
hla_la_length_dist_df = pd.DataFrame(columns = ['mean_depth'])
hisatgenotype_length_dist_df = pd.DataFrame(columns = ['mean_depth'])
stc_seq_length_dist_df = pd.DataFrame(columns = ['mean_depth'])

for sample_id in list(error_dataframe_hla_II.index): 
    #Kourami
    n_errors = error_dataframe_hla_II.loc[sample_id, 'Kourami']
    if n_errors > 0:
        for n in range(n_errors):
            depth_of_error_sample = error_dataframe_hla_II.loc[sample_id, 'mean_depth']
            kourami_length_dist_df = kourami_length_dist_df.append({'mean_depth' : depth_of_error_sample}, ignore_index=True)
    
    #HLA-LA
    n_errors = error_dataframe_hla_II.loc[sample_id, 'HLA-LA']
    if n_errors > 0:
        for n in range(n_errors):
            depth_of_error_sample = error_dataframe_hla_II.loc[sample_id, 'mean_depth']
            hla_la_length_dist_df = hla_la_length_dist_df.append({'mean_depth' : depth_of_error_sample}, ignore_index=True)

    #Hisatgenotype
    n_errors = error_dataframe_hla_II.loc[sample_id, 'Hisatgenotype']
    if n_errors > 0:
        for n in range(n_errors):
            depth_of_error_sample = error_dataframe_hla_II.loc[sample_id, 'mean_depth']
            hisatgenotype_length_dist_df = hisatgenotype_length_dist_df.append({'mean_depth' : depth_of_error_sample}, ignore_index=True)

    #STC-seq
    n_errors = error_dataframe_hla_II.loc[sample_id, 'STC-seq']
    if n_errors > 0:
        for n in range(n_errors):
            depth_of_error_sample = error_dataframe_hla_II.loc[sample_id, 'mean_depth']
            stc_seq_length_dist_df = stc_seq_length_dist_df.append({'mean_depth' : depth_of_error_sample}, ignore_index=True)


In [None]:
labels_boxplot = ['Full Dataset'] + labels[:-2]

overall_coverage = error_dataframe_hla_II['mean_depth'].values
kourami_coverage = kourami_length_dist_df['mean_depth'].values
hla_la_coverage= hla_la_length_dist_df['mean_depth'].values
hisatgenotype_coverage = hisatgenotype_length_dist_df['mean_depth'].values
stc_seq_coverage = stc_seq_length_dist_df['mean_depth'].values
data = [overall_coverage, kourami_coverage, hla_la_coverage, hisatgenotype_coverage, stc_seq_coverage] 
  
    
fig = plt.figure(figsize =(10, 7)) 
  
# Creating axes instance 
ax = fig.add_axes([0, 0, 1, 1]) 
 
# Creating plot 
bp = ax.boxplot(data) 
  
# show plot 
ax.set_xticklabels(labels_boxplot, rotation = 45)

ax.set_ylim(0, max(overall_coverage)+10)
 

In [None]:
overall_mean = error_dataframe_hla_II['mean_depth'].mean()
kourami_mean = kourami_length_dist_df['mean_depth'].mean()
hla_la_mean = hla_la_length_dist_df['mean_depth'].mean()
hisatgenotype_mean = hisatgenotype_length_dist_df['mean_depth'].mean()
stc_seq_mean = stc_seq_length_dist_df['mean_depth'].mean()

mean_values = [overall_mean, kourami_mean, hla_la_mean, hisatgenotype_mean, stc_seq_mean]

overall_std = error_dataframe_hla_II['mean_depth'].std()
kourami_std = kourami_length_dist_df['mean_depth'].std()
hla_la_std = hla_la_length_dist_df['mean_depth'].std()
hisatgenotype_std = hisatgenotype_length_dist_df['mean_depth'].std()
stc_seq_std = stc_seq_length_dist_df['mean_depth'].std()

std_values = [overall_std, kourami_std, hla_la_std, hisatgenotype_std, stc_seq_std]

plt.bar(range(len(error_dataframe_hla_II.columns[:-2])), mean_values, 
        yerr=std_values, align='center', alpha=0.5, color = '#0000FF')

plt.xticks(range(len(error_dataframe_hla_II.columns[:-2])), ['All Samples'] + list(error_dataframe_hla_II.columns[1:-2]), rotation = 45)
plt.ylabel('depth')
plt.title('Mean Sequencing coverage with std. error')
plt.show()

# Errors in relation to gender and ethnicity

In [None]:
labels_no_stc = ['Kourami', 'HLA-LA', 'Optitype', 'Hisatgenotype']

errors_metadata_hla_I_df[labels_no_stc+ ['mean_error_graph'] +['Region']].groupby(['Region']).agg([np.mean, np.std]) 

In [None]:
labels_no_stc_no_optitype = ['Kourami', 'HLA-LA', 'Hisatgenotype'] 

errors_metadata_hla_II_df[labels_no_stc_no_optitype + ['mean_error_graph'] + ['Region']].groupby(['Region']).agg([np.mean, np.std])

### Overall analysis - mean performance of HISAT-genotype, Kourami and HLA-LA

In [None]:
errors_metadata_hla_I_df[['mean_error_graph']+['Region']].groupby(['Region']).mean().unstack().plot(kind='bar')

In [None]:
errors_metadata_hla_II_df[['mean_error_graph']+['Region']].groupby(['Region']).mean().unstack().plot(kind='bar')

In [None]:
errors_metadata_hla_I_df[['mean_error_graph']+['Population']].groupby(['Population']).mean().unstack().plot(kind='bar')

In [None]:
errors_metadata_hla_II_df[['mean_error_graph']+['Population']].groupby(['Population']).mean().unstack().plot(kind='bar')

In [None]:
errors_metadata_hla_I_df[['mean_error_graph']+['Gender']].groupby(['Gender']).agg([np.mean, np.std])

In [None]:
errors_metadata_hla_II_df[['mean_error_graph']+['Gender']].groupby(['Gender']).agg([np.mean, np.std])

In [None]:
 errors_metadata_hla_I_df[['mean_error_graph']+['Gender']].groupby(['Gender']).mean().unstack().plot(kind='bar')

In [None]:
 errors_metadata_hla_II_df[['mean_error_graph']+['Gender']].groupby(['Gender']).mean().unstack().plot(kind='bar')

### Over Kourami, HLA-LA, Optitype and HISAT-genotype individually:

In [None]:
 errors_metadata_hla_I_df[labels_no_stc+['Region']].groupby(['Region']).mean().unstack().plot(kind='bar', figsize = (10,6))

In [None]:
 errors_metadata_hla_II_df[labels_no_stc_no_optitype+['Region']].groupby(['Region']).mean().unstack().plot(kind='bar', figsize = (10,6))

In [None]:
 errors_metadata_hla_I_df[labels_no_stc+['Population']].groupby(['Population']).mean().unstack().plot(kind='bar', figsize = (10,6))

In [None]:
 errors_metadata_hla_II_df[labels_no_stc_no_optitype+['Population']].groupby(['Population']).mean().unstack().plot(kind='bar', figsize = (10,6))

In [None]:
 errors_metadata_hla_I_df[labels_no_stc+['Gender']].groupby(['Gender']).agg([np.mean, np.std])

In [None]:
errors_metadata_hla_II_df[labels_no_stc_no_optitype+['Gender']].groupby(['Gender']).agg([np.mean, np.std])

In [None]:
 errors_metadata_hla_I_df[labels_no_stc+['Gender']].groupby(['Gender']).mean().unstack().plot(kind='bar')

In [None]:
 errors_metadata_hla_II_df[labels_no_stc_no_optitype+['Gender']].groupby(['Gender']).mean().unstack().plot(kind='bar')

# Analysis of overlap of samples 
Is there an overlap between the samples used in this study and the samples used in the development of the individual tools.

In [None]:
sample_library_filepath = 'C:\\Users\\nikol\\OneDrive\\DTU\\11_semester\\Data\\'

samples_used_in_tools_dict = dict()

for tool in ['kourami', 'hla-la', 'optitype', 'hisatgenotype']:
    full_filepath = sample_library_filepath + tool + "_samples.txt"
    
    tmp_list = list()
    
    #Read all lines with samples used in study
    with open(full_filepath, 'r') as infile:
        for line in infile:
            
            tmp_list.append(line[:-1])
    
    
    #Only include those, which are also included in the 819 samples in this study:
    filtered_tmp_list = list(set(list(MG_exome_df.index)) & set(tmp_list))
    
    samples_used_in_tools_dict[tool] = len(list(filtered_tmp_list))
        

In [None]:
already_used_samples_df = pd.DataFrame(samples_used_in_tools_dict, index=['n_samples'])
already_used_samples_df

In [None]:
plt.bar(range(len(already_used_samples_df.columns)), list(already_used_samples_df.values[0]), align='center', alpha=0.5, color = '#0000FF')

plt.xticks(range(len(already_used_samples_df.columns)), ['kourami', 'hla-la', 'optitype', 'hisatgenotype'], rotation = 45)
plt.ylim(0,100)
plt.ylabel('N samples')
plt.title('Overview of samples used both in this study\n and in the development of the individual tools')
plt.show()

# Analysis of HISAT-genotype's alternative reads


In [None]:
hisatgenotype_result_filepath = resultpath + '\\hisatgenotype\\'

hisatgenotype_files = []
for (dirpath, dirnames, filenames) in walk(hisatgenotype_result_filepath):
    hisatgenotype_files.extend(filenames)
    
#print(hisatgenotype_files)

#Save two predictions. One, with one guess per allele and one with the full prediction
hisatgenotype_results = dict()

hisatgenotype_results_ambiguous = dict()


for filename in hisatgenotype_files:
    if filename.endswith('.txt'):
    
        hisatgenotype_resultlist = list()
        
        hisatgenotype_resultlist_ambiguous = list()
        
        
        with open(hisatgenotype_result_filepath + filename) as infile:
            for line in infile:
                result = re.match(r'^\t+(1|2)\sranked (A|B|C|DRB1|DQB1)',line)
                
                ambiguous_result = re.match(r'^\t+\d\sranked (A|B|C|DRB1|DQB1)',line)

                if result is not None:
                    hisatgenotype_resultlist.append(line.split()[2])
                
                if ambiguous_result is not None:
                    hisatgenotype_resultlist_ambiguous.append(line.split()[2])

                    
            #Duplicate prediction for an allele in case of homologous case, so that each gene has two predictions.
            #In a homologous case, both result dicts only have one prediction and both needs an update.
            for allele in ['A', 'B', 'C', 'DRB1', 'DQB1']:
                allele_list = [pred for pred in hisatgenotype_resultlist if pred.startswith(allele)]

                if len(allele_list) == 1:
                    hisatgenotype_resultlist.append(allele_list[0])
                    hisatgenotype_resultlist.sort()
                    
                    hisatgenotype_resultlist_ambiguous.append(allele_list[0])
                    hisatgenotype_resultlist_ambiguous.sort()
                    
                    
                    

        hisatgenotype_results[filename[:-4]] = [i.split('*')[0] + '*' + i.split('*')[1].split(':')[0] + ':' + i.split('*')[1].split(':')[1] for i in hisatgenotype_resultlist]
        
        hisatgenotype_results_ambiguous[filename[:-4]] = [i.split('*')[0] + '*' + i.split('*')[1].split(':')[0] + ':' + i.split('*')[1].split(':')[1] for i in hisatgenotype_resultlist_ambiguous]
        
        
#hisatgenotype_results           
        

In [None]:
#Find out a percentage of how much the true allele is an alternative suggestion, when HISAT-genotype mistypes: 

# Old code

### The distribution of the depth:

In [None]:
# matplotlib histogram
plt.hist(error_dataframe_hla_I['mean_depth'], color = 'blue', edgecolor = 'black',
         bins = int(180/5))

# seaborn histogram
sns.distplot(error_dataframe_hla_I['mean_depth'], hist=True, kde=False, 
             bins=int(180/5), color = 'blue',
             hist_kws={'edgecolor':'black'})
# Add labels
plt.title('Histogram of mean sequencing depth for region')
plt.xlabel('mean sequencing depth')
plt.ylabel('Number of alleles')

In [None]:
#Can one do this - maybe, idk. Check distributions. pvalue is however not below 0.05, so no significant difference
male_mean_errors = errors_metadata_hla_I_df[errors_metadata_hla_I_df['Gender'] == 'male']['mean_error_graph']
female_mean_errors = errors_metadata_hla_I_df[errors_metadata_hla_I_df['Gender'] == 'female']['mean_error_graph']

from scipy import stats
print(stats.ttest_ind(male_mean_errors, female_mean_errors))


In [None]:
#Check for distribution of errors in males and females:
# matplotlib histogram
plt.hist(male_mean_errors, color = 'blue', edgecolor = 'black',
         bins = int(180/5), range=[0, 6])

# seaborn histogram
sns.distplot(male_mean_errors, hist=True, kde=False, 
             bins=int(180/5), color = 'blue',
             hist_kws={'edgecolor':'black'})
# Add labels
plt.title('Histogram of mean sequencing depth for region')
plt.xlabel('mean sequencing depth')
plt.ylabel('Number of alleles')

In [None]:
#Check for distribution of errors in males and females:
# matplotlib histogram
plt.hist(female_mean_errors, color = 'blue', edgecolor = 'black',
         bins = int(180/5), range=[0, 6])


# # seaborn histogram
# sns.distplot(female_mean_errors, hist=True, kde=False, 
#              bins=int(180/5), color = 'blue',
#              hist_kws={'edgecolor':'black'})
# Add labels
plt.title('Histogram of mean sequencing depth for region')
plt.xlabel('mean sequencing depth')
plt.ylabel('Number of alleles')

In [None]:

#Can one do this - maybe, idk. Check distributions. pvalue is however not below 0.05, so no significant difference
male_mean_errors = errors_metadata_hla_II_df[errors_metadata_hla_II_df['Gender'] == 'male']['mean_error_graph']
female_mean_errors = errors_metadata_hla_II_df[errors_metadata_hla_II_df['Gender'] == 'female']['mean_error_graph']

from scipy import stats
print(stats.ttest_ind(male_mean_errors, female_mean_errors))

In [None]:
#Check for distribution of errors in males and females:
# matplotlib histogram
plt.hist(male_mean_errors, color = 'blue', edgecolor = 'black',
         bins = int(180/5), range=[0, 6])

# seaborn histogram
sns.distplot(male_mean_errors, hist=True, kde=False, 
             bins=int(180/5), color = 'blue',
             hist_kws={'edgecolor':'black'})
# Add labels
plt.title('Histogram of mean sequencing depth for region')
plt.xlabel('mean sequencing depth')
plt.ylabel('Number of alleles')

In [None]:
#Check for distribution of errors in males and females:
# matplotlib histogram
plt.hist(female_mean_errors, color = 'blue', edgecolor = 'black',
         bins = int(180/5), range=[0, 6])


# # seaborn histogram
# sns.distplot(female_mean_errors, hist=True, kde=False, 
#              bins=int(180/5), color = 'blue',
#              hist_kws={'edgecolor':'black'})
# Add labels
plt.title('Histogram of mean sequencing depth for region')
plt.xlabel('mean sequencing depth')
plt.ylabel('Number of alleles')

In [None]:
errors_metadata_df[labels_no_stc+['Region','Population']].groupby(['Region','Population']).mean().unstack().plot(kind='bar', figsize= (20,10))

In [None]:
errors_metadata_df[labels_no_stc+['Region','Population']].groupby(['Region','Population']).mean().unstack().plot(kind='bar', figsize= (20,10))