In [1]:
import os
from os import walk

import pandas as pd
import numpy as np
import sys
import re

import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns

%matplotlib inline

pd.set_option('display.max_rows', 9000)
pd.set_option('display.max_columns', 1500)
pd.set_option('max_colwidth', 400)

In [2]:
main_folder_path = 'C:\\Users\\nikol\\OneDrive\\DTU\\11_semester\\'

resultpath = main_folder_path + '1000_genomes_results\\'

gold_standard_path = main_folder_path + 'gold_standard_data\\'

In [3]:
p_group_filepath = gold_standard_path + 'p_group_resolution.txt'

g_group_filepath = gold_standard_path + 'g_group_resolution.txt'

# Functions for standard field conversion

In [4]:
#Function for converting an allele to one/two/three field resolution (disregarding any trailing letters - still unambiguous)
#For the genes A, B, C, DRB1 and DQB1 the first field always has two digits.

def convert_to_one_field(allele_high_res):
    one_field_finder = re.search(r"(A|B|C|DRB1|DQB1)\*\d{2}", allele_high_res)
    
    if one_field_finder != None:
        one_field_finder = one_field_finder.group(0)
    else:
        one_field_finder = None
    
    return one_field_finder    

    
def convert_to_two_field(allele_high_res):
    two_field_finder = re.search(r"(A|B|C|DRB1|DQB1)\*\d{2}:\d{2,3}", allele_high_res)
    
    if two_field_finder != None:
        allele_two_field = two_field_finder.group(0)
    else:
        allele_two_field = None
    
    return allele_two_field    


def convert_to_three_field(allele_high_res):
    six_field_finder = re.search(r"(A|B|C|DRB1|DQB1)\*\d{2}:\d{2,3}:\d{2,3}", allele_high_res)
    
    if six_field_finder != None:
        allele_six_field = six_field_finder.group(0)
    else:
        allele_six_field = None
    
    return allele_six_field    

# Functions for conversion to P-group resolution

Null allels are not listed in P group resolution .txt document. 
However, these should still be mapped to their proper P group - if they are part of one. 
The .txt file with G group resolution is therefore used to add these entries to p_group_dict
Synonymous mutations are not grouped in G group, and these null alleles - even though they should be belong to a P group-
are not grouped. 

In [10]:
#Make dict for P-type conversion

p_group_filepath = gold_standard_path + 'p_group_resolution.txt'

#Dict with the structure: {allele_in : allele_converted_to_p_group...}
p_group_dict = dict()

#List all null alleles not part of a G group 
null_allele_outside_g_group_list = list()


#Read the important results:
with open(p_group_filepath, 'r') as infile:
    for line in infile:
        
        #If allele doesn't belong to a P group, add it to dict as key and value
        if ('/' not in line) and (line[0] != '#'):
            gene = line.split('*')[0]
            
            #Only register the valid alleles:
            if gene in ['A', 'B', 'C', 'DRB1', 'DQB1']:
                p_group_entry = convert_to_two_field(gene + "*" + line.split(';')[-2])
                
                p_group_dict[p_group_entry] = p_group_entry
                
        
        #If several alleles map to the same one, they are separated by a "/" - this indicates a P group
        if ('/' in line) and (line[0] != '#'):
            gene = line.split('*')[0]
            
            #Only register the valid alleles:
            if gene in ['A', 'B', 'C', 'DRB1', 'DQB1']:
                        
                #Find the four field, G type resolution. The G type is found at the end of the line.
                p_group_full = gene + "*" + line.split(';')[-1][:-1]
                p_group_two_field = convert_to_two_field(p_group_full)
            
                #Read the rest of the alleles and clean up the front and end part
                synonymous_alleles = line.split('/')
                synonymous_alleles[0] = synonymous_alleles[0].split(';')[1]
                synonymous_alleles[-1] = synonymous_alleles[-1].split(';')[0]
                

                #Convert all alleles to four field resolution
                for i in range(len(synonymous_alleles)):
                    synonymous_alleles[i] = gene + "*" + synonymous_alleles[i]
                    synonymous_alleles[i] = convert_to_two_field(synonymous_alleles[i])

                #Remove duplicates when converting to four field:
                synonymous_allels_unique_two_field = list(set(synonymous_alleles))
                
                #Add key in dict for each of the unique entries:
                for synonymous_allele in synonymous_allels_unique_two_field:
                    
#                     #Check for ambiguities in the G-type conversion e.g. C*02:02:02G and C*02:10:01G have same exon sequence for exon 2 and 3
#                     if synonymous_allele in p_group_dict.keys():
#                         print(synonymous_allele, p_group_full)
                    
                    p_group_dict[synonymous_allele] = convert_to_two_field(p_group_full)
        

#Finally, add the alleles, which have been changed/updated and are included in the gold standardd/predictions manually
#e.g. A*01:34N, "Sequence shown to be expressed at low levels and renamed A*01:01:38L (March 2011)" (https://www.ebi.ac.uk/cgi-bin/ipd/imgt/hla/allele.cgi)               
p_group_dict['A*01:34'] = 'A*01:01'    
p_group_dict['C*03:99'] = 'C*01:169'
p_group_dict['A*03:260'] = 'A*03:284' 
    
    
    
#Make P type conversion function using p_group_dict
def convert_to_p_group(allele):

    #Start by converting to two field:
    allele_two_field = convert_to_two_field(allele)
    
    #Find corresponding P-type if it exists. 
    if allele_two_field in p_group_dict:
        allele_two_field_p_group = p_group_dict[allele_two_field]
    
    #Failsafe: if allele isn't found in p_group_dict (which it should be), then just use two_field_resolution
    else:
        allele_two_field_p_group = allele_two_field
        #print("Allele was not found in p_group_dict", allele_two_field)
       
        
    return allele_two_field_p_group

# NetMHCseq typing resolution
Using Evaxion typing:

In [6]:
#Make dict for evaxion-group conversion

e_group_filepath = gold_standard_path + 'e_group_resolution.txt'

e_group_dict = dict()


#Make conversion from full allele names to G groups
with open(e_group_filepath, 'r') as infile:
    
    #Add all relevant entries from e_group_filepath to e_group_dict
    for line in infile:
        if line.startswith('HLA-'):
            gene = line[4]
            
            if gene in ['A', 'B', 'C']:     
                 
                digits_from_type = re.search(r'\d{2}:\d{2,3}',line)
                
                if digits_from_type != None:
                    allele = gene + "*" + digits_from_type.group(0)

                    peptide_sequence = line.split('\t')[-1][:-1]

                    e_group_dict[allele] = peptide_sequence

            
        elif line.startswith('DRB1'):
            gene = 'DRB1'
            
            digits_from_type_raw = line.split('_')[1].split('\t')[0]
            
            digits_from_type = digits_from_type_raw[0:2] + ':' + digits_from_type_raw[2:]
            
            allele = gene + "*" + digits_from_type
                
            peptide_sequence = line.split('\t')[-1][:-1]
            
            e_group_dict[allele] = peptide_sequence

            

#Function for converting to e-group resolution:
def convert_to_e_group(allele):

    #Start by converting to P group:
    allele_p_group = convert_to_p_group(allele)
    
    #Find corresponding e-type if it exists
    if allele_p_group in e_group_dict:
        allele_e_group = e_group_dict[allele_p_group]
    
    #Else use P group
    else:
        allele_e_group = allele_p_group
        
    return allele_e_group



# Load Gold standard (## 1000Genomes)

## Load 2014 dataset

Loading and preprocessing of gold-standard dataset 

In [7]:
milleg_2014_raw_filepath= gold_standard_path + '1000G_hla_diversity.txt'
milleg_2014_filepath= gold_standard_path + '1000G_hla_diversity_edited.txt'

#Remove quotes from file:
with open(milleg_2014_raw_filepath, 'r') as infile:
    newfile = ""
    for line in infile:
        newline = line.replace("\"", "")
        newfile += newline

outfile = open(milleg_2014_filepath, 'w')
outfile.write(newfile)
outfile.close()

#Make new dataframe pretty:

mille_gs_df = pd.read_csv(milleg_2014_filepath, sep = " ")
#mille_gs_df

#Change name of Utah individuals from CEPH to CEU as seen in the 1000 genomes database:
mille_gs_df.replace('CEPH','CEU', inplace=True)

#Remove samples with NaN (non typed alleles)
mille_gs_df.dropna(inplace=True)          
#Print number of subgroups in the dataset:
#mille_gs_df['sbgroup'].unique()



### Remove samples, which doesn't have exome data on 1000Genomes and create download links

In [8]:
import http.client
from urllib.parse import urlparse

def checkUrl(url):
    p = urlparse(url)
    conn = http.client.HTTPConnection(p.netloc)
    conn.request('HEAD', p.path)
    resp = conn.getresponse()
    return resp.status < 400

# if __name__ == '__main__':
#     print(checkUrl('http://www.stackoverflow.com')) # True
#     print(checkUrl('http://stackoverflow.com/notarealpage.html')) # False

In [9]:
# #Thre results of this code are saved in two files, to save time

# #Check all exome links to create a list of id's with exome data
# gold_standard_id_list = list()

# #Save the valid urls for download later
# gold_standard_url_list = list()

# i = 0

# original_sample_list = list(mille_gs_df['id'])

# for identity in original_sample_list:
#     i += 1
#     if i % 100 == 0:
#         print(i, " iterations completed")
        
#     sbgroup = mille_gs_df[mille_gs_df['id'] == identity]['sbgroup'].iloc[0]         
#     wget_url = "ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/data_collections/1000_genomes_project/data/{}/{}/exome_alignment/{}.alt_bwamem_GRCh38DH.20150826.{}.exome.cram".format(sbgroup, identity, identity, sbgroup, )
    
#     if checkUrl(wget_url):
#         gold_standard_id_list.append(identity)
#         gold_standard_url_list.append(wget_url)

# with open(gold_standard_path + 'gold_standard_url_list.txt', 'w') as outfile:
#     for entry in wget_url_list:
#         outfile.write(entry + '\n')
    
# with open(gold_standard_path + 'gold_standard_id_list.txt', 'w') as outfile:
#     for entry in valid_wes_id_list:
#         outfile.write(entry + '\n')

gold_standard_url_list = list()
with open(gold_standard_path + 'gold_standard_url_list.txt', 'r') as infile:
    for line in infile:
        gold_standard_url_list.append(line[:-1])   

gold_standard_id_list= list()
with open(gold_standard_path + 'gold_standard_id_list.txt', 'r') as infile:
    for line in infile:
        gold_standard_id_list.append(line[:-1])   

In [10]:
MG_exome_df = mille_gs_df[mille_gs_df['id'].isin(gold_standard_id_list)].drop('sbgroup', axis = 1)

#Replace 0000 with empty string:
MG_exome_df.replace('0000', '', inplace=True)

#MG_exome_df

### Double Typing
Merge rows, where a person has been typed twice. (this was only relevant for 2014 data)

In [11]:
#Check for non-identical rows
print(len(set(list(MG_exome_df['id']))))
print(len(list(MG_exome_df['id'])))

819
822


In [12]:
non_unique = list({x for x in list(MG_exome_df['id']) if list(MG_exome_df['id']).count(x) > 1})
non_unique_df = MG_exome_df[MG_exome_df['id'].isin(non_unique)]
non_unique_df

Unnamed: 0,id,A,A.1,B,B.1,C,C.1,DRB1,DRB1.1,DQB1,DQB1.1
440,NA19119,23:01:01/23:07N/23:17/23:18/23:20,36:01,35:01:01/35:01:03/35:40N/35:42/35:57/35:94,49:01:01,04:01:01:01/04:01:01:02/04:01:01:03/04:09N/04:28/04:30/04:41,07:01:01/07:01:02/07:01:09/07:06/07:18/07:52,03:01:01:01/03:01:01:02,07:01:01:01/07:01:01:02,02:01:01/02:02/02:04,05:01:01
441,NA19119,23:01,36:01,35:01,49:01,04:01,07:01,,,,
480,NA19210,03:01,33:01,15:10,58:01,03:02:01/03:02:02/03:02:03,08:04,,,,
481,NA19210,03:01:01:01/03:01:01:02N/03:01:01:03/03:01:07/03:20/03:21N/03:26/03:37/03:45,33:01:01,15:10,58:01:01/58:11,03:02:01/03:02:02/03:02:03,08:04,03:01:01:01/03:01:01:02,13:01:01,03:01:01/03:01:04/03:09/03:19/03:21/03:22/03:24,06:03:01
486,NA19223,,,,,,,03:02,13:27,04:02,02:01
487,NA19223,30:01:01/30:01:02/30:24,33:03:01/33:03:03/33:15/33:25,41:04,42:01,17:01/17:02/17:03,17:01/17:02/17:03,03:02:01,13:27,02:01:01/02:02/02:04,04:02


In [13]:
# DQB1 and DQB1.1 have been mixed between the two entries for NA19223 - switch these around for one entry, so they match columnwise
non_unique_df.loc[486,'DQB1'] = '02:01'
non_unique_df.loc[486,'DQB1.1'] = '04:02'

non_unique_df

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  iloc._setitem_with_indexer(indexer, value)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until


Unnamed: 0,id,A,A.1,B,B.1,C,C.1,DRB1,DRB1.1,DQB1,DQB1.1
440,NA19119,23:01:01/23:07N/23:17/23:18/23:20,36:01,35:01:01/35:01:03/35:40N/35:42/35:57/35:94,49:01:01,04:01:01:01/04:01:01:02/04:01:01:03/04:09N/04:28/04:30/04:41,07:01:01/07:01:02/07:01:09/07:06/07:18/07:52,03:01:01:01/03:01:01:02,07:01:01:01/07:01:01:02,02:01:01/02:02/02:04,05:01:01
441,NA19119,23:01,36:01,35:01,49:01,04:01,07:01,,,,
480,NA19210,03:01,33:01,15:10,58:01,03:02:01/03:02:02/03:02:03,08:04,,,,
481,NA19210,03:01:01:01/03:01:01:02N/03:01:01:03/03:01:07/03:20/03:21N/03:26/03:37/03:45,33:01:01,15:10,58:01:01/58:11,03:02:01/03:02:02/03:02:03,08:04,03:01:01:01/03:01:01:02,13:01:01,03:01:01/03:01:04/03:09/03:19/03:21/03:22/03:24,06:03:01
486,NA19223,,,,,,,03:02,13:27,02:01,04:02
487,NA19223,30:01:01/30:01:02/30:24,33:03:01/33:03:03/33:15/33:25,41:04,42:01,17:01/17:02/17:03,17:01/17:02/17:03,03:02:01,13:27,02:01:01/02:02/02:04,04:02


In [14]:
clean_duplicates_df = pd.DataFrame()
for column in ['A','A.1','B','B.1','C','C.1','DRB1','DRB1.1','DQB1','DQB1.1']:
    clean_duplicates_df[column] = non_unique_df.groupby(['id'])[column].apply('/'.join)
    
    #Remove potentaial starting '/'
    for identity in clean_duplicates_df.index:
        entry = clean_duplicates_df.loc[identity, column]
        
        if entry.startswith('/'):
            clean_duplicates_df.at[identity, column] = entry[1:]
        if entry.endswith('/'):
            clean_duplicates_df.at[identity, column] = entry[:-1]
        
clean_duplicates_df.reset_index(inplace=True)

In [15]:
clean_duplicates_df

Unnamed: 0,id,A,A.1,B,B.1,C,C.1,DRB1,DRB1.1,DQB1,DQB1.1
0,NA19119,23:01:01/23:07N/23:17/23:18/23:20/23:01,36:01/36:01,35:01:01/35:01:03/35:40N/35:42/35:57/35:94/35:01,49:01:01/49:01,04:01:01:01/04:01:01:02/04:01:01:03/04:09N/04:28/04:30/04:41/04:01,07:01:01/07:01:02/07:01:09/07:06/07:18/07:52/07:01,03:01:01:01/03:01:01:02,07:01:01:01/07:01:01:02,02:01:01/02:02/02:04,05:01:01
1,NA19210,03:01/03:01:01:01/03:01:01:02N/03:01:01:03/03:01:07/03:20/03:21N/03:26/03:37/03:45,33:01/33:01:01,15:10/15:10,58:01/58:01:01/58:11,03:02:01/03:02:02/03:02:03/03:02:01/03:02:02/03:02:03,08:04/08:04,03:01:01:01/03:01:01:02,13:01:01,03:01:01/03:01:04/03:09/03:19/03:21/03:22/03:24,06:03:01
2,NA19223,30:01:01/30:01:02/30:24,33:03:01/33:03:03/33:15/33:25,41:04,42:01,17:01/17:02/17:03,17:01/17:02/17:03,03:02/03:02:01,13:27/13:27,02:01/02:01:01/02:02/02:04,04:02/04:02


In [16]:
#Remove the duplicate columns from the full dataframe"
MG_exome_df = MG_exome_df[~MG_exome_df['id'].isin(non_unique)]

#Add back the clean duplicate rows:
MG_exome_df = MG_exome_df.append(clean_duplicates_df, sort=False)

#Reset index
MG_exome_df.reset_index(inplace=True, drop = True)

#Check that only uniwue entries exist now.
print(len(set(list(MG_exome_df['id']))))
print(len(list(MG_exome_df['id'])))

819
819


In [17]:
#Set id as index
MG_exome_df.set_index('id', inplace=True)

## Separate multiple predictions to interable lists and mark untyped alleles:

In [18]:
#Remember entries, which are not typed
non_typed_samples = list()

#Furthermore - in cases, where 2 field resolution is not available, use 2018 data.
for identity in list(MG_exome_df.index):
    
    for col in MG_exome_df.columns:
        #get list of predictions
        predictions_list = MG_exome_df.loc[identity,col].split('/')
        
        #Put gene name in front of all entries in the list
        for i in range(len(predictions_list)):
            predictions_list[i] = col.split('.')[0] + "*" + predictions_list[i]
        
        #Check, that all entries in the list have at least four field resolution:
        for pred in predictions_list:
            
            #Remove all non valid entries
            if convert_to_two_field(pred) == None:
                predictions_list.remove(pred)
        
        #Note, if no proper typing was made in 2014
        if len(predictions_list) == 0:
            non_typed_samples.append(identity)
            predictions_list = ['not_typed_in_2014']
            
        #Convert to P-type resolution:
        
 
        MG_exome_df.at[identity,col] = predictions_list

In [19]:
non_typed_samples

['NA12234', 'NA12249']

In [20]:
MG_exome_df

Unnamed: 0_level_0,A,A.1,B,B.1,C,C.1,DRB1,DRB1.1,DQB1,DQB1.1
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
NA06985,[A*03:01],[A*02:01],[B*07:02:01],[B*57:01],[C*07:02],[C*06:02:01:01],[DRB1*15:01],[DRB1*15:01],[DQB1*06:02:01],[DQB1*06:02:01]
NA06986,[A*03:01],"[A*32:01:01, A*32:01:02]","[B*44:03:01, B*44:03:03, B*44:03:04]","[B*44:03:01, B*44:03:03, B*44:03:04]",[C*04:01],[C*16:01],[DRB1*07:01],[DRB1*07:01],[DQB1*02:01],[DQB1*02:01]
NA06994,[A*02:01],"[A*32:01:01, A*32:01:02]",[B*40:02],[B*08:01],[C*02:02:02],"[C*07:01, C*07:06]",[DRB1*01:01],[DRB1*04:04],[DQB1*05:01],[DQB1*03:02]
NA07000,[A*02:01],[A*68:01:02],[B*44:02],[B*40:01],"[C*03:03, C*03:04]",[C*07:04],[DRB1*03:01],"[DRB1*11:01:01, DRB1*11:01:08]",[DQB1*02:01],[DQB1*03:01]
NA07037,[A*30:01],[A*31:01],[B*15:10],[B*40:01],[C*03:04:02],[C*03:04],[DRB1*04:04],[DRB1*13:02],[DQB1*03:02],[DQB1*06:04]
NA07048,[A*02:01],[A*02:01],[B*44:02:01:01],[B*07:02],[C*05:01],[C*07:02],[DRB1*04:01],[DRB1*15:01],[DQB1*03:01:01],[DQB1*06:02:01]
NA07051,[A*02:01],[A*68:01:02],"[B*15:01, B*15:12, B*15:19]",[B*07:02],[C*03:03],[C*07:02],[DRB1*04:01],[DRB1*15:01],[DQB1*03:01],[DQB1*06:02]
NA07056,[A*01:01],[A*02:01],[B*08:01],[B*57:01],"[C*07:01, C*07:06]","[C*06:02:01:01, C*06:02:01:02, C*06:02:03]",[DRB1*03:01],[DRB1*07:01],[DQB1*02:01],[DQB1*03:03]
NA07347,"[A*30:02:01, A*30:02:02]",[A*26:01],[B*18:01],[B*44:02],[C*05:01],[C*05:01],[DRB1*03:01],[DRB1*11:04],[DQB1*02:01],[DQB1*03:01]
NA07357,[A*01:01],[A*24:02],[B*08:01],[B*39:06],"[C*07:01, C*07:06]",[C*07:02],[DRB1*03:01],[DRB1*04:04],[DQB1*02:01],[DQB1*03:02]


## Merge Alleles in original dataframe

In [21]:
#Merge haplotypes
MG_exome_df['A_merged']= MG_exome_df[['A', 'A.1']].apply(lambda x: list(x), axis=1)
MG_exome_df['B_merged']= MG_exome_df[['B', 'B.1']].apply(lambda x: list(x), axis=1)
MG_exome_df['C_merged']= MG_exome_df[['C', 'C.1']].apply(lambda x: list(x), axis=1)
MG_exome_df['DRB1_merged']= MG_exome_df[['DRB1', 'DRB1.1']].apply(lambda x: list(x), axis=1)
MG_exome_df['DQB1_merged']= MG_exome_df[['DQB1', 'DQB1.1']].apply(lambda x: list(x), axis=1)

MG_exome_merged_df = MG_exome_df.drop(columns=['A', 'A.1', 'B', 'B.1', 'C', 'C.1', 'DRB1', 'DRB1.1','DQB1', 'DQB1.1'])

MG_exome_merged_df.rename(columns={"A_merged": "A", "B_merged": "B", "C_merged": "C", "DRB1_merged": "DRB1", "DQB1_merged": "DQB1"}, inplace = True)

## Load and convert 2018 data for error correction in 2014 dataset

This data is not used itself directly, but fills out the gaps in the 2014 dataset loaded later

In [22]:
#10 samples analysed in ATHLATES and with results taken from 2018 dataset.
high_cov_id_list = ["HG01756", "HG01757", "HG01872", "HG01873", "HG01886", "HG01953", "HG01968", "HG02014", "HG02057", "NA20313"]

In [23]:
gs_2018_filepath = gold_standard_path + '2018_1129_HLA_types_full_1000_Genomes_Project_panel.txt'

gs_2018_raw_df = pd.read_csv(gs_2018_filepath, sep = "\t")

#Rename columns
gs_2018_raw_df.rename(columns={'Sample ID': 'id', 'Population': 'sbgroup', 'HLA-A 1': 'A', 'HLA-A 2': 'A.1', 'HLA-B 1': 'B',
                            'HLA-B 2': 'B.1', 'HLA-C 1': 'C', 'HLA-C 2': 'C.1', 'HLA-DQB1 1': 'DQB1', 'HLA-DQB1 2': 'DQB1.1',
                            'HLA-DRB1 1': 'DRB1' , 'HLA-DRB1 2': 'DRB1.1' }, inplace=True)

#Set id as index
gs_2018_raw_df.set_index('id', inplace=True)

#Remove samples with NaN (non typed alleles)
gs_2018_df = gs_2018_raw_df.dropna()          


In [24]:
#Find entries with an *, indicating a former mistake in the 2014 dataset
changed_sample_indexes = list()

for index in gs_2018_df.index:
    for col in gs_2018_df.columns[1:]:
        for entry in gs_2018_df.loc[index][col]:
            if '*' in entry:              
                changed_sample_indexes.append(index)

#Include samples, which are missing as well.
changed_sample_indexes += non_typed_samples

#Include the 10 high coverage samples:
changed_sample_indexes += high_cov_id_list

In [25]:
#make relevant dataframe smaller
corrections_2018_df_raw = gs_2018_df.loc[set(changed_sample_indexes)]

#Limit the 2018 dataframe to 2014 samples and the 10 high coverage samples 
corrections_2018_df = corrections_2018_df_raw[corrections_2018_df_raw.index.isin(list(MG_exome_df.index) + high_cov_id_list)]

In [26]:
#Make dataframe for writing into 2014 dataframe:

#Remove * from entry:
corrections_2018_df_no_asterix = corrections_2018_df.copy()
corrections_2018_df_no_asterix=corrections_2018_df_no_asterix.replace({'\*':''}, regex=True)

#Add allele name in front of the resolution:
for identity in corrections_2018_df_no_asterix.index:
    for col in corrections_2018_df_no_asterix.columns[2:]:
        entrylist = corrections_2018_df_no_asterix.loc[identity][col].split('/')       
        
        entrylist = list(set([col.split('.')[0] + "*" + i.split('*')[0] for i in entrylist]))
        
        corrections_2018_df_no_asterix.at[identity,col] = entrylist


corrections_2018_df

#Merge alleles into one column:

corrections_2018_df_no_asterix['A_merged']= corrections_2018_df_no_asterix[['A', 'A.1']].apply(lambda x: list(x), axis=1)
corrections_2018_df_no_asterix['B_merged']= corrections_2018_df_no_asterix[['B', 'B.1']].apply(lambda x: list(x), axis=1)
corrections_2018_df_no_asterix['C_merged']= corrections_2018_df_no_asterix[['C', 'C.1']].apply(lambda x: list(x), axis=1)
corrections_2018_df_no_asterix['DRB1_merged']= corrections_2018_df_no_asterix[['DRB1', 'DRB1.1']].apply(lambda x: list(x), axis=1)
corrections_2018_df_no_asterix['DQB1_merged']= corrections_2018_df_no_asterix[['DQB1', 'DQB1.1']].apply(lambda x: list(x), axis=1)

corrections_2018_merged_df = corrections_2018_df_no_asterix.drop(columns=['A', 'A.1', 'B', 'B.1', 'C', 'C.1', 'DRB1', 'DRB1.1','DQB1', 'DQB1.1'])

corrections_2018_merged_df.rename(columns={"A_merged": "A", "B_merged": "B", "C_merged": "C", "DRB1_merged": "DRB1", "DQB1_merged": "DQB1"}, inplace = True)

In [27]:
#Update samples, where typing was not performed in 2014:

for identity in non_typed_samples:
    for col in list(MG_exome_merged_df.columns)[1:]:
        entry = MG_exome_merged_df.loc[identity,col]
        if ['not_typed_in_2014'] in entry:
            
            column = col.split('.')[0]
                
            new_type = corrections_2018_merged_df.loc[identity][column]

            old_type = MG_exome_merged_df.loc[identity,column]

            corrected_typing = list()

            for new_allele in list(new_type):
                if (new_allele in list(old_type)[0]):
                    corrected_typing.append(list(old_type)[0][0])
                elif new_allele in list(old_type)[1]:
                    corrected_typing.append(list(old_type)[0][1])
                else:
                    corrected_typing.append(new_allele)

            MG_exome_merged_df.at[identity,column] = corrected_typing
            
            

In [28]:
corrections_2018_df

Unnamed: 0_level_0,Region,sbgroup,A,A.1,B,B.1,C,C.1,DQB1,DQB1.1,DRB1,DRB1.1
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
NA19319,AFR,LWK,02:01,36:01,15:03,53:01,02:10,04:01,03:01,03:19*,08:04,11:01
NA18504,AFR,YRI,36:01,36:01,15:03,39:10,02:10*,12:03,02:01,05:01,01:23*,03:01
NA19099,AFR,YRI,23:01,36:01,07:02,07:02,07:02,15:05,02:02*,05:02,09:01,15:03
NA19153,AFR,YRI,33:03,36:01,18:01,53:01,04:01,05:01,04:02,05:03,03:02,14:54*
NA19239,AFR,YRI,02:01,68:02,35:01,52:01,04:01,16:01,03:01,05:01,13:01,12:01*
NA11840,EUR,CEU,02:01,02:01,27:05*,57:01,02:02,06:02,03:02,03:03,04:04,07:01
HG01886,AFR,ACB,30:02,74:01,15:03,57:03,02:10,07:01,05:02,06:09,11:01,13:02
HG02057,EAS,KHV,02:03,31:01,48:01,13:01,03:03,03:04,03:01,03:01,11:01,13:12
NA11830,EUR,CEU,02:01,02:01,14:01,14:02,08:02,08:02,02:02*,03:01,13:03,07:01
NA19172,AFR,YRI,30:02,68:01,35:01,44:03,04:01,04:01,02:02*,02:02*,07:01,15:03


In [29]:
len(corrections_2018_df)

67

In [30]:
#Perform update of samples, where mistakes have been found

changed_sample_indexes = list()


#Loop over the samples, which needs to be corrected
for identity in list(corrections_2018_df.index):
    
    #Loop over relevant columns (genes)
    for col in corrections_2018_df.columns[3:]:
        
        #Loop over alleles
        for entry in corrections_2018_df[corrections_2018_df.index == identity][col]:
            
            #If allele is corrected since 2014 - update it
            if '*' in entry:
                column = col.split('.')[0]
                
                new_type = corrections_2018_merged_df.loc[identity][column]
                
                old_type = MG_exome_merged_df.loc[identity,column]
                
                #print(old_type)
                corrected_typing = list()
                
                for new_allele in list(new_type):
                    if (new_allele in list(old_type)[0]):
                        corrected_typing.append(list(old_type)[0][0])
                    elif new_allele in list(old_type)[1]:
                        corrected_typing.append(list(old_type)[0][1])
                    else:
                        corrected_typing.append(new_allele)
                
#                 #Make sure, that the correction is p-type resolution:
#                 for i in range(len(corrected_typing)):
#                     corrected_typing[i] = [convert_to_two_field_p_group(corrected_typing[i][0])]
                
                MG_exome_merged_df.at[identity,column] = corrected_typing
            
    #Add the 10 high coverage samples to MG_exome_merged
    if identity in high_cov_id_list:
        

        high_cov_prediction_row = corrections_2018_merged_df.loc[identity][['A','B','C','DRB1','DQB1']]
        
#         #Make sure, that correction is p-type resolution:
#         for gene in high_cov_prediction_row.index:
#             for i in range(len(high_cov_prediction_row[gene])):
#                 high_cov_prediction_row[gene][i][0] = convert_to_two_field_p_group(high_cov_prediction_row[gene][i][0])
        
        MG_exome_merged_df = MG_exome_merged_df.append(high_cov_prediction_row)    

## Make URL list for 10 high coverage WES samples

In [31]:
print(high_cov_id_list) 

# high_cov_url_list = list()

# for identity in high_cov_id_list:
       
#     sbgroup = m2018_gs_df.loc[identity]['sbgroup']
#     wget_url = "ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/data_collections/1000_genomes_project/data/{}/{}/exome_alignment/{}.alt_bwamem_GRCh38DH.20150826.{}.exome.cram".format(sbgroup, identity, identity, sbgroup, )
    
#     if checkUrl(wget_url):
#         high_cov_url_list.append(wget_url)

# with open(gold_standard_path + 'high_cov_url_list.txt', 'w') as outfile:
#     for entry in high_cov_url_list:
#         outfile.write(entry + '\n')
    
# with open(gold_standard_path + 'high_cov_id_list.txt', 'w') as outfile:
#     for entry in high_cov_id_list:
#         outfile.write(entry + '\n')

high_cov_url_list = list()
with open(gold_standard_path + 'high_cov_url_list.txt', 'r') as infile:
    for line in infile:
        high_cov_url_list.append(line[:-1])   

high_cov_id_list= list()
with open(gold_standard_path + 'high_cov_id_list.txt', 'r') as infile:
    for line in infile:
        high_cov_id_list.append(line[:-1])   

['HG01756', 'HG01757', 'HG01872', 'HG01873', 'HG01886', 'HG01953', 'HG01968', 'HG02014', 'HG02057', 'NA20313']


## Write config.yaml file for Snakemake workflow (both 2014 data and high cov data)

In [32]:
full_sample_urllist = list(set(gold_standard_url_list)) + high_cov_url_list
full_sample_id_list = list(set(gold_standard_id_list)) + high_cov_id_list

configfile_1 = 'C:\\Users\\nikol\\OneDrive\\DTU\\11_semester\\Data\\config_all.yaml'

with open(configfile_1, 'w') as outfile:
    outfile.write('sample_urlist:\n')
    for entry in full_sample_urllist:
        entry_name = entry.split('/')[9]
        outfile.write("  " + entry_name + ": " +  "\"" + entry + "\"" + '\n')


In [33]:
len(gold_standard_id_list)

822

In [34]:
len(MG_exome_merged_df.index)

829

# Make three (four) types of old standard based on MG_exome_merged_df

1) Four field resolution
2) G-group resolution
3) P-group resolution
4) Evaxion-group resolution

In [35]:
gs_one_field_df = MG_exome_merged_df.copy()
gs_two_field_df = MG_exome_merged_df.copy()
gs_p_group_df = MG_exome_merged_df.copy()
gs_e_group_df = MG_exome_merged_df.copy()

#Loop over all entries and update the three gold standard dataframes, so they fit the idividual resolution
for identity in list(MG_exome_merged_df.index):
    for gene in list(MG_exome_merged_df.columns):
        
        old_pred_list = MG_exome_merged_df.loc[identity,gene]
        
        gene_pred_one_field = list()
        gene_pred_two_field = list()
        gene_pred_p_group = list()
        gene_pred_e_group = list()
    
    
        #Loop over the two alleles
        for allele_list in old_pred_list:
            
            allele_pred_one_field = list()            
            allele_pred_two_field = list()
            allele_pred_p_group = list()
            allele_pred_e_group = list()
            
            #Convert each prediction to it's respective correct format:
            for allele in allele_list:
                
                allele_pred_one_field.append(convert_to_one_field(allele))
                allele_pred_two_field.append(convert_to_two_field(allele))
                allele_pred_p_group.append(convert_to_p_group(allele))
                allele_pred_e_group.append(convert_to_e_group(allele))
                
                if convert_to_two_field(allele) in null_allele_outside_g_group_list:
                    print(identity, allele, allele_list)
                
        
            #Merge the two alleles for each gene in a list
            gene_pred_one_field.append(list(set(allele_pred_one_field)))
            gene_pred_two_field.append(list(set(allele_pred_two_field)))
            gene_pred_p_group.append(list(set(allele_pred_p_group)))
            gene_pred_e_group.append(list(set(allele_pred_e_group)))
        
        #Update dataframes wit0h the new predictions
        gs_one_field_df.at[identity,gene] = gene_pred_one_field
        gs_two_field_df.at[identity,gene] = gene_pred_two_field
        gs_p_group_df.at[identity,gene] = gene_pred_p_group
        gs_e_group_df.at[identity,gene] = gene_pred_e_group

NA12287 B*15:26 ['B*15:01', 'B*15:26', 'B*15:12', 'B*15:19']


# Save Dataframes as pickle objects

In [36]:
MG_exome_merged_df.to_pickle(gold_standard_path + "MG_exome_merged_df.pkl")

gs_one_field_df.to_pickle(gold_standard_path + "gs_one_field_df.pkl")

gs_two_field_df.to_pickle(gold_standard_path + "gs_two_field_df.pkl")

gs_p_group_df.to_pickle(gold_standard_path + "gs_p_group_df.pkl")

gs_e_group_df.to_pickle(gold_standard_path + "gs_e_group_df.pkl")

In [38]:
len(MG_exome_merged_df)

829