In [1]:
import pandas as pd
import numpy as np

### data load

In [2]:
# path 
import os
os.chdir('/home/midan/_midanniiii/IMGTHLA/fasta')

In [3]:
import Bio

In [4]:
# for HLA-C
from Bio import SeqIO, Seq
input_file = 'C_prot.fasta'
records = SeqIO.parse(input_file, 'fasta')
records = list(records) # make a copy, otherwise our generator is exhausted after calculating maxlen
maxlen = max(len(record.seq) for record in records)

# pad sequences so that they all have the same length
for record in records:
    if len(record.seq) != maxlen:
        sequence = str(record.seq).ljust(maxlen, '.')
        record.seq = Seq.Seq(sequence)
assert all(len(record.seq) == maxlen for record in records)

In [5]:
des = []
for record in records:
    des.append(record.description)

In [6]:
des[0]

'HLA:HLA00401 C*01:02:01:01 366 bp'

In [7]:
import pandas as pd

In [8]:
ids = []
full_ids = []
delete = []
name = []
for i in range(len(records)):
    des = records[i].description[13:] # A*01:01:01:01 365 bp
    if des[0] == 'C':
        full_ids.append(des[2:des.find('bp')-5])
        name.append(records[i].name)
        if des.count(':')<2:
            ids.append(des[2:des.find('bp')-4])
        else:
            ids.append(des[2:des.find(':', des.find(':')+1)])

In [9]:
df = pd.DataFrame(data = {'ids': ids, 'full_id': full_ids, 'name':name})

In [10]:
df

Unnamed: 0,ids,full_id,name
0,01:02,01:02:01:01,HLA:HLA00401
1,01:02,01:02:01:02,HLA:HLA16697
2,01:02,01:02:01:03,HLA:HLA16866
3,01:02,01:02:01:04,HLA:HLA17051
4,01:02,01:02:01:05,HLA:HLA17299
...,...,...,...
5704,18:10,18:10,HLA:HLA14214
5705,18:11,18:11,HLA:HLA19746
5706,18:12,18:12,HLA:HLA21586
5707,18:13,18:13,HLA:HLA23551


### delete overlapped

In [11]:
uniques = list(df['ids'].unique())

In [12]:
len(uniques)

3704

In [13]:
overlap = [] # alleles of overlapped
not_overlap = [] # alleles of not overlapped
for uniq in uniques:
    if sum(df.ids == uniq) != 1:
        overlap.append(uniq) 
    else:
        not_overlap.append(uniq)

In [14]:
len(not_overlap)

3477

In [15]:
only = [] # delete overlapped alleles
for ovl in overlap:
    ovls = []
    for i in range(len(df)):
        if df.ids[i] == ovl:
            ovls.append(i)
        elif df.ids[i] in not_overlap:
            only.append(i)
    only.append(ovls[0])
only = sorted(list(set(only)))

In [16]:
len(only)

3704

In [17]:
import numpy as np

In [18]:
data = np.asarray(df)[only]

In [19]:
hla_a = []
a_name = []
for i in range(len(data)):
    hla_a.append('HLA-C-' + data[i][0][0:2] + data[i][0][3:5])
    a_name.append(data[i][2])

In [20]:
pd.DataFrame(data={'alleles':hla_a, 'name':a_name})

Unnamed: 0,alleles,name
0,HLA-C-0102,HLA:HLA00401
1,HLA-C-0103,HLA:HLA00402
2,HLA-C-0104,HLA:HLA01075
3,HLA-C-0105,HLA:HLA01434
4,HLA-C-0106,HLA:HLA01555
...,...,...
3699,HLA-C-1810,HLA:HLA14214
3700,HLA-C-1811,HLA:HLA19746
3701,HLA-C-1812,HLA:HLA21586
3702,HLA-C-1813,HLA:HLA23551


In [21]:
from Bio import Align
from Bio.Align import Applications
from Bio.Align.Applications import MuscleCommandline

In [23]:
from Bio import SeqIO
records = (r for r in SeqIO.parse('C_prot.fasta', 'fasta') if len(r)<900)

In [24]:
from io import StringIO
handle = StringIO()
SeqIO.write(records, handle, 'fasta')
data = handle.getvalue()

In [25]:
muscle_exe = '/home/midan/_midanniiii/muscle3.8.31_i86linux64'
muscle_cline = MuscleCommandline(muscle_exe)
stdout, stderr = muscle_cline(stdin=data)

In [26]:
empty = []
new = []
for i in range(len(stdout)):
    if stdout[i] == '>':
        new.append(empty)
        empty = []
    else:
        empty.append(stdout[i])

In [27]:
align = []
for i in range(len(new)):
    align.append((','.join(new[i])).replace(',',''))

In [28]:
# alignment & name
new_align = []
names = []
for i in range(len(align)):
    a = align[i]
    p = a.find('p')
    new_align.append(align[i][p+2:])
    names.append(align[i][:12])

In [29]:
names

['',
 'HLA:HLA22272',
 'HLA:HLA15623',
 'HLA:HLA14818',
 'HLA:HLA23310',
 'HLA:HLA22038',
 'HLA:HLA23892',
 'HLA:HLA14110',
 'HLA:HLA12203',
 'HLA:HLA22357',
 'HLA:HLA11345',
 'HLA:HLA07030',
 'HLA:HLA12513',
 'HLA:HLA12349',
 'HLA:HLA16952',
 'HLA:HLA08823',
 'HLA:HLA10588',
 'HLA:HLA22100',
 'HLA:HLA22667',
 'HLA:HLA23040',
 'HLA:HLA19868',
 'HLA:HLA21609',
 'HLA:HLA06422',
 'HLA:HLA08174',
 'HLA:HLA14344',
 'HLA:HLA13675',
 'HLA:HLA07972',
 'HLA:HLA07243',
 'HLA:HLA11346',
 'HLA:HLA11670',
 'HLA:HLA12728',
 'HLA:HLA14033',
 'HLA:HLA17518',
 'HLA:HLA19659',
 'HLA:HLA20501',
 'HLA:HLA21678',
 'HLA:HLA22465',
 'HLA:HLA07009',
 'HLA:HLA15144',
 'HLA:HLA15142',
 'HLA:HLA11583',
 'HLA:HLA14032',
 'HLA:HLA04696',
 'HLA:HLA01711',
 'HLA:HLA03437',
 'HLA:HLA03077',
 'HLA:HLA15677',
 'HLA:HLA22224',
 'HLA:HLA05739',
 'HLA:HLA07017',
 'HLA:HLA14031',
 'HLA:HLA08825',
 'HLA:HLA04617',
 'HLA:HLA14034',
 'HLA:HLA14757',
 'HLA:HLA06436',
 'HLA:HLA02831',
 'HLA:HLA20060',
 'HLA:HLA06403',
 'HLA:HLA

In [31]:
for i in range(len(new_align)):
    new_align[i] = new_align[i].replace('\n','')
new_align = new_align[1:]
names = names[1:]

In [32]:
m = new_align[0].find('M')
for i in range(len(new_align)):
    new_align[i] = new_align[i][m:]

In [33]:
new_align

['MRYFSTSVSX--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------',
 'MRYFSTSVSWPGRGEPRFIAVGYVDDTQFVRFDSDAAESKRGA-------------AGA--------------------------------------V---------G----GAGGAGVLGPGDTEVQAPGTGX----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------',
 'MRYFSTSVSWPGRGX-------------------------------------------------------------------------------------

In [34]:
df_a = pd.DataFrame(data = {'align' : new_align, 'name':names})
a = pd.DataFrame(data = {'name': a_name, 'alleles': hla_a})
hla_a = pd.merge(a,df_a)
hla_c = hla_a.drop(['name'], axis=1)

In [35]:
hla_c

Unnamed: 0,alleles,align
0,HLA-C-0102,MKYFFTSVSRPGRGEPRFISVGYVDDTQFVRFDSDAASPRGEP-RA...
1,HLA-C-0103,MKYFFTSVSRPGRGEPRFISVGYVDDTQFVRFDSDAASPRGEP-RA...
2,HLA-C-0104,MKYFFTSVSRPGRGEPRFISVGYVDDTQFVRFDSDAASPRGEP-RA...
3,HLA-C-0105,MKYFFTSVSRPGRGEPRFISVGYVDDTQFVRFDSDAASPRGEP-RA...
4,HLA-C-0106,MKYFFTSVSRPGRGEPRFISVGYVDDTQFVRFDSDAASPRGEP-RA...
...,...,...
3698,HLA-C-1810,MRYFDTAVSRPGRGEPRFISVGYVDDTQFVRFDSDPASPRGEP-RA...
3699,HLA-C-1811,MRYFDTAVSRPGRGEPRFISVGYVDDTQFVRFDSDAASPRGEP-RA...
3700,HLA-C-1812,MRYFDTAVSWPGRGEPRFISVGYVDDTQFVRFDSDAASPRGEP-RA...
3701,HLA-C-1813,MRYFDTAVSRPGRGEPRFISVGYVDDTQFVRFDSDAASPRGEP-RA...


In [36]:
aligns = list(hla_a['align'])
align = aligns[0]
indices = []
for i in range(len(align)):
    if align[i] =='-':
        indices.append(i)

In [37]:
new_align = []
for i in range(len(aligns)):
    align = aligns[i]
    a = []
    for j in range(len(align)):
        if j not in indices:
            a.append(align[j])
    new_align.append(('').join(a))
    new_align[i] = new_align[i].replace('-','*')

In [38]:
new_align

['MKYFFTSVSRPGRGEPRFISVGYVDDTQFVRFDSDAASPRGEPRAPWVEQEGPEYWDRETQKYKRQAQTDRVSLRNLRGYYNQSEAGSHTLQWMCGCDLGPDGRLLRGYDQYAYDGKDYIALNEDLRSWTAADTAAQITQRKWEAAREAEQRRAYLEGTCVEWLRRYLENGKETLQRAEHPKTHVTHHPVSDHEATLRCWALGFYPAEITLTWQWDGEDQTQDTELVETRPAGDGTFQKWAAVMVPSGEEQRYTCHVQHEGLPEPLTLRWEPSSQPTIPIVGIVAGLAVLAVLAVLGAVVAVVMCRRKSSGGKGGSCSQAASSNSAQGSDESLIACKA',
 'MKYFFTSVSRPGRGEPRFISVGYVDDTQFVRFDSDAASPRGEPRAPWVEQEGPEYWDRETQKYKRQAQTDRVSLRNLRGYYNQSEAGSHTLQWMCGCDLGPDGRLLRGYNQFAYDGKDYIALNEDLRSWTAADTAAQITQRKWEAAREAEQRRAYLEGTCVEWLRRYLENGKETLQRAEHPKTHVTHHPVSDHEATLRCWALGFYPLTWQWD*****EDQTQDTELVETRPAGDGTFQKWAAVMVPSGEEQRYTCHVQHEGLPEPLTLRWEPSSQPTIPIVGIVAGLAVLAVLAVLGAVVAVVMCRRKSSGGKGGSCSQAASSNSAQGSDESLIACKA',
 'MKYFFTSVSRPGRGEPRFISVGYVDDTQFVRFDSDAASPRGEPRAPWVEQEGPEYWDRETQKYKRQAQTDRVSLRNLRGYYNQSEAGSHTLQWMCGCDLGPDGRLLRGYDQSAYDGKDYIALNEDLRSWTAADTAAQITQRKWEAAREAEQWRAYLEGTCVEWLRRYLENGKETLQRAEHPKTHVTHHPVSDHEATLRCWALGFYPLTWQRD*****EDQTQDTELVETRPAGDGTFQKWAAVVVPSGEEQRYTCHVQHEGLPEPLTLRWEPSSQPTIPIVGIVAGLAVLAVLAVLGAVMAVVMCRRKSSGG

In [39]:
hla_c = pd.DataFrame(data = {'alleles':list(hla_a['alleles']), 'align': new_align})

In [40]:
hla_c

Unnamed: 0,alleles,align
0,HLA-C-0102,MKYFFTSVSRPGRGEPRFISVGYVDDTQFVRFDSDAASPRGEPRAP...
1,HLA-C-0103,MKYFFTSVSRPGRGEPRFISVGYVDDTQFVRFDSDAASPRGEPRAP...
2,HLA-C-0104,MKYFFTSVSRPGRGEPRFISVGYVDDTQFVRFDSDAASPRGEPRAP...
3,HLA-C-0105,MKYFFTSVSRPGRGEPRFISVGYVDDTQFVRFDSDAASPRGEPRAP...
4,HLA-C-0106,MKYFFTSVSRPGRGEPRFISVGYVDDTQFVRFDSDAASPRGEPRAP...
...,...,...
3698,HLA-C-1810,MRYFDTAVSRPGRGEPRFISVGYVDDTQFVRFDSDPASPRGEPRAP...
3699,HLA-C-1811,MRYFDTAVSRPGRGEPRFISVGYVDDTQFVRFDSDAASPRGEPRAP...
3700,HLA-C-1812,MRYFDTAVSWPGRGEPRFISVGYVDDTQFVRFDSDAASPRGEPRAP...
3701,HLA-C-1813,MRYFDTAVSRPGRGEPRFISVGYVDDTQFVRFDSDAASPRGEPRAP...


In [41]:
hla_c.to_csv('/home/midan/_midanniiii/HLA_C_prot.txt', index=False, header=None, sep='\t')