In [46]:
import os
import json
import pandas as pd

from Bio.SeqIO.FastaIO import SimpleFastaParser
from Bio.SeqUtils.IsoelectricPoint import IsoelectricPoint as IP

In [47]:
meta_dir = 'metadata'
res_dir = 'results'
annotations_dir = os.path.join('annotation', 'prokka')

targets_file = os.path.join(meta_dir, 'collect_this.tsv')
gff_file = os.path.join(res_dir, 'upstreams_clusters_domains.gff')
assembly_info = os.path.join('ncbi_dataset', 'data','assembly_data_report.jsonl')
out_file = os.path.join('selected_variants_cloning', 'concatenated_info.tsv')

Read targets file:

In [48]:
targets = pd.read_csv(targets_file, sep='\t', names = ['cluster_num', 'seq_id'])

In [49]:
targets.head()

Unnamed: 0,cluster_num,seq_id
0,6,MW074885.1
1,7,NC_048064.1
2,8,NC_048071.1
3,9,NC_047777.1
4,10,NC_047895.1


Read gff:

In [50]:
colnames = ['seq_id', 'source', 'type', 'start', 'end', 'score', 'strand', 'phase', 'attribute']
gff_data = pd.read_csv(gff_file, sep='\t', skiprows=1, names=colnames)
gff_data = gff_data.drop(columns=['source', 'type', 'score', 'phase'])

gff_data['attribute'] = gff_data['attribute'].apply(lambda x: x.replace('=;', '=NaN;'))  # fix a data artefact
gff_data['attribute_dict'] = gff_data['attribute'].apply(lambda x: {i.split('=')[0]: 
                                                                    i.split('=')[1] for i in x.split(';')})
gff_data.head()

Unnamed: 0,seq_id,start,end,strand,attribute,attribute_dict
0,AY264775.1,901,1254,+,ID=GCA_002600005.1_00001;file_id=upstreams_wit...,"{'ID': 'GCA_002600005.1_00001', 'file_id': 'up..."
1,AY264775.1,1254,1409,+,ID=GCA_002600005.1_00002;file_id=upstreams_wit...,"{'ID': 'GCA_002600005.1_00002', 'file_id': 'up..."
2,AY264775.1,1478,1615,+,ID=GCA_002600005.1_00003;file_id=upstreams_wit...,"{'ID': 'GCA_002600005.1_00003', 'file_id': 'up..."
3,AY264775.1,1612,2256,+,ID=GCA_002600005.1_00004;file_id=upstreams_wit...,"{'ID': 'GCA_002600005.1_00004', 'file_id': 'up..."
4,AY264775.1,2327,4978,+,ID=GCA_002600005.1_00005;Name=1;file_id=upstre...,"{'ID': 'GCA_002600005.1_00005', 'Name': '1', '..."


Create separate columns from attribute dict:

In [51]:
norm_attribute = pd.json_normalize(gff_data.attribute_dict)
gff_data = pd.concat([gff_data, norm_attribute], axis=1)
# remove temp columns:
gff_data = gff_data.drop(columns=['attribute', 'attribute_dict'])
gff_data.head()

Unnamed: 0,seq_id,start,end,strand,ID,file_id,inference,locus_tag,product,cluster_num,...,pfam_annotation,pfam,e_c_number,Name,gene,cdd,cdd_name,tigr,tigrfam_name,conj_pl_domains
0,AY264775.1,901,1254,+,GCA_002600005.1_00001,upstreams_with_clusters,"ab initio prediction:Prodigal:2.6,similar to A...",GCA_002600005.1_00001,Overcome classical restriction gp0.3,2,...,ocr,PF08684.13,,,,,,,,
1,AY264775.1,1254,1409,+,GCA_002600005.1_00002,upstreams_with_clusters,"ab initio prediction:Prodigal:2.6,similar to A...",GCA_002600005.1_00002,Gene 0.4 protein,11,...,,,,,,,,,,
2,AY264775.1,1478,1615,+,GCA_002600005.1_00003,upstreams_with_clusters,ab initio prediction:Prodigal:2.6,GCA_002600005.1_00003,hypothetical protein,19,...,,,,,,,,,,
3,AY264775.1,1612,2256,+,GCA_002600005.1_00004,upstreams_with_clusters,"ab initio prediction:Prodigal:2.6,similar to A...",GCA_002600005.1_00004,Protein kinase 0.7,124,...,,,2.7.11.1,,,,,,,
4,AY264775.1,2327,4978,+,GCA_002600005.1_00005,upstreams_with_clusters,"ab initio prediction:Prodigal:2.6,similar to A...",GCA_002600005.1_00005,T7 RNA polymerase,0,...,"RNA_pol,RPOL_N","PF00940.22,PF14700.9",2.7.7.6,1.0,1.0,,,,,


In [52]:
gff_data = gff_data.drop(columns=['file_id', 'inference', 'phrOG',
       'phrOG_annotation', 'phrOG_category', 'pfam_annotation', 'pfam',
       'e_c_number', 'Name', 'gene', 'cdd', 'cdd_name', 'tigr', 'tigrfam_name',
       'conj_pl_domains', 'locus_tag', 'start', 'end', 'strand', 'product', 'cluster_main_prod'])
gff_data.head()

Unnamed: 0,seq_id,ID,cluster_num
0,AY264775.1,GCA_002600005.1_00001,2
1,AY264775.1,GCA_002600005.1_00002,11
2,AY264775.1,GCA_002600005.1_00003,19
3,AY264775.1,GCA_002600005.1_00004,124
4,AY264775.1,GCA_002600005.1_00005,0


Change cluster number to 1-based.

In [53]:
gff_data['cluster_num'] = gff_data['cluster_num'].apply(lambda row: int(row) + 1)

In [54]:
gff_data.head()

Unnamed: 0,seq_id,ID,cluster_num
0,AY264775.1,GCA_002600005.1_00001,3
1,AY264775.1,GCA_002600005.1_00002,12
2,AY264775.1,GCA_002600005.1_00003,20
3,AY264775.1,GCA_002600005.1_00004,125
4,AY264775.1,GCA_002600005.1_00005,1


In [55]:
gff_data[gff_data["ID"] == "GCF_002611705.1_00005"]

Unnamed: 0,seq_id,ID,cluster_num
2585,NC_047769.1,GCF_002611705.1_00005,17


In [56]:
df = pd.merge(targets, gff_data)
df

Unnamed: 0,cluster_num,seq_id,ID
0,6,MW074885.1,GCA_015296585.1_00042
1,7,NC_048064.1,GCF_003575725.1_00008
2,8,NC_048071.1,GCF_003601515.1_00005
3,9,NC_047777.1,GCF_002613645.1_00005
4,10,NC_047895.1,GCF_002957255.1_00004
5,11,NC_047808.1,GCF_002618545.1_00011
6,17,NC_047769.1,GCF_002611705.1_00005
7,19,NC_047766.1,GCF_002610185.1_00017
8,22,MW876471.1,GCA_018612625.1_00036
9,25,MZ234024.1,GCA_020869015.1_00009


Function to extract nucleotide or protein sequence from prokka annotation:

In [57]:
def extract_seq(prot_id: str, type: str, ann_path = annotations_dir) -> str:
    extensions = {'prot': 'faa', 'nucl': 'ffn'}
    ext = extensions[type]
    if prot_id == 'Nan':
        return None
    genome_id = '_'.join(prot_id.split('_')[:2])
    fna_path = os.path.join(ann_path, genome_id, f'{genome_id}.{ext}')
    with open(fna_path, 'rt') as fna_file:
        for seq_id, sequence in SimpleFastaParser(fna_file):
            if seq_id.split(' ')[0] == prot_id:
                if type == 'prot':
                    return sequence + '*'
                else:
                    return sequence

Apply function to dataframe to find sequences of targets.

In [58]:
df['protein_sequence'] = df.ID.apply(lambda target: extract_seq(target, type='prot'))
df['nucleotide_sequence'] = df.ID.apply(lambda target: extract_seq(target, type='nucl'))

Add 0.6B and 0.65:

In [59]:
df2 = {'cluster_num': [32, 38],
       'seq_id': ['NC_003298.1', 'NC_001604.1'],
       'protein_sequence': ['MLRTNSKHVKTALYAMAYGASKRKVKRILTRHRKMTARQAASAVKWAEFTLYSYR*',
                           'MTERTDGLKKGYMPNGTLYAANRRIVRTWRENNLERRKDKRGRRGIDERKRLKPRNSPHLNRH*'],
      'nucleotide_sequence': ['TTGCTACGCACTAACTCAAAGCACGTAAAGACCGCACTGTATGCGATGGCGTATGGTGCATCGAAACGCAAGGTCAAACGCATCTTAACGAGACACCGCAAGATGACCGCACGACAGGCTGCAAGCGCTGTCAAATGGGCTGAATTTACTCTTTACTCTTACAGATAA',
                             'atgacagaacgcactgatggcttaaagaaaggttatatgcccaatggcacactatacgctgcaaatcggcgaatagtgagaacttggcgagagaacaacctcgaacgccgcaaggacaagagagggcggcgtggcatagacgaaaggaaaaggttaaagccaagaaactcgccgcacttgaacaggcactag'.upper()],
      'ID': ['GCF_000841665.1_00003.5', 'GCF_000844825.1_00003.5']}
df2 = pd.DataFrame(df2)
df = pd.concat([df, df2], ignore_index = True)
df.tail()

Unnamed: 0,cluster_num,seq_id,ID,protein_sequence,nucleotide_sequence
32,20,NC_001604.1,GCF_000844825.1_00003,MYMLTIGLLTALGLAVGASFGKALGVAVGSYFTACIIIGIIKGALRK*,ATGTATATGCTTACTATCGGTCTACTCACCGCTCTAGGTCTAGCTG...
33,5,NC_047942.1,GCF_002997865.1_00001,MIFTKEPANVFYVLVSAFRSNLDDAENMSRHRHMVSTLRAAEGLYG...,ATGATTTTCACTAAAGAACCTGCTAACGTCTTCTATGTGCTGGTCT...
34,3,NC_047942.1,GCF_002997865.1_00003,MKRNANAYYELLAAAVEAFNVRIQEDQLTEHHDYHAALHEVVDKMV...,ATGAAACGCAATGCTAACGCTTATTATGAACTGCTGGCCGCTGCCG...
35,32,NC_003298.1,GCF_000841665.1_00003.5,MLRTNSKHVKTALYAMAYGASKRKVKRILTRHRKMTARQAASAVKW...,TTGCTACGCACTAACTCAAAGCACGTAAAGACCGCACTGTATGCGA...
36,38,NC_001604.1,GCF_000844825.1_00003.5,MTERTDGLKKGYMPNGTLYAANRRIVRTWRENNLERRKDKRGRRGI...,ATGACAGAACGCACTGATGGCTTAAAGAAAGGTTATATGCCCAATG...


Add length calculation:

In [60]:
df['len_protein'] = df.protein_sequence.apply(lambda seq: len(seq))
df['len_nucl'] = df.nucleotide_sequence.apply(lambda seq: len(seq))

Add pI calculation:

In [61]:
df['pI'] = df['protein_sequence'].apply(lambda x: IP(x).pi())
df['pI'] = df['pI'].apply(lambda x: round(x, 2))

In [62]:
df.tail()

Unnamed: 0,cluster_num,seq_id,ID,protein_sequence,nucleotide_sequence,len_protein,len_nucl,pI
32,20,NC_001604.1,GCF_000844825.1_00003,MYMLTIGLLTALGLAVGASFGKALGVAVGSYFTACIIIGIIKGALRK*,ATGTATATGCTTACTATCGGTCTACTCACCGCTCTAGGTCTAGCTG...,48,144,9.86
33,5,NC_047942.1,GCF_002997865.1_00001,MIFTKEPANVFYVLVSAFRSNLDDAENMSRHRHMVSTLRAAEGLYG...,ATGATTTTCACTAAAGAACCTGCTAACGTCTTCTATGTGCTGGTCT...,152,456,5.93
34,3,NC_047942.1,GCF_002997865.1_00003,MKRNANAYYELLAAAVEAFNVRIQEDQLTEHHDYHAALHEVVDKMV...,ATGAAACGCAATGCTAACGCTTATTATGAACTGCTGGCCGCTGCCG...,175,525,4.15
35,32,NC_003298.1,GCF_000841665.1_00003.5,MLRTNSKHVKTALYAMAYGASKRKVKRILTRHRKMTARQAASAVKW...,TTGCTACGCACTAACTCAAAGCACGTAAAGACCGCACTGTATGCGA...,56,168,11.4
36,38,NC_001604.1,GCF_000844825.1_00003.5,MTERTDGLKKGYMPNGTLYAANRRIVRTWRENNLERRKDKRGRRGI...,ATGACAGAACGCACTGATGGCTTAAAGAAAGGTTATATGCCCAATG...,64,192,11.63


Add phage name:

In [63]:
def extract_phage_ncbi_name(prot_id: str, assembly_info_path=assembly_info) -> str:
    genome_id = '_'.join(prot_id.split('_')[:2])
    with open(assembly_info, 'rt') as jsonl_file:
        for line in jsonl_file:
            info = json.loads(line)
            if info['accession'] == genome_id:
                return info['organism']['organismName']

In [64]:
df['OrganismName'] = df.ID.apply(lambda ID: extract_phage_ncbi_name(ID))

In [65]:
df.to_csv(out_file, sep='\t', index=False)

### RothH1 (external)

In [29]:
! cat /home/oxalotl/work_dir/klebsiella_phages/pharokka/RothH1/phanotate.faa | grep -A 3 RothH1_CDS_0054 | \
grep -v ">" | tr -d "\n"

MSTELNQPAITARIIPNNRRMAFLPRLFGAWYLTGEAGVYNLARSLCVDYQGGSWEFVELSCGSGFMYPLSAERFTVSVSGNWFEGELSAEATGIVLTLFTLNHMIWHAHDEGYQHICDMLITQQEKLKLYADQHAEAGLIYRAID

In [32]:
! cat /home/oxalotl/work_dir/klebsiella_phages/pharokka/RothH1/phanotate.faa | grep -A 3 RothH1_CDS_0054 | \
grep -v ">" | tr -d "\n" | wc -c

146


In [31]:
! cat /home/oxalotl/work_dir/klebsiella_phages/pharokka/RothH1/phanotate.ffn | grep -A 9 RothH1_CDS_0054 | \
grep -v ">" | tr -d "\n"

ATGTCTACTGAACTGAATCAACCCGCCATCACTGCACGCATTATCCCCAATAACCGCCGCATGGCCTTTCTGCCACGTCTGTTCGGCGCCTGGTATCTCACAGGTGAGGCCGGTGTCTACAACCTCGCACGCTCTCTCTGCGTCGATTATCAAGGCGGTTCCTGGGAGTTCGTCGAACTGTCTTGCGGTAGCGGCTTCATGTACCCACTGAGCGCAGAGCGCTTCACGGTATCGGTTAGCGGTAACTGGTTCGAAGGGGAACTGTCCGCAGAAGCTACCGGTATCGTGCTGACCCTGTTCACCCTGAACCACATGATCTGGCACGCACATGATGAAGGCTATCAGCACATCTGCGACATGCTGATCACCCAGCAGGAAAAGCTCAAATTATATGCCGACCAGCACGCCGAAGCCGGTTTGATTTACCGCGCGATTGACTGA

In [33]:
! cat /home/oxalotl/work_dir/klebsiella_phages/pharokka/RothH1/phanotate.ffn | grep -A 9 RothH1_CDS_0054 | \
grep -v ">" | tr -d "\n" | wc -c

441


In [35]:
seq = 'MSTELNQPAITARIIPNNRRMAFLPRLFGAWYLTGEAGVYNLARSLCVDYQGGSWEFVELSCGSGFMYPLSAERFTVSVSGNWFEGELSAEATGIVLTLFTLNHMIWHAHDEGYQHICDMLITQQEKLKLYADQHAEAGLIYRAID'
round(IP(seq).pi(), 2)

5.11

# Extract GNATs

In [19]:
colnames = ['seq_id', 'source', 'type', 'start', 'end', 'score', 'strand', 'phase', 'attribute']
gff_data = pd.read_csv(gff_file, sep='\t', skiprows=1, names=colnames)
gff_data = gff_data.drop(columns=['source', 'type', 'score', 'phase'])

gff_data['attribute'] = gff_data['attribute'].apply(lambda x: x.replace('=;', '=NaN;'))  # fix a data artefact
gff_data['attribute_dict'] = gff_data['attribute'].apply(lambda x: {i.split('=')[0]: 
                                                                    i.split('=')[1] for i in x.split(';')})

norm_attribute = pd.json_normalize(gff_data.attribute_dict)
gff_data = pd.concat([gff_data, norm_attribute], axis=1)
# remove temp columns:
gff_data = gff_data.drop(columns=['attribute', 'attribute_dict'])
gff_data['cluster_num'] = gff_data['cluster_num'].apply(lambda row: int(row) + 1)

gff_data = gff_data.drop(columns = ['inference', 'phrOG',
       'phrOG_annotation', 'phrOG_category', 'pfam_annotation', 'pfam',
       'e_c_number', 'Name', 'gene', 'cdd', 'cdd_name', 'tigr', 'tigrfam_name',
       'conj_pl_domains', 'locus_tag', 'product', 'file_id'])
gff_data.head()

Unnamed: 0,seq_id,start,end,strand,ID,cluster_num,cluster_main_prod
0,AY264775.1,901,1254,+,GCA_002600005.1_00001,3,Overcome classical restriction gp0.3
1,AY264775.1,1254,1409,+,GCA_002600005.1_00002,12,Gene 0.4 protein
2,AY264775.1,1478,1615,+,GCA_002600005.1_00003,20,hypothetical protein
3,AY264775.1,1612,2256,+,GCA_002600005.1_00004,125,protein kinase
4,AY264775.1,2327,4978,+,GCA_002600005.1_00005,1,RNA polymerase


In [57]:
indicies = []
for ind in gff_data[gff_data.cluster_num == 55].index:
    indicies.append(ind - 1)
    indicies.append(ind)
    indicies.append(ind + 1)

gnat_loci = gff_data.loc[indicies].drop(index=[961, 1822, 2734])

In [58]:
gnat_loci['protein_sequence'] = gnat_loci.ID.apply(lambda target: extract_seq(target, type='prot'))
gnat_loci['nucleotide_sequence'] = gnat_loci.ID.apply(lambda target: extract_seq(target, type='nucl'))

In [62]:
gnat_loci.loc[gnat_loci.cluster_num == 7, 'cluster_main_prod'] = 'hypothetical protein'
gnat_loci['seq_name'] =  gnat_loci.apply(lambda seq: f'>{seq.seq_id}:{seq.start}-{seq.end}({seq.strand}), {seq.cluster_num=}', axis=1)
gnat_loci_seq = gnat_loci.drop(columns=['cluster_main_prod', 'ID', 'seq_id', 'start', 'end', 'strand', 'cluster_num'])

In [71]:
gnat_loci_seq.head()

Unnamed: 0,protein_sequence,nucleotide_sequence,seq_name
736,MTIENTNAAELSAITRMDEIVLEIQSSLDSIHTESVKVGLLLQEAN...,ATGACTATCGAAAACACCAACGCCGCCGAACTGTCTGCTATCACTC...,">MT941682.1:7741-8601(-), seq.cluster_num=7"
737,MPSIIMQCADQTAFYFALNHARRTNKNGAALTSAEAVTIEIAEAFR...,ATGCCTTCAATCATCATGCAATGCGCAGACCAAACAGCCTTCTACT...,">MT941682.1:8798-9268(-), seq.cluster_num=55"
738,MSAANIAGLGGDKATVKYNKEYEEWVVKFWRAGVYQKAADYHTDDR...,ATGAGTGCCGCTAACATAGCGGGCTTAGGTGGTGACAAAGCAACCG...,">MT941682.1:9445-9654(-), seq.cluster_num=33"
962,MASVIVAASLKNNGMDYLSNLGLAGRTLKHGHSLSTPTEIAGSTAE...,ATGGCATCTGTTATCGTCGCTGCATCGCTGAAGAACAATGGCATGG...,">OK094519.1:774-1253(+), seq.cluster_num=55"
963,MSTVVHVDAPDGYGARVYHNGYCLTLHGRALSQADVMSLRGLAFTE...,ATGAGTACCGTGGTTCATGTGGATGCACCTGATGGCTACGGCGCTC...,">OK094519.1:1250-1474(+), seq.cluster_num=56"


In [73]:
gnat_loci.head()

Unnamed: 0,seq_id,start,end,strand,ID,cluster_num,cluster_main_prod,protein_sequence,nucleotide_sequence,seq_name
736,MT941682.1,7741,8601,-,GCA_015502145.1_00010,7,hypothetical protein,MTIENTNAAELSAITRMDEIVLEIQSSLDSIHTESVKVGLLLQEAN...,ATGACTATCGAAAACACCAACGCCGCCGAACTGTCTGCTATCACTC...,">MT941682.1:7741-8601(-), seq.cluster_num=7"
737,MT941682.1,8798,9268,-,GCA_015502145.1_00011,55,GNAT family N-acetyltransferase,MPSIIMQCADQTAFYFALNHARRTNKNGAALTSAEAVTIEIAEAFR...,ATGCCTTCAATCATCATGCAATGCGCAGACCAAACAGCCTTCTACT...,">MT941682.1:8798-9268(-), seq.cluster_num=55"
738,MT941682.1,9445,9654,-,GCA_015502145.1_00012,33,hypothetical protein,MSAANIAGLGGDKATVKYNKEYEEWVVKFWRAGVYQKAADYHTDDR...,ATGAGTGCCGCTAACATAGCGGGCTTAGGTGGTGACAAAGCAACCG...,">MT941682.1:9445-9654(-), seq.cluster_num=33"
962,OK094519.1,774,1253,+,GCA_020489725.1_00001,55,GNAT family N-acetyltransferase,MASVIVAASLKNNGMDYLSNLGLAGRTLKHGHSLSTPTEIAGSTAE...,ATGGCATCTGTTATCGTCGCTGCATCGCTGAAGAACAATGGCATGG...,">OK094519.1:774-1253(+), seq.cluster_num=55"
963,OK094519.1,1250,1474,+,GCA_020489725.1_00002,56,hypothetical protein,MSTVVHVDAPDGYGARVYHNGYCLTLHGRALSQADVMSLRGLAFTE...,ATGAGTACCGTGGTTCATGTGGATGCACCTGATGGCTACGGCGCTC...,">OK094519.1:1250-1474(+), seq.cluster_num=56"


In [72]:
with open('selected_variants_cloning/gnat_surrounging.fna', 'wt') as out_f:    
    for row in gnat_loci_seq.T:
        out_f.write(f'{gnat_loci_seq.loc[row].seq_name}\n')
        out_f.write(f'{gnat_loci_seq.loc[row].nucleotide_sequence}\n')

with open('selected_variants_cloning/gnat_surrounging.faa', 'wt') as out_f:    
    for row in gnat_loci_seq.T:
        out_f.write(f'{gnat_loci_seq.loc[row].seq_name}\n')
        out_f.write(f'{gnat_loci_seq.loc[row].protein_sequence}\n')


In [13]:
from Bio.Seq import Seq
with open('selected_variants_cloning/optimized.fna', 'rt') as in_fasta:
     for seq_id, sequence in SimpleFastaParser(in_fasta):
           print(seq_id, Seq(sequence).translate())

26 MFISTADNKFTVIASPFRANMSMQTNMMRLEAAMNRLTMCFNITNMKTVLGVYQEEGQEAATREVSIKMDGLCWPEVKNLCTLFCEDFEQDCILVINNENGRCALWSNSWSEELGFWTQVTAEEAHEAGIYTLDTNFTYWLAK*
29 MLQRIDNRVYRDKHGFVIVHIVKDDCYVYADTCESLPYYPSANTRKTWERIGTNLKFTD*
30 MEIVMQALNHGVIMTTARDYTGATKYMVQYGLQFTVFDSFREALQDYTDLRHPFRKSVGTSG*
43 MIQNQGNDHKVKEGDVVNWRDEYGDIHCDKVDHIYVSAMGYECVSLYTGKRLKMNELL*
100 MERNANAYYELLAAAVEAMNERIQDDRIESEDDCGFSDALHEVVDGQVPHYYHEIFTVMAADGIDHEFEDSGLIPETKDVTRILQARIYEALYNDVLNHSDVVWFEAEESDDEEADEYWVVDAKTGVFIEQAVSLDVATACAKDHYAIGRHLKVEDINDNVVFDPAAAEEDCE*
77 MFHLKDTEVAPPQTTNRKGLTMMNKRYNLNARVLGNTAQAYADRLKTDKKYTIAQTLNTWNMIVEREGKSGTYSFPTTGYTRKAQLIADLESCAAQLRELAECNDTVYDVEQAHKEALMANMPIDEQTRSRLTREEAPVVATAPVHNKTHFVDITVGSSRVKVKADFQMKLYPRDMSKSRLEESVSCLNEFIKNEFKFTVVAVVTTMGDTHGDLKAIVTVK*
513 MASIINMVSKAAADMSVTENAVCVRFYGINEGCDFWIDADEIGDAGGLRVWLETYTELSISEVDEILERDYLVIGHEGELVGQCVEGEVFDWELYNEARELVDNYYSEEVVIAAMKLGITLERLEDSYSGSFDSDEEFAQEMWENYGYLDNLPDHMQSYIDWEKVARDLMFEYVEQDGHYFNNNW*
95 MKTQYLVNKEAARKLFEMSTASKFEGEVSILARRWVDSYGNTYHHVYIDVLFPGESTYT