In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import abc
import pickle
import math

import wandb

import torch

from tqdm import tqdm
from functools import partial
from torch.nn.utils.rnn import pad_sequence
from torch.optim import AdamW, SGD
from collections import namedtuple

import proteinbert_gen.constants as consts
import proteinbert_gen.mask_diffusion as mask_diffusion

from proteinbert_gen.debugging import print2
from proteinbert_gen.proteinbert import ProteinBERT, load_pretrained_weights
from proteinbert_gen.word_freq import create_word_freq_tensor
from proteinbert_gen.tokenizer import ProteinTokenizer
from proteinbert_gen.dataset import SwissProtDataset, calculate_splits

In [3]:
Hyperparameters = namedtuple(
    "Hyperparameters",
    [
        "batch_size",
        "epochs",
        "num_steps",
        "word_freq_lambda",
        "device",
        "hybrid_lambda",
        "lr",
        "logging_steps",
        "eval_step_size",
        "clip_grad",
        "clip_grad_val",
        "warmup_scheduler",
        "optimizer_cls",
        "warmup_steps",
        "data_csvfile",
        "word_freq_dict_pkl",
        "num_blocks",
        "d_local",
        "d_global"
    ]
)

args = Hyperparameters(
    batch_size=64,
    epochs=100,
    num_steps=4096,
    word_freq_lambda=0.3,
    device="cuda",
    hybrid_lambda=1e-3,
    lr=5e-4,
    logging_steps=25,
    eval_step_size=4,
    clip_grad_val=10,
    clip_grad=False,
    warmup_scheduler=True,
    optimizer_cls=AdamW,
    warmup_steps=50000,
    num_blocks=12,
    d_local=256,
    d_global=1024,
    data_csvfile="../data/uniprot_sprot_1m_1024.csv",
    word_freq_dict_pkl="../data/uniprot_sprot_1m_1024_word_freq_dict.pkl"
)

run = wandb.init(
    project="proteinbert_gen",
    config={k:str(v) for k, v in args._asdict().items()},
    # mode="disabled"
)

[34m[1mwandb[0m: Currently logged in as: [33mmattfeng[0m ([33mkaiogenbio[0m). Use [1m`wandb login --relogin`[0m to force relogin


In [4]:
sprot_all = SwissProtDataset(args.data_csvfile)
sprot_train, sprot_val, sprot_test = torch.utils.data.random_split(
        sprot_all,
        calculate_splits(0.7, 0.2, 0.1, len(sprot_all)),
        generator=torch.Generator().manual_seed(42)
        )

In [5]:
class SampleClassBase(abc.ABC):
    def sample(self, logits, x_0):
        raise NotImplementedError

    def post_process_sample_in_prediction(self, sample, x_0):
        return sample


class Categorical(SampleClassBase):
    def sample(self, logits, x_0):
        return torch.distributions.categorical.Categorical(logits=logits).sample()

In [6]:
def word_freq_preprocess_fn(wf):
    wf = wf + 1
    wf = wf.log()
    wf = wf / wf.max()

    # range: 0 - 1
    return wf

def process_fn_in_collate(wf):
    return wf - wf.mean()


tokenizer = ProteinTokenizer()
wf_tensor = create_word_freq_tensor(args.word_freq_dict_pkl, tokenizer.ALL_TOKENS)
# wf_tensor[tokenizer.mask_token_id] = 0
wf_tensor[tokenizer.pad_token_id] = 0
wf_tensor = word_freq_preprocess_fn(wf_tensor)
wf_tensor

tensor([0.9930, 0.8760, 0.9626, 0.9751, 0.9424, 0.9824, 0.9082, 0.9710, 0.9673,
        1.0000, 0.9151, 0.9418, 0.9518, 0.9393, 0.9652, 0.9708, 0.9611, 0.3411,
        0.9802, 0.8615, 0.5221, 0.9240, 0.0000, 0.0000, 0.0000, 0.0000])

In [7]:
def collate(batch_input, *, tokenizer, word_freq: torch.Tensor):
    input_ids = []
    attention_mask = []
    word_freq_logits = []
    
    for item in batch_input:
        seq = item["seq"]
        ids = torch.tensor(tokenizer.tokenize(seq))
        mask = torch.ones_like(ids)
        logits = process_fn_in_collate(
            word_freq.gather(0, ids)
        )
        
        input_ids.append(ids)
        attention_mask.append(mask)
        word_freq_logits.append(logits)

    input_ids = pad_sequence(input_ids, batch_first=True, padding_value=tokenizer.pad_token_id)
    attention_mask = pad_sequence(attention_mask, batch_first=True)
    word_freq_logits = pad_sequence(word_freq_logits, batch_first=True)
    
    return {
        "input_ids": input_ids,
        "attention_mask": attention_mask,
        "word_freq_logits": word_freq_logits
    }

collate_fn = partial(collate, tokenizer=tokenizer, word_freq=wf_tensor)

In [8]:
train_loader = torch.utils.data.DataLoader(
    sprot_train,
    batch_size=args.batch_size,
    collate_fn=collate_fn,
    num_workers=4,
    pin_memory=True
)

In [9]:
sample_batch = next(iter(train_loader))
print(sample_batch)
print(sample_batch["input_ids"].size())

{'input_ids': tensor([[23, 10, 19,  ..., 25, 25, 25],
        [23, 10,  8,  ..., 25, 25, 25],
        [23, 10,  8,  ..., 25, 25, 25],
        ...,
        [23, 10, 15,  ..., 25, 25, 25],
        [23, 10,  2,  ..., 25, 25, 25],
        [23, 10, 12,  ..., 25, 25, 25]]), 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]]), 'word_freq_logits': tensor([[-9.6086e-01, -4.5732e-02, -9.9333e-02,  ...,  0.0000e+00,
          0.0000e+00,  0.0000e+00],
        [-9.6000e-01, -4.4873e-02,  7.3356e-03,  ...,  0.0000e+00,
          0.0000e+00,  0.0000e+00],
        [-9.5968e-01, -4.4548e-02,  7.6612e-03,  ...,  0.0000e+00,
          0.0000e+00,  0.0000e+00],
        ...,
        [-9.5636e-01, -4.1228e-02,  1.4481e-02,  ...,  0.0000e+00,
          0.0000e+00,  0.0000e+00],
        [-9.6258e-01, -4.7453e-02,  5.8234e-05,  

In [10]:
def denoise(targets, timestep, attention_mask, *, model):
    ret = model(targets)
    # ret = model(targets, attention_mask=attention_mask)
    # print("denoise output:", ret.shape)
    return ret

with open("../weights/epoch_92400_sample_23500000.pkl", "rb") as f:
    _, pretrained_model_weights, _ = pickle.load(f)

model = ProteinBERT(
    tokenizer.vocab_size,
    consts.GO_ANN_SIZE,
    d_local=256,
    d_global=1024,
    num_blocks=12
)
print(model)

# trainable_params = load_pretrained_weights(model, pretrained_model_weights)
trainable_params = list(model.parameters())
model = model.to(args.device)
denoise_fn = partial(denoise, model=model)

ProteinBERT(
  (embed_local): Embedding(26, 256)
  (embed_global): Sequential(
    (0): Linear(in_features=8943, out_features=1024, bias=True)
    (1): GELU(approximate='none')
  )
  (blocks): ModuleList(
    (0-11): 12 x TransformerLikeBlock(
      (wide_and_narrow_conv1d): ConvBlock(
        (conv_narrow): Sequential(
          (0): Rearrange('b l d -> b d l')
          (1): Conv1d(256, 256, kernel_size=(9,), stride=(1,), padding=same)
          (2): GELU(approximate='none')
          (3): Rearrange('b d l -> b l d')
        )
        (conv_wide): Sequential(
          (0): Rearrange('b l d -> b d l')
          (1): Conv1d(256, 256, kernel_size=(9,), stride=(1,), padding=same, dilation=(5,))
          (2): GELU(approximate='none')
          (3): Rearrange('b d l -> b l d')
        )
      )
      (dense_and_broadcast): Sequential(
        (0): Linear(in_features=1024, out_features=256, bias=True)
        (1): GELU(approximate='none')
        (2): Rearrange('b d -> b () d')
      )
  

In [11]:
optimizer = args.optimizer_cls(trainable_params, lr=args.lr)
if args.warmup_scheduler:
    warmup_scheduler = torch.optim.lr_scheduler.LambdaLR(
        optimizer,
        lr_lambda=lambda n: n / args.warmup_steps + 1e-3 if n < args.warmup_steps else math.sqrt(args.warmup_steps) / math.sqrt(n)
    )

In [12]:
sample_cls = Categorical()

diffusion_schedule = mask_diffusion.create_discrete_diffusion_schedule(num_steps=args.num_steps)
diffusion_instance = mask_diffusion.MaskDiffusion(
    dim=tokenizer.vocab_size,
    schedule=diffusion_schedule,
    tokenizer=tokenizer,
    sample_cls=sample_cls,
    word_freq_lambda=args.word_freq_lambda,
    device=args.device
)

using standard schedule with num_steps: 4096.


In [None]:
train_loss = 0.
has_nan_log = 0
nan_count = 0

# torch.autograd.set_detect_anomaly(True)

# def _save_output(module, grad_input, grad_output):
#     print(module, grad_output)
#     print(torch.isnan(grad_output[0]).any())
#     print()

# for name, module in model.named_modules():
#     if str(type(module)).find("LayerNorm") != -1:
#         print(name)
#         module.register_full_backward_hook(_save_output)

for epoch in range(args.epochs):
    for i, batch in enumerate(tqdm(train_loader)):
        run.log({"epoch": epoch, "minibatch": i}, commit=False)
        
        optimizer.zero_grad()
        diffusion_t = diffusion_instance.sample_t()
        # print(diffusion_t)

        metrics = mask_diffusion.compute_kl_reverse_process(
            batch["input_ids"].to(args.device),
            diffusion_t,
            denoise_fn=denoise_fn,
            diffusion=diffusion_instance,
            target_mask=batch["attention_mask"].to(args.device),
            hybrid_lambda=args.hybrid_lambda,
            predict_x0=True, # False,
            word_freq_logits=batch["word_freq_logits"].to(args.device),
            device=args.device
        )

        # print(metrics)

        loss = metrics["loss"] / args.batch_size / batch["input_ids"].size(1)

        if loss.isnan():
            nan_count += 1
            if i % args.logging_steps == args.logging_steps - 1:
                run.log({"nan_count": nan_count})
            continue
            
        train_loss += loss.item()
        loss.backward()
        if args.clip_grad:
            torch.nn.utils.clip_grad_value_(trainable_params, args.clip_grad_val)
        
        has_nan = 0
        for param in trainable_params:
            if param.grad is not None:
                if torch.isnan(param.grad).any():
                    param.grad = torch.nan_to_num(param.grad, nan=0.0)
                    has_nan = 1

        has_nan_log += has_nan
        
        optimizer.step()
        if args.warmup_scheduler:
            warmup_scheduler.step()

        if i % args.logging_steps == args.logging_steps - 1:
            run.log(metrics, commit=False)
            if args.warmup_scheduler:
                run.log({"last_lr": warmup_scheduler.get_last_lr()[0]}, commit=False)
            run.log({"nan_count": nan_count, "nan -> zero": has_nan_log})
            has_nan_log = 0

    # generate some proteins
    print(f"Post epoch {epoch}")
    
    generated_final_states = []
    for length in (200, 500, 800):
        generated = mask_diffusion.discrete_diffusion_predict_fn((4, length), denoise_fn, diffusion_instance, topp=1.0)
        generated_final_states.extend(generated["final_state"].tolist())
    
    generated_table = wandb.Table(columns=["gen_id", "seq"])
    for j, genseq in enumerate(generated_final_states):
        genprot = tokenizer.untokenize(genseq)
        generated_table.add_data(j, genprot)
        print(genprot)
    run.log({"generated_proteins": generated_table})

    torch.save(model.state_dict(), f"../checkpoints/{run.name}-postepoch-{epoch}.pt")
    torch.save(optimizer.state_dict(), f"../checkpoints/{run.name}-postepoch-{epoch}-optimizer.pt")


100%|██████████████████████████████████████████████████████████████████████████████████| 6036/6036 [40:05<00:00,  2.51it/s]


Post epoch 0
^MILRGIVCCCCCCCCCCCCCCCCCCCGCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCKCCCCCCCCCCCCCDCCCCCCCCCACCCCCCCCCCCCPCCCCCCCCCCCCCCCCCCGCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCSCCCCCCCCCCGCCGCCGGCGRFGR
SPCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCPCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC
LCACCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCACCCCCCCCCCCCCCCCLCCCCCCGCCCCACCCCKCCCCCCCCCCCCCCCCCCCCCCGCCCCCSCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC
^MAKKKRPVDEKKNLDEEVKKLMKEIELQEPDSAAKLEIEEAMSKEEDELKIDDEAEEEEKKSLEEEEEKEEELKEKKEEKELNVFDKMNELDKEDESIAKAVKKIKDTVEKKLMEVDVNVLYQDETLERKMDELKEFDEMDINEILKQEETGKLMEYLVLKEIDNKNNELGANVEKGFLKEEKSYLLDLEEVNKIDEDK
PMSRCRCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCNGCCCCCCCCCCCCCCACCCCCCCCCCCGCCCCCCCCCCCCCCCCCCCCCCCCCCCC

100%|██████████████████████████████████████████████████████████████████████████████████| 6036/6036 [40:00<00:00,  2.51it/s]


Post epoch 1
^MLEELDRGPVCWLGPDRFAWATHLGRGRVKFNAAHRCGAYRAVVLSKAHRNFVNITLSGMNGILRAKLDGYSVTALLAAIFKTLISHYPSSLCKQPCVRLKTSLKLGEPDELFPTYSLGALMHGEIRMAFVDLEEDKYKEAKIKDTQLNFRQAMTPDTSGGKTSHDLLERQGVVRADGNRQAAKDDVCNRPIINFQFWL
^MTEVGAAGYRETTERREAGARPLFHGQEVAFACSVLGSEAERRDDLQDYLETEQQNNGTRIIKDDMVSGAWAGLVCLRSIVTEYMINFNYQYPMYQVKGVYGRFAYVSYKNVFEGPRSAQGKGAGDRVVAQTTKKPLASRNFKNKKDLTNPACVSDDVWANTESPVLSLLGLKFKLGLGYIVQHASNAVAYAVDDADME
^MDKIQSITNLKPVQNKGALLINSCNLLENCESPHNLAGDSEFGLEDEINASVLSLAAIDKVGGDIGISLGSGTADSVAYIKEAYMSVSRGREVSSYYLITTKTMNEEKLNINFIDIEPATGVLLRELVPEKLAAHPVARAALHILVSIKNLHHFLDGGRLLMKNLGIVYIRGAKVPFFYELNDLQAIKETTVCGSEDKV
^MSKIGTVFARSFSLRLKHARVPIDLTQARSEQYMSDAGPQVIRGLIVIKSVLKRVKEARVVSYTAIHLPDVPRTKGHLLSPENSACESADPANDYDPYRLVKINFNFFGEIEVVGSSSSMAHGYGTIYSVLGNKAPTAANPALKGTDREMGDRVTEHRDHAFGLIGPALTDPAVREHAII$ERCSQIIRAKRSVYRSRV
^MSTYESTLAATQKNTAIIRSISLVNFKIAREEAVDQWVVMFGVTLDVGHEENASIDLLHQDINLQAVGPFRFILKVASGGAAHWYSARPPVTGMFRIDTQLQKGAFSRARAPSPDAEGGNALGDRDKLYQFQAVAALPNLFGWLLGPKKMGIKLLARMRGYLMGMGQSNVTLVNDNMERKIP

100%|██████████████████████████████████████████████████████████████████████████████████| 6036/6036 [39:45<00:00,  2.53it/s]


Post epoch 2
^^P$CFN$$IHAA$$$$DERPRGT$ESSHH$$S$RGKSTE$L$$N$$N$HESELD$RGPN$$$AQVLAIHPALGL$QNFFDTS$$$IA$$$$$$TA$D$ELAPTV$$MTRRAIE$$KTNM$GAAACTKC$EASNNM$RKI$$I$$RRFLARQADKY$FFAHHKE$NS$ILVSPDILEKKLTRAFRGDNKLAKA$$D$RTA
^^LA$$$L$E$LVRLRSD$ANKWDAKDVEAT$HVIIKDPREAHDIMIATLKF$$$$RI$EYTLETRIR$$$$H$$$$S$VF$$VYE$AA$A$$VGAIQLQMGSTKQSK$EKVLLCSLE$$$$$$SV$Y$$VL$$DAVQ$$DVFQAADR$$A$PSMQRM$K$E$ALFGL$GY$$$$$IL$A$$P$GGCRDEEGGIAVFM$I
^TSKEKSRL$$$$$$NP$$A$GIM$FR$IAMDT$$IMS$Y$$K$NLM$SH$V$TASSKIPK$LHRMKGE$CVKTNYEKTS$$$$PL$$R$$ISVG$KKEKDVDHVSALSKTDLRAAACNV$QTVAG$$$$K$$$NW$P$YAS$GKWHNFG$$$$$LK$$Y$AV$P$LEIG$VTARP$IVETGIKKWIEEGRKYSKNRYET
^SEAKKGG$$NSGF$$$CSSLPQVRCKL$K$KPALEYVEA$E$$$Q$$$KRFAYFEP$GSSAQ$ILTEES$$$$$$FCK$KNK$KGPE$$RRLSEK$$QQ$$G$$IGTCDQ$PRSLRKT$$$F$GEVF$ER$$$SVGTFRWIIDETIFR$ALSRP$$$$$$N$$Y$$L$L$K$GNYGEDDAHYSD$DVDRDYMLCKQRPL
^SKSLF$$$$KKG$SAN$$FFCNN$$$A$$V$$R$PCVRAVKEINLDDRYIR$$PKG$FY$$KLSEPNEV$G$$$$$EDM$$VA$IILSRKDRERI$CAAWGR$VKDDFLLSELEAYS$GRSFEAL$FIC$$$KN$$$$GI$$$$P$$TWH$K$NARI$VK$$V$$EE$L$T$AYP$$$$$R$

100%|██████████████████████████████████████████████████████████████████████████████████| 6036/6036 [39:39<00:00,  2.54it/s]


Post epoch 3
^MTMKSPYGPDCLITAQELLFAVLRRGGITFLPGDLTGHLLESANLDRKAIIVVSEVRLPESDSEPLSRLPDLANRLRIINPTSETTAQVDVEPIELYKKRGQVLYLQANVQNTVLAVKAVNYGCVNMLITAEIHAIHEFDIKLVVEIEDRPAGPINSMLQILEEASCRTRLTPNGSFAMSSRYYETTGVSGKNEWQTR$
^MMRRAAQLLEALEFVQQPTIFVSKASLDREKLDGMWQLASHSVKTYDHASIDGHEVLAVNSIQMYFNHNQGGRQSAKNEALDRKDDVIAMYATDSEKHVYSAYCLPSIALMSEYKGTF$STGMLVDLPMVRGTRWIGRNHHNADLSEDTVLYPVDAVGNSGSLVDCLSDLHTKEDAVIAPKVNESGFVKKLNGIKRLK$
^MMTDESIPFAAAANLKESNVRPSLDSMVKGGPHSIKIAQLQSKAYPRNLKTPLLAEVTEPSMGEARSVFMSESKAGVCLTEAATENQRLNNPRRNAFYDLKPAYVPVQAELQDTSSSAGFGFLKTTLELMRKKHKSWQGLGISWIAIASIVEDKSQLAAQIILLQTVFAIIIFRITILGATAPGGGTSVSDRIIDVEA$
^MAKLKSCSAAKRLNVGETAVSYWWSEGEYATMVQVEPTDPIRAAAHIIRGTVWVDRENEEHHVATRNAYGPAESLDLRHRLVDAVIKRKDAAGLGIQCSPRNPSSTDAAIVQIVVVYRTIAAVLLLLAKIQIQLRVNQFYPVIPSANLTSVYAHLVYLTGSQREVDSALTVGSKYGKPGAVFNAAFKIIIDSAGKDM$$
^MMMQQDLLFVLYGNHRLDTLHFGATLTALWIPWFTISALRAKCIRVRHDLDQPFYGVAQSLVLAKGAHLIKPPKGGFEKLPTKVSPECEDYPENFQMEGIGRPLLLSVQMQMALRVWYRVVIKKDFQEPQFAIYLQSNGCGYYCYAPDNNIPAVYPLVADAIQYEKGNLAVCEYGAILYSGL

100%|██████████████████████████████████████████████████████████████████████████████████| 6036/6036 [39:36<00:00,  2.54it/s]


Post epoch 4
^MTKRVDEIAGLDLGNAIAAKLYRSAEQIPFEIGNRFDQFIINRVDIADSMVTSAAPFTGFMVFSNGHVVGPGIQAIRSAAAIAPNIVGSGSGETTSHVRIVSQVGPDEAAFKMKVDAVFNPAYLKLRIALRLTDLEKMGVLVFTMSRAHAKGYHKWDVEPSPFTPICKGKMPLCQVSTSGNIGTQGNGNVMVGSDVGP$
^MQAEQNVQYHPSQKTVGISRPAAVARNVKKLLSQLGWVEFHADILGGSRTPEHQNTAITLALTDLHNMGVALSNGYLIAHLLHHEERGKADHGVEVVGESVRDCLLDRHAKQGNVDRGKQELTGHVHKGNMFANGNFREGKVGADKVGKIATSSSPIEAGGLRTVKRDDVETELHGANILGIYIERA$SLAEHEFRHF$
^MMDIDIKLREVMFNLSLQKVRTYDLPVTVLIAFISGVVDQDRQCSDGLTEEGQTPSAGRS$LKAEQMRVKGRAPADIVGRSISRDILRDVIVELRRGLQANERSKKAWTKMENQFISEELIWDNAIGILLMAKTKLAPKEANKEVTVGPLGEQSPSHVKEGANGDGDASVVSDAVKVL$GILSELVQNKHNEADSSNT$
^MTIAIEFKKQGGDIPVLSVFAMAVHECRATCLPPRMGQHAVPQHVDRSKPGQSLVSTFRFPAYQRMVGEYPSLLAGDHSEHWIDALKEPGHSGQMDCEGVMTIN$GIVQQRLAAVNLRVTGPAGQITFRLLRIDAFGKPVWYWGEIQLRHVSNPAPDPQIGDVNLSRDFSIPSEAKFETEPILWLEQMFPNPKEERGS$
^MAANVEIPVCTGQQVTNISALMTTNTVGERLLRPTFLTYPGNQGKATFIKAKAIVSVEGGSAKQVSCCHVFGVIPFNESASKHKISGLADLSAVKEFLEDTKAMSKTFWKFVYGPARGDIVQVVANCNLAGTWPKDLLTGTSIEHALAYAVLQMVDQGAEKFVV$LGQVKDRQGDANILVVG

100%|██████████████████████████████████████████████████████████████████████████████████| 6036/6036 [39:39<00:00,  2.54it/s]


Post epoch 5
^MLNKGDLKAASSDNAISALVIVPIHEAKVVKPPIWAALSLGIDQPWQAAHKGQSGIILVSRSAVTSASYANPLDANPLLIMIRAMTQFEELKRIFRASAMVLVHGRLNASFFDTPAAAGVGEQVEGFPEDETKYIDESVYLIFTHLEYEGSVRTTLVVKRGEDPEIPGLKLKRRLRRKYLSIKMSREGIPDDEYDHAT$
^MQLTFLVAAKTLSDWTILILLNWGLAPPSVAEGHLNRSWGYLGALGILGVVPSVDSIFQRKYDVTGSVKGTEIVYEYRDRCVWFILAPEPLNQMGKNVTSTVFVKHMASGFKTGYLLNASKADYPLSRRNQLDEDYTLHNEVRDTELNFIIQRDAIDVMPIDTNLSRKNVALMHELRVIRAPLAPLFP$VVTLMQKLI$
^MNRPRCLEARLLYGNNSGNFIHKLGPMMKESQAVCQLRAGTTLSFTPDSFLEKRVVIIIAEGIAVTDSAATATVGPKEPYSPLEVDYLPFKLNGGALRAIKVSHKVPKTQILYRNYEISIYAELFILGSFFNAGYVDISGLSAATRIDYFQTFCLQFSATGNFQPLTSFQRGVFLLRKAVDTLFQRPLGIKRAFIHSQ$
^MDPFQRNVQATGEYDQGTKSNVAIRALQSNAVPDGGGSQAEQINLGFYSGALAYAKLPVLVVDGATIQPIIDLRPMLHSITHSVTKQQPMVNLDTFVVITEYENRIIQKISVVSDKLLHFHIDVIYINFMRKLDGEKILDARTVVRRLMVSGGELRMERVATELTEDDKAVVFTFGFALIIVASVAKSQAQLKACDGL$
^MLYSHLFIRAPGSARARTTHALIGVSLSNPIAVDEQNPPMLLRVAVPIGSEDILSLDWVFAQAAYKASPHLAERMVWVLTRPGKLELNNYTPESAYPVLKETQCESVSGGPYLIPLRAIDDAPVLFVDYQHFTRIIEQGGSPFTREQGATSHVKGGLRCDAVLDFYAFTSMKSGKNLAALVI

100%|██████████████████████████████████████████████████████████████████████████████████| 6036/6036 [39:40<00:00,  2.54it/s]


Post epoch 6
^MITLPKDTLLAPQTYIIRRRQDILSDLYELTSYELSGPPNVRTISGMQVMCLVADLREGVQYVTALLAINLVELQPSELAYSPGWEHEDQGKSRGWCIFMTTGILTINPDEYASVLSQLPAMAKSLMIENDAKHPKYDSVPECRIPEMDDFVQILEDAILDESSVRGAGIEIATSQTARNETDKRVRASFTAFLELTE$
^MEAESFVSAGKVEAASIEHLMVPKGEVFVAPFASGEETFGAEAVLRMRLGITTFLGLDVREIEDRDSLRLREIIKQHDLECGFDPGYHLVGFGSHVQFPGFCLAENRIIDYHGVVTTAISVIEGQKSYRPPEPQVYRMTVHKIKLVRAFVSIRRHFGPRGQHVDFLPPYHADGLNIKEPMLIKHDALTAAVSNGKRLF$
^MPTTGGKRGMRRLPAPAVALPTDEKDRDLAIHATPMPFKNAAPSKNKNDSGTGYLLMARGVGNTDTRRIYINTAADLSCLITFIEKATDEQSLSAFIRQKNQGSFNSPIAAGPAEAFDLEREASSTTRYGRRRRSVRRKATDYIEGLEERSGLDSATPKFYSMPKMRLAWRSDLEMEPWSEGGDIYSYFKADIECLTS$
^MAKSLVQRQKEFSLLNMLQWGDKTYVHSGAPILLRIHFAQPIHRATKNVKYGTHKEGLYGIHIMIMEGETDMSQGDVADIQLDVLVSGRPYIYICFNAATGNTTHNSVGKDFQVKDEHIYGVKGNPCKVDVEVIPDSTPGNGALYAVMLDTVEPIIFHQAKIQFEIDMYPRQPVAFVLNKSELVNRCAFVGYFISASE$
^MRKLLQAVISPMTFVGLSLNSRVEYGKPVMICLAAYTTAQDRFLSADEGPIDFGKWYDAQEEATHLCNPFLSSRFGLVGISTSLSFDEKMSQIQPLWNFQRENLAAVCAKLNHFESNSHDGPPEGSFLKIQYGRYFLTFPTHWGLYDAKVKAGVHLTKTYDSVSLKTRLTVGKPQHIQQRRT

100%|██████████████████████████████████████████████████████████████████████████████████| 6036/6036 [39:37<00:00,  2.54it/s]


Post epoch 7
^MGRWRLVAYVNSVLLIMVYEAPGCVLPEVRFLMGAPRVGVARNLAVTYDGDLFTVLQNKFATLISEQSLISTADWKRRNLKIKSNITMLDNIPKFADANGLLVGSSLLESIGAQITTRVMTLIIEGILIEDTFVISIVEEAQGKVIQFILFGIIGVENKITENNKSYLSPFLSGAYLEFYRKGTLLKNSITDKRASKS$
^MATIKIGKIPKHAHADSVYASCVGNIITSPFESVSLYGIDRWKAAFGLARVVMKGELVQYVSWQIDERLIDGRCNQNREVERLHGPGDFELRFVLLADLIKKLAQAPDSPEIFHAAQALKWGRARAAEDLIAIGQIRDTKDSSAEEDECQIEQEGYLEQRAINGPTVV$HERCVEHGASALNMPSEIFFRGRRTGSSA$
^MRTNRLNTLRRRAAFVDEFNTQVSKAQLNKKFILFLAHSTRFRFFVADTSGVHICKYAVMRERVAQHERVISTNDKYLLNIAGGFSTTLGADSDVWIEGGGTRIITSSYFFLEKLMGVGAFKGDSMENVENDDNLALQLTHKGTSHNLLKTPSKTIANIAEIFFRGLTEEADNLCKQNSSAEFRKATGARPIIEIVSI$
^MFFFKSVSPKELNINKINIIVQAIDSVTTLSNIDDIPYRSTSPYTMADSSLTYRVFVAYGFRVRLCCHLSNDVWGISIILRDHIYSHKIYMYYKYDFVFIEELISAWSIPLNVGTGQLRVSSAILGTCTAHILEHTASKAFVTVGFTFGDFKVEIMVGKVPNESLTVRRDSRVDSQKSGDDYAMDASIKDHGGYMSLN$
^MRNLYFSTNFLKNRTRLAHLLDSTSNFFLHTYVRLGRQKDESLLAILQIKHATIGAADENNGNIGTSHARETYWYDQRLAAAMLSAGQTIRMCCHQ$VDPDVLELKFDLRDQRRVTQPGSVILRSGETRIRSGAMCQGEQIHLIGRKIRRLPAHAATGTFEVTLVPISSTPAGYASQQYYFY

100%|██████████████████████████████████████████████████████████████████████████████████| 6036/6036 [39:34<00:00,  2.54it/s]


Post epoch 8
^MATAGPQRKEYEYDITLATEMKLKPRDDSRLRYSAPANRAFATDADSTAHGTNIPCNAMNRVCYHKCGKMRLADSIAAIQIYGLMPIDKAAQPKLQEYDVELFFYSMAIMGEEEFVDTEPSLTFRPATYTMSSCLSGGYPNVATSSSPRTESFADADNAEEIR$EVAMVYFVKEKLSCVLDVSEPSVALFFGIGYKAR$
^MKIIFQTTATILTSKLHTCLKELPIVSKAGGMMKQVGANYPSNSTGALKVESQTTIAQAASTLYLTAVSALVLAVVNDSDLDWKAGVGPRYIAMPMSMTEASRFRKKDGPSSYQAESTKKIVGSASGVRIGNTVLQGEGYGQKLPSEIGDFETRKGSTNHLRNLAILIP$HRNERYRNRDTMTAMMFCHRVYLKVRGI$
^MTCETKDQDLGKMVLEVTNTGEVDITLRTSTASTRTSFLKVHIRTRASGLILRRKDSDFARETDAAFATLQSKRIKRWFSGFTDVMQDGLDKHKVSTPGEIDNRYEDADQPKPIMMRKGVGYYRDTGAAYDSVSRCLVAETLEVVCIMGSDVGSFRASQGVKATEDPCRQAEGFENSIYPNIVYDASMSGIATWITIT$
^MSFWTRLARGLPSFGLREYLGGDQKYADPAVGADSRVAGIGQSSGSSARQLRQAGWISKVSPSDLR$LCVENLFLMSKRPRCLTPAAKRKARAKLLRDPHPDFGIIFHDKPGRCSTESKAAAKNNI$ESVKKESMTEDIKKGSANFRFSNYAEDVPASIEDRVVLTSTDRVFGLLPSEGRATQILQPENNILSCWSKI$
^MAQSASPPSEAALKPVMVVTDGGREVSLFRANKAKLTLAASTAFVRNAQGPILLDLNILQVSDTDNGSMRRALIIPKKPDRFLILKLYPDNPVDAPGKIVQGSFFMAEVYGGPSIPFEMSDQIENGLVRAALEPGIELEDQLLKYYMWTQWIPHTGGELSSHSTSCFQGLAYVLTLAAPGLS

100%|██████████████████████████████████████████████████████████████████████████████████| 6036/6036 [39:35<00:00,  2.54it/s]


Post epoch 9
^MSDLLGSVHVRHRVDIALCLVLFRKEPSIVRTVSRAPTVRGIIWNLMKAEDLSGSTKKANLIDNTTILDGLYFLTPPLALGFPDGLGHMKKTRLVNSPVNDSGTIREVELKINPWEFVATLAIDPEQYPPQGDTGGAREYEKRERIAPGENVERKQSWSARARPVYMYFRYLEKQSPDPEAVSPNNASPFAEAGIHRW$
^MASLILQLAQRVIITYLSALANAQAVCWYSILICVGQITNTLVQMVLYAPHIPLDVYPMKVTLVGLGAPDEIIVKIYVVTCPMRLVTKIFFHMVDGTAAKVPQHPEQLGTCIDGIPALSIGVLEREDTVEETSSPSCAIASRFASMFPLDDIAWGNSSPFDILRQSNNKQAVYQASGPSEQYVGDYAGLAQGCIEPYS$
^MSPDATPLYLIDARLVTFGQEPETGSGNRERLETPLHPLFFLNGIGLSVVAIVKASPVGEGWTCSDILMYDLEAKPLVNKELQIQSNPLDEPNIEAEECVLNWNQESLYSVLEDEEYDKINGSILYVPENTDHISIQSSCSAMLQKRKQEEEYGAPGRYCLNNSLAKVSTECALAYKLGACRVRDHEKHNKDRLSSKQ$
^MQRHNDRLIAASLSFGWINSPSEAGMTPPTVHAGARPLCTKPIIHACRDSAVGLKIQMETMLVALNGTYLGHEKTDVKTSPGRALHLRFPLPTLGRYTVTTALQAADSLMEYRQHGAGTIKASSTSRKGLPQAYEQQLAKEKISSANPNSAIEHCGNRKKTYTVAWVGVSQPIA$$AVKLPV$QDTQEKFAPVDTEWL$
^MNNSNLDRSLDKIHGIAVLGDTIQGPAGADRVRIKDVTSEANKLDLYKGAQKRIQAVGKGAPSGHAVPTLEKMAAIWGIIAKSPLAKYIDPKTNSHEPLSTLLLESWASVLGRTHGSLQATDGRESEM$MNNVDNVVVFNPLEQVPLGIKLSQALTAQVSKLVLAFDRTETGQIALSHNGYE

100%|██████████████████████████████████████████████████████████████████████████████████| 6036/6036 [39:35<00:00,  2.54it/s]


Post epoch 10
^MTQSGQKRLYESSTFYLVECVHEKYTKLASGQAELALRTGKVHISTLKLIVIKVYGLRPGLAQACAVGSDPLSILFKIKSSQLEKNKRFAGGFKETGALMLNGTEEVKFEGYWGRGPWHKSPEKALSG$ISTPRCTPLRNAVVKSRDAVMSSQVEWKMRTVSHSVKKSRVNEVTRGPAYMGDRETHQGFKLAPNNRWL$
^MLIWLIFDVFKVSTKCLATFGPNASVKFLPNILWAGELLSDVDLSESLKSKHGGYIGKMQEALWEDLVIGCRSSLEIPVANDDIWNHGVDERSDFQVFTSNGRLLEAAEDKVQSVYYKEEIASFALAWLVRERYAACPCGEISPATYGATKGPQLDGTSKSVGEHSQDRERPRSSSFAMIKNGPNIYKPAKIKKIREE$
^MALMLDSRGLLSGGGSVLSLDNGTMPYLSKPGARMIFYLKRGTLLTMVVGDTILHMAYQGDVGFTPKLAFRHPMVETILKCGDKTQTELKIFHNSKRVKLVERVEGDESALLEMELLAARHKRIESPLKTVVRLANPAPKLYADAMTRHWATIEDKVLTSYLGIACAQFGLNLTALMPNVSFVGLSNGLMEVFKNAAF$
^MKAKLFHLRAELRLNAALDKSPFRIFKSAKMPVSEVVESDYNSIEPFKIILTHRPSEFAKIDHEIGGQKQRDHGKTYTEEKKNAAFRVYMEGTLLPEPNFLFEPIEDIY$PRVCIGHIQVNRLRTLTWVAIHMLCNESMCLRSTAPQAEMEVQFLGQHDKQVVYTANIKIHLAESLRIYNSFNVN$LLLFETFEGEEL$
^MENAFQRQANTDALMASKVASVFARGESLYGAYDVQAGDEQSELYGLAAPPGPEPPQQVKLKGIDVFDLKVAASAGIEVNSGVVMDTGAYWMLQLESGEVMRTRSTGVVYFMVDNAEELEVFEGEALDVGNGNTAGLPELRAHGMVDRFEDKELQNRGRGVIKLCLKPQLANPIAMGLGVG

100%|██████████████████████████████████████████████████████████████████████████████████| 6036/6036 [39:37<00:00,  2.54it/s]


Post epoch 11
^MVKQELSRSTAGLYPTGLSIVDRYGKGFSDFTVAGLVSTDSTEVTVLRVGYEWHSDKRLRAIGHIVAVNYRCDVGNFEPFHGSKAWTSISADAFDLNGNLGLDKPEDEGDSLRGRVPNSYDQNPDILEPTITTVAQSAQTPRGPQAAMWTMRSRTHLALLNDQFAHIGHSILIEYTCPPALLEVPSQATALALLEKSK$
^MTDYIQNVPALPLAPPSVRHRVLGWVPQISCLDYITEQFGSRYSTQDQNKVEYPTIGKRSSSDKAGVALLDLRVRGAMNVMNPLLQENRAAALYSEGIAKVEIPPANPTIYKSDELLEQLPSCLNLTSQRKTAEGAFKDAQGALMIKMRRLGGESINYSKNQYEFQAIKKQVIGETLIWKMVVEMREAYIYKKIAERN$
^MITSLRMAQASKTDLYRLPKQAAGDMAIRFGEKNTVGGLLKDSDFYRWPKAREIVNSAYKLIATGLLFYDLGANRLMSAQDFADQGALYTDADDIERSERNHYFALDVALKCNDHLCAQSVTISLVAIYLEPTSVEGLEVYAPKVLKQGDPKKPNNLYGLFKTSACAILICSSRNLGTANIRDGRPGTSEGKKPNVKV$
^MAEVVKAQCPLLALLFGRTLQGLLHELPFETQGVAQIADTKKANNSITCIGLSQNRALYDCTGDALEKIGMRMLTAAISLPLAEPMGKNAGTHRWEPKNFSCALQAHHDNSAGMQCTRSNREHDAPEYDDNPVFYNAQEFKGLDTSLLQKVVQQKQNGIQDPHVGIGMLVYYALKLTAYLSPRGAVEKILGKLLRTAF$
^MKPLAIYDFRESPQYRLQGGRIAIKGMKRTPRRGRTIKIFPQNVAVENCLVIFDLTEVLEWIVMLIATYGSPCRENIEIGPERNRKYKGSPGPSFKSWSALAIEGDHANIAITKAFPGSDVHGKPDPFIGGIIDSQSEYIDGRDSRGHTIQGILQATTRLHLITLHSIYSDKEEIQINCEA

100%|██████████████████████████████████████████████████████████████████████████████████| 6036/6036 [39:40<00:00,  2.54it/s]


Post epoch 12
^MKIKSFLELAVSSPVGFEAGSSEVRPRGRRHFGHSVYDLMGAFVGLAARMMILVAREAQLSGTKNLIKCQSVRLGGGRTPTGAFLNERTLVGGIDSPVFTSNLDDVISEISTAALIHTIMQGSAVGSLIHLETCMMLDGGTDKGKNRSDLGLMQRGRHKQRSARKWPYSQVPPYHDTLGAVRASMFLAPPSPHLDLELM
^MAACIRSITIAERAEAKEACDWEGTLDVYTEVRPYTGQKDTRQNEYNGGDTKVEVLKNKGQVCDFALKGAGDPQIVQDSSWADLDGVSAPLQRQLVVNLQHVQAKVGSITRIPYYSDWLLEVVLAEYVPAAHAIVFPNWMNLIEIDMAMKISGPDVNFVACPTQLENETEIRLHMAWRVNQYDAAKKTMKELSENQREA
^MDFDCDHAQIAKNMMSEQWVLFRVWQDAKYLVPFKRLPALFDIPISLANATKTRDLLDAAGVTSVAEPKSGFEDGLATVDDHLNTLPVEGVGMIKMAKVSELQLHLALDASIFDLVNYTRYIRVYLLPVLGSTSDEQAALPRAARQLRAGYAALRLRVTPANTIPKKNALPFIRGADEDHGFDLAYKVVQMQPTLAVT$
^MFTTDSVIEGVIQSILQNVIETMGALPTLPIWLVNLSLKRKDVKPMPLRDYFSTLKLKSGVGNLQQANLAKERYYPITSLVSNKITHISSQLDVAKSVTQKYTFLIEALQAISHSNSSALNAEELAVGTEAYRKNPGERGFSSRSTTFMKSKRLFLEES$TAKNSNAWLNDAVHNGLAQIQLKIEVKSSDFEMAKEDVE
^MTNLQPPGVQSREPVNEENGPSGGQDKPILLRQLNLDGLSVDFCDATVTLPWARVAASMSVRNLADSVGLPNSVVGGFQVKVMKVTRTVWLADQLAWARSRNAAQDILSEPNCIILFTAEHRSFGEPSTYCFAAFSRKDRQLHPVLYNTHGERNDFARSQDLNYFDKSSSMVKPTTLMLAL

  6%|████▉                                                                              | 356/6036 [02:19<34:59,  2.71it/s]