In [1]:
local_path = '../'

"""## Prepare fastai"""
from fastai import *
from fastai.text import *
from fastai.metrics import *
torch.cuda.set_device(2)
np.random.seed(0)
"""## Prepare Dataset"""
local_project_path = local_path + 'data/proteinnet/'
if not os.path.exists(local_project_path):
    os.makedirs(local_project_path)
print('local_project_path:', local_project_path)

"""## Create Language Model"""
class dna_tokenizer(BaseTokenizer):
    def tokenizer(self, t):
        res = []
        tokens = t.split(' ')
        before_seq = tokens[:-2]
        seq = tokens[-2]
        eos = tokens[-1]
        
        res = before_seq
        res += list(seq) # sequence string to list
        res.append(eos)
        
        return res
tokenizer = Tokenizer(tok_func=dna_tokenizer, pre_rules=[], post_rules=[], special_cases=[])
processor = [TokenizeProcessor(tokenizer=tokenizer, include_bos= True, include_eos=True), NumericalizeProcessor(max_vocab=30000)]

local_project_path: ../data/proteinnet/


In [2]:
# batch size
bs = 512
data_cls = (TextList.from_csv(local_project_path, 'test.csv', cols='seq', processor=[OpenFileProcessor(), SPProcessor()])
                    .split_by_rand_pct(0.01)
                   .label_for_lm()
                   .databunch(bs=bs))

In [3]:
data_lm = data_cls
# batch size
# bs = 512
# data_lm = TextLMDataBunch.from_csv(local_project_path, 'test.csv',
#                                    text_cols ='seq', valid_pct= 0.1,
#                                    tokenizer=tokenizer,
#                                    include_bos= True, include_eos=True, bs=bs)
print('data_cls Training set size', len(data_lm.train_ds))
print('data_cls Validation set size', len(data_lm.valid_ds))

data_cls Training set size 109899
data_cls Validation set size 1110


In [4]:
data_lm.show_batch()

idx,text
0,l ila nken vhw tty mdt ff rtsp mvi attd mqn ▁xxbos ▁xxup ▁mm tpen deeq tsv fsat vyg dki qgk nkrk rvig l cir ism vis l l smi tms af l ivr l nqc msa nea ait daav avaaa sst hrk vass ttq ydh kes cng l yyq gsc yi l hsd yq l fsd aka nct aes stlp nksd vlit wli dyve dtw gsdg npi tktt sdy
1,tsk ylnn l ieq dhr hik vrk try qsi nta knt l kgie ciy a l ykkn rrs l qi ygf spc hei sim l as ▁xxbos ▁xxup ▁m ynp fdea yhg l ce eile ign rrdd rth tgti skf ghq l rfd l tk gfp l l tt kkv sfk l va te l l wfi kgd tni qy l l ky nnni wne waf eny vqs ddy hgp
2,l aw div dgvf ppd l l ddm l eay vvf l rr l tee pwg eq vrc slpp aqlea ras ana tna l l seh tlh glf aar veq l pmq l avvs ark tlt yee l s rrsrr l gar lreqg arp ntlv avvm ekg weq vvav l avle sgaa yvp ida dlpa eri hy l l dh gevk l vlt qpw l dgk l sw ppg iqr
3,qg wge ndr gvs ftf gadvv skf l nr hdld l ic rahq vved gye ffa krq l vt l fsa pny cge fdn aggm msv det l mcs fqi l kpse kkak yqy ggln sgrp vtp prt anpp kkr ▁xxbos ▁xxup ▁man yef sqvs gdr pgc rlsr kaq i glgvg l l vlia l vvgi vvi l l rprs l l vwt gep ttk hfs dif l grc l
4,niy va qay nsn hqm l l vdn ak elaek l kke grpvr l ii vds l msh fra eyv grg tlad rqq klnr hlh dlmk fge l yna aivv ▁xxbos ▁xxup ▁mas kem fed tvee rvi nee yki wkk ntp fly dlv mt halq wps l tvq wlp evt kpe gkdy alh wlv l gtht sdeq nh l vva rvh ipn dda qfd as hcd sdk gef ggfgs vtgk


In [5]:
data_lm.train_ds.x[0]

Text ▁xxbos ▁xxup ▁mg aaas iqt tvn tlse riss kleqe anas aqt kc diei gnf yir qnh gcn l tv knm csa dada qlda vlsa ate tysg l tpeq kay vpa mft aaln iqt svn tvv rdf eny vkqt cnss avvd nklk iqn vii dec yga pgsp tnle fint gssk gnc aika l mq l ttk att qia pkq vagt gvq fym ivig vii l aalf myy akr mlf tstn dkik l ila nken vhw tty mdt ff rtsp mvi attd mqn

In [6]:
len(data_lm.vocab.itos)

30000

In [7]:
learn_lm = language_model_learner(data_lm, AWD_LSTM, drop_mult=0.5, pretrained=False).to_fp16()

In [8]:
learn_lm.unfreeze()

In [9]:
lr = 3e-3
# lr *= bs/48
lr

0.003

In [None]:
learn_lm.lr_find()
learn_lm.recorder.plot(skip_end = 15)

In [None]:
learn_lm.recorder.plot_losses()

In [12]:
learn_lm.fit_one_cycle(10, lr, moms=(0.8,0.7)) #bs = 512

epoch,train_loss,valid_loss,accuracy,time
0,7.128744,7.440886,0.224742,01:36
1,7.145301,7.440466,0.22477,01:37
2,7.167376,7.424616,0.225398,01:37
3,7.09363,7.406709,0.227441,01:37
4,7.060319,7.385869,0.228823,01:37
5,7.006532,7.358196,0.231083,01:37
6,6.937879,7.343986,0.23238,01:37
7,6.893778,7.334908,0.233949,01:37
8,6.864722,7.320726,0.234759,01:37
9,6.830932,7.319204,0.235345,01:37


In [None]:
lr = 1e-3
learn_lm.fit_one_cycle(1, lr, moms=(0.8,0.7)) #bs = 512

epoch,train_loss,valid_loss,accuracy,time


In [10]:
learn_lm.load('lm-gpu2-sp-40M-v1');

In [None]:
data_cls2 = (TextList.from_csv(local_project_path, 'test-400M.csv', cols='seq', vocab=data_cls.vocab, processor=[OpenFileProcessor(), SPProcessor()])
                    .split_by_rand_pct(0.01)
                   .label_for_lm()
                   .databunch(bs=bs))

In [None]:
learn_lm.data = data_cls2

In [None]:
learn_lm.fit_one_cycle(10, lr, moms=(0.8,0.7)) #bs = 512

In [None]:
learn_lm.fit_one_cycle(10, lr, moms=(0.8,0.7)) #bs = 512

In [14]:
learn_lm.save('lm-gpu2-sp-40M-v3');

In [None]:
lr = 1e-3
learn_lm.fit_one_cycle(10, lr, moms=(0.8,0.7)) #bs = 512

In [None]:
learn_lm.validate(metrics=[accuracy])

In [None]:
learn_lm.save('lm-v1-loss2.46')

In [None]:
# del learn_lm
torch.cuda.empty_cache()
import gc; gc.collect()