In [1]:
local_path = './'

"""## Prepare fastai"""
from fastai import *
from fastai.text import *
from fastai.metrics import *
from fastai.callbacks.tensorboard import LearnerTensorboardWriter
from fastai.callbacks.misc import StopAfterNBatches
import datetime
from pytz import timezone

torch.cuda.set_device(1)
np.random.seed(0)
"""## Prepare Dataset"""
local_project_path = local_path + 'data/cafa3/'
if not os.path.exists(local_project_path):
    os.makedirs(local_project_path)
print('local_project_path:', local_project_path)

"""## Create Language Model"""
class dna_tokenizer(BaseTokenizer):
    def tokenizer(self, t):
        res = []
        tokens = t.split(' ')
        if len(tokens) == 3:
            bos = tokens[0]
            text = tokens[1]
            eos = tokens[2]
            res = list(text)
            res.insert(0, bos)
            res.append(eos)
        else:
            res = list(tokens)
        return res
tokenizer = Tokenizer(tok_func=dna_tokenizer, pre_rules=[], post_rules=[], special_cases=[])
processor = [TokenizeProcessor(tokenizer=tokenizer, include_bos= True, include_eos=True), NumericalizeProcessor(max_vocab=30000)]

local_project_path: ./data/cafa3/


In [None]:
# batch size
bs = 8
data_lm = TextLMDataBunch.from_csv('bp_deepred/', 'combined.csv',
                                   text_cols ='seq', valid_pct= 0.1, tokenizer=tokenizer,
                                   include_bos= True, include_eos=True, bs=bs)
print('data_cls Training set size', len(data_lm.train_ds))
print('data_cls Validation set size', len(data_lm.valid_ds))

In [None]:
data_cls = (TextList.from_csv(local_project_path, 'uniprot_sprot_exp_go_all.csv', cols='seq', vocab=data_lm.vocab, processor=processor)
                    .split_by_rand_pct(0.10)
                   .label_from_df(cols='labels', label_delim=' ')
                   .databunch(bs=bs))

In [None]:
print(len(data_cls.train_ds))
print(len(data_cls.valid_ds))

In [None]:
data_cls.train_ds.y[10400]

In [None]:
# # batch size
# bs = 256
# data_cls = TextClasDataBunch.from_csv(local_project_path, 'uniprot_sprot_exp_go_F.csv',
#                                    text_cols ='seq', valid_pct= 0.1, tokenizer=tokenizer,
#                                    include_bos= True, include_eos=True, classes='labels', bs=bs)
# print('data_cls Training set size', len(data_lm.train_ds))
# print('data_cls Validation set size', len(data_lm.valid_ds)) 

In [None]:
data_cls.show_batch()

In [None]:
len(data_lm.vocab.itos)

In [None]:
acc_02 = partial(accuracy_thresh, thresh=0.5)
f_score = partial(fbeta, thresh=0.5, beta=1)

In [None]:
learn_cls = text_classifier_learner(data_cls, AWD_LSTM, drop_mult=0.05, pretrained=False, metrics =[acc_02, f_score]).to_fp16()

In [None]:
learn_cls.metrics =[acc_02, f_score]

In [None]:
learn_cls.load_encoder('../../../bp_deepred/models/lm2-v2-21_enc');

In [None]:
learn_lm.data.batch_size = 256

In [None]:
def add_tensorboard_callback(learn_lm):
    now = datetime.datetime.now().astimezone(timezone('US/Eastern'))
    time_for_different_run = f'{now.year}-{now.month}-{now.day}-{now.hour}-{now.minute}-{now.second}'

    proj_id = 'cafa' + time_for_different_run
    tboard_path = Path('log/' + proj_id)
    remove_tensorboard_callback(learn_lm)
    learn_lm.callback_fns.append(partial(LearnerTensorboardWriter, base_dir=tboard_path, name='CafaLearner'))

def remove_tensorboard_callback(learn_lm):
    if len(learn_lm.callback_fns) > 1: # not the best way to check this !!
        learn_lm.callback_fns.pop()


In [None]:
add_tensorboard_callback(learn_cls)

In [None]:
remove_tensorboard_callback(learn_cls)
learn_cls.lr_find()
add_tensorboard_callback(learn_cls)

In [None]:
learn_cls.recorder.plot(skip_start=20, skip_end=20, suggestion = True)

In [None]:
learn_lm.recorder.plot_losses()

In [None]:
learn_lm.recorder.plot_lr(show_moms=True)

In [None]:
learn_lm.recorder.plot_metrics()

In [None]:
learn_cls.fit_one_cycle(1, slice(1e-2), moms=(0.8,0.7))

In [None]:
learn_cls.fit_one_cycle(10, slice(1e-3), moms=(0.8,0.7))

In [None]:
learn_cls.unfreeze()

In [None]:
learn_cls.fit_one_cycle(10, slice(1e-3), moms=(0.8,0.7))

In [None]:
learn_cls.fit_one_cycle(10, slice(1e-4), moms=(0.8,0.7))

In [None]:
learn_cls.fit_one_cycle(10, slice(1e-4), moms=(0.8,0.7))

In [None]:
learn_lm.unfreeze()

In [None]:
learn_cls.validate(metrics=[partial(accuracy_thresh, thresh=0.5), partial(fbeta, thresh=0.5, beta = 1), top_k_accuracy])

In [None]:
pred = learn_cls.get_preds()

In [None]:
learn_cls.summary()

In [None]:
learn_cls.model

In [None]:
interp = ClassificationInterpretation.from_learner(learn_cls)

In [None]:
losses,idxs = interp.top_losses()

In [None]:
len(data_cls.valid_ds)==len(losses)==len(idxs)

In [None]:
interp.plot_top_losses(9, figsize=(15,11))

In [None]:
interp.plot_confusion_matrix(figsize=(15,15), dpi=120)

In [None]:
interp.most_confused(min_val=2)

In [None]:
(losses > 1).sum()

In [None]:
len(losses)

In [None]:
len(data_cls.classes)