In [1]:
import warnings
warnings.filterwarnings("ignore")
from pyarrow.feather import *
import pandas as pd
import numpy as np
from fastai import *
from fastai.lm_rnn import *
from fastai.text import *
import pickle
import json
import collections

  from numpy.core.umath_tests import inner1d


In [110]:
# Reading padded indices inputs (298 tokens max)
trn_clas = np.array(pd.read_feather('SEQ_TRN_TRIAL_N.feather'))
val_clas = np.array(pd.read_feather('SEQ_VAL_TRIAL_N.feather'))
#test_clas = np.array(pd.read_feather('seq_test.feather'))
all_clas = np.array(pd.read_feather('SEQ_ALL_TRIAL_N.feather'))

# Reading SubCodes dataframe then converting them into one large array
trn_labels = np.squeeze(pd.read_feather('LBL_TRN_TRIAL_N.feather'))
val_labels = np.squeeze(pd.read_feather('LBL_VAL_TRIAL_N.feather'))
#test_labels = np.squeeze(pd.read_feather('lbl_test.feather'))
all_labels = np.squeeze(pd.read_feather('LBL_ALL_TRIAL_N.feather'))

In [2]:
# int to str
itos = pd.read_feather('VOCAB_TRIAL_N.feather')['term']
# str to int
stoi = collections.defaultdict(lambda:0, {v:k for k,v in enumerate(itos)})
# Vocab size
vs = len(itos)
# int to label
itol = pd.read_feather('52SubCodes.feather')['lbl']
# label to int
ltoi = collections.defaultdict(lambda:0, {v:k for k,v in enumerate(itol)})
# Number of nominal classes
n_clas = 52 #len(itol)

In [33]:
# Converting SubCodes to indices (0 to 51)
# "000" --> 0; "040" --> 1; ...; "999" --> 51
trn_labels = np.array([ltoi[trn_labels[i]] for i in range(len(trn_labels))])
val_labels = np.array([ltoi[val_labels[i]] for i in range(len(val_labels))])
#test_labels = np.array([ltoi[test_labels[i]] for i in range(len(test_labels))])
all_labels = np.array([ltoi[all_labels[i]] for i in range(len(all_labels))])

In [32]:
# Back propagation through time (BPTT) batch size
bptt = 70
# Embedding size
em_sz = 400
# No. of hidden activations per layer
nh = 1150
# No. of hidden layer
nl = 3
# Optimising function
opt_fn = partial(optim.Adam, betas=(0.8, 0.99))
# Batch size
bs = 128

In [34]:
trn_ds = TextDataset(trn_clas, trn_labels)
val_ds = TextDataset(val_clas, val_labels)
#test_ds = TextDataset(test_clas, test_labels)
all_ds = TextDataset(all_clas, all_labels)

trn_samp = SortishSampler(trn_clas, key=lambda x: len(trn_clas[x]), bs=bs//2)
val_samp = SortSampler(val_clas, key=lambda x: len(val_clas[x]))

trn_dl = DataLoader(trn_ds, bs//2, transpose=True, num_workers=1, pad_idx=1, sampler=trn_samp)
val_dl = DataLoader(val_ds, bs, transpose=True, num_workers=1, pad_idx=1, sampler=val_samp)
#test_dl = DataLoader(test_ds, bs, transpose=True, num_workers=1, pad_idx=1)
# can be made as all_dl and include all train, valid and test dataset
all_dl = DataLoader(all_ds, bs, transpose=True, num_workers=1, pad_idx=1)

md = ModelData('', trn_dl, val_dl, all_dl)

In [35]:
# Dropout in each layer
dps = np.array([0.4, 0.5, 0.05, 0.3, 0.1])

In [36]:
def get_rnn_classifier(bptt, max_seq, n_class, n_tok, emb_sz, n_hid, n_layers, pad_token, layers, drops, bidir=False,
                      dropouth=0.3, dropouti=0.5, dropoute=0.1, wdrop=0.5, qrnn=False):
    rnn_enc = MultiBatchRNN(bptt, max_seq, n_tok, emb_sz, n_hid, n_layers, pad_token=pad_token, bidir=bidir,
                      dropouth=dropouth, dropouti=dropouti, dropoute=dropoute, wdrop=wdrop)
    return SequentialRNN(rnn_enc, PoolingLinearClassifier(layers, drops))

In [37]:
# Defining the RNN model with the hyperparameters set above
m = get_rnn_classifier(bptt, 20*70, n_class=n_clas, n_tok=vs, emb_sz=em_sz, n_hid=nh, n_layers=nl, pad_token=1,
          layers=[em_sz*3, 50, c], drops=[dps[4], 0.1],
          dropouti=dps[0], wdrop=dps[1], dropoute=dps[2], dropouth=dps[3])

In [38]:
# Optimising function
opt_fn = partial(optim.Adam, betas=(0.7, 0.99))

In [39]:
learn = RNN_Learner(md, TextModel(to_gpu(m)), opt_fn=opt_fn)
learn.reg_fn = partial(seq2seq_reg, alpha=2, beta=1)
learn.clip = 25.
learn.metrics = [accuracy]
learn.crit = torch.nn.CrossEntropyLoss(weight=torch.from_numpy(np.array(pd.read_feather('LOSS_WGT_SQRT.feather'))).type(torch.cuda.FloatTensor))

In [12]:
# Defining learning rates
lr = 3e-3
# optimal multiplier based on the developers
lrm = 2.6
lrs = np.array([lr/(lrm**4), lr/(lrm**3), lr/(lrm**2), lr/lrm, lr])

In [14]:
# Weight decay
wd = 1e-7
# Loading the encoder from previously tuned LM
learn.load_encoder('LM1_ENC_TRIAL_N')

In [16]:
learn.freeze_to(-1)

In [None]:
learn.lr_find(lrs/1000)
learn.sched.plot()

HBox(children=(IntProgress(value=0, description='Epoch', max=1), HTML(value='')))

  0%|          | 0/17034 [00:00<?, ?it/s]

In [None]:
learn.fit(lrs, 1, wds=wd, cycle_len=1, use_clr=(8,3))

In [None]:
learn.save('CLAS_0_TRIAL_N')

In [None]:
torch.cuda.empty_cache()

In [None]:
learn.load('CLAS_0_TRIAL_N')

In [None]:
learn.freeze_to(-2)

In [None]:
learn.fit(lrs, 1, wds=wd, cycle_len=1, use_clr=(8,3))

In [None]:
learn.save('CLAS_1_TRIAL_N')

In [None]:
torch.cuda.empty_cache()

In [None]:
learn.load('CLAS_1_TRIAL_N')

In [None]:
learn.unfreeze()

In [None]:
for i in range(0, 10):
    if i == 0:
        learn.load('CLAS_1_TRIAL_N')
    else:
        learn.load('CLAS_2_TRIAL_N')
    learn.fit(lrs, 1, wds=wd, cycle_len=1, use_clr=(32,10))
    learn.save('CLAS_2_TRIAL_N')
    # cleaning cache memory of GPU
    torch.cuda.empty_cache()

In [40]:
learn.load('CLAS_2_TRIAL_N')

In [41]:
# Scores for all obs. in a list
all_scores = learn.predict(is_test=True)
# Saving all output scores for further diagnosis
pd.DataFrame(all_scores, columns=itol.values.astype('str')).to_feather('RESULTS/ALL_SCORES_TRIAL_N.feather')

KeyboardInterrupt: 