In [1]:
import warnings
warnings.filterwarnings("ignore")
from pyarrow.feather import *
import pandas as pd
import numpy as np
from fastai import *
from fastai.lm_rnn import *
from fastai.text import *
import pickle
import json
import collections

  from numpy.core.umath_tests import inner1d


In [17]:
# Reading padded indices inputs (298 tokens max)
# "empty container" --> 401 16 1 1 1... 1
trn_clas = np.array(pd.read_feather('seq_trn_trial3.feather'))
#val_clas = np.array(pd.read_feather('seq_val_trial3.feather'))
#test_clas = np.array(pd.read_feather('seq_test_trial3.feather'))

# Reading SubCodes dataframe then converting them into one large array
trn_labels = np.squeeze(np.array(pd.read_feather('lbl_trn_trial3.feather')))
#val_labels = np.squeeze(np.array(pd.read_feather('lbl_val_trial3.feather')))
#test_labels = np.squeeze(np.array(pd.read_feather('lbl_test_trial3.feather')))

In [3]:
# int to str
itos = pd.read_feather('vocab_EN10_CN.feather')['term']
# str to int
stoi = collections.defaultdict(lambda:0, {v:k for k,v in enumerate(itos)})

# int to label
itol = pd.read_feather('52SubCodes.feather')['lbl']
# label to int
ltoi = collections.defaultdict(lambda:0, {v:k for k,v in enumerate(itol)})

In [19]:
# Converting SubCodes to indices (0 to 51)
# "000" --> 0; "040" --> 1; ...; "999" --> 51
trn_labels = np.array([ltoi[trn_labels[i]] for i in range(len(trn_labels))])
#val_labels = np.array([ltoi[val_labels[i]] for i in range(len(val_labels))])
#test_labels = np.array([ltoi[test_labels[i]] for i in range(len(test_labels))])

# Number of nominal classes
c = 52

In [4]:
# Back propagation through time, embedding size, hidden unit, hidden layer
bptt, em_sz, nh, nl = 70, 400, 1150, 3
# Vocab size
vs = len(itos)
# Optimising function
opt_fn = partial(optim.Adam, betas=(0.8, 0.99))
# Batch size
bs = 128

In [21]:
trn_ds = TextDataset(trn_clas, trn_labels)
#val_ds = TextDataset(val_clas, val_labels)
#test_ds = TextDataset(test_clas, test_labels)

trn_samp = SortishSampler(trn_clas, key=lambda x: len(trn_clas[x]), bs=bs//2)
#val_samp = SortSampler(val_clas, key=lambda x: len(val_clas[x]))

trn_dl = DataLoader(trn_ds, bs//2, transpose=True, num_workers=1, pad_idx=1, sampler=trn_samp)
#val_dl = DataLoader(val_ds, bs, transpose=True, num_workers=1, pad_idx=1, sampler=val_samp)
#test_dl = DataLoader(test_ds, bs, transpose=True, num_workers=1, pad_idx=1)

md = ModelData('', trn_dl, None)#trn_dl, val_dl)#, test_dl)

In [6]:
dps = np.array([0.4, 0.5, 0.05, 0.3, 0.1])

In [7]:
def get_rnn_classifier(bptt, max_seq, n_class, n_tok, emb_sz, n_hid, n_layers, pad_token, layers, drops, bidir=False,
                      dropouth=0.3, dropouti=0.5, dropoute=0.1, wdrop=0.5, qrnn=False):
    rnn_enc = MultiBatchRNN(bptt, max_seq, n_tok, emb_sz, n_hid, n_layers, pad_token=pad_token, bidir=bidir,
                      dropouth=dropouth, dropouti=dropouti, dropoute=dropoute, wdrop=wdrop)
    return SequentialRNN(rnn_enc, PoolingLinearClassifier(layers, drops))

In [8]:
# Defining the RNN model
m = get_rnn_classifier(bptt, 20*70, 52, vs, emb_sz=em_sz, n_hid=nh, n_layers=nl, pad_token=1,
          layers=[em_sz*3, 50, 52], drops=[dps[4], 0.1],
          dropouti=dps[0], wdrop=dps[1], dropoute=dps[2], dropouth=dps[3])

In [9]:
# Optimising function
opt_fn = partial(optim.Adam, betas=(0.7, 0.99))

In [22]:
learn = RNN_Learner(md, TextModel(to_gpu(m)), opt_fn=opt_fn)
learn.reg_fn = partial(seq2seq_reg, alpha=2, beta=1)
learn.clip = 25.
learn.metrics = [accuracy]

In [34]:
# loading a custom weight array for the loss function
torch.from_numpy(np.array(pd.read_feather('loss_wgt.feather')))


   0.5291
   1.2135
   0.7796
  12.2349
   0.5780
   2.1629
   8.2946
   0.7379
   0.8337
   2.2457
   3.9084
   0.8359
   7.5712
   0.9132
  69.7656
   6.3712
 252.5851
   6.8233
   1.4222
   0.9846
   5.5943
   0.2328
   0.5485
   1.4861
   0.8234
   0.7232
  28.6206
   0.8360
   1.7174
   2.5297
   8.5309
   0.7402
   1.1919
   0.6126
   1.3018
   0.8539
   0.1695
   1.4689
   1.2719
  51.8925
   0.8109
   2.2017
   1.0758
   1.5498
   0.2823
   1.1277
   3.3827
   4.3563
   1.3638
   0.2594
  79.2611
  31.2671
[torch.DoubleTensor of size 52x1]

In [35]:
learn.crit = torch.nn.CrossEntropyLoss(weight=torch.from_numpy(np.array(pd.read_feather('loss_wgt.feather'))))

In [36]:
# Defining learning rates
lr = 3e-3
lrm = 2.6
lrs = np.array([lr/(lrm**4), lr/(lrm**3), lr/(lrm**2), lr/lrm, lr])

In [37]:
# Weight decay
wd = 1e-7

In [38]:
learn.load_encoder('lm1_enc_EN10_CN')
learn.load('clas_2_trial3')

In [None]:
learn.fit(lrs, 1, wds=wd, cycle_len=1, use_clr=(32,10))

In [None]:
learn.save('clas_2_trial3_cont_1')

In [None]:
torch.cuda.empty_cache()

In [None]:
learn.load('clas_2_trial3_cont_1')

In [None]:
learn.fit(lrs, 1, wds=wd, cycle_len=1, use_clr=(32,10))

In [None]:
learn.save('clas_2_trial3_cont_2')

In [None]:
torch.cuda.empty_cache()

In [None]:
all_clas = np.array(d.read_feather('t2s_df_Aug14.feather'))
all_labels = np.squeeze(np.array(pd.read_feather('lbl_all_trial3.feather')))

all_ds = TextDataset(all_clas, all_labels)
all_dl = DataLoader(all_ds, bs, transpose=True, num_workers=1, pad_idx=1)

md = ModelData('', None, None, all_dl)

learn = RNN_Learner(md, TextModel(to_gpu(m)), opt_fn=opt_fn)
learn.load("clas_2_trial3_cont_2")

In [None]:
# Score vectors with 52 elements for each obs.
all_scores = learn.predict(is_test=True)
# all_prob can be obtained by applying softmax function
# predictions can be obtained by np.argmax()
# Saving the results as dataframe for further diagnosis
pd.DataFrame(all_scores, columns=itol.values.astype('str')).to_feather('Results/all_scores_trial3_cont.feather')