In [2]:
import warnings
warnings.filterwarnings("ignore")
import pandas as pd
import numpy as np
from fastai import *
from fastai.lm_rnn import *
from fastai.text import *
import pickle
import json
import collections

  from numpy.core.umath_tests import inner1d


In [3]:
# Reading descriptions in indices (before padding)
trn_idx = np.array(pd.read_feather('TRN_LM_TRIAL_N.feather'))
val_idx = np.array(pd.read_feather('VAL_LM_TRIAL_N.feather'))
# Vocab list
vocab = pd.read_feather('VOCAB_TRIAL_N.feather')

In [4]:
# index2string, string2index
itos = vocab['term']
stoi = collections.defaultdict(lambda:0, {v:k for k,v in enumerate(itos)})
# Vocab size
vs = len(itos)

In [5]:
# Embedding size
em_sz = 400
# No. of hidden activations per layer
nh = 1150
# No. of hidden layers
nl = 3
# Loading wikitext-103 language model
wgts = torch.load('fwd_wt103.h5', map_location=lambda storage, loc: storage)
enc_wgts = to_np(wgts['0.encoder.weight'])
row_m = enc_wgts.mean(0)
# Loading the int2string of wikitext-103
with open('itos_wt103.pkl', 'rb') as f:
        itos2 = pickle.load(f)
stoi2 = collections.defaultdict(lambda:-1, {v:k for k,v in enumerate(itos2)})
new_w = np.zeros((vs, em_sz), dtype=np.float32)
# Matching the itos and stoi of our corpus an d pre-trained LM
for i,w in enumerate(itos):
    r = stoi2[w]
    new_w[i] = enc_wgts[r] if r>=0 else row_m
# Loading encoder weights
wgts['0.encoder.weight'] = T(new_w)
wgts['0.encoder_with_dropout.embed.weight'] = T(np.copy(new_w))
wgts['1.decoder.weight'] = T(np.copy(new_w))

In [6]:
# Weight decay
wd = 1e-7
# Back propagation through time (BPTT) batch size
bptt = 70
# Batch size
bs = 128
# Optimising function
opt_fn = partial(optim.Adam, betas=(0.8, 0.99))

In [7]:
# Concatenate the Des in index form and feed it to the LM for fine-tuning
trn_dl = LanguageModelLoader(np.concatenate(trn_idx), bs, bptt)
val_dl = LanguageModelLoader(np.concatenate(val_idx), bs, bptt)
md = LanguageModelData("", 1, vs, trn_dl, val_dl, bs=bs, bptt=bptt)

In [8]:
# Dropout in each layer
drops = np.array([0.25, 0.1, 0.2, 0.02, 0.15])*0.7
# Building the model
learner= md.get_model(opt_fn, em_sz, nh, nl, 
    dropouti=drops[0], dropout=drops[1], wdrop=drops[2], dropoute=drops[3], dropouth=drops[4])
learner.metrics = [accuracy]
# Gradual unfreezing
learner.freeze_to(-1)
# Loading pre-trained LM
learner.model.load_state_dict(wgts)

In [9]:
# Setting learning rates
lr = 1e-3
lrs = lr

In [10]:
# Training for 1 epoch
learner.fit(lrs/2, 1, wds=wd, use_clr=(32,2), cycle_len=1)

HBox(children=(IntProgress(value=0, description='Epoch', max=1), HTML(value='')))

  0%|          | 2/3078 [01:01<26:28:20, 30.98s/it, loss=10.5]

KeyboardInterrupt: 

In [None]:
# Saving progress
learner.save('LM_LAST_FT')

In [None]:
# Clean memory
torch.cuda.empty_cache()

In [None]:
learner.lr_find(start_lr=lrs/10, end_lr=lrs*10, linear=True)

In [None]:
learner.sched.plot()

In [None]:
# I guess accuracy ~ 35% for predicting next word of the sequence is OK for trial
# To achieve maximum benefits, I am not sure to what extent should we tune the LM
for i in range(0, 5):
    if i == 0:
        learner.load('LM_LAST_FT')
        learner.unfreeze()
    else:
        learner.load('LM1_TRIAL_N')
        learner.load_encoder('LM1_ENC_TRIAL_N')
    learner.fit(lrs, 1, wds=wd, use_clr=(20,10), cycle_len=1)
    learner.save('LM1_TRIAL_N')
    # Saving the encoder for classifier training
    learner.save_encoder('LM1_ENC_TRIAL_N')
    # cleaning cache memory of GPU
    torch.cuda.empty_cache()