In [1]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline

In [2]:
from fastai.learner import *

import torchtext
from torchtext import vocab, data
from torchtext.datasets import language_modeling

from fastai.rnn_reg import *
from fastai.rnn_train import *
from fastai.nlp import *
from fastai.lm_rnn import *

import dill as pickle
import itertools as it
from operator import itemgetter

from spooky import *

In [3]:
PATH = 'data/spooky'

os.makedirs(f'{PATH}/models', exist_ok=True)
os.makedirs(f'{PATH}/tmp', exist_ok=True)

bs = 4
bptt = 5

# for NLP, configure Adam to use less momentum than the defaul of 0.9
opt_fn = partial(optim.Adam, betas=(0.7, 0.99))

In [4]:
# get raw training and test datasets
train_raw_df = pd.read_csv(f'{PATH}/train.csv')
test_df = pd.read_csv(f'{PATH}/test.csv')

len(train_raw_df), len(test_df)

(19579, 8392)

## Get a baseline

Running with default parameters to figure out good default hyperparameters

In [5]:
# Standard CV

To build a standard cross-validation dataset use this
val_idxs = get_cv_idxs(len(train_raw_df), val_pct=0.10)

train_df =  train_raw_df.drop(val_idxs)
val_df = train_raw_df.iloc[val_idxs]

len(train_df), len(val_df), len(test_df)

In [None]:
# tokenize = split each sentence into a list of words
' '.join(spacy_tok(train_df.text.iloc[0]))

In [12]:
#createa torchtext field = describes how to preprocess a piece of text
txt_fld = data.Field(lower=True, tokenize=spacy_tok)

In [None]:
dataframes = dict(train_df=train_df, val_df=val_df, test_df=test_df)

# min_freq = 10 says, "treat any word that appears less than 10 times as the word <unk>"
md = LanguageModelData.from_dataframes(PATH, txt_fld, 'text', **dataframes, 
                                       bs=bs, bptt=bptt, min_freq=min_freq)

In [None]:
# after building the ModelData object, TEXT.vocab is set.  because this will be needed again, save it
pickle.dump(txt_fld, open(f'{PATH}/models/TEXT.pkl', 'wb'))

In [None]:
# batches, # of unique tokens in vocab, # of items in ds, # of words in ds
len(md.trn_dl), md.nt, len(md.trn_ds), len(md.trn_ds[0].text)

In [None]:
# int to string mapping
TEXT.vocab.itos[:12]

In [None]:
# string to int mapping
TEXT.vocab.stoi['the']

In [None]:
# in a LanguageModelData object there is only one item in each dataset: all the words joined together
md.trn_ds[0].text[:12]

In [None]:
# torchtext will handle turning this words into integer Ids
TEXT.numericalize([md.trn_ds[0].text[:12]])

In [None]:
batch = next(iter(md.trn_dl))
print(batch[0].size()), print(batch[1].size())

batch

In [None]:
em_sz = 200  # size of each embedding vector
n_hidden = 500     # number of hidden activations per layer
n_layers = 3       # number of layers

In [None]:
learner = md.get_model(opt_fn, emb_sz, n_hidden, n_layers,
                      dropouti=0.1, dropout=0.1, wdrop=0.2, dropoute=0.04, dropouth=0.1)

learner.reg_fn = partial(seq2seq_reg, alpha=2, beta=1)
learner.clip = 0.3

In [None]:
lrf = learner.lr_find() 

In [None]:
learner.sched.plot()

In [None]:
lr = 1e-3

In [None]:
learner.fit(lr, 4, wds=wds, cycle_len=1, cycle_mult=2)

In [None]:
learner.save_encoder('spooky_adam_enc1')
# learner.load_encoder('spooky_adam1_enc')

In [None]:
learner.fit(lr, 2, wds=wds, cycle_len=5, cycle_save_name='spooky_adam_enc2_c1_cl5')

In [None]:
learner.save_encoder('spooky_adam_enc2')

In [None]:
learner.fit(lr, 1, wds=wds, cycle_len=10, cycle_save_name='spooky_adam_enc3_c1_cl10')

In [None]:
learner.save_encoder('spooky_adam_enc3')

In [None]:
# metric perplexity (how language model accuracy generally measured) = exp() of loss function
np.exp(4.33935)

## GridSearchCV

Do a grid search to figure out params

In [6]:
# GridSearchCV ...

# 1. Define hyper parameters
# size of each embedding vector, # of hidden activations per layer, # of layers, min word freq
params = { 'emb_sz': [50, 200, 400], 'n_hidden': [512, 1024], 'n_layers': [3, 4], 'min_freq': [10] }

# 2. Define folds
from sklearn.model_selection import StratifiedKFold
skf = StratifiedKFold(n_splits=3, shuffle=True)

# get folds
kfolds = [ (train_idxs, val_idxs) for train_idxs, val_idxs in skf.split(train_raw_df.id, train_raw_df.author) ]

# 3. Get all permutations of hyperparameters
param_names = sorted(params)
param_combos = [dict(zip(param_names, prod)) for prod in it.product(*(params[k] for k in param_names))]

In [None]:
lr = 3e-3
wds = 1e-6

for hps in param_combos:
    # get params for this run
    emb_sz, n_hidden, n_layers, min_freq = itemgetter('emb_sz', 'n_hidden', 'n_layers', 'min_freq')(hps)

    fold_metrics = []
    
    for f in kfolds:
        # build train/val dataframes
        train_df =  train_raw_df.iloc[f[0]]
        val_df = train_raw_df.iloc[f[1]]
        
        #create torchtext field = describes how to preprocess a piece of text
        txt_fld = data.Field(lower=True, tokenize=spacy_tok)
        
        dataframes = dict(train_df=train_df, val_df=val_df, test_df=test_df)

        # min_freq = 10 says, "treat any word that appears less than 10 times as the word <unk>"
        md = LanguageModelData.from_dataframes(PATH, txt_fld, 'text', **dataframes, 
                                               bs=bs, bptt=bptt, min_freq=min_freq)
        
        learner = md.get_model(opt_fn, emb_sz, n_hidden, n_layers,
                      dropouti=0.1, dropout=0.1, wdrop=0.2, dropoute=0.04, dropouth=0.1)

        learner.reg_fn = partial(seq2seq_reg, alpha=2, beta=1)
        learner.clip = 0.3
        
        learner.fit(lr, 4, wds=wds, cycle_len=1, cycle_mult=2)
        
        learner.fit(lr, 2, wds=wds, cycle_len=5)
        
        learner.fit(lr, 1, wds=wds, cycle_len=10)
        
        acc = accuracy(*learner.predict_with_targs())
        fold_metrics.append(acc)
        
    hps['metrics'] = np.mean(fold_metrics)

## Train final LanguageModel

Use the best hyperparameters against the full training dataset to train a final model

In [None]:
# train on entire training dataset with best metrics
train_df = train_raw_df.copy()
val_df = train_raw_df.copy()

lr = 3e-3
wds = 1e-6

best_params = sorted(params, key=lambda k: k['metrics'])[0]
emb_sz, n_hidden, n_layers, min_freq = itemgetter('emb_sz', 'n_hidden', 'n_layers', 'min_freq')(best_params)

## Test

In [None]:
# create a short bit of text to "prime" the precitions, then use torchtext to numericalize it
# so we can feed it into our language model
m = learner.model
ss = """. It was a dark and scary night. The old"""
s = [spacy_tok(ss)]
t = TEXT.numericalize(s)
' '.join(s[0])

In [None]:
m[0].bs = 1      # set batch size = 1
m.eval()         # turn-off dropout
m.reset()        # reset hidden state
res, *_ = m(t)   # get predictions from model
m[0].bs = bs     # put batch size back to what it was

In [None]:
# top 10 predictions for next word
nexts = torch.topk(res[-1], 10)[1]
[TEXT.vocab.itos[o] for o in to_np(nexts)]

In [None]:
# try to generate more text
print(ss, "\n")

for i in range(50):
    n = res[-1].topk(2)[1]
    n = n[1] if n.data[0] == 0 else n[0]
    print(TEXT.vocab.itos[n.data[0]], end=' ')
    res, *_ = m(n[0].unsqueeze(0))
    
print('...')

## Predict the author

In [None]:
bs = 64
bptt = 70

emb_sz = 400       # size of each embedding vector
nh = 1024           # of hidden activations per layer
nl = 3             # of layers

# for NLP, configure Adam to use less momentum than the defaul of 0.9
opt_fn = partial(optim.Adam, betas=(0.7, 0.99))

In [None]:
# use the same vocab built from the language model so as to ensure words map to same Ids
TEXT = pickle.load(open(f'{PATH}/models/TEXT.pkl', 'rb'))

In [None]:
AUTHOR_LABEL = data.Field(sequential=False)
splits = SpookyDataset.splits(TEXT, AUTHOR_LABEL, train_df, val_df, test_df)

In [None]:
t = splits[0].examples[0]

In [None]:
t.label, ' '.join(t.text[:10])

In [None]:
# fastai can create a ModelData object directly from torchtext splits
md2 = TextData.from_splits(PATH, splits, bs)

In [None]:
m3 = md2.get_model(opt_fn, 1500, bptt, emb_sz=emb_sz, n_hid=nh, n_layers=nl,
                      dropout=0.1, dropouti=0.4, wdrop=0.5, dropoute=0.05, dropouth=0.3)

m3.reg_fn = partial(seq2seq_reg, alpha=2, beta=1)
m3.load_encoder(f'spooky_adam_enc2')

In [None]:
m3.clip = 25.
lrs = np.array([1e-4, 1e-3, 1e-2])

In [None]:
m3.freeze_to(-1) # freeze everything except last layer
m3.fit(lrs/2, 2, metrics=[accuracy])

In [None]:
m3.unfreeze()
m3.fit(lrs, 2, metrics=[accuracy], cycle_len=1)

In [None]:
m3.fit(lrs/2, 4, metrics=[accuracy], cycle_len=1, cycle_mult=2, cycle_save_name='spooky_sent1_c4_cl1x2')

In [None]:
m3.fit(lrs/4, 3, metrics=[accuracy], cycle_len=3, cycle_save_name='spooky_sent2_c3_cl3')

In [None]:
m3.load_cycle('spooky_sent1_c4_cl1x2', 1) # NOTE: using model with lower val loss is better
# m3

In [None]:
classes = AUTHOR_LABEL.vocab.itos
classes

In [None]:
preds = []

m = m3.model 
m[0].bs = 1
for index, row in test_df.iterrows():
    ss = row['text']
    s = [spacy_tok(ss)]
    t = TEXT.numericalize(s)
   
    m.eval()
    m.reset()
    res,*_ = m(t)
    preds.append(to_np(res).squeeze()[1:])
#     preds.append(to_np(res).squeeze())
    
preds = np.array(preds)
preds.shape

In [None]:
probs = to_np(F.softmax(torch.from_numpy(preds)))

In [None]:
probs.shape

In [None]:
def do_clip(arr, mx):
    clipped = np.clip(arr, (1-mx)/1, mx)
    return clipped/clipped.sum(axis=1)[:, np.newaxis]

In [None]:
# probs = do_clip(probs, 0.98)

In [None]:
preds_test_df = test_df.copy()
preds_test_df['EAP'] = probs[:,0]
preds_test_df['MWS'] = probs[:,1]
preds_test_df['HPL'] = probs[:,2]

preds_test_df.drop('text', axis=1, inplace=True)
preds_test_df.head()

In [None]:
preds_test_df.to_csv(f'{PATH}/subm_wg_20171127_4.csv', index=None)

In [None]:
preds_test_df = pd.read_csv(f'{PATH}/subm_wg_20171126_3.csv', index_col=None)

In [None]:
preds_test_df.head()

In [None]:
from IPython.display import FileLink

In [None]:
FileLink(f'{PATH}/subm_wg_20171126_3.csv')