In [1]:
import random
random.seed(42)
%load_ext autoreload
%autoreload 2

import torch
import torch.nn as nn
import numpy as np
import pytorch_lightning as pl
from torch.nn import functional as F
from torch.utils.data import DataLoader
import torch.optim as optim
from torch.autograd import Variable as V
import torchtext
from torchtext import data
from pytorch_lightning_lm.data_module import QuotesDataModule
from pytorch_lightning_lm.metrics import Perplexity
from pytorch_lightning.loggers import WandbLogger
from argparse import ArgumentParser



## LSTM with Attention

In [3]:
from pytorch_lightning_lm.model import RNNAttentionModel

In [11]:
parser = ArgumentParser()
parser.add_argument("-f", "--fff", help="a dummy argument to fool ipython", default="1")

# add PROGRAM level args
parser.add_argument('--project-name', type=str, default='rnn_lm_attention')
parser.add_argument('--experiment-tag', type=str, default='Baseline_NoAtt')
parser.add_argument('--use-cuda', type=bool, default=True)
parser.add_argument('--use-wandb', type=bool, default=True)
parser.add_argument('--log-gradients', type=bool, default=True)
parser.add_argument('--unk-cutoff', type=int, default=1)

# add model specific args
# parser = LitModel.add_model_specific_args(parser)
parser.add_argument('--batch_size', type=int, default=128)
parser.add_argument('--bptt', type=int, default=16)
parser.add_argument('--rnn-type', type=str, default="LSTM")
parser.add_argument('--attention', type=str, default="scaled_dot")
parser.add_argument('--nhid', type=int, default=256)
parser.add_argument('--query-dim', type=int, default=256)
parser.add_argument('--nlayers', type=int, default=2)
parser.add_argument('--dropout', type=float, default=0.5)
parser.add_argument('--pretrained-vector', type=str, default="fasttext.simple.300d")

# add all the available trainer options to argparse
parser.add_argument('--max_epochs', type=int, default=25)
parser.add_argument('--fast_dev_run', type=bool, default=False)
# ie: now --gpus --num_nodes ... --fast_dev_run all work in the cli
# parser = Trainer.add_argparse_args(parser)
args = parser.parse_args()

In [12]:
device = torch.device('cuda') if (torch.cuda.is_available()&args.use_cuda) else torch.device('cpu')
experiment_name = f"{args.experiment_tag}_{args.rnn_type}_{args.batch_size}_{args.bptt}_{args.nhid}_{args.nlayers}_{args.attention}"
print(experiment_name)

Baseline_NoAtt_LSTM_128_16_256_2_scaled_dot


In [8]:
# experiment_name=experiment_name+"_tied_weights"

In [9]:
dm = QuotesDataModule(
    train_file="data/quotesdb/funny_quotes.train.txt",
    valid_file="data/quotesdb/funny_quotes.val.txt",
    test_file="data/quotesdb/funny_quotes.test.txt",
    tokenizer=None,
    unk_limit=args.unk_cutoff,
    batch_size=args.batch_size,
    bptt=args.bptt,
    pretrained_vectors=args.pretrained_vector,
)



In [13]:
vocab = dm.vocab
weight_matrix = vocab.vectors
ntoken, ninp = weight_matrix.shape

pad_idx = vocab.stoi["<pad>"]

ppl = Perplexity(pad_idx)
model = RNNAttentionModel(
    rnn_type=args.rnn_type, 
    ntoken=ntoken, 
    ninp=ninp, 
    nhid=args.nhid, 
    query_dim=args.query_dim,
    attention=args.attention,
    nlayers=args.nlayers,
    dropout = args.dropout,
    batch_size=args.batch_size, 
    device_type= device.type,
    lr = 1e-2,
    pretrained_vectors=weight_matrix, metric=ppl,
    tie_weights=False
)

if args.use_wandb:
    wandb_logger = WandbLogger(name=experiment_name,project=args.project_name)
    if args.log_gradients:
        wandb_logger.watch(model, log='gradients', log_freq=100)
    logger= wandb_logger
    logger.log_hyperparams({"bptt":args.bptt,
                           "pretrained_vector": args.pretrained_vector,
                           "unk_limit": args.unk_cutoff})
else:
    logger= True

if args.fast_dev_run:
    logger = None
    
early_stop_callback = pl.callbacks.EarlyStopping(
   min_delta=0.01,
   patience=5,
   verbose=False,
   mode='min'
)

trainer = pl.Trainer(gpus=1 if device.type =='cuda' else 0, 
                     max_epochs=args.max_epochs, 
                     logger=logger, 
#                      auto_lr_find=False if args.fast_dev_run else True,
                    fast_dev_run=args.fast_dev_run,
                    early_stop_callback=early_stop_callback)

trainer.fit(model, datamodule=dm)

if not args.fast_dev_run:
    trainer.save_checkpoint(f"models/{experiment_name}.ckpt")
    torch.save(dm.vocab, f"models/{experiment_name}_vocab.sav")
    trainer.auto_lr_find = False
    test_eval = trainer.test(model, datamodule=dm)
    logger.log_metrics({
        "test_ppl":test_eval[0]['test_ppl'],
        "test_loss":test_eval[0]['test_loss']
    })

wandb: Wandb version 0.9.6 is available!  To upgrade, please run:
wandb:  $ pip install wandb --upgrade
GPU available: True, used: True
TPU available: False, using: 0 TPU cores
CUDA_VISIBLE_DEVICES: [0]

  | Name       | Type                           | Params
--------------------------------------------------------------
0 | criterion  | CrossEntropyLoss               | 0     
1 | metric     | Perplexity                     | 0     
2 | drop       | Dropout                        | 0     
3 | encoder    | Embedding                      | 13 M  
4 | rnn        | LSTM                           | 1 M   
5 | attn_layer | ScaledDotProductAttentionLayer | 196 K 
6 | decoder    | Linear                         | 11 M  
7 | softmax    | Softmax                        | 0     


HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validation sanity check', layout=Layout…



HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Training', layout=Layout(flex='2'), max…

RuntimeError: CUDA out of memory. Tried to allocate 32.00 MiB (GPU 0; 4.00 GiB total capacity; 1023.23 MiB already allocated; 0 bytes free; 1.01 GiB reserved in total by PyTorch)

In [12]:
logger.log_hyperparams({"bptt":args.bptt})

wandb: Wandb version 0.9.6 is available!  To upgrade, please run:
wandb:  $ pip install wandb --upgrade
