In [None]:
import random
random.seed(42)
%load_ext autoreload
%autoreload 2

import torch
import torch.nn as nn
import numpy as np
import pytorch_lightning as pl
from torch.nn import functional as F
from torch.utils.data import DataLoader
import torch.optim as optim
from torch.autograd import Variable as V
import torchtext
from torchtext import data
from pytorch_lightning_lm.data_module import QuotesDataModule
from pytorch_lightning_lm.metrics import Perplexity
from pytorch_lightning.loggers import WandbLogger
from argparse import ArgumentParser

## LSTM

In [None]:
from pytorch_lightning_lm.model import RNNModel

In [None]:
parser = ArgumentParser()
parser.add_argument("-f", "--fff", help="a dummy argument to fool ipython", default="1")

# add PROGRAM level args
parser.add_argument('--project-name', type=str, default='neural_lms')
parser.add_argument('--experiment-tag', type=str, default='RNN_LM')
parser.add_argument('--use-cuda', type=bool, default=True)
parser.add_argument('--use-wandb', type=bool, default=True)
parser.add_argument('--log-gradients', type=bool, default=False)
parser.add_argument('--unk-cutoff', type=int, default=2)

# add model specific args
# parser = LitModel.add_model_specific_args(parser)
parser.add_argument('--batch_size', type=int, default=32)
parser.add_argument('--bptt', type=int, default=16)
parser.add_argument('--rnn-type', type=str, default="LSTM")
parser.add_argument('--nhid', type=int, default=64)
parser.add_argument('--nlayers', type=int, default=2)
parser.add_argument('--pretrained-vector', type=str, default="fasttext.simple.300d")

# add all the available trainer options to argparse
parser.add_argument('--max_epochs', type=int, default=25)
parser.add_argument('--fast_dev_run', type=bool, default=True)
# ie: now --gpus --num_nodes ... --fast_dev_run all work in the cli
# parser = Trainer.add_argparse_args(parser)
args = parser.parse_args()

In [None]:
device = torch.device('cuda') if (torch.cuda.is_available()&args.use_cuda) else torch.device('cpu')
experiment_name = f"{args.experiment_tag}_{args.rnn_type}_{args.batch_size}_{args.bptt}_{args.nhid}_{args.nlayers}"
print(experiment_name)

In [None]:
experiment_name = experiment_name+"_test"

In [None]:
dm = QuotesDataModule(
    train_file="data/quotesdb/funny_quotes.train.txt",
    valid_file="data/quotesdb/funny_quotes.val.txt",
    test_file="data/quotesdb/funny_quotes.test.txt",
    tokenizer=None,
    batch_size=args.batch_size,
    bptt=args.bptt,
    pretrained_vectors=args.pretrained_vector,
)

In [8]:
vocab = dm.vocab
weight_matrix = vocab.vectors
ntoken, ninp = weight_matrix.shape

pad_idx = vocab.stoi["<pad>"]

ppl = Perplexity(pad_idx)
model = RNNModel(
    rnn_type=args.rnn_type, ntoken=ntoken, ninp=ninp, nhid=args.nhid, nlayers=args.nlayers, batch_size=args.batch_size, device_type= device.type, pretrained_vectors=weight_matrix, metric=ppl
)

if args.use_wandb:
    wandb_logger = WandbLogger(name=experiment_name,project=args.project_name)
    if args.log_gradients:
        wandb_logger.watch(model, log='gradients', log_freq=100)
    logger= wandb_logger
else:
    logger= True

if args.fast_dev_run:
    logger = None
    
trainer = pl.Trainer(gpus=1 if device.type =='cuda' else 0, 
                     max_epochs=args.max_epochs, 
                     logger=logger, 
                     auto_lr_find=False if args.fast_dev_run else True,
                    fast_dev_run=args.fast_dev_run,
                    early_stop_callback=early_stop_callback)

trainer.fit(model, datamodule=dm)

if not args.fast_dev_run:
    trainer.save_checkpoint(f"models/{experiment_name}.ckpt")
    torch.save(dm.vocab, "models/{experiment_name}.sav")
    trainer.auto_lr_find = False
    test_eval = trainer.test(model, datamodule=dm)
    logger.log_metrics({
        "test_ppl":test_eval[0]['test_ppl'],
        "test_loss":test_eval[0]['test_loss']
    })

Running in fast_dev_run mode: will run a full train, val and test loop using a single batch
GPU available: True, used: True
TPU available: False, using: 0 TPU cores
CUDA_VISIBLE_DEVICES: [0]


wandb: Network error resolved after 0:00:34.660896, resuming normal operation.

  | Name      | Type             | Params
-----------------------------------------------
0 | criterion | CrossEntropyLoss | 0     
1 | metric    | Perplexity       | 0     
2 | drop      | Dropout          | 0     
3 | encoder   | Embedding        | 13 M  
4 | rnn       | LSTM             | 126 K 
5 | decoder   | Linear           | 2 M   


HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Training', layout=Layout(flex='2'), max…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…






HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Testing', layout=Layout(flex='2'), max=…

--------------------------------------------------------------------------------
DATALOADER:0 TEST RESULTS
{'test_loss': tensor(10.7377, device='cuda:0'),
 'test_ppl': tensor(46060.9336, device='cuda:0'),
 'val_checkpoint_on': None,
 'val_early_stop_on': None}
--------------------------------------------------------------------------------



## LSTM with Attention

In [10]:
from pytorch_lightning_lm.model import RNNAttentionModel

In [12]:
parser = ArgumentParser()
parser.add_argument("-f", "--fff", help="a dummy argument to fool ipython", default="1")

# add PROGRAM level args
parser.add_argument('--project-name', type=str, default='neural_lms')
parser.add_argument('--experiment-tag', type=str, default='RNN_LM_w_Att')
parser.add_argument('--use-cuda', type=bool, default=True)
parser.add_argument('--use-wandb', type=bool, default=True)
parser.add_argument('--log-gradients', type=bool, default=False)
parser.add_argument('--unk-cutoff', type=int, default=2)

# add model specific args
# parser = LitModel.add_model_specific_args(parser)
parser.add_argument('--batch_size', type=int, default=32)
parser.add_argument('--bptt', type=int, default=16)
parser.add_argument('--rnn-type', type=str, default="LSTM")
parser.add_argument('--nhid', type=int, default=64)
parser.add_argument('--nlayers', type=int, default=2)
parser.add_argument('--att-width', type=int, default=6)
parser.add_argument('--pretrained-vector', type=str, default="fasttext.simple.300d")

# add all the available trainer options to argparse
parser.add_argument('--max_epochs', type=int, default=25)
parser.add_argument('--fast_dev_run', type=bool, default=False)
# ie: now --gpus --num_nodes ... --fast_dev_run all work in the cli
# parser = Trainer.add_argparse_args(parser)
args = parser.parse_args()

In [14]:
device = torch.device('cuda') if (torch.cuda.is_available()&args.use_cuda) else torch.device('cpu')
experiment_name = f"{args.experiment_tag}_{args.rnn_type}_{args.batch_size}_{args.bptt}_{args.nhid}_{args.nlayers}_{args.att_width}"
print(experiment_name)

RNN_LM_w_Att_LSTM_32_16_64_2_6


In [15]:
dm = QuotesDataModule(
    train_file="data/quotesdb/funny_quotes.train.txt",
    valid_file="data/quotesdb/funny_quotes.val.txt",
    test_file="data/quotesdb/funny_quotes.test.txt",
    tokenizer=None,
    batch_size=args.batch_size,
    bptt=args.bptt,
    pretrained_vectors=args.pretrained_vector,
)



In [16]:
vocab = dm.vocab
weight_matrix = vocab.vectors
ntoken, ninp = weight_matrix.shape

pad_idx = vocab.stoi["<pad>"]

ppl = Perplexity(pad_idx)
model = RNNAttentionModel(
    rnn_type=args.rnn_type, 
    ntoken=ntoken, 
    ninp=ninp, 
    nhid=args.nhid, 
    attention_width=args.att_width,
    nlayers=args.nlayers, 
    batch_size=args.batch_size, 
    device_type= device.type, 
    pretrained_vectors=weight_matrix, metric=ppl
)

if args.use_wandb:
    wandb_logger = WandbLogger(name=experiment_name,project=args.project_name)
    if args.log_gradients:
        wandb_logger.watch(model, log='gradients', log_freq=100)
    logger= wandb_logger
else:
    logger= True

if args.fast_dev_run:
    logger = None
    
early_stop_callback = pl.callbacks.EarlyStopping(
   min_delta=0.01,
   patience=5,
   verbose=False,
   mode='min'
)

trainer = pl.Trainer(gpus=1 if device.type =='cuda' else 0, 
                     max_epochs=args.max_epochs, 
                     logger=logger, 
                     auto_lr_find=False if args.fast_dev_run else True,
                    fast_dev_run=args.fast_dev_run,
                    early_stop_callback=early_stop_callback)

trainer.fit(model, datamodule=dm)

if not args.fast_dev_run:
    trainer.save_checkpoint(f"models/{experiment_name}.ckpt")
    torch.save(dm.vocab, "models/{experiment_name}.sav")
    trainer.auto_lr_find = False
    test_eval = trainer.test(model, datamodule=dm)
    logger.log_metrics({
        "test_ppl":test_eval[0]['test_ppl'],
        "test_loss":test_eval[0]['test_loss']
    })

GPU available: True, used: True
TPU available: False, using: 0 TPU cores
CUDA_VISIBLE_DEVICES: [0]

  | Name           | Type             | Params
----------------------------------------------------
0 | criterion      | CrossEntropyLoss | 0     
1 | metric         | Perplexity       | 0     
2 | drop           | Dropout          | 0     
3 | encoder        | Embedding        | 13 M  
4 | rnn            | LSTM             | 126 K 
5 | decoder        | Linear           | 2 M   
6 | softmax        | Softmax          | 0     
7 | AttentionLayer | AttentionLayer   | 4 K   
  a = self.softmax(a)


HBox(children=(FloatProgress(value=0.0, description='Finding best initial lr', style=ProgressStyle(description…

Learning rate set to 0.01


wandb: Wandb version 0.9.6 is available!  To upgrade, please run:
wandb:  $ pip install wandb --upgrade

  | Name           | Type             | Params
----------------------------------------------------
0 | criterion      | CrossEntropyLoss | 0     
1 | metric         | Perplexity       | 0     
2 | drop           | Dropout          | 0     
3 | encoder        | Embedding        | 13 M  
4 | rnn            | LSTM             | 126 K 
5 | decoder        | Linear           | 2 M   
6 | softmax        | Softmax          | 0     
7 | AttentionLayer | AttentionLayer   | 4 K   


HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validation sanity check', layout=Layout…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Training', layout=Layout(flex='2'), max…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…



HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

LR finder stopped early due to diverging loss.
Failed to compute suggesting for `lr`. There might not be enough points.
Traceback (most recent call last):
  File "C:\Users\manujoseph\Anaconda3\envs\bot\lib\site-packages\pytorch_lightning\trainer\lr_finder.py", line 344, in suggestion
    min_grad = np.gradient(loss).argmin()
  File "<__array_function__ internals>", line 6, in gradient
  File "C:\Users\manujoseph\Anaconda3\envs\bot\lib\site-packages\numpy\lib\function_base.py", line 1053, in gradient
    "Shape of array too small to calculate a numerical gradient, "
ValueError: Shape of array too small to calculate a numerical gradient, at least (edge_order + 1) elements are required.
Learning rate set to None


--------------------------------------------------------------------------------
DATALOADER:0 TEST RESULTS
{'test_loss': tensor(6.4631, device='cuda:0'),
 'test_ppl': tensor(651.5473, device='cuda:0'),
 'val_checkpoint_on': None,
 'val_early_stop_on': None}
--------------------------------------------------------------------------------


TypeError: '<=' not supported between instances of 'float' and 'NoneType'

## Transformers

In [18]:
from pytorch_lightning_lm.model import TransformerModel

In [19]:
parser = ArgumentParser()
parser.add_argument("-f", "--fff", help="a dummy argument to fool ipython", default="1")

# add PROGRAM level args
parser.add_argument('--project-name', type=str, default='neural_lms')
parser.add_argument('--experiment-tag', type=str, default='Transformer_LM')
parser.add_argument('--use-cuda', type=bool, default=True)
parser.add_argument('--use-wandb', type=bool, default=True)
parser.add_argument('--log-gradients', type=bool, default=False)
parser.add_argument('--unk-cutoff', type=int, default=1)

# add model specific args
# parser = LitModel.add_model_specific_args(parser)
parser.add_argument('--batch_size', type=int, default=32)
parser.add_argument('--bptt', type=int, default=16)
parser.add_argument('--nhid', type=int, default=64)
parser.add_argument('--nhead', type=int, default=3)
parser.add_argument('--nlayers', type=int, default=2)
parser.add_argument('--dropout', type=float, default=0.5)
parser.add_argument('--pretrained-vector', type=str, default="fasttext.simple.300d")

# add all the available trainer options to argparse
parser.add_argument('--max_epochs', type=int, default=25)
parser.add_argument('--fast_dev_run', type=bool, default=True)
# ie: now --gpus --num_nodes ... --fast_dev_run all work in the cli
# parser = Trainer.add_argparse_args(parser)
args = parser.parse_args()

In [22]:
device = torch.device('cuda') if (torch.cuda.is_available()&args.use_cuda) else torch.device('cpu')
experiment_name = f"{args.experiment_tag}_{args.batch_size}_{args.bptt}_{args.nhead}_{args.nhid}_{args.nlayers}"
print(experiment_name)

Transformer_LM_32_16_3_64_2


In [23]:
dm = QuotesDataModule(
    train_file="data/quotesdb/funny_quotes.train.txt",
    valid_file="data/quotesdb/funny_quotes.val.txt",
    test_file="data/quotesdb/funny_quotes.test.txt",
    tokenizer=None,
    batch_size=args.batch_size,
    bptt=args.bptt,
    pretrained_vectors=args.pretrained_vector,
)



In [25]:
vocab = dm.vocab
weight_matrix = vocab.vectors
ntoken, ninp = weight_matrix.shape

pad_idx = vocab.stoi["<pad>"]

ppl = Perplexity(pad_idx)
model = TransformerModel(
    ntoken=ntoken, 
    ninp=ninp,
    nhead=args.nhead,
    nhid=args.nhid, 
    nlayers=args.nlayers, 
    batch_size=args.batch_size, 
    device_type= device.type, 
    pretrained_vectors=weight_matrix, metric=ppl
)

if args.use_wandb:
    wandb_logger = WandbLogger(name=experiment_name,project=args.project_name)
    if args.log_gradients:
        wandb_logger.watch(model, log='gradients', log_freq=100)
    logger= wandb_logger
else:
    logger= True

if args.fast_dev_run:
    logger = None
    
early_stop_callback = pl.callbacks.EarlyStopping(
   min_delta=0.01,
   patience=5,
   verbose=False,
   mode='min'
)

trainer = pl.Trainer(gpus=1 if device.type =='cuda' else 0, 
                     max_epochs=args.max_epochs, 
                     logger=logger, 
                     auto_lr_find=False if args.fast_dev_run else True,
                    fast_dev_run=args.fast_dev_run,
                    early_stop_callback=early_stop_callback)

trainer.fit(model, datamodule=dm)
if not args.fast_dev_run:
    trainer.save_checkpoint(f"models/{experiment_name}.ckpt")
    torch.save(dm.vocab, "models/{experiment_name}.sav")
    trainer.auto_lr_find = False
    test_eval = trainer.test(model, datamodule=dm)
    logger.log_metrics({
        "test_ppl":test_eval[0]['test_ppl'],
        "test_loss":test_eval[0]['test_loss']
    })

Running in fast_dev_run mode: will run a full train, val and test loop using a single batch
GPU available: True, used: True
TPU available: False, using: 0 TPU cores
CUDA_VISIBLE_DEVICES: [0]

  | Name                | Type               | Params
-----------------------------------------------------------
0 | criterion           | CrossEntropyLoss   | 0     
1 | metric              | Perplexity         | 0     
2 | pos_encoder         | PositionalEncoding | 0     
3 | transformer_encoder | TransformerEncoder | 802 K 
4 | encoder             | Embedding          | 13 M  
5 | decoder             | Linear             | 13 M  
6 | drop                | Dropout            | 0     


HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Training', layout=Layout(flex='2'), max…



HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…




