https://pytorch.org/tutorials/beginner/nlp/sequence_models_tutorial.html

## Basic includes

In [1]:
!pip install ipython-autotime
%load_ext autotime

Collecting ipython-autotime
  Downloading ipython_autotime-0.3.1-py2.py3-none-any.whl (6.8 kB)
Installing collected packages: ipython-autotime
Successfully installed ipython-autotime-0.3.1
time: 1.71 ms (started: 2022-03-27 08:47:03 +00:00)


In [2]:
!pip install word2vec
!pip install ray[tune]

Collecting word2vec
  Downloading word2vec-0.11.1.tar.gz (42 kB)
[?25l[K     |███████▊                        | 10 kB 44.6 MB/s eta 0:00:01[K     |███████████████▌                | 20 kB 42.9 MB/s eta 0:00:01[K     |███████████████████████▎        | 30 kB 22.9 MB/s eta 0:00:01[K     |███████████████████████████████ | 40 kB 14.2 MB/s eta 0:00:01[K     |████████████████████████████████| 42 kB 1.1 MB/s 
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
    Preparing wheel metadata ... [?25l[?25hdone
Building wheels for collected packages: word2vec
  Building wheel for word2vec (PEP 517) ... [?25l[?25hdone
  Created wheel for word2vec: filename=word2vec-0.11.1-py2.py3-none-any.whl size=164791 sha256=1c79fbd8f48ea1888d04397dcf772e5edc7cde5366bebfd99db4c2fe150e4e95
  Stored in directory: /root/.cache/pip/wheels/c9/c0/d4/29d797817e268124a32b6cf8beb8b8fe87b86f099d5a049e61
Successfully built word2vec
Installing c

In [3]:
import word2vec
from collections import Counter # https://pymotw.com/2/collections/counter.html

import pandas as pd
import numpy as np
import itertools
import re
import os

import glob

time: 299 ms (started: 2022-03-27 08:47:28 +00:00)


In [4]:
from google.colab import drive
drive.mount('/content/drive/')

dataPath = '/content/drive/MyDrive/TP2/Datasets/Recipe1M/'
import sys
sys.path.append(dataPath)

Mounted at /content/drive/
time: 37.4 s (started: 2022-03-27 08:47:28 +00:00)


## Import Data

In [5]:
TIMESTAMP = '2022_03_19'

time: 1.05 ms (started: 2022-03-27 08:48:06 +00:00)


In [6]:
baseFrame = pd.DataFrame()

smallSet = False

if(os.path.exists(dataPath + TIMESTAMP + '/recipes_valid_full.pkl')):
  baseFrame = pd.read_pickle(dataPath + TIMESTAMP + '/recipes_valid_full.pkl')
elif(smallSet == True):
  baseFrame = baseFrame.append(pd.read_pickle(glob.glob(dataPath + TIMESTAMP +  '/recipes_valid_*.pkl')[0]))
elif(len(glob.glob(dataPath + TIMESTAMP +  '/recipes_valid_*.pkl')) != 0):
  for file in glob.glob(dataPath + TIMESTAMP +  '/recipes_valid_*.pkl'):
    if not 'full' in file:
      baseFrame = baseFrame.append(pd.read_pickle(file))

baseFrame.head()

Unnamed: 0_level_0,title,ingredients,instructions
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
000033e39b,Dilly Macaroni Salad Recipe,amount unit ingredient 0 1....,0 Cook macaroni according to package direct...
000035f7ed,Gazpacho,amount unit ingredient 0 8.0 ...,0 Add the tomatoes to a food processor with...
00003a70b1,Crunchy Onion Potato Bake,amount unit ingredient 0 2...,0 Preheat oven to 350 degrees Fah...
00004320bb,Cool 'n Easy Creamy Watermelon Pie,amount unit ingredient 0 3....,0 Dissolve Jello in boiling water. 1 ...
0000631d90,Easy Tropical Beef Skillet,amount unit ingredient 0...,"0 In a large skillet, toast the coconut ove..."


time: 4min 26s (started: 2022-03-27 08:48:06 +00:00)


In [None]:
def getAmount(row):
  return row['amount'].tolist()
def getUnit(row):
  return row['unit'].tolist()
def getIng(row):
  return row['ingredient'].tolist()

baseFrame['amount'] = np.vectorize(getAmount, otypes=[np.ndarray])(baseFrame['ingredients'])
baseFrame['unit'] = np.vectorize(getUnit, otypes=[np.ndarray])(baseFrame['ingredients'])
baseFrame['ingredient'] = np.vectorize(getIng, otypes=[np.ndarray])(baseFrame['ingredients'])
baseFrame = baseFrame.drop(columns=['ingredients'])
baseFrame.head()

## Imports for Learning
https://pytorch.org/tutorials/beginner/introyt/trainingyt.html

In [None]:
import torch

# Model
import torch.nn as nn
import torch.nn.functional as F
from torch.autograd import Variable 
from torchsummary import summary

# Optimizer
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader, random_split

# Tokenizer
# torch padding does only support constant padding (ConstantPad1d) for 1D or non-constant padding for >1D (nn.function.pad)
from tensorflow.keras.preprocessing.sequence import pad_sequences
# keras tokenizer more powerful than torch
from tensorflow.keras.preprocessing.text import Tokenizer, text_to_word_sequence
from torchtext.data import get_tokenizer # https://pytorch.org/text/stable/data_utils.html

# PyTorch TensorBoard support
from torch.utils.tensorboard import SummaryWriter
from datetime import datetime

# hyperparameter tuning
from ray import tune
from ray.tune import CLIReporter
from ray.tune.schedulers import ASHAScheduler

# Seed

In [None]:
torch.manual_seed(0)
np.random.seed(0)

# Setup
https://closeheat.com/blog/pytorch-lstm-text-generation-tutorial


## Tokenization
to be checked: necessity of punctuation (maybe reintroduce later: https://stackoverflow.com/questions/49073673/include-punctuation-in-keras-tokenizer)

### Get Corpus

In [None]:
w2v_model = word2vec.load(dataPath + 'vocab.bin')
ingredientDict = {}
for voc in w2v_model.vocab:
     # Offset by 1 so empty fields can be 0
     ingredientDict.setdefault(voc, len(ingredientDict)+1)

if 'dilly' in ingredientDict:
  print("Word exists")
else:
  print('vocab.bin not to be used as dict misses words') 

### Hyperparams

In [None]:
class HyperParams():
  def __init__(self, epochs=10, batchSize=10, lr=1e-3, ratio=[0.7, 0.2, 0.1]):
    self.epochs = epochs
    self.batchSize = batchSize
    self.lr = lr
    self.ratio = ratio

    # self.input_size = 5 #number of features
    self.hidden_dim = 4 #number of features in hidden state
    self.num_layers = 1 #number of stacked lstm layers
    # self.num_classes = 1 #number of output classes 
    self.embedding_dim = 200 # embedding dimension

  def __str__(self):
    return('epochs ' + str(self.epochs) + '\n' +
    'batchSize ' + str(self.batchSize) + '\n' +
    'lr ' + str(self.lr) + '\n' +
    'ratio train|val|test ' + str(self.ratio) + '\n' +
    # 'input_size ' + str(self.input_size) + '\n' +
    'hidden_dim ' + str(self.hidden_dim) + '\n' +
    'num_layers ' + str(self.num_layers) + '\n' +
    # 'num_classes ' + str(self.num_classes) + '\n' +
    'embedding_dim ' + str(self.embedding_dim) + '\n')
    

### Dataset

In [None]:
class TitleDataset(Dataset):
    def __init__(self, hyperparams, data):
      self.hyperparams = hyperparams

      self.tokenizer = Tokenizer(oov_token='OOV')

      # dataset split into word sequences required for training
      self.wordSeq = np.vectorize(self.getTitleSequence, otypes=[np.ndarray])(data['title'], data['ingredient'])

      # training requires same length sequences -->  padding
      self.maxSequenceLength = max([len(seq['ings']) for seq in self.wordSeq])

      # list of all words in dataset
      self.words = np.concatenate(np.vectorize(self.getCorpus, otypes=[np.ndarray])(data['title'], data['ingredient']))

      # tokenization corpus
      self.tokenizer.fit_on_texts(self.words)

      # indexed wordSequences (could be calculated in getter but very slow, preprocessing better)
      self.idxWords = np.vectorize(self.getIndexedSeqs, otypes=[np.ndarray])(self.wordSeq)

      # n gram sequences
      self.movWindSeq = pd.Series(np.vectorize(self.getMovWindSeq, otypes=[np.ndarray])(self.idxWords)).explode()
      self.movWindSeq.dropna(inplace=True)
      self.movWindSeq = self.movWindSeq.to_numpy()


    def getCorpus(self, title, ingredient):
      titleTok = text_to_word_sequence(title)
      ingTok = text_to_word_sequence(','.join(ingredient))
      return np.array(ingTok + titleTok)

    def getTitleSequence(self, title, ingredient):
      titleTok = text_to_word_sequence(title)
      ingTok = text_to_word_sequence(','.join(ingredient))
      return {'ings': ingTok, 'title': titleTok}

    def getIndexedSeqs(self, seq):
      ingTok = self.tokenizer.texts_to_sequences([seq['ings']])[0]
      ingTok = pad_sequences([ingTok], maxlen=self.maxSequenceLength, padding='pre', value=1)[0] # https://arxiv.org/abs/1903.07288
      titleTok = self.tokenizer.texts_to_sequences([seq['title']])[0]

      return {'ings': ingTok, 'title': titleTok}

    def getMovWindSeq(self, seq):
      # input needs to be pre padded
      idxShift = len(seq['title'])
      ingLen = len(seq['ings'])

      fullSeq = np.append(seq['ings'], seq['title'])
      retSeq = np.empty((0,ingLen + 1), dtype=np.int32)

      for i_shift in range(idxShift):
        retSeq = np.vstack([retSeq, np.array(fullSeq[i_shift:ingLen+i_shift+1])])
      return retSeq

    def __len__(self):
        return len(self.idxWords)

    def __getitem__(self, index):
      # tuple of input (ingredients) and label (title)
        return (
            torch.tensor(self.movWindSeq[index][:-1]),
            torch.tensor(self.movWindSeq[index][1:])
        )

## Model
LSTM Net: https://pytorch.org/docs/stable/generated/torch.nn.LSTM.html

Embedding Net: https://pytorch.org/docs/stable/generated/torch.nn.Embedding.html

Init state: https://stats.stackexchange.com/questions/224737/best-way-to-initialize-lstm-state

### base: https://github.com/yuchenlin/lstm_sentence_classifier/blob/master/LSTM_sentence_classifier.py

### base: https://pytorch.org/tutorials/beginner/nlp/sequence_models_tutorial.html#lstms-in-pytorch

### base: https://www.analyticsvidhya.com/blog/2020/08/build-a-natural-language-generation-nlg-system-using-pytorch/

In [None]:
class EmbedLSTM(nn.Module):

    def __init__(self, hyperParams, dataset, device):
        super(EmbedLSTM, self).__init__()

        # initialize vital params
        self.vocab_size = len(dataset.tokenizer.word_index)
        self.batchSize = hyperParams.batchSize
        self.hidden_dim = hyperParams.hidden_dim
        self.device = device
        self.num_layers = hyperParams.num_layers
        
        # embedding definition 
        self.word_embeddings = nn.Embedding(self.vocab_size, hyperParams.embedding_dim)

        # lstm definition
        self.lstm = nn.LSTM(input_size=hyperParams.embedding_dim, hidden_size=self.hidden_dim, num_layers=self.num_layers, batch_first=True)

        # definition fully connected layer
        self.linear = nn.Linear(self.hidden_dim, self.vocab_size)

    def forward(self, x, hidden):
        embeds = self.word_embeddings(x)

        lstm_out, hidden = self.lstm(embeds, hidden)

        out = self.linear(lstm_out.reshape(-1, self.hidden_dim))
        # tag_scores = F.log_softmax(tag_space, dim=1)
        return out, hidden

    def init_hidden(self, batchSize=None):
        ''' initializes hidden state '''
        # Create two new tensors with sizes num_layers x batchSize x hidden_dim,
        # initialized to zero, for hidden state and cell state of LSTM
        weight = next(self.parameters()).data

        batchSize = self.batchSize if batchSize == None else batchSize

        hidden = (weight.new(self.num_layers, batchSize, self.hidden_dim).zero_().to(self.device),
                  weight.new(self.num_layers, batchSize, self.hidden_dim).zero_().to(self.device))
        
        return hidden

## Training
mixture of 
* https://pytorch.org/tutorials/beginner/introyt/trainingyt.html
* https://pytorch.org/tutorials/beginner/hyperparameter_tuning_tutorial.html
* https://stackoverflow.com/questions/67295494/correct-validation-loss-in-pytorch


In [None]:
def train_epoch(epoch, model, criterion, optimizer, train_loader, device, writer):
  running_loss = 0.
  correct = 0
  total = 0

  h = model.init_hidden()

  model.train()

  for batch, (input, target) in enumerate(train_loader):
    if epoch == 0 and batch == 0:
      writer.add_graph(model, input_to_model=(input.to(device), h), verbose=False)

    # assign input and target to device
    input, target = input.to(device), target.to(device)

    # detach hidden states
    h = tuple([each.data for each in h])

    # clear gradients
    optimizer.zero_grad()

    # batch prediction (alternative: forward)
    outputs, h = model(input, h)
    target = target.long()

    # loss computation
    loss = criterion(outputs, target.view(-1))

    # calc backward gradients
    loss.backward()

    # run optimizer
    optimizer.step()

    # print statistics
    running_loss += loss.item()

    # _, predicted = outputs.max(1)
    # print(outputs.shape)
    # print(predicted.shape)
    # total += target.size(0)
    # correct += predicted.eq(target).sum().item()

  print("Epoch: %d, loss: %1.5f" % (epoch+1, running_loss / len(train_loader)))
  return( running_loss / len(train_loader))


In [None]:
def val_epoch(epoch, model, criterion, optimizer, val_loader, device, writer):
  # Validation Loss
  correct = 0                                               
  total = 0                                                 
  running_loss = 0.0    

  h = model.init_hidden()                                 
      
  model.eval() # what does it do
  with torch.no_grad(): # what does it do
    for batch, (input, target) in enumerate(val_loader):
      # assign input and target to device
      input, target = input.to(device), target.to(device)

      # detach hidden states
      h = tuple([each.data for each in h])

      # batch prediction (alternative: forward)
      outputs, h = model(input, h)
      target = target.long()

      # loss computation
      loss = criterion(outputs, target.view(-1))

      # _, predicted = torch.max(outputs.data, 1)
      # total += target.size(0)
      # correct += (predicted == target).sum().item()

      running_loss += loss.item()
  # # mean_val_accuracy = (100 * correct / total)               
  mean_val_loss = ( running_loss )   
  # # print('Validation Accuracy: %d %%' % (mean_val_accuracy)) 
  # print('Validation Loss:'  ,mean_val_loss )
  return( running_loss / len(val_loader))


In [None]:
def train(dataset, model, hyperparams, device):
  timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
  trainWriter = SummaryWriter('/content/drive/MyDrive/runs/titleTrainer/train'.format(timestamp))
  valWriter = SummaryWriter('/content/drive/MyDrive/runs/titleTrainer/validation'.format(timestamp))
  # writer = SummaryWriter('/content/drive/MyDrive/runs/titleTrainer')

  criterion = nn.CrossEntropyLoss()
  optimizer = optim.Adam(model.parameters(), lr=hyperparams.lr)

  # split data
  train_set, val_set = dataset['train'], dataset['val']

  train_loader = DataLoader(train_set, batch_size=hyperparams.batchSize, drop_last=True)
  val_loader   = DataLoader(val_set, batch_size=hyperparams.batchSize, drop_last=True)
  # further options: shuffle, num_workers

  for epoch in range(hyperparams.epochs):
    trainLoss = train_epoch(epoch, model, criterion, optimizer, train_loader, device, trainWriter)
    valLoss = val_epoch(epoch, model, criterion, optimizer, val_loader, device, valWriter)

    trainWriter.add_scalar('loss', trainLoss, epoch)  
    valWriter.add_scalar('loss', valLoss, epoch)  
    # writer.add_scalars('loss', {'train':trainLoss,
    #                                 'val':valLoss}, epoch)

  trainWriter.flush()
  valWriter.flush()
    

In [None]:
import random

def predict(model, dataset, tkn, h=None):
         
  # tensor inputs
  x = np.array([[dataset.tokenizer.word_index[tkn]]])
  inputs = torch.from_numpy(x)
  
  # push to GPU
  inputs = inputs.cuda()

  # detach hidden state from history
  h = tuple([each.data for each in h])

  # get the output of the model
  out, h = model(inputs, h)

  # get the token probabilities
  p = F.softmax(out, dim=1).data

  p = p.cpu()

  p = p.numpy()
  p = p.reshape(p.shape[1],)

  # get indices of top 3 values
  top_n_idx = p.argsort()[-3:][::-1]

  # randomly select one of the three indices
  sampled_token_index = top_n_idx[random.sample([0,1,2],1)[0]]

  # return the encoded value of the predicted char and the hidden state
  return dataset.tokenizer.index_word[sampled_token_index], h


In [None]:
# function to generate text
def sample(model, dataset, size, device, initial):
    
    model.eval()

    # batch size is 1
    h = model.init_hidden(batchSize=1)

    toks = initial
    title = []

    # predict next token
    for t in initial:
      token, h = predict(model, dataset, t, h)
    
    toks.append(token)

    title.append(token)

    # predict subsequent tokens
    for i in range(size-1):
        token, h = predict(model, dataset, toks[-1], h)
        toks.append(token)
        title.append(token)

    return ' '.join(title)

## Execution

In [None]:
hyperParams = HyperParams(epochs=30, batchSize=32)
print(hyperParams)

In [None]:
titleSet = TitleDataset(hyperParams, baseFrame)

In [None]:
pd.DataFrame.from_dict(pd.Series(titleSet.tokenizer.word_index))

In [None]:
trainNum = int(hyperParams.ratio[0] * len(titleSet))
valNum = int(hyperParams.ratio[1] * len(titleSet))
testNum = len(titleSet) - trainNum - valNum
splitSet = random_split(titleSet, [trainNum, valNum, testNum], generator=torch.Generator().manual_seed(0))
splitSet = {'train': splitSet[0], 'val': splitSet[1], 'test': splitSet[2]}

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(torch.cuda.get_device_name(0))

In [None]:
model = EmbedLSTM(hyperParams, titleSet, device)
model.to(device)
print(model)
# summary(model, (16,53))

In [None]:
train(splitSet, model, hyperParams, device)

## Prediction

In [None]:
# sample(model, titleSet, 6, device, initial=['dry', 'penne', 'pasta', 'broccoli', 'sun', 'dried', 'tomatoes', 'packed', 'in', 'oil', 'garlic', 'cloves', 'cheddar', 'cheese', 'salt', 'black', 'pepper'])
seq = splitSet['test'][np.random.randint(0, len(splitSet['test']))][0].tolist()

def remove_values_from_list(the_list, val):
   return [titleSet.tokenizer.index_word[value] for value in the_list if value != val]

seq = remove_values_from_list(seq, titleSet.tokenizer.word_index['OOV'])
print(seq)

sample(model, titleSet, 6, device, initial=seq)

## DOWNSIDE of this splitting: other information is hard to obtain (lost during random split), therefore --> split up frame before passing it to titleSet class

https://towardsdatascience.com/lstm-for-time-series-prediction-de8aeb26f2ca for input sequence

## Save Model

In [None]:
# Save weights
torch.save(model.state_dict(), '/content/drive/MyDrive/weights/titleGenerator_model.pt')

# Tensorboard visualization

* https://pytorch.org/docs/stable/tensorboard.html
* https://github.com/christianversloot/machine-learning-articles/blob/main/how-to-use-tensorboard-with-pytorch.md

In [None]:
%load_ext tensorboard
%tensorboard --logdir=/content/drive/MyDrive/runs/titleTrainer