## Language Models Lab ##

Through these notebooks, we will explore different important and interesting techniques, approaches, and uses of language models to address mainly Natural Language Processing tasks.

We will explore the following:

- Creating Recurrent Neural Networks (RNN) and Long short-term memory (LSTM) networks
- Word2Vec
    - Continuous Bag-Of-Words (CBOW)
- Using RNNS in practice!
    - Text classification
- Seq2Seq
    - Using Torchtext
    - Machine Translation
- Using Pre-trained models!

-------------
## Basic testing of RNN, LSTM, and GRU ##

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

In [None]:
def prepare_sequence(seq, to_ix):
    idxs = [to_ix[w] for w in seq]
    return torch.tensor(idxs, dtype=torch.long)

# What will happen here?
training_data = [
    # Tags are: DET - determiner; NN - noun; V - verb
    # For example, the word "The" is a determiner
    ("The dog ate the apple".split(), ["DET", "NN", "V", "DET", "NN"]),
    ("Everybody read that book".split(), ["NN", "V", "DET", "NN"]),
    ("Everybody does machine learning nowadays".split(), ["NN", "V", "NN", "NN", "ADV", ])
]
word_to_ix = {}
# For each words-list (sentence) and tags-list in each tuple of training_data
for sent, tags in training_data:
    for word in sent:
        if word not in word_to_ix:  # word has not been assigned an index yet
            word_to_ix[word] = len(word_to_ix)  # Assign each word with a unique index
            
print(word_to_ix)
tag_to_ix = {"DET": 0, "NN": 1, "V": 2, "ADV": 3}  # Assign each tag with a unique index

{'The': 0, 'dog': 1, 'ate': 2, 'the': 3, 'apple': 4, 'Everybody': 5, 'read': 6, 'that': 7, 'book': 8, 'does': 9, 'machine': 10, 'learning': 11, 'nowadays': 12}


In [None]:
EMBEDDING_DIM = 6
HIDDEN_DIM = 12
VOCAB_SIZE = len(word_to_ix)
NUM_CLASSES = len(tag_to_ix)

In [None]:
def train(model, optimizer, criterion, epochs):
    epoch_loss = []
    for epoch in range(epochs):  # again, normally you would NOT do 300 epochs, it is toy data
        final_loss = 0
        for sentence, tags in training_data:
            
            model.zero_grad()

            # get inputs and targets ready for the network!
            sentence_in = prepare_sequence(sentence, word_to_ix)
            targets = prepare_sequence(tags, tag_to_ix)

            # get the tag scores
            tag_scores = model(sentence_in)
            
            loss = criterion(tag_scores, targets)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            final_loss += loss.item()
        epoch_loss.append(final_loss)
    
    return epoch_loss


In [None]:
def evaluate(model, test_sequence):
    with torch.no_grad():
        inputs = prepare_sequence(training_data[test_sequence][0], word_to_ix)
        tag_scores = model(inputs)
        
        outputs = []
        
        print(tag_to_ix)
        print(training_data[test_sequence][0])
        print(training_data[test_sequence][1])
        
        for tag_score in tag_scores:
            outputs.append(tag_score.topk(1).indices.item())
            
        print(outputs)
        print("--------------")

### Recurrent Neural Networks (RNN) ###

In [None]:
class RNNTagger(nn.Module):

    def __init__(self, embedding_dim, hidden_dim, vocab_size, tagset_size):
        super(RNNTagger, self).__init__()
        self.hidden_dim = hidden_dim

        self.word_embeddings = nn.Embedding(vocab_size, embedding_dim)

        # The RNN takes word embeddings as inputs, and outputs hidden states and output
        self.rnn = nn.RNN(embedding_dim, hidden_dim)

        # The linear layer that maps from hidden state space to tag space
        self.hidden2tag = nn.Linear(hidden_dim, tagset_size)

    def forward(self, sentence):
        
        embeds = self.word_embeddings(sentence)
        rnn_out, _ = self.rnn(embeds.view(len(sentence), 1, -1)) #The module is expecting [sentence_length, batch_size, embedding_dim]
        
        # in this case, rnn_out.view(len(sentence), -1) is the same as doing what function?
        tag_space = self.hidden2tag(rnn_out.view(len(sentence), -1))
        
        tag_scores = F.log_softmax(tag_space, dim=1)
        return tag_scores

In [None]:
model = RNNTagger(EMBEDDING_DIM, HIDDEN_DIM, len(word_to_ix), len(tag_to_ix))
loss_function = nn.NLLLoss()
optimizer = optim.SGD(model.parameters(), lr=0.1)
losses = train(model, optimizer, loss_function, 100)
print(losses)
evaluate(model, 0)
evaluate(model, 1)
evaluate(model, 2)

[4.01319944858551, 3.7409374713897705, 3.5142505168914795, 3.3068745136260986, 3.106687307357788, 2.9086394906044006, 2.712101995944977, 2.5186516642570496, 2.330113708972931, 2.147655189037323, 1.9718610644340515, 1.80312380194664, 1.6419906616210938, 1.4894486665725708, 1.3470511138439178, 1.2166416645050049, 1.0996626317501068, 0.9965263158082962, 0.9065359681844711, 0.8282851129770279, 0.760127991437912, 0.7004872411489487, 0.6479876488447189, 0.6014860197901726, 0.5600533112883568, 0.5229396373033524, 0.4895393028855324, 0.459358274936676, 0.4319900572299957, 0.4070950672030449, 0.38438767939805984, 0.36362361162900925, 0.34459254890680313, 0.3271123170852661, 0.3110240176320076, 0.2961874231696129, 0.2824797108769417, 0.2697916142642498, 0.25802647694945335, 0.2470984198153019, 0.2369306944310665, 0.22745467349886894, 0.21860947087407112, 0.2103400118649006, 0.20259720087051392, 0.19533676654100418, 0.18851905316114426, 0.1821078471839428, 0.17607104033231735, 0.17037911340594292

### Long Short-Term Memory (LSTM) ###

In [None]:
class LSTMTagger(nn.Module):

    def __init__(self, embedding_dim, hidden_dim, vocab_size, tagset_size):
        super(LSTMTagger, self).__init__()
        self.hidden_dim = hidden_dim

        self.word_embeddings = nn.Embedding(vocab_size, embedding_dim)

        # The LSTM takes word embeddings as inputs, and outputs hidden states
        # with dimensionality hidden_dim.
        self.lstm = nn.LSTM(embedding_dim, hidden_dim)

        # The linear layer that maps from hidden state space to tag space
        self.hidden2tag = nn.Linear(hidden_dim, tagset_size)

    def forward(self, sentence):
        embeds = self.word_embeddings(sentence)
        lstm_out, _ = self.lstm(embeds.view(len(sentence), 1, -1))
        tag_space = self.hidden2tag(lstm_out.view(len(sentence), -1))
        tag_scores = F.log_softmax(tag_space, dim=1)
        return tag_scores

In [None]:
model = LSTMTagger(EMBEDDING_DIM, HIDDEN_DIM, len(word_to_ix), len(tag_to_ix))
loss_function = nn.NLLLoss()
optimizer = optim.SGD(model.parameters(), lr=0.1)
losses = train(model, optimizer, loss_function, 100)
print(losses)
evaluate(model, 0)
evaluate(model, 1)
evaluate(model, 2)

[4.257006525993347, 4.126981019973755, 4.017700791358948, 3.925899624824524, 3.848695755004883, 3.7835817337036133, 3.728405475616455, 3.6813430786132812, 3.640865445137024, 3.6057045459747314, 3.574816584587097, 3.5473484992980957, 3.522607207298279, 3.5000311136245728, 3.4791669845581055, 3.4596489667892456, 3.4411827325820923, 3.4235302209854126, 3.4064990282058716, 3.389933228492737, 3.3737049102783203, 3.3577096462249756, 3.3418610095977783, 3.3260862827301025, 3.3103246688842773, 3.294524073600769, 3.278640031814575, 3.262632727622986, 3.2464678287506104, 3.230114698410034, 3.213545083999634, 3.196734070777893, 3.1796581745147705, 3.1622960567474365, 3.1446282863616943, 3.1266363859176636, 3.1083032488822937, 3.089613139629364, 3.070551812648773, 3.0511056184768677, 3.031262755393982, 3.0110122561454773, 2.9903443455696106, 2.9692501425743103, 2.9477224349975586, 2.9257550835609436, 2.9033421874046326, 2.880480468273163, 2.857166111469269, 2.833396792411804, 2.80917090177536, 2.7

## Replace LSTM and RNN with GRU ##

Implement a network with nn.GRU, and compare with the other networks through loss and perplexity. If wanted, you can extend this toy example with more sentences or vary the task for testing the networks and observing the differences.