# NLP

## Logistic Regression BoW classifier

Map sparse BoW representations to log probabilities over labels.

In [1]:
%matplotlib inline
import torch
import torch.autograd as autograd
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

### Data

In [2]:
data = [("me gusta comer en la cafeteria".split(), "SPANISH"),
        ("Give it to me".split(), "ENGLISH"),
        ("No creo que sea una buena idea".split(), "SPANISH"),
        ("No it is not a good idea to get lost at sea".split(), "ENGLISH")]

test_data = [("Yo creo que si".split(), "SPANISH"),
             ("it is lost on me".split(), "ENGLISH")]

In [3]:
word2idx = {}
for sent, _ in data+test_data:
    for word in sent:
        if word not in word2idx:
            word2idx[word] = len(word2idx)

In [4]:
vocab_size = len(word2idx)

### Classifier

In [5]:
class BoWClassifier(nn.Module):
    def __init__(self, num_labels, vocab_size):
        super(BoWClassifier, self).__init__()
        self.linear = nn.Linear(vocab_size, num_labels)
    
    def forward(self, bow_vec):
        return F.log_softmax(self.linear(bow_vec))

In [6]:
def make_vector(sentence, word2idx):
    vec = torch.zeros(len(word2idx))
    for word in sentence:
        vec[word2idx[word]] += 1
    return vec.view(1, -1)

In [7]:
def make_target(label, label2idx):
    return torch.LongTensor([label2idx[label]])

In [8]:
model = BoWClassifier(2, vocab_size)

In [9]:
for param in model.parameters():
    print(param)

Parameter containing:

Columns 0 to 9 
-0.0854 -0.0685 -0.1575  0.1464  0.0801 -0.0993 -0.1936  0.0904  0.0904 -0.0873
-0.0563  0.0899  0.0629 -0.1883 -0.1687 -0.0535  0.1446 -0.0348 -0.0169 -0.0755

Columns 10 to 19 
 0.0072 -0.0688  0.1950  0.1789 -0.1836  0.0864  0.0192  0.0405  0.0324  0.0224
 0.1155 -0.0095  0.1052 -0.1874  0.0439  0.1872 -0.1613  0.1525 -0.1849  0.1848

Columns 20 to 25 
 0.0056 -0.0590  0.0524  0.0011  0.1112 -0.0003
 0.1806  0.0427  0.1119 -0.1890 -0.0354  0.0835
[torch.FloatTensor of size 2x26]

Parameter containing:
-0.0640
-0.1933
[torch.FloatTensor of size 2]



### Test

In [10]:
sample = data[0]
bow_vector = make_vector(sample[0], word2idx)

In [11]:
log_probs = model(autograd.Variable(bow_vector)); log_probs

Variable containing:
-0.5719 -0.8311
[torch.FloatTensor of size 1x2]

### Run

In [12]:
label2idx = {'SPANISH': 0, 'ENGLISH': 1}

Run on test data before training -> see before-and-after

In [13]:
for sent, label in test_data:
    bow_vec = autograd.Variable(make_vector(sent, word2idx))
    log_probs = model(bow_vec)
    print(log_probs)

Variable containing:
-0.5550 -0.8534
[torch.FloatTensor of size 1x2]

Variable containing:
-0.5890 -0.8094
[torch.FloatTensor of size 1x2]



Negative log likelihood loss

In [14]:
loss_fn = nn.NLLLoss()
optimizer = optim.SGD(model.parameters(), lr=0.1)

In [15]:
for epoch in range(100):
    for sent, label in data:
        model.zero_grad()
        
        bow_vec = autograd.Variable(make_vector(sent, word2idx))
        target = autograd.Variable(make_target(label, label2idx))
        
        log_probs = model(bow_vec)
        
        loss = loss_fn(log_probs, target)
        loss.backward()
        optimizer.step()

In [16]:
for sent, label in test_data:
    bow_vec = autograd.Variable(make_vector(sent, word2idx))
    log_probs = model(bow_vec)
    print(log_probs)

Variable containing:
-0.1083 -2.2765
[torch.FloatTensor of size 1x2]

Variable containing:
-2.5334 -0.0827
[torch.FloatTensor of size 1x2]



## WordEmbeddings

### N-Gram Language Model

Given a sequence of words `w`, we want to compute:

`P(wi|wi−1,wi−2,…,wi−n+1)`

In [17]:
CONTEXT_SIZE = 2
EMBEDDING_DIM = 10

Shakespeare Sonnet 2

In [18]:
test_sentence = """When forty winters shall besiege thy brow,
And dig deep trenches in thy beauty's field,
Thy youth's proud livery so gazed on now,
Will be a totter'd weed of small worth held:
Then being asked, where all thy beauty lies,
Where all the treasure of thy lusty days;
To say, within thine own deep sunken eyes,
Were an all-eating shame, and thriftless praise.
How much more praise deserv'd thy beauty's use,
If thou couldst answer 'This fair child of mine
Shall sum my count, and make my old excuse,'
Proving his beauty by succession thine!
This were to be new made when thou art old,
And see thy blood warm when thou feel'st it cold.""".split()

Trigram: [context_words, target_word]

`([word_i-2, word_i-1], target_word)`

In [19]:
trigrams = [([test_sentence[i], test_sentence[i+1]], test_sentence[i+2])
           for i in range(len(test_sentence) - 2)]

In [20]:
print(trigrams[:3])

[(['When', 'forty'], 'winters'), (['forty', 'winters'], 'shall'), (['winters', 'shall'], 'besiege')]


In [21]:
vocab = set(test_sentence)
word2idx = {word: i for i, word in enumerate(vocab)}

In [22]:
class NGramModeler(nn.Module):
    def __init__(self, vocab_size, emb_dim, context_size):
        super(NGramModeler, self).__init__()
        self.embeddings = nn.Embedding(vocab_size, emb_dim)
        self.linear1 = nn.Linear(context_size * emb_dim, 128)
        self.linear2 = nn.Linear(128, vocab_size)
    
    def forward(self, inputs):
        embeds = self.embeddings(inputs).view((1, -1))
        out = F.relu(self.linear1(embeds))
        out = self.linear2(out)
        
        log_probs = F.log_softmax(out)
        return log_probs

In [23]:
losses = []
loss_fn = nn.NLLLoss()

In [24]:
model = NGramModeler(len(vocab), EMBEDDING_DIM, CONTEXT_SIZE)

In [25]:
optimizer = optim.SGD(model.parameters(), lr=0.001)

In [26]:
for epoch in range(10):
    total_loss = torch.Tensor([0])
    for context, target in trigrams:
        model.zero_grad()
        
        context_idxs = [word2idx[w] for w in context]
        context_var = autograd.Variable(torch.LongTensor(context_idxs))
        
        log_probs = model(context_var)
        
#         print('target', word2idx[target])
        target_var = autograd.Variable(torch.LongTensor([word2idx[target]]))
        loss = loss_fn(log_probs, target_var)
        
        loss.backward()
        optimizer.step()
        
        total_loss += loss.data
    
    losses.append(total_loss)

In [27]:
# print(losses)

### Continous Bag-of-Words Model

Given a target word $w_i$ and an
$N$ context window on each side, 

$w_{i-1}, \dots, w_{i-N}$
and $w_{i+1}, \dots, w_{i+N}$, referring to all context words
collectively as $C$, CBOW tries to minimize

\begin{align}-\log p(w_i | C) = -\log \text{Softmax}(A(\sum_{w \in C} q_w) + b)\end{align}

where $q_w$ is the embedding for word $w$.

In [28]:
CONTEXT_SIZE = 2

In [29]:
raw_text = """We are about to study the idea of a computational process.
Computational processes are abstract beings that inhabit computers.
As they evolve, processes manipulate other abstract things called data.
The evolution of a process is directed by a pattern of rules
called a program. People create programs to direct processes. In effect,
we conjure the spirits of the computer with our spells.""".split()

In [30]:
vocab = set(raw_text)
vocab_size = len(vocab)

In [31]:
word2idx = {word: i for i, word in enumerate(vocab)}

In [32]:
data = []
for i in range(CONTEXT_SIZE, len(raw_text) - CONTEXT_SIZE):
    context = [raw_text[i-2], raw_text[i-1],
               raw_text[i+1], raw_text[i+2]]
    target = raw_text[i]
    
    data.append((context, target))

In [33]:
data[:5]

[(['We', 'are', 'to', 'study'], 'about'),
 (['are', 'about', 'study', 'the'], 'to'),
 (['about', 'to', 'the', 'idea'], 'study'),
 (['to', 'study', 'idea', 'of'], 'the'),
 (['study', 'the', 'of', 'a'], 'idea')]

In [34]:
class CBoW(nn.Module):
    def __init__(self, vocab_size, emb_dim, context_size):
        super(CBoW, self).__init__()
        self.embeddings = nn.Embedding(vocab_size, emb_dim)
        self.linear1 = nn.Linear(context_size*2*emb_dim, 128)
        self.linear2 = nn.Linear(128, vocab_size)
    
    def forward(self, inputs):
        embeds = self.embeddings(inputs).view((1, -1))
        out = F.relu(self.linear1(embeds))
        out = self.linear2(out)
        
        log_probs = F.log_softmax(out)
        return log_probs

In [41]:
def make_context_vector(context, word2idx):
    idxs = [word2idx[w] for w in context]
    tensor = torch.LongTensor(idxs).cuda()
    return autograd.Variable(tensor)

In [42]:
losses = []
loss_fn = nn.NLLLoss()

In [43]:
model = CBoW(len(vocab), EMBEDDING_DIM, CONTEXT_SIZE)
model.cuda()

CBoW (
  (embeddings): Embedding(49, 10)
  (linear1): Linear (40 -> 128)
  (linear2): Linear (128 -> 49)
)

In [44]:
optimizer = optim.SGD(model.parameters(), lr=1e-4)

In [45]:
for epoch in range(100):
    total_loss = torch.Tensor([0]).cuda()
    for context, target in data:
        model.zero_grad()
        
        context_var = make_context_vector(context, word2idx)
        
        log_probs = model(context_var)
        
        target_var = autograd.Variable(torch.LongTensor([word2idx[target]]).cuda())
        loss = loss_fn(log_probs, target_var)
        
        loss.backward()
        optimizer.step()
        
        total_loss += loss.data
    
    losses.append(total_loss)

In [46]:
losses

[
  227.1947
 [torch.cuda.FloatTensor of size 1 (GPU 0)], 
  227.0444
 [torch.cuda.FloatTensor of size 1 (GPU 0)], 
  226.8942
 [torch.cuda.FloatTensor of size 1 (GPU 0)], 
  226.7442
 [torch.cuda.FloatTensor of size 1 (GPU 0)], 
  226.5942
 [torch.cuda.FloatTensor of size 1 (GPU 0)], 
  226.4444
 [torch.cuda.FloatTensor of size 1 (GPU 0)], 
  226.2946
 [torch.cuda.FloatTensor of size 1 (GPU 0)], 
  226.1451
 [torch.cuda.FloatTensor of size 1 (GPU 0)], 
  225.9955
 [torch.cuda.FloatTensor of size 1 (GPU 0)], 
  225.8459
 [torch.cuda.FloatTensor of size 1 (GPU 0)], 
  225.6965
 [torch.cuda.FloatTensor of size 1 (GPU 0)], 
  225.5473
 [torch.cuda.FloatTensor of size 1 (GPU 0)], 
  225.3981
 [torch.cuda.FloatTensor of size 1 (GPU 0)], 
  225.2491
 [torch.cuda.FloatTensor of size 1 (GPU 0)], 
  225.1001
 [torch.cuda.FloatTensor of size 1 (GPU 0)], 
  224.9513
 [torch.cuda.FloatTensor of size 1 (GPU 0)], 
  224.8026
 [torch.cuda.FloatTensor of size 1 (GPU 0)], 
  224.6539
 [torch.cuda.Float

## Sequence Models and LSTM

### LSTM Example

In [51]:
torch.manual_seed(1)

<torch._C.Generator at 0x7f9077762600>

In [63]:
lstm = nn.LSTM(3, 3)
inputs = [autograd.Variable(torch.randn((1,3))) for _ in range(5)]; inputs

[Variable containing:
 -1.6000 -0.6771 -0.9848
 [torch.FloatTensor of size 1x3], Variable containing:
  1.8307  2.3463  0.0054
 [torch.FloatTensor of size 1x3], Variable containing:
 -0.5591  0.0550  0.3098
 [torch.FloatTensor of size 1x3], Variable containing:
 -2.1258 -0.2598  0.3487
 [torch.FloatTensor of size 1x3], Variable containing:
 -1.5464  0.8535  0.1206
 [torch.FloatTensor of size 1x3]]

Step through the sequence one element at a time

In [66]:
# initialize hidden state
hidden = (autograd.Variable(torch.randn(1,1,3)),
          autograd.Variable(torch.randn((1,1,3))))
for i in inputs:
    out, hidden = lstm(i.view(1,1,-1), hidden)
hidden

(Variable containing:
 (0 ,.,.) = 
  -0.2146  0.0554  0.1427
 [torch.FloatTensor of size 1x1x3], Variable containing:
 (0 ,.,.) = 
  -0.5155  0.1003  0.2526
 [torch.FloatTensor of size 1x1x3])

Do entire sequence at once

In [67]:
inputs = torch.cat(inputs).view(len(inputs), 1, -1); inputs

Variable containing:
(0 ,.,.) = 
 -1.6000 -0.6771 -0.9848

(1 ,.,.) = 
  1.8307  2.3463  0.0054

(2 ,.,.) = 
 -0.5591  0.0550  0.3098

(3 ,.,.) = 
 -2.1258 -0.2598  0.3487

(4 ,.,.) = 
 -1.5464  0.8535  0.1206
[torch.FloatTensor of size 5x1x3]

In [68]:
hidden = (autograd.Variable(torch.randn(1,1,3)),
          autograd.Variable(torch.randn((1,1,3))))
out, hidden = lstm(inputs, hidden)
print(out)
print(hidden)

Variable containing:
(0 ,.,.) = 
 -0.6347 -0.0698 -0.2026

(1 ,.,.) = 
 -0.3732 -0.2269  0.0138

(2 ,.,.) = 
 -0.5078 -0.1984  0.0181

(3 ,.,.) = 
 -0.4325 -0.0129  0.0019

(4 ,.,.) = 
 -0.2835  0.0734  0.1115
[torch.FloatTensor of size 5x1x3]

(Variable containing:
(0 ,.,.) = 
 -0.2835  0.0734  0.1115
[torch.FloatTensor of size 1x1x3]
, Variable containing:
(0 ,.,.) = 
 -0.7164  0.1307  0.1921
[torch.FloatTensor of size 1x1x3]
)


### LSTM Part-of-Speech Tagging

Let our input sentence be
$w_1, \dots, w_M$, where $w_i \in V$, our vocab. 

Also, let
$T$ be our tag set, and $y_i$ the tag of word $w_i$.
Denote our prediction of the tag of word $w_i$ by
$\hat{y}_i$.

This is a structure prediction, model, where our output is a sequence
$\hat{y}_1, \dots, \hat{y}_M$, where $\hat{y}_i \in T$.

To do the prediction, pass an LSTM over the sentence. Denote the hidden
state at timestep $i$ as $h_i$. Also, assign each tag a
unique index. Then our prediction rule for $\hat{y}_i$ is

\begin{align}\hat{y}_i = \text{argmax}_j \  (\log \text{Softmax}(Ah_i + b))_j\end{align}

That is, take the log softmax of the affine map of the hidden state,
and the predicted tag is the tag that has the maximum value in this
vector. Note this implies immediately that the dimensionality of the
target space of $A$ is $|T|$.


In [69]:
def prepare_sequence(seq, word2idx):
    idxs = [word2idx[w] for w in seq]
    tensor = torch.LongTensor(idxs)
    return autograd.Variable(tensor)

In [70]:
training_data = [
    ("The dog ate the apple".split(), ["DET", "NN", "V", "DET", "NN"]),
    ("Everybody read that book".split(), ["NN", "V", "DET", "NN"])
]

In [75]:
word2idx = {}
for sent, tag in training_data:
    for word in sent:
        if word not in word2idx:
            word2idx[word] = len(word2idx)

In [76]:
tag2idx = {'DET': 0, 'NN': 1, 'V': 2}

In [77]:
EMBEDDING_DIM = 6
HIDDEN_DIM = 6

In [78]:
class LSTMTagger(nn.Module):
    def __init__(self, emb_dim, hid_dim, vocab_size, target_size):
        super(LSTMTagger, self).__init__()
        self.hid_dim = hid_dim
        self.word_emb = nn.Embedding(vocab_size, emb_dim)
        self.lstm = nn.LSTM(emb_dim, hid_dim)
        # linear layer that maps from hidden state to tag space
        self.hidden2tag = nn.Linear(hid_dim, target_size)
        self.hidden = self.init_hidden()
        
    def init_hidden(self):
        # (num_layers, minibatch_size, hidden_dim)
        return (autograd.Variable(torch.zeros(1,1,self.hid_dim)),
                autograd.Variable(torch.zeros(1,1,self.hid_dim)))
    
    def forward(self, sentence):
        embeds = self.word_emb(sentence)
        lstm_out, self.hidden = self.lstm(
            embeds.view(len(sentence), 1, -1), self.hidden)
        tag_space = self.hidden2tag(lstm_out.view(len(sentence), -1))
        tag_scores = F.log_softmax(tag_space)
        return tag_scores

In [79]:
model = LSTMTagger(EMBEDDING_DIM, HIDDEN_DIM, len(word2idx), len(tag2idx))

In [80]:
loss_fn = nn.NLLLoss()
optimizer = optim.SGD(model.parameters(), lr=0.1)

See what scores are before training

In [81]:
test_sentence = prepare_sequence(training_data[0][0], word2idx)
tag_scores = model(test_sentence); tag_scores

Variable containing:
-0.8221 -1.3920 -1.1651
-0.8030 -1.4162 -1.1731
-0.8267 -1.4007 -1.1518
-0.8091 -1.4459 -1.1418
-0.8324 -1.4931 -1.0779
[torch.FloatTensor of size 5x3]

In [83]:
for epoch in range(300):
    for sentence, tags in training_data:
        model.zero_grad()
        
        # clear out hidden state
        model.hidden = model.init_hidden()
        
        sentence_in = prepare_sequence(sentence, word2idx)
        targets = prepare_sequence(tags, tag2idx)
        
        tag_scores = model(sentence_in)
        
        loss = loss_fn(tag_scores, targets)
        loss.backward()
        optimizer.step()

In [84]:
tag_scores = model(test_sentence); tag_scores

Variable containing:
-0.0612 -2.9190 -5.2243
-3.8456 -0.0394 -4.0574
-2.3511 -2.7228 -0.1755
-0.0646 -3.3599 -3.5818
-3.9519 -0.0205 -6.8684
[torch.FloatTensor of size 5x3]