In [68]:
import math
import os
import urllib
import string

import h5py
import numpy as np
import torch

import torch.nn as nn
import torch.nn.functional as F

# Word Embedding

In [69]:
# Download data
if not os.path.isfile("datasets/mini.h5"):
    print("Downloading Conceptnet Numberbatch word embeddings...")
    conceptnet_url = "http://conceptnet.s3.amazonaws.com/precomputed-data/2016/numberbatch/17.06/mini.h5"
    urllib.request.urlretrieve(conceptnet_url, "datasets/mini.h5")

In [70]:
# Load the file and pull out words and embeddings
with h5py.File("datasets/mini.h5", "r") as f:
    all_words = [word.decode("utf-8") for word in f["mat"]["axis1"][:]]
    all_embeddings = f["mat"]["block0_values"][:]

print(f"all_words dimensions: {len(all_words)}")
print(f"all_embeddings dimensions: {all_embeddings.shape}")

print(f"Random example word: {all_words[36289]}")

all_words dimensions: 362891
all_embeddings dimensions: (362891, 300)
Random example word: /c/en/brainwash


In [71]:
# Restrict our vocabulary to just the English words
english_words = []
english_embeddings = []
for i in range(len(all_words)):
    word = all_words[i]
    word_embedding = all_embeddings[i]
    if word.startswith("/c/en/"):
        english_words.append(word[6:])
        english_embeddings.append(word_embedding)
english_embeddings = np.array(english_embeddings)

print(f"Number of English words in all_words: {len(english_words)}")
print(f"english_embeddings dimensions: {english_embeddings.shape}")

print(f"random word: {english_words[1337]}")

Number of English words in all_words: 150875
english_embeddings dimensions: (150875, 300)
random word: activated_carbon


In [94]:
# nomalize embeddings, make all of them have norm equals to 1
norms = np.linalg.norm(english_embeddings, axis=1)
normalized_embeddings = english_embeddings.astype("float32") / norms.astype(
    "float32"
).reshape([-1, 1])

In [95]:
# fast lookup
index = {word: i for i, word in enumerate(english_words)}

In [96]:
# measure the similarity between words
def similarity_score(w1, w2):
    c1 = normalized_embeddings[index[w1], :]
    c2 = normalized_embeddings[index[w2], :]
    score = np.dot(c1, c2)
    return score


def print_similarity(w1, w2):
    print(f"{w1:10s} ~ {w2:10s} = {similarity_score(w1, w2):+.5f}")


# A word is as similar with itself as possible:
print_similarity("cat", "cat")

# Closely related words still get high scores:
print_similarity("cat", "feline")
print_similarity("cat", "dog")

# Unrelated words, not so much
print_similarity("cat", "moo")
print_similarity("cat", "freeze")

# Antonyms are still considered related, sometimes more so than synonyms
print_similarity("antonym", "opposite")
print_similarity("antonym", "synonym")

cat        ~ cat        = +1.00000
cat        ~ feline     = +0.81995
cat        ~ dog        = +0.59072
cat        ~ moo        = +0.00395
cat        ~ freeze     = -0.03023
antonym    ~ opposite   = +0.39411
antonym    ~ synonym    = +0.46884


In [34]:
def closest_to_vector(v, n):
    all_scores = np.dot(normalized_embeddings, v)
    best_words = list(map(lambda i: english_words[i], reversed(np.argsort(all_scores))))
    return best_words[:n]


def most_similar(w, n):
    return closest_to_vector(normalized_embeddings[index[w], :], n)


print(most_similar("cat", 10))
print(most_similar("dog", 10))
print(most_similar("duke", 10))

['cat', 'humane_society', 'kitten', 'feline', 'colocolo', 'cats', 'kitty', 'maine_coon', 'housecat', 'sharp_teeth']
['dog', 'dogs', 'wire_haired_dachshund', 'doggy_paddle', 'lhasa_apso', 'good_friend', 'puppy_dog', 'bichon_frise', 'woof_woof', 'golden_retrievers']
['duke', 'dukes', 'duchess', 'duchesses', 'ducal', 'dukedom', 'duchy', 'voivode', 'princes', 'prince']


In [40]:
def solve_analogy(a1, b1, a2):
    b2 = (
        normalized_embeddings[index[b1], :]
        - normalized_embeddings[index[a1], :]
        + normalized_embeddings[index[a2], :]
    )
    return closest_to_vector(b2, 1)


print(solve_analogy("grandmother", "niece", "grandfather"))
print(solve_analogy("man", "husband", "woman"))
print(solve_analogy("spain", "madrid", "france"))

['nephew']
['wife']
['paris']


# Word embeddings in deep models

In [47]:
# download data
movie_simple_file_path = "datasets/movie-simple.txt"

if not os.path.isfile(movie_simple_file_path):
    print("Downloading movie-simple.txt...")
    movie_simple_url = "https://raw.githubusercontent.com/duke-mlss/Duke-MLSS-2018/master/movie-simple.txt"
    urllib.request.urlretrieve(movie_simple_url, movie_simple_file_path)


remove_punct = str.maketrans("", "", string.punctuation)


# This function converts a line of our data file into
# a tuple (x, y), where x is 300-dimensional representation
# of the words in a review, and y is its label.
def convert_line_to_example(line):
    # Pull out the first character: that's our label (0 or 1)
    y = int(line[0])

    # Split the line into words using Python's split() function
    words = line[2:].translate(remove_punct).lower().split()

    # Look up the embeddings of each word, ignoring words not
    # in our pretrained vocabulary.
    embeddings = [normalized_embeddings[index[w]] for w in words if w in index]

    # Take the mean of the embeddings
    x = np.mean(np.vstack(embeddings), axis=0)
    return x, y


# Apply the function to each line in the file.
xs = []
ys = []
with open(movie_simple_file_path, "r", encoding="utf-8", errors="ignore") as f:
    for l in f.readlines():
        x, y = convert_line_to_example(l)
        xs.append(x)
        ys.append(y)

# Concatenate all examples into a numpy array
xs = np.vstack(xs)
ys = np.vstack(ys)

# shuffle
shuffle_idx = np.random.permutation(xs.shape[0])
xs = xs[shuffle_idx, :]
ys = ys[shuffle_idx, :]

# log
print("Shape of inputs: {}".format(xs.shape))
print("Shape of labels: {}".format(ys.shape))

Shape of inputs: (1411, 300)
Shape of labels: (1411, 1)


In [51]:
num_examples = xs.shape[0]

num_train = math.floor(0.8 * num_examples)

x_train = torch.tensor(xs[:num_train])
y_train = torch.tensor(ys[:num_train], dtype=torch.float32)

x_test = torch.tensor(xs[num_train:])
y_test = torch.tensor(ys[num_train:], dtype=torch.float32)

reviews_train = torch.utils.data.TensorDataset(x_train, y_train)
reviews_test = torch.utils.data.TensorDataset(x_test, y_test)

train_loader = torch.utils.data.DataLoader(reviews_train, batch_size=100, shuffle=True)
test_loader = torch.utils.data.DataLoader(reviews_test, batch_size=100, shuffle=False)

In [53]:
# define model
class SWEM(nn.Module):
    def __init__(self):
        super().__init__()
        self.fc1 = nn.Linear(300, 64)
        self.fc2 = nn.Linear(64, 1)

    def forward(self, x):
        x = self.fc1(x)
        x = F.relu(x)
        x = self.fc2(x)
        return x
    
## Training
# Instantiate model
model = SWEM()

# Binary cross-entropy (BCE) Loss and Adam Optimizer
criterion = nn.BCEWithLogitsLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

# Iterate through train set minibatchs 
for epoch in range(250):
    correct = 0
    num_examples = 0
    for inputs, labels in train_loader:
        # Zero out the gradients
        optimizer.zero_grad()
        
        # Forward pass
        y = model(inputs)
        loss = criterion(y, labels)
        
        # Backward pass
        loss.backward()
        optimizer.step()
        
        predictions = torch.round(torch.sigmoid(y))
        correct += torch.sum((predictions == labels).float())
        num_examples += len(inputs)
    
    # Print training progress
    if epoch % 25 == 0:
        acc = correct/num_examples
        print("Epoch: {0} \t Train Loss: {1} \t Train Acc: {2}".format(epoch, loss, acc))

## Testing
correct = 0
num_test = 0

with torch.no_grad():
    # Iterate through test set minibatchs 
    for inputs, labels in test_loader:
        # Forward pass
        y = model(inputs)
        
        predictions = torch.round(torch.sigmoid(y))
        correct += torch.sum((predictions == labels).float())
        num_test += len(inputs)
    
print('Test accuracy: {}'.format(correct/num_test))

Epoch: 0 	 Train Loss: 0.6855358481407166 	 Train Acc: 0.5460993051528931
Epoch: 25 	 Train Loss: 0.18884149193763733 	 Train Acc: 0.9521276354789734
Epoch: 50 	 Train Loss: 0.1113380566239357 	 Train Acc: 0.9689716100692749
Epoch: 75 	 Train Loss: 0.08995165675878525 	 Train Acc: 0.9751772880554199
Epoch: 100 	 Train Loss: 0.05221224203705788 	 Train Acc: 0.9796099066734314
Epoch: 125 	 Train Loss: 0.019935395568609238 	 Train Acc: 0.9858155846595764
Epoch: 150 	 Train Loss: 0.008389944210648537 	 Train Acc: 0.9867021441459656
Epoch: 175 	 Train Loss: 0.05249840021133423 	 Train Acc: 0.9902482032775879
Epoch: 200 	 Train Loss: 0.01095837913453579 	 Train Acc: 0.9946808218955994
Epoch: 225 	 Train Loss: 0.033099930733442307 	 Train Acc: 0.9982269406318665
Test accuracy: 0.9434629082679749


In [67]:
# Check some words
words_to_test = ["interesting", "brokeback", "bad", "what"]

for word in words_to_test:
    x = torch.tensor(normalized_embeddings[index[word]].reshape(1, 300))
    print("Sentiment of the word '{0}': {1}".format(word, round(float(torch.sigmoid(model(x))[0][0]))))

Sentiment of the word 'interesting': 1
Sentiment of the word 'brokeback': 1
Sentiment of the word 'bad': 0
Sentiment of the word 'what': 1


# Learning Word Embeddings

In [98]:
# define vocabulary and space vector
VOCAB_SIZE = 5000
EMBED_DIM = 300

embedding = nn.Embedding(VOCAB_SIZE, EMBED_DIM)

print(f"size of the embedding weights: {embedding.weight.size()}")

size of the embedding weights: torch.Size([5000, 300])


In [99]:
class SWEMWithEmbeddings(nn.Module):
    def __init__(self, vocab_size, embedding_size, hidden_dim, num_outputs):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_size)
        self.fc1 = nn.Linear(embedding_size, hidden_dim)
        self.fc2 = nn.Linear(hidden_dim, num_outputs)

    def forward(self, x):
        x = self.embedding(x)
        x = torch.mean(x, dim=0)
        x = self.fc1(x)
        x = F.relu(x)
        x = self.fc2(x)
        return x


model = SWEMWithEmbeddings(
    vocab_size=5000,
    embedding_size=300,
    hidden_dim=64,
    num_outputs=1,
)
print(model)

SWEMWithEmbeddings(
  (embedding): Embedding(5000, 300)
  (fc1): Linear(in_features=300, out_features=64, bias=True)
  (fc2): Linear(in_features=64, out_features=1, bias=True)
)


# RNN

In [100]:
mb = 1
x_dim = 300 
sentence = ["recurrent", "neural", "networks", "are", "great"]

xs = []
for word in sentence:
    xs.append(torch.tensor(normalized_embeddings[index[word]]).view(1, x_dim))
    
xs = torch.stack(xs, dim=0)
print("xs shape: {}".format(xs.shape))

xs shape: torch.Size([5, 1, 300])


In [101]:
h_dim = 128

# For projecting the input
Wx = torch.randn(x_dim, h_dim)/np.sqrt(x_dim)
Wx.requires_grad_()
bx = torch.zeros(h_dim, requires_grad=True)

# For projecting the previous state
Wh = torch.randn(h_dim, h_dim)/np.sqrt(h_dim)
Wh.requires_grad_()
bh = torch.zeros(h_dim, requires_grad=True)

print(Wx.shape, bx.shape, Wh.shape, bh.shape)

torch.Size([300, 128]) torch.Size([128]) torch.Size([128, 128]) torch.Size([128])


In [103]:
def RNN_step(x, h):
    return torch.tanh((torch.matmul(x, Wx) + bx) + (torch.matmul(h, Wh) + bh))


# Word embedding for first word
x1 = xs[0, :, :]

# Initialize hidden state to 0
h0 = torch.zeros([mb, h_dim])

# Forward pass of one RNN step for time step t=1
h1 = RNN_step(x1, h0)
print("Hidden state h1 dimensions: {0}".format(h1.shape))

# Word embedding for second word
x2 = xs[1, :, :]

# Forward pass of one RNN step for time step t=2
h2 = RNN_step(x2, h1)
print("Hidden state h2 dimensions: {0}".format(h2.shape))

Hidden state h1 dimensions: torch.Size([1, 128])
Hidden state h2 dimensions: torch.Size([1, 128])


In [105]:
rnn = nn.RNN(x_dim, h_dim)
print("RNN parameter shapes: {}".format([p.shape for p in rnn.parameters()]))

hs, h_T = rnn(xs)

print("Hidden states shape: {}".format(hs.shape))
print("Final hidden state shape: {}".format(h_T.shape))

RNN parameter shapes: [torch.Size([128, 300]), torch.Size([128, 128]), torch.Size([128]), torch.Size([128])]
Hidden states shape: torch.Size([5, 1, 128])
Final hidden state shape: torch.Size([1, 1, 128])


In [106]:
lstm = nn.LSTM(x_dim, h_dim)
print("LSTM parameters: {}".format([p.shape for p in lstm.parameters()]))

gru = nn.GRU(x_dim, h_dim)
print("GRU parameters: {}".format([p.shape for p in gru.parameters()]))

LSTM parameters: [torch.Size([512, 300]), torch.Size([512, 128]), torch.Size([512]), torch.Size([512])]
GRU parameters: [torch.Size([384, 300]), torch.Size([384, 128]), torch.Size([384]), torch.Size([384])]
