<a href="https://colab.research.google.com/github/maxwelljohn/siamese-word2vec/blob/master/siamese.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import numpy as np

import datetime
import itertools
import nltk
import os
import random
import skimage.transform
import sys
import time
import unittest
nltk.download('wordnet')
nltk.download('punkt')
from nltk.corpus import wordnet as wn
from keras.utils.data_utils import get_file
from keras.models import Model
from keras.layers import Input, Flatten, Dense, Dropout, Lambda
from keras.optimizers import RMSprop
from keras import backend as K
from keras import regularizers
from keras.constraints import non_neg
from scipy.misc import imread

!pip install scikit-optimize
import skopt

!pip install -U -q PyDrive
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


Using TensorFlow backend.


Collecting scikit-optimize
[?25l  Downloading https://files.pythonhosted.org/packages/f4/44/60f82c97d1caa98752c7da2c1681cab5c7a390a0fdd3a55fac672b321cac/scikit_optimize-0.5.2-py2.py3-none-any.whl (74kB)
[K    100% |████████████████████████████████| 81kB 3.8MB/s 
Installing collected packages: scikit-optimize
Successfully installed scikit-optimize-0.5.2


In [0]:
def similarity(a, b):
    a, b = np.ravel(a), np.ravel(b)
    # Cosine similarity
    return np.dot(a, b) / max(np.linalg.norm(a) * np.linalg.norm(b), sys.float_info.epsilon)


def keras_norm(vect):
    return K.sqrt(K.batch_dot(vect, vect, axes=1))


def keras_similarity(vects):
    x, y = vects
    # Cosine similarity
    return K.batch_dot(x, y, axes=1) / K.maximum(keras_norm(x) * keras_norm(y), K.epsilon())


def sim_output_shape(shapes):
    shape1, shape2 = shapes
    return (shape1[0], 1)


def create_base_network(input_shape, output_size=128, map_reg_rate=0, scale_reg_rate=0):
    '''Base network to be shared (eq. to feature extraction).
    '''
    input = Input(shape=input_shape)
    x = input
    x = Dense(output_size, kernel_regularizer=regularizers.l2(map_reg_rate))(x)
    x = Dense(output_size, kernel_initializer='identity', kernel_constraint=non_neg(),
              kernel_regularizer=regularizers.l2(scale_reg_rate), use_bias=False)(x)
    return Model(input, x)


def syn_accuracy(y_true, y_pred):
    '''Compute synonym classification accuracy with a variable threshold on similarities.
    '''
    median = np.median(y_pred)
    pred = y_pred.ravel() > median
    return np.mean(pred == y_true)


def ant_accuracy(y_true, y_pred):
    '''Compute antonym classification accuracy with a variable threshold on similarities.
    '''
    median = np.median(y_pred)
    pred = -((y_pred.ravel() < median).astype(np.int, casting='safe', copy=False))
    return np.mean(pred == y_true)


def keras_syn_accuracy(y_true, y_pred):
    '''Compute synonym classification accuracy with a fixed threshold on similarities.
    '''
    return K.mean(K.equal(y_true, K.cast(y_pred > 0.75, y_true.dtype)))


def keras_ant_accuracy(y_true, y_pred):
    '''Compute antonym classification accuracy with a fixed threshold on similarities.
    '''
    return K.mean(K.equal(y_true, K.cast(y_pred < -0.75, y_true.dtype)))

In [3]:
!unzip -u glove.6B.zip || (wget http://nlp.stanford.edu/data/glove.6B.zip && unzip -u glove.6B.zip)

unzip:  cannot find or open glove.6B.zip, glove.6B.zip.zip or glove.6B.zip.ZIP.
--2019-01-25 22:36:07--  http://nlp.stanford.edu/data/glove.6B.zip
Resolving nlp.stanford.edu (nlp.stanford.edu)... 171.64.67.140
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:80... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://nlp.stanford.edu/data/glove.6B.zip [following]
--2019-01-25 22:36:07--  https://nlp.stanford.edu/data/glove.6B.zip
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 862182613 (822M) [application/zip]
Saving to: ‘glove.6B.zip’


2019-01-25 22:37:36 (9.30 MB/s) - ‘glove.6B.zip’ saved [862182613/862182613]

Archive:  glove.6B.zip
  inflating: glove.6B.50d.txt        
  inflating: glove.6B.100d.txt       
  inflating: glove.6B.200d.txt       
  inflating: glove.6B.300d.txt       


In [0]:
# 50K words is enough for 95% of English word usage, per the OED:
# https://web.archive.org/web/20160304170936/http://www.oxforddictionaries.com/words/the-oec-facts-about-the-language
# The GloVe text files appear to be roughly sorted by frequency of usage.
!egrep '^[a-z]+ ' glove.6B.300d.txt > glove.head.txt
!cut -d' ' -f1 glove.head.txt > glove.head.strings.txt
!cut -d' ' -f2- glove.head.txt > glove.head.vectors.txt

In [5]:
!wc glove.6B.300d.txt

    400000  120400000 1037962819 glove.6B.300d.txt


In [0]:
word_strings = np.loadtxt('glove.head.strings.txt', dtype=object)
word_vectors = np.loadtxt('glove.head.vectors.txt')
input_shape = word_vectors.shape[1:]
assert len(word_strings) == len(word_vectors)

index_for_word = {}
for i, word in enumerate(word_strings):
    index_for_word[word] = i

In [7]:
wnl = nltk.WordNetLemmatizer()


def might_be_synonyms(w1, w2):
    s1 = set()
    d1 = set()
    s2 = set()
    d2 = set()
    for synset in wn.synsets(w1):
        lemma_names = synset.lemma_names()
        s1.update(lemma_names)
        d1.update(nltk.word_tokenize(synset.definition()))
    for synset in wn.synsets(w2):
        lemma_names = synset.lemma_names()
        s2.update(lemma_names)
        d2.update(nltk.word_tokenize(synset.definition()))
    total_intersection = len(s1.intersection(s2)) + len(d1.intersection(s2)) + len(d2.intersection(s1))
    return total_intersection > 0


class TestSynonyms(unittest.TestCase):
    def test_true(self):
        pairs = [
            ('car', 'auto'),
            ('auto', 'car'),
            ('car', 'railcar'),
            ('small', 'tiny'),
            ('small', 'miniature'),
        ]
        for word1, word2 in pairs:
            self.assertTrue(might_be_synonyms(word1, word2))
    def test_false(self):
        pairs = [
            ('car', 'airplane'),
            ('car', 'fast'),
            ('small', 'focused'),
            ('small', 'accidental'),
            ('small', 'rabbit'),
            ('small', 'flippant'),
        ]
        for word1, word2 in pairs:
            self.assertFalse(might_be_synonyms(word1, word2))


def get_antonyms(word):
    result = set()
    for lemma in wn.lemmas(word):
        for antonym in lemma.antonyms():
            result.add(antonym.name())
    return result


def might_be_antonyms(w1, w2):
    w1_antonyms = get_antonyms(w1)
    w2_antonyms = get_antonyms(w2)
    for w1_antonym in w1_antonyms:
        if might_be_synonyms(w2, w1_antonym):
            return True
    for w2_antonym in w2_antonyms:
        if might_be_synonyms(w1, w2_antonym):
            return True


class TestAntonyms(unittest.TestCase):
    def test_get_antonyms(self):
        self.assertEqual(get_antonyms('big'), set(['little', 'small']))
        self.assertEqual(get_antonyms('fast'), set(['slow']))
        self.assertEqual(get_antonyms('big'), set(['little', 'small']))
    def test_true(self):
        pairs = [
            ('big', 'small'),
            ('big', 'minor'),
            ('big', 'tiny'),
            ('big', 'miniature'),
            ('fast', 'slow'),
            ('fast', 'sluggish'),
            ('loud', 'soft'),
            ('loud', 'quiet'),
        ]
        for word1, word2 in pairs:
            self.assertTrue(might_be_antonyms(word1, word2))
    def test_false(self):
        pairs = [
            ('big', 'huge'),
            ('big', 'clean'),
            ('fast', 'speedy'),
            ('fast', 'unusual'),
            ('loud', 'noisy'),
            ('loud', 'flat'),
        ]
        for word1, word2 in pairs:
            self.assertFalse(might_be_antonyms(word1, word2))


unittest.main(argv=['first-arg-is-ignored'], exit=False)


def create_pairs(word_strings, word_vectors, class_indices, pos_label, max_per_class=float('infinity')):
    '''Positive and negative pair creation.
    Alternates between positive and negative pairs.
    '''
    string_pairs = []
    vector_pairs = []
    labels = []
    for family in class_indices:
        sibling_pairs = np.array(list(itertools.combinations(family, 2)))

        shuffled_indices = np.arange(len(word_strings))
        np.random.shuffle(shuffled_indices)
        next_index = 0

        if len(sibling_pairs) > max_per_class:
            sibling_pairs = sibling_pairs[np.random.choice(len(sibling_pairs), max_per_class)]

        for sibling_pair in sibling_pairs:
            np.random.shuffle(sibling_pair)
            anchor, pos = sibling_pair
            string_pairs.append([word_strings[anchor], word_strings[pos]])
            vector_pairs.append([word_vectors[anchor], word_vectors[pos]])
            labels.append(pos_label)
            
            random_neg = None
            while random_neg == None or random_neg in family or \
                    might_be_synonyms(word_strings[anchor], word_strings[random_neg]) or \
                    might_be_antonyms(word_strings[anchor], word_strings[random_neg]):
                if next_index >= len(shuffled_indices):
                    np.random.shuffle(shuffled_indices)
                    next_index = 0
                random_neg = shuffled_indices[next_index]
                next_index += 1

            string_pairs.append([word_strings[anchor], word_strings[random_neg]])
            vector_pairs.append([word_vectors[anchor], word_vectors[random_neg]])
            labels.append(0)
    assert len(string_pairs) == len(vector_pairs) == len(labels)
    return np.array(string_pairs), np.array(vector_pairs), np.array(labels)


def create_train_dev_test(word_strings, word_vectors, class_count, word_class, pos_label):
    classes = list(range(class_count))
    random.seed(850101)
    random.shuffle(classes)
    random.seed()
    train_dev_split = round(len(classes)*0.6)
    dev_test_split = round(len(classes)*0.8)
    train_classes = classes[:train_dev_split]
    dev_classes = classes[train_dev_split:dev_test_split]
    test_classes = classes[dev_test_split:]
    assert len(set(train_classes).intersection(set(dev_classes))) == 0
    assert len(set(dev_classes).intersection(set(test_classes))) == 0
    assert len(set(train_classes).intersection(set(test_classes))) == 0

    class_indices = [np.where(word_class == c)[0] for c in train_classes]
    tr_strings, tr_pairs, tr_y = create_pairs(word_strings, word_vectors, class_indices, pos_label)

    class_indices = [np.where(word_class == c)[0] for c in dev_classes]
    dev_strings, dev_pairs, dev_y = create_pairs(word_strings, word_vectors, class_indices, pos_label)

    class_indices = [np.where(word_class == c)[0] for c in test_classes]
    te_strings, te_pairs, te_y = create_pairs(word_strings, word_vectors, class_indices, pos_label)
    
    return tr_strings, tr_pairs, tr_y, dev_strings, dev_pairs, dev_y, te_strings, te_pairs, te_y

.....
----------------------------------------------------------------------
Ran 5 tests in 2.313s

OK


In [0]:
syn_class = -np.ones(len(word_strings), dtype=np.int)
shuffled_indices = np.arange(len(word_strings))
np.random.shuffle(shuffled_indices)
class_count = 0
for i in shuffled_indices:
    word = word_strings[i]
    synsets = wn.synsets(word)
    if synsets:
        syn_indices = [index_for_word[syn] for syn in synsets[0].lemma_names() if syn in index_for_word and syn_class[index_for_word[syn]] == -1]
        if len(syn_indices) > 1:
            syn_class[syn_indices] = class_count
            class_count += 1

tr_syn_strings, tr_syn_pairs, tr_syn_y, \
dev_syn_strings, dev_syn_pairs, dev_syn_y, \
te_syn_strings, te_syn_pairs, te_syn_y = create_train_dev_test(word_strings, word_vectors, class_count, syn_class, 1)

In [0]:
ant_class = -np.ones(len(word_strings), dtype=np.int)
shuffled_indices = np.arange(len(word_strings))
np.random.shuffle(shuffled_indices)
class_count = 0
for i in shuffled_indices:
    word = word_strings[i]
    antonyms = get_antonyms(word)
    if antonyms:
        known_antonyms = [a for a in antonyms if a in index_for_word]
        if known_antonyms:
            ant_indices = [i, index_for_word[known_antonyms[0]]]
            if all(ant_class[ant_indices] == -1):
                ant_class[ant_indices] = class_count
                class_count += 1

tr_ant_strings, tr_ant_pairs, tr_ant_y, \
dev_ant_strings, dev_ant_pairs, dev_ant_y, \
te_ant_strings, te_ant_pairs, te_ant_y = create_train_dev_test(word_strings, word_vectors, class_count, ant_class, -1)

In [0]:
def fit_model(tr_data, output_size, optimizer, batch_size, epochs, map_reg_rate, scale_reg_rate, deviation_dropoff):
    # network definition
    base_network = create_base_network(input_shape, output_size)

    input_a = Input(shape=input_shape)
    input_b = Input(shape=input_shape)

    # because we re-use the same instance `base_network`,
    # the weights of the network
    # will be shared across the two branches
    processed_a = base_network(input_a)
    processed_b = base_network(input_b)

    distance = Lambda(keras_similarity,
                      output_shape=sim_output_shape)([processed_a, processed_b])

    model = Model([input_a, input_b], distance)

    def contrastive_sim_loss(y_true, y_pred):
        return K.mean(-y_true * y_pred +
                      K.cast(K.equal(y_true, 0), 'float32') * (K.abs(y_pred) ** deviation_dropoff))

    # train
    if optimizer == 'rms':
        opt = RMSprop()
    else:
        raise ValueError("unknown optimizer")
    model.compile(loss=contrastive_sim_loss, optimizer=opt)
    
    for tr_pairs, tr_y in tr_data:
        model.fit([tr_pairs[:, 0], tr_pairs[:, 1]], tr_y,
            batch_size=batch_size,
            epochs=epochs,
            verbose=0)

    return model

def eval_model(model):
    y_pred = model.predict([tr_syn_pairs[:, 0], tr_syn_pairs[:, 1]])
    syn_tr_acc = syn_accuracy(tr_syn_y, y_pred)
    y_pred = model.predict([dev_syn_pairs[:, 0], dev_syn_pairs[:, 1]])
    syn_dev_acc = syn_accuracy(dev_syn_y, y_pred)

    y_pred = model.predict([tr_ant_pairs[:, 0], tr_ant_pairs[:, 1]])
    ant_tr_acc = ant_accuracy(tr_ant_y, y_pred)
    y_pred = model.predict([dev_ant_pairs[:, 0], dev_ant_pairs[:, 1]])
    ant_dev_acc = ant_accuracy(dev_ant_y, y_pred)

    print('* Accuracy on synonym training set: %0.2f%%' % (100 * syn_tr_acc))
    print('* Accuracy on synonym dev set: %0.2f%%' % (100 * syn_dev_acc))
    print('* Accuracy on antonym training set: %0.2f%%' % (100 * ant_tr_acc))
    print('* Accuracy on antonym dev set: %0.2f%%' % (100 * ant_dev_acc))
    return syn_tr_acc, syn_dev_acc, ant_tr_acc, ant_dev_acc

def map_word_vectors(model):
    base_network = model.layers[2]
    l1_weights = base_network.layers[1].get_weights()[0]
    l1_biases = base_network.layers[1].get_weights()[1]
    l2_weights = base_network.layers[2].get_weights()[0]
    mapped = (word_vectors @ l1_weights + l1_biases) @ l2_weights
    mapped_2 = (l2_weights.T @ (l1_weights.T @ word_vectors.T + l1_biases.reshape((-1, 1)))).T
    assert np.isclose(mapped, mapped_2).all()
    return mapped

In [29]:
baseline_similarities = np.array([similarity(p[0], p[1]) for p in dev_syn_pairs])
print('* Baseline synonym accuracy on dev set: %0.2f%%' % (100 * syn_accuracy(dev_syn_y, baseline_similarities)))
baseline_similarities = np.array([similarity(p[0], p[1]) for p in dev_ant_pairs])
print('* Baseline antonym accuracy on dev set: %0.2f%%' % (100 * ant_accuracy(dev_ant_y, baseline_similarities)))

* Baseline synonym accuracy on dev set: 74.65%
* Baseline antonym accuracy on dev set: 9.90%


In [30]:
print('Training on synonyms...')
model = fit_model([(tr_syn_pairs, tr_syn_y)], 250, 'rms', 256, 30, 0.01, 0.00001, 20)
eval_model(model)
print('Training on antonyms...')
model = fit_model([(tr_ant_pairs, tr_ant_y)], 250, 'rms', 256, 30, 0.01, 0.00001, 20)
eval_model(model)
print('Training on synonyms & antonyms...')
model = fit_model([(np.vstack((tr_syn_pairs, tr_ant_pairs)), np.hstack((tr_syn_y, tr_ant_y)))], 250, 'rms', 256, 30, 0.01, 0.00001, 20)
eval_model(model)
print('Training on synonyms, then antonyms...')
model = fit_model([(tr_syn_pairs, tr_syn_y), (tr_ant_pairs, tr_ant_y)], 250, 'rms', 256, 30, 0.01, 0.00001, 20)
eval_model(model)
map_word_vectors(model)

Training on synonyms...
* Accuracy on synonym training set: 91.94%
* Accuracy on synonym dev set: 84.54%
* Accuracy on antonym training set: 10.38%
* Accuracy on antonym dev set: 9.65%
Training on antonyms...
* Accuracy on synonym training set: 58.74%
* Accuracy on synonym dev set: 58.74%
* Accuracy on antonym training set: 74.46%
* Accuracy on antonym dev set: 58.42%
Training on synonyms & antonyms...
* Accuracy on synonym training set: 90.23%
* Accuracy on synonym dev set: 82.90%
* Accuracy on antonym training set: 25.95%
* Accuracy on antonym dev set: 21.78%
Training on synonyms, then antonyms...
* Accuracy on synonym training set: 59.10%
* Accuracy on synonym dev set: 59.17%
* Accuracy on antonym training set: 72.32%
* Accuracy on antonym dev set: 58.66%


array([[-6.77601418e+00, -4.27214551e+00, -5.53955442e+00, ...,
        -5.40324240e+00, -4.75650413e+00, -3.01095209e+00],
       [-3.23905499e+00, -2.26440199e+00, -3.70610044e+00, ...,
        -3.11671756e+00, -2.11191786e+00, -9.20262692e-01],
       [-2.36944889e+00, -1.14235824e+00, -2.40405039e+00, ...,
        -2.28590647e+00, -1.82995304e+00, -5.34043462e-01],
       ...,
       [ 1.69680255e+00,  2.00927996e-03,  2.79827937e+00, ...,
         1.74249077e+00,  1.62535258e-01, -1.98569399e+00],
       [ 6.18807477e+00,  8.27245955e-01,  3.29448117e+00, ...,
         4.67244509e+00,  3.68744692e+00,  2.68172217e-01],
       [ 2.88924212e+00, -2.22860132e-01,  1.49813368e+00, ...,
         2.35710256e+00,  1.69267762e+00,  5.51473068e-02]])

In [36]:
space = [
    skopt.space.Integer(1, 300, name='output_size'),
    skopt.space.Categorical(['rms'], name='optimizer'),
    skopt.space.Categorical([128, 256, 512], name='batch_size'),
    skopt.space.Categorical([30], name='epochs'),
    skopt.space.Real(0.000000001, 1000, prior='log-uniform', name='map_reg_rate'),
    skopt.space.Real(0.000000001, 1000, prior='log-uniform', name='scale_reg_rate'),
    skopt.space.Integer(1, 50, name='deviation_dropoff'),
]

@skopt.utils.use_named_args(space)
def objective(output_size, optimizer, batch_size, epochs, map_reg_rate, scale_reg_rate, deviation_dropoff):
    start_time = datetime.datetime.now()
    model = fit_model([(tr_syn_pairs, tr_syn_y)], output_size, optimizer, batch_size, epochs, map_reg_rate, scale_reg_rate, deviation_dropoff)

    print('* Optimization took {:.0f} seconds'.format((datetime.datetime.now() - start_time).total_seconds()))
    syn_tr_acc, syn_dev_acc, ant_tr_acc, ant_dev_acc = eval_model(model)
    print('')
    return -syn_dev_acc

# Authenticate and create the PyDrive client.
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)

checkpoint_filename = "{}-checkpoint.pkl".format(time.asctime())
checkpoint_filepath = os.path.join(os.curdir, checkpoint_filename)

def backup(res):
    skopt.dump(res, checkpoint_filepath)
    uploaded = drive.CreateFile({'title': checkpoint_filename})
    uploaded.SetContentFile(checkpoint_filepath)
    uploaded.Upload()
    print('Uploaded file with ID {}'.format(uploaded.get('id')))

x0 = [270, 'rms', 256, 30, 0.01, 0.01, 14]
res = skopt.gp_minimize(objective, space, x0=x0, n_calls=12, callback=[backup])
res.x

* Optimization took 17 seconds
* Accuracy on synonym training set: 91.95%
* Accuracy on synonym dev set: 84.68%
* Accuracy on antonym training set: 10.05%
* Accuracy on antonym dev set: 8.66%

Uploaded file with ID 1ponF9b9TAu2PvOLNXL4QAiQyyyasPqAQ
* Optimization took 16 seconds
* Accuracy on synonym training set: 89.41%
* Accuracy on synonym dev set: 84.83%
* Accuracy on antonym training set: 9.56%
* Accuracy on antonym dev set: 9.41%

Uploaded file with ID 1pLX53JyN57KEUt14sVXdKCKxd0nZRfD6
* Optimization took 11 seconds
* Accuracy on synonym training set: 90.62%
* Accuracy on synonym dev set: 84.54%
* Accuracy on antonym training set: 10.05%
* Accuracy on antonym dev set: 9.90%

Uploaded file with ID 19OPO04Dfmy2PpUqeVIuoGjmciDMNHQ8z
* Optimization took 28 seconds
* Accuracy on synonym training set: 90.12%
* Accuracy on synonym dev set: 84.73%
* Accuracy on antonym training set: 10.13%
* Accuracy on antonym dev set: 9.90%

Uploaded file with ID 1J5b8uxQ0XpOGvcXanKFzO0En2aa6UNJ9
* Opt

[300, 'rms', 128, 30, 1e-09, 1000.0, 1]