[View in Colaboratory](https://colab.research.google.com/github/maxwelljohn/siamese-word2vec/blob/master/siamese.ipynb)

In [14]:
import numpy as np

import datetime
import itertools
import nltk
import os
import random
import skimage.transform
import sys
import time
nltk.download('wordnet')
from nltk.corpus import wordnet as wn
from keras.utils.data_utils import get_file
from keras.models import Model
from keras.layers import Input, Flatten, Dense, Dropout, Lambda
from keras.optimizers import RMSprop
from keras import backend as K
from keras import regularizers
from keras.constraints import non_neg
from scipy.misc import imread

!pip install scikit-optimize
import skopt

!pip install -U -q PyDrive
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials

# Authenticate and create the PyDrive client.
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [0]:
def similarity(a, b):
    a, b = np.ravel(a), np.ravel(b)
    # Cosine similarity
    return np.dot(a, b) / max(np.linalg.norm(a) * np.linalg.norm(b), sys.float_info.epsilon)


def keras_norm(vect):
    return K.sqrt(K.batch_dot(vect, vect, axes=1))


def keras_similarity(vects):
    x, y = vects
    # Cosine similarity
    return K.batch_dot(x, y, axes=1) / K.maximum(keras_norm(x) * keras_norm(y), K.epsilon())


def sim_output_shape(shapes):
    shape1, shape2 = shapes
    return (shape1[0], 1)


def create_base_network(input_shape, output_size=128, map_reg_rate=0, scale_reg_rate=0):
    '''Base network to be shared (eq. to feature extraction).
    '''
    input = Input(shape=input_shape)
    x = input
    x = Dense(output_size, kernel_regularizer=regularizers.l2(map_reg_rate))(x)
    x = Dense(output_size, kernel_initializer='identity', kernel_constraint=non_neg(),
              kernel_regularizer=regularizers.l2(scale_reg_rate), use_bias=False)(x)
    return Model(input, x)


def accuracy(y_true, y_pred):
    '''Compute classification accuracy with a variable threshold on similarities.
    '''
    median = np.median(y_pred)
    pred = y_pred.ravel() > median
    return np.mean(pred == y_true)


def keras_accuracy(y_true, y_pred):
    '''Compute classification accuracy with a fixed threshold on similarities.
    '''
    return K.mean(K.equal(y_true, K.cast(y_pred > 0.75, y_true.dtype)))

In [16]:
!unzip -u glove.6B.zip || (wget http://nlp.stanford.edu/data/glove.6B.zip && unzip -u glove.6B.zip)

Archive:  glove.6B.zip


In [0]:
# 50K words is enough for 95% of English word usage per the OED:
# https://web.archive.org/web/20160304170936/http://www.oxforddictionaries.com/words/the-oec-facts-about-the-language
# The GloVe text files appear to be roughly sorted by frequency of usage.
!egrep '^[a-z]+ ' glove.6B.300d.txt > glove.head.txt
!cut -d' ' -f1 glove.head.txt > glove.head.strings.txt
!cut -d' ' -f2- glove.head.txt > glove.head.vectors.txt

In [0]:
word_strings = np.loadtxt('glove.head.strings.txt', dtype=object)
word_vectors = np.loadtxt('glove.head.vectors.txt')
input_shape = word_vectors.shape[1:]
assert len(word_strings) == len(word_vectors)

index_for_word = {}
for i, word in enumerate(word_strings):
    index_for_word[word] = i

In [0]:
wnl = nltk.WordNetLemmatizer()
def are_synonyms(word1, word2):
    lemma2 = wnl.lemmatize(word2)
    for synset in wn.synsets(word1):
        lemma_names = synset.lemma_names()
        if word2 in lemma_names or lemma2 in lemma_names:
            return True
    return False
assert are_synonyms('car', 'auto')
assert are_synonyms('auto', 'car')
assert are_synonyms('car', 'railcar')
assert not are_synonyms('car', 'airplane')


def create_pairs(word_strings, word_vectors, class_indices, choose_hard_negs=1, max_per_class=float('infinity')):
    '''Positive and negative pair creation.
    Alternates between positive and negative pairs.
    '''
    string_pairs = []
    vector_pairs = []
    labels = []
    for family in class_indices:
        sibling_pairs = np.array(list(itertools.combinations(family, 2)))

        shuffled_indices = np.arange(len(word_strings))
        np.random.shuffle(shuffled_indices)
        next_index = 0
        num_outside_family = len(shuffled_indices) - len(family)
        assert choose_hard_negs < num_outside_family

        if len(sibling_pairs) > max_per_class:
            sibling_pairs = sibling_pairs[np.random.choice(len(sibling_pairs), max_per_class)]

        for sibling_pair in sibling_pairs:
            np.random.shuffle(sibling_pair)
            anchor, pos = sibling_pair
            string_pairs.append([word_strings[anchor], word_strings[pos]])
            vector_pairs.append([word_vectors[anchor], word_vectors[pos]])
            labels.append(1)

            hardest_neg = None
            closest_similarity = float('-infinity')
            candidates = 0
            while candidates < choose_hard_negs:
                try:
                    random_neg = shuffled_indices[next_index]
                    next_index += 1
                    while random_neg in family or are_synonyms(word_strings[anchor], word_strings[random_neg]):
                        random_neg = shuffled_indices[next_index]
                        next_index += 1
                except IndexError:
                    np.random.shuffle(shuffled_indices)
                    next_index = 0
                    continue
                sim = similarity(word_vectors[anchor], word_vectors[random_neg])
                if sim > closest_similarity:
                    hardest_neg = random_neg
                    closest_similarity = sim
                candidates += 1
            string_pairs.append([word_strings[anchor], word_strings[hardest_neg]])
            vector_pairs.append([word_vectors[anchor], word_vectors[hardest_neg]])
            labels.append(0)
    assert len(string_pairs) == len(vector_pairs) == len(labels)
    return np.array(string_pairs), np.array(vector_pairs), np.array(labels)

In [0]:
syn_class = -np.ones(len(word_strings), dtype=np.int)
shuffled_indices = np.arange(len(word_strings))
np.random.shuffle(shuffled_indices)
class_num = 0
for i in shuffled_indices:
    word = word_strings[i]
    synsets = wn.synsets(word)
    if synsets:
        syn_indices = [index_for_word[syn] for syn in synsets[0].lemma_names() if syn in index_for_word and syn_class[index_for_word[syn]] == -1]
        if len(syn_indices) > 1:
            syn_class[syn_indices] = class_num
            class_num += 1

In [0]:
classes = list(range(class_num))
random.shuffle(classes)
train_dev_split = round(len(classes)*0.6)
dev_test_split = round(len(classes)*0.8)
train_classes = classes[:train_dev_split]
dev_classes = classes[train_dev_split:dev_test_split]
test_classes = classes[dev_test_split:]
assert len(set(train_classes).intersection(set(dev_classes))) == 0
assert len(set(dev_classes).intersection(set(test_classes))) == 0
assert len(set(train_classes).intersection(set(test_classes))) == 0

class_indices = [np.where(syn_class == c)[0] for c in train_classes]
tr_strings, tr_pairs, tr_y = create_pairs(word_strings, word_vectors, class_indices, 5)

class_indices = [np.where(syn_class == c)[0] for c in dev_classes]
dev_strings, dev_pairs, dev_y = create_pairs(word_strings, word_vectors, class_indices)

class_indices = [np.where(syn_class == c)[0] for c in test_classes]
te_strings, te_pairs, te_y = create_pairs(word_strings, word_vectors, class_indices)

In [0]:
space = [
    skopt.space.Integer(1, 300, name='output_size'),
    skopt.space.Categorical(['rms'], name='optimizer'),
    skopt.space.Categorical([128, 256, 512], name='batch_size'),
    skopt.space.Categorical([30], name='epochs'),
    skopt.space.Real(0.000000001, 1000, prior='log-uniform', name='map_reg_rate'),
    skopt.space.Real(0.000000001, 1000, prior='log-uniform', name='scale_reg_rate'),
    skopt.space.Integer(1, 50, name='deviation_dropoff'),
]

@skopt.utils.use_named_args(space)
def objective(output_size, optimizer, batch_size, epochs, map_reg_rate, scale_reg_rate, deviation_dropoff):
    print(locals())
    start_time = datetime.datetime.now()
    
    # network definition
    base_network = create_base_network(input_shape, output_size)

    input_a = Input(shape=input_shape)
    input_b = Input(shape=input_shape)

    # because we re-use the same instance `base_network`,
    # the weights of the network
    # will be shared across the two branches
    processed_a = base_network(input_a)
    processed_b = base_network(input_b)

    distance = Lambda(keras_similarity,
                      output_shape=sim_output_shape)([processed_a, processed_b])

    model = Model([input_a, input_b], distance)

    def contrastive_sim_loss(y_true, y_pred):
        return K.mean(y_true * (-y_pred + 1) +
                      (1 - y_true) * (K.maximum(y_pred, 0) ** deviation_dropoff))
    
    # train
    if optimizer == 'rms':
        opt = RMSprop()
    else:
        raise ValueError("unknown optimizer")
    model.compile(loss=contrastive_sim_loss, optimizer=opt, metrics=[keras_accuracy])
    model.fit([tr_pairs[:, 0], tr_pairs[:, 1]], tr_y,
        batch_size=batch_size,
        epochs=epochs,
        verbose=0,
        validation_data=([dev_pairs[:, 0], dev_pairs[:, 1]], dev_y))
    
    # compute final accuracy on training and test sets
    y_pred = model.predict([tr_pairs[:, 0], tr_pairs[:, 1]])
    tr_acc = accuracy(tr_y, y_pred)
    y_pred = model.predict([dev_pairs[:, 0], dev_pairs[:, 1]])
    dev_acc = accuracy(dev_y, y_pred)

    print('* Optimization took {:.0f} seconds'.format((datetime.datetime.now() - start_time).total_seconds()))
    print('* Accuracy on training set: %0.2f%%' % (100 * tr_acc))
    print('* Accuracy on dev set: %0.2f%%' % (100 * dev_acc))
    
    return -dev_acc

x0 = [270, 'rms', 256, 30, 0.01, 0.01, 14]

In [23]:
baseline_similarities = np.array([similarity(p[0], p[1]) for p in dev_pairs])
print('* Baseline accuracy on dev set: %0.2f%%' % (100 * accuracy(dev_y, baseline_similarities)))

* Baseline accuracy on dev set: 77.06%


In [24]:
checkpoint_filename = "{}-checkpoint.pkl".format(time.asctime())
checkpoint_filepath = os.path.join(os.curdir, checkpoint_filename)

def backup(res):
    skopt.dump(res, checkpoint_filepath)
    uploaded = drive.CreateFile({'title': checkpoint_filename})
    uploaded.SetContentFile(checkpoint_filepath)
    uploaded.Upload()
    print('Uploaded file with ID {}'.format(uploaded.get('id')))

res = skopt.gp_minimize(objective, space, x0=x0, n_calls=25, callback=[backup])

{'scale_reg_rate': 0.01, 'map_reg_rate': 0.01, 'epochs': 30, 'batch_size': 256, 'optimizer': 'rms', 'output_size': 270, 'deviation_dropoff': 14}
* Optimization took 27 seconds
* Accuracy on training set: 89.33%
* Accuracy on dev set: 84.11%
Uploaded file with ID 1jbrSMGRuqyXass8jn9hZHlbtKC5AbLNq
{'scale_reg_rate': 9.778344957500946e-09, 'map_reg_rate': 5.7670991626034244e-05, 'epochs': 30, 'batch_size': 512, 'optimizer': 'rms', 'output_size': 176, 'deviation_dropoff': 30}
* Optimization took 17 seconds
* Accuracy on training set: 89.56%
* Accuracy on dev set: 84.01%
Uploaded file with ID 1GST_kK5W6o0nOD02DW8ecXa1WMJnj3cG
{'scale_reg_rate': 0.0002578845453366895, 'map_reg_rate': 3.365753869765262e-09, 'epochs': 30, 'batch_size': 512, 'optimizer': 'rms', 'output_size': 70, 'deviation_dropoff': 45}
* Optimization took 17 seconds
* Accuracy on training set: 90.03%
* Accuracy on dev set: 83.06%
Uploaded file with ID 1LpjKDP1R0VfroElaikst-J_JFc3yjS5M
{'scale_reg_rate': 2.167327587686808e-08,



* Optimization took 29 seconds
* Accuracy on training set: 90.32%
* Accuracy on dev set: 83.06%
Uploaded file with ID 1Q50HK8FFfnaKzkzdM0ktm9pGshS-7EC4


In [25]:
res.x

[69, 'rms', 256, 30, 9.173980938998038e-06, 3.43121801054129e-05, 10]