In [3]:
import random
import numpy as np
import matplotlib.pyplot as plt

%matplotlib inline
plt.rcParams['figure.figsize'] = (10.0, 8.0) # set default size of plots
plt.rcParams['image.interpolation'] = 'nearest'
plt.rcParams['image.cmap'] = 'gray'

In [18]:
def sigmoid(x):
    x = 1.0/(1.0 + np.exp(-x))
    return x

In [3]:
x = np.array([[1, 2], [-1, -2]])
print sigmoid(x)

[[ 0.73105858  0.88079708]
 [ 0.26894142  0.11920292]]


In [7]:
def normalizeRows(x):
    (r, c) = x.shape
    row_sums = np.sum(x**2, axis=1)
    x = x / np.sqrt(row_sums.reshape((r, 1)))
    return x

In [4]:
dataset = type('sample', (), {})()
def sampleTokenIdx():
    return random.randint(0, 4)
def getRandomContext(C):
    tokens = ["once", "upon", "a", "time", ","]
    return tokens[random.randint(0,4)], [tokens[random.randint(0,4)] for i in xrange(2*C)]

dataset.sampleTokenIdx = sampleTokenIdx
dataset.getRandomContext = getRandomContext

In [5]:
print dataset.sampleTokenIdx()
print dataset.getRandomContext(2)

3
('once', [',', 'time', 'upon', 'upon'])


In [25]:
random.seed(31415)
np.random.seed(9265)
vectors = normalizeRows(np.random.randn(10,3))
tokens = dict([("once",0), ("upon",1), ("a",2),("time",3),(",",4)])
print vectors.shape
print vectors[:5,:] # first half
print vectors[5:,:] # last half

(10L, 3L)
[[-0.96735714 -0.02182641  0.25247529]
 [ 0.73663029 -0.48088687 -0.47552459]
 [-0.27323645  0.12538062  0.95374082]
 [-0.56713774 -0.27178229 -0.77748902]
 [-0.59609459  0.7795666   0.19221644]]
[[-0.6831809  -0.04200519  0.72904007]
 [ 0.18289107  0.76098587 -0.62245591]
 [-0.61517874  0.5147624  -0.59713884]
 [-0.33867074 -0.80966534 -0.47931635]
 [-0.52629529 -0.78190408  0.33412466]]


In [33]:
inputVectors = vectors[:5,:]
outputVectors = vectors[5:,:]

currentWord = ","
curr_index = tokens[currentWord]
curr_vector = inputVectors[curr_index]

print "Curr Word:", currentWord
print "Curr Word Idx:", curr_index
print "Curr Word Vec from Input Vec:", curr_vector

contextWords = ["upon"]

for context_word in contextWords:
    context_index = tokens[context_word]
    
    print "Context Word:", context_word
    print "Context Word Idx:", context_index

    print negativeSampling(curr_vector, context_index, outputVectors)

    # negativeSampling(predicted, target, outputVectors, K=10)
    

Curr Word: ,
Curr Word Idx: 4
Curr Word Vec from Input Vec: [-0.59609459  0.7795666   0.19221644]
Context Word: upon
Context Word Idx: 1
Sample random K indices from output vectors
[[ 0.18289107  0.76098587 -0.62245591]
 [-0.33867074 -0.80966534 -0.47931635]
 [ 0.18289107  0.76098587 -0.62245591]
 [-0.52629529 -0.78190408  0.33412466]
 [ 0.18289107  0.76098587 -0.62245591]
 [-0.52629529 -0.78190408  0.33412466]
 [-0.61517874  0.5147624  -0.59713884]
 [-0.33867074 -0.80966534 -0.47931635]
 [-0.6831809  -0.04200519  0.72904007]
 [-0.6831809  -0.04200519  0.72904007]]
dot
[[-0.59609459]
 [ 0.7795666 ]
 [ 0.19221644]]
(8.3419838773308896, array([-1.7289198 ,  0.02639603, -0.3885663 ]), array([[-0.74618022,  0.97584709,  0.24061299],
       [-0.81103882,  1.06066853,  0.26152727],
       [-0.39207222,  0.51274816,  0.12642746],
       [-0.44410883,  0.58080113,  0.14320717],
       [-0.52737366,  0.68969404,  0.17005671]]))


In [32]:
def negativeSampling(inputVec, contextIdx, outputVectors, K=10):
    # Implement the cost and gradients for one input/predicted word vector  
    # and one context/target word vector as a building block for word2vec     
    # models, using the negative sampling technique. K is the sample size.                                               
    
    N, D = outputVectors.shape
    
    # Get the K random indices (rows into outputVectors)
    k_indices = []
    for i in xrange(K):
        rand_index = dataset.sampleTokenIdx()
        k_indices.append(rand_index)
        
    
    w_out = outputVectors[k_indices, 0:D] # size K x D
    
    print "Sample random K indices from output vectors"
    print w_out    
    
    w_out_dot_r = sigmoid(np.dot(w_out, -1.0 * inputVec.reshape((D, 1)))) # size K x 1
            
    s = np.sum(np.log(w_out_dot_r))
    
    sigm_target = sigmoid(np.dot(inputVec, outputVectors[contextIdx]))
    
    cost = -1.0 * np.log(sigm_target) - s
        
    x1 = 1.0 - w_out_dot_r # shape is (K, 1)
    x2 = np.dot(x1.reshape((1, K)), w_out).reshape((1, D))
    
    gradPred = (sigm_target - 1.0) * outputVectors[contextIdx].reshape((1, D)) + x2
    gradPred = gradPred.reshape(inputVec.shape)
    
    grad = np.zeros(outputVectors.shape)
    grad[contextIdx, :] = inputVec * (sigmoid(np.dot(inputVec, outputVectors[contextIdx])) - 1.0)
    for k in k_indices:
        grad[k, :] += -1.0 * inputVec * (sigmoid(np.dot(-1.0 * inputVec, outputVectors[k])) - 1.0)
        
    # cost_curr, grad_in_curr, grad_out_curr     
    return cost, gradPred, grad