In [1]:
import random
import numpy as np

In [2]:
def sigmoid(x):
    x = 1.0/(1.0 + np.exp(-x))
    return x

In [3]:
x = np.array([[1, 2], [-1, -2]])
print sigmoid(x)

[[ 0.73105858  0.88079708]
 [ 0.26894142  0.11920292]]


In [69]:
def softmax(x):
    input_x = x
    
    if len(x.shape) == 1:
        x = x.reshape((1, x.shape[0]))
    
    row_maxes = np.amax(x, axis=1).reshape((x.shape[0], 1))
    x = np.exp(x - row_maxes)
    row_sums = np.sum(x, axis=1).reshape((x.shape[0], 1))
    x = x / row_sums
    x = x.reshape(input_x.shape)
    
    return x

In [4]:
def normalizeRows(x):
    (r, c) = x.shape
    row_sums = np.sum(x**2, axis=1)
    x = x / np.sqrt(row_sums.reshape((r, 1)))
    return x

In [5]:
VOCAB = dict([("the", 0), ("quick", 1), ("brown", 2), ("fox", 3), ("jumped", 4), 
              ("over", 5), ("lazy", 6), ("dog", 7), ("cat", 8), ("OOV", 9)])
DATASET = type('ds', (), {})()
V = len(VOCAB.keys())

def sampleTokenIdx():
    return random.randint(0, V-1)
def getRandomContext(C):
    return VOCAB.keys()[random.randint(0, V-1)], [VOCAB.keys()[random.randint(0, V-1)] for i in xrange(2 * C)]

DATASET.sampleTokenIdx = sampleTokenIdx
DATASET.getRandomContext = getRandomContext

In [6]:
print DATASET.sampleTokenIdx()
print DATASET.getRandomContext(2)

2
('brown', ['fox', 'quick', 'OOV', 'cat'])


In [7]:
D = 2
print "DxV =", D, "x", V
vectors = normalizeRows(np.random.randn(2 * V, D)) 
InputVectors = vectors[:V,:]   # first half
OutputVectors = vectors[V:,:]  # second half
print InputVectors.shape
print OutputVectors.shape

DxV = 2 x 10
(10L, 2L)
(10L, 2L)


In [90]:
def softmaxCostAndGradient(inputVec, contextIdx):
    N, D = OutputVectors.shape
    
    outputVec = OutputVectors[contextIdx]
    
    h = inputVec.reshape((D, 1))
    
    u = np.dot(OutputVectors, h)
        
    uT = u.reshape((1, N))
    
    yT= softmax(uT)
        
    y = yT.reshape((N,))
                
    cost = -np.log(y[contextIdx])
        
    gradPred = -1.0 * outputVec.reshape((1, D)) + np.dot(y.reshape((1, N)), OutputVectors)
    
    gradPred = gradPred.reshape((D,))

    grad = np.dot(y.reshape((N, 1)), inputVec.reshape((1, D)))
    
    m = np.zeros(grad.shape)
    
    m[contextIdx, :] = inputVec.reshape((1, D))
    
    grad = grad - m    
    
    #assert grad.shape == outputVectors.shape    
    return cost, gradPred, grad

In [91]:
def skipgram(centerWord, contextWords):   
    # Outputs:                                                        #
    #   - cost: the cost function value for the skip-gram model       #
    #   - grad: the gradient with respect to the word vectors         #
    
    N, D = InputVectors.shape
    
    centerIndex = VOCAB[centerWord]
    centerVec = InputVectors[centerIndex]
    
    cost = 0.0
    gradientIn = np.zeros(InputVectors.shape)
    gradientOut = np.zeros(OutputVectors.shape)

    for contextWord in contextWords:
        
        print "Context word:", contextWord
        
        contextIndex = VOCAB[contextWord]
        
        costContext, gradInContext, gradOutContext = softmaxCostAndGradient(centerVec, contextIndex)
        
        cost += costContext
        
        gradientIn[contextIndex, :] += gradInContext
        gradientOut += gradOutContext
    
    return cost, gradientIn, gradientOut

In [92]:
print "w1 = VxD", InputVectors.shape

cost, gradientIn, gradientOut = skipgram("fox", ["brown", "jumped"])
print "Cost =", cost
print "Gradient In =", gradientIn
print "Gradient Out =", gradientOut

w1 = VxD (10L, 2L)
Context word: brown
y[contextIdx] = 0.0512490737881
Context word: jumped
y[contextIdx] = 0.0605259377739
Cost = 5.77574101584
Gradient In = [[ 0.          0.        ]
 [ 0.          0.        ]
 [-0.4925329   0.93026938]
 [ 0.          0.        ]
 [-0.31552027  0.93180503]
 [ 0.          0.        ]
 [ 0.          0.        ]
 [ 0.          0.        ]
 [ 0.          0.        ]
 [ 0.          0.        ]]
Gradient Out = [[-0.10888493  0.03850226]
 [-0.35590006  0.12584805]
 [ 0.84615913 -0.29920613]
 [-0.16625775  0.05878958]
 [ 0.82866679 -0.29302075]
 [-0.16013765  0.05662548]
 [-0.12831092  0.04537139]
 [-0.15026613  0.05313486]
 [-0.29927218  0.10582415]
 [-0.3057963   0.10813111]]


In [61]:
def negativeSampling(inputVec, contextIdx, K=40):
    # Implement the cost and gradients for one input/center word vector  
    # and one context word vector as a building block for word2vec     
    # models, using the negative sampling technique. K is the sample size.                                               
    
    N, D = OutputVectors.shape
    
    outputVec = OutputVectors[contextIdx]
    
    # Get the K random indices (rows into outputVectors)
    k_indices = []
    for i in xrange(K):
        rand_index = DATASET.sampleTokenIdx()
        k_indices.append(rand_index)        
    
    w_out = OutputVectors[k_indices, 0:D] # size K x D
    
    print "w2 = KxD", w_out.shape
    
    print "Sampling random", K, "indices from output vectors as w2"
    
    h = inputVec.reshape((D, 1))
    print "h = vT", h.shape
    #print h
    
    u = np.dot(w_out, -1.0 * h)
    print "u = w2 . -h", u.shape
    #print u
        
    print "inputVec =", inputVec, "outpurVec =", outputVec
    sigmoidContext = sigmoid(np.dot(inputVec, outputVec))
    
    # Equation (4) (negative sampling) in “Distributed Representations of Words and Phrases and their Compositionality”
    cost = -1.0 * np.log( sigmoidContext ) - np.sum( np.log(sigmoid(u)) )
        
    x1 = 1.0 - sigmoid(u) # shape is (K, 1)
    
    x2 = np.dot(x1.reshape((1, K)), w_out).reshape((1, D))
    
    gradIn = (sigmoidContext - 1.0) * outputVec.reshape((1, D)) + x2
    
    gradIn = gradIn.reshape(inputVec.shape)
    
    gradOut = np.zeros(OutputVectors.shape)
    
    gradOut[contextIdx, :] = inputVec * (sigmoidContext - 1.0)
    
    for k in k_indices:
        gradOut[k, :] += -1.0 * inputVec * (sigmoid(np.dot(-1.0 * inputVec, OutputVectors[k])) - 1.0)
        
    return cost, gradIn, gradOut