In [2]:
import numpy as np
from numba import jit

In [235]:
@jit
def jit_loop(train, test):
    """
    Compute the distance between each test point in test and each training point
    in train using a nested loop over both the training data and the
    test data.

    Inputs:
    - test: A numpy array of shape (num_test, D) containing test data.

    Returns:
    - dists: A numpy array of shape (num_test, num_train) where dists[i, j]
      is the Euclidean distance between the ith test point and the jth training
      point. """
    num_test = test.shape[0]
    num_train = train.shape[0]
    dists = np.zeros((num_test, num_train))
    dim = test.shape[1]
    for i in range(num_test):
        for j in range(num_train):
            sum_ = 0
            for k in range(dim):
                x = train[j, k] - test[i, k]
                sum_ += x*x
            dists[i,j] = sum_
    return np.sqrt(dists)

def compute_distances_two_loops(data, X):
    """
    Compute the distance between each test point in X and each training point
    in data using a nested loop over both the training data and the
    test data.

    Inputs:
    - X: A numpy array of shape (num_test, D) containing test data.

    Returns:
    - dists: A numpy array of shape (num_test, num_train) where dists[i, j]
      is the Euclidean distance between the ith test point and the jth training
      point. """
    num_test = X.shape[0]
    num_train = data.shape[0]
    dists = np.zeros((num_test, num_train))
    for i in xrange(num_test):
        for j in xrange(num_train):
            dists[i,j] = np.sum((data[j] - X[i])**2.)
    return np.sqrt(dists)

def compute_distances_one_loop(data, X):
    """
    Compute the distance between each test point in X and each training point
    in data using a single loop over the test data.

    Input / Output: Same as compute_distances_two_loops """
    num_test = X.shape[0]
    num_train = data.shape[0]
    dists = np.zeros((num_test, num_train))
    d2 = np.sum(data**2, axis=1)
    for i in xrange(num_test):
        dists[i,:] = d2 + np.sum(X[i,:]**2.) - 2.*np.einsum('ij,j->i', data, X[i,:])
    return np.sqrt(dists)

def compute_distances_no_loops_bad(data, X):
    """
    Compute the distance between each test point in X and each training point
    in data using no explicit loops.

    Input / Output: Same as compute_distances_two_loops"""
    return np.sqrt(np.sum((data - X[:, np.newaxis])**2.,axis=-1))

def compute_distances_no_loops(data, X):
    """
    Compute the distance between each test point in X and each training point
    in data using no explicit loops.

    Input / Output: Same as compute_distances_two_loops"""
    return np.sqrt(np.sum(X**2, axis=1, keepdims=True) + np.sum(data**2, axis=1) - 2.*np.einsum('ij,kj->ik',X,data))

In [245]:
np.sqrt(np.sum(X**2, axis=1, keepdims=True) + np.sum(data**2, axis=1) - 2.*np.einsum('ij,kj->ik',X,data))

array([[ 6280.25469229,  6412.55292376,  6383.29068741, ...,
         6487.92170113,  6341.13152363,  6175.24274827],
       [ 6377.76426344,  6260.5168317 ,  6459.86640729, ...,
         6307.28261615,  6355.12493976,  6285.49290032],
       [ 6285.34891633,  6388.13564352,  6360.75184235, ...,
         6282.60216789,  6369.49534893,  6234.88371985],
       ..., 
       [ 6351.24523224,  6457.6684647 ,  6387.41285655, ...,
         6302.57621929,  6441.8261386 ,  6360.20219804],
       [ 6307.5241577 ,  6290.77666111,  6427.11117066, ...,
         6349.16388196,  6308.64827043,  6315.5979131 ],
       [ 6420.03816188,  6309.68319331,  6311.81772551, ...,
         6404.38888576,  6342.34310015,  6407.09840724]])

In [246]:
np.sqrt(np.sum(X**2, axis=1)[:,np.newaxis] + np.sum(data**2, axis=1) - 2.*np.einsum('ij,kj->ik',X,data))

array([[ 6280.25469229,  6412.55292376,  6383.29068741, ...,
         6487.92170113,  6341.13152363,  6175.24274827],
       [ 6377.76426344,  6260.5168317 ,  6459.86640729, ...,
         6307.28261615,  6355.12493976,  6285.49290032],
       [ 6285.34891633,  6388.13564352,  6360.75184235, ...,
         6282.60216789,  6369.49534893,  6234.88371985],
       ..., 
       [ 6351.24523224,  6457.6684647 ,  6387.41285655, ...,
         6302.57621929,  6441.8261386 ,  6360.20219804],
       [ 6307.5241577 ,  6290.77666111,  6427.11117066, ...,
         6349.16388196,  6308.64827043,  6315.5979131 ],
       [ 6420.03816188,  6309.68319331,  6311.81772551, ...,
         6404.38888576,  6342.34310015,  6407.09840724]])

In [72]:
a = np.zeros((i,i/2))
np.subtract(data, X, a)

ValueError: operands could not be broadcast together with shapes (6,5) (3,5) (6,3) 

In [157]:
2.5+7.4

9.9

In [159]:
a=0

In [160]:
a = data - X[:, np.newaxis]

In [153]:
a.shape

(500L, 5000L, 370L)

In [97]:
from math import log

In [164]:
'{:e}'.format(500*5000*3700*64//8)

'7.400000e+10'

In [94]:
a.shape

(50L, 500L, 370L)

In [240]:
i = 5000
j = 3700
data = np.random.randint(0,256,(i,j)).astype('int')
X = np.random.randint(0,256,(i/10,j)).astype('int')

In [241]:
%%timeit
jit_loop(data,X)

1 loops, best of 3: 7.29 s per loop


In [242]:
%%timeit
compute_distances_no_loops(data,X)

1 loops, best of 3: 3.1 s per loop


In [243]:
%%timeit
compute_distances_one_loop(data,X)

1 loops, best of 3: 3.07 s per loop


In [244]:
%%timeit
compute_distances_two_loops(data,X)

1 loops, best of 3: 25.4 s per loop


In [188]:
x2 = np.sum(data**2, axis=1, keepdims=True)
y2 = np.sum(X**2, axis=1)
xy = np.dot(data, X.T)
dist = np.sqrt(x2 - 2*xy + y2)

In [170]:
print data.shape
print X.shape

(5000L, 370L)
(500L, 370L)


In [171]:
xy.shape

(5000L, 500L)

In [220]:
dist = compute_distances_no_loops(data,X)

In [221]:
dist2 = jit_loop(data,X)

In [222]:
np.allclose(dist, dist2)

True