From 9089c02c44f3525559c19278baa5dafdcd6ada25 Mon Sep 17 00:00:00 2001 From: Nathaniel Saul Date: Thu, 30 Nov 2017 00:05:48 -0800 Subject: [PATCH 1/4] setup travis to run python 3.6 also. --- .travis.yml | 11 ++++--- ci_scripts/install.sh | 73 +++++++++++++++++++++++-------------------- 2 files changed, 45 insertions(+), 39 deletions(-) diff --git a/.travis.yml b/.travis.yml index 73f3965a..3f1fd40a 100644 --- a/.travis.yml +++ b/.travis.yml @@ -12,11 +12,12 @@ env: # Directory where tests are run from - TEST_DIR=/tmp/test_dir/ - MODULE=umap - matrix: - - DISTRIB="conda" PYTHON_VERSION="2.7" - NUMPY_VERSION="1.13.3" SCIPY_VERSION="0.19.1" - - DISTRIB="conda" PYTHON_VERSION="3.6" COVERAGE="true" - NUMPY_VERSION="1.13.3" SCIPY_VERSION="0.19.1" + +matrix: + include: + - python: 3.6 + - env: DISTRIB="conda" PYTHON_VERSION="2.7" NUMPY_VERSION="1.13.3" SCIPY_VERSION="0.19.1" + - env: DISTRIB="conda" PYTHON_VERSION="3.6" COVERAGE="true" NUMPY_VERSION="1.13.3" SCIPY_VERSION="0.19.1" install: source ci_scripts/install.sh script: bash ci_scripts/test.sh diff --git a/ci_scripts/install.sh b/ci_scripts/install.sh index f302ae99..c91365c3 100644 --- a/ci_scripts/install.sh +++ b/ci_scripts/install.sh @@ -1,40 +1,45 @@ -# Deactivate the travis-provided virtual environment and setup a -# conda-based environment instead -deactivate +if [[ "$DISTRIB" == "conda" ]]; then -# Use the miniconda installer for faster download / install of conda -# itself -pushd . -cd -mkdir -p download -cd download -echo "Cached in $HOME/download :" -ls -l -echo -if [[ ! -f miniconda.sh ]] - then - wget http://repo.continuum.io/miniconda/Miniconda-3.6.0-Linux-x86_64.sh \ - -O miniconda.sh - fi -chmod +x miniconda.sh && ./miniconda.sh -b -cd .. -export PATH=/home/travis/miniconda/bin:$PATH -conda update --yes conda -popd + # Deactivate the travis-provided virtual environment and setup a + # conda-based environment instead + deactivate -# Configure the conda environment and put it in the path using the -# provided versions -conda create -n testenv --yes python=$PYTHON_VERSION pip nose \ - numpy=$NUMPY_VERSION scipy=$SCIPY_VERSION numba scikit-learn + # Use the miniconda installer for faster download / install of conda + # itself + pushd . + cd + mkdir -p download + cd download + echo "Cached in $HOME/download :" + ls -l + echo + if [[ ! -f miniconda.sh ]] + then + wget http://repo.continuum.io/miniconda/Miniconda-3.6.0-Linux-x86_64.sh \ + -O miniconda.sh + fi + chmod +x miniconda.sh && ./miniconda.sh -b + cd .. + export PATH=/home/travis/miniconda/bin:$PATH + conda update --yes conda + popd -source activate testenv + # Configure the conda environment and put it in the path using the + # provided versions + conda create -n testenv --yes python=$PYTHON_VERSION pip nose \ + numpy=$NUMPY_VERSION scipy=$SCIPY_VERSION numba scikit-learn + source activate testenv -if [[ "$COVERAGE" == "true" ]]; then - pip install coverage coveralls -fi -python --version -python -c "import numpy; print('numpy %s' % numpy.__version__)" -python -c "import scipy; print('scipy %s' % scipy.__version__)" -python setup.py develop + if [[ "$COVERAGE" == "true" ]]; then + pip install coverage coveralls + fi + + python --version + python -c "import numpy; print('numpy %s' % numpy.__version__)" + python -c "import scipy; print('scipy %s' % scipy.__version__)" + python setup.py develop +else + pip install -e . +fi From bbd17f1d6249fc6d60f1da3415349cc889ddcaaf Mon Sep 17 00:00:00 2001 From: Nathaniel Saul Date: Thu, 30 Nov 2017 09:31:47 -0800 Subject: [PATCH 2/4] skip test until test file is in place --- umap/tests/test_umap.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/umap/tests/test_umap.py b/umap/tests/test_umap.py index faeaac56..2952e384 100644 --- a/umap/tests/test_umap.py +++ b/umap/tests/test_umap.py @@ -76,6 +76,7 @@ 'yule' ) + def test_nn_descent_neighbor_accuracy(): rng_state = np.random.randint(INT32_MIN, INT32_MAX, size=3) nn_descent = make_nn_descent(dist.euclidean, ()) @@ -191,6 +192,7 @@ def test_sparse_metrics(): def test_sparse_fit(): pass +@SkipTest def test_sklearn_digits(): digits = datasets.load_digits() data = digits.data @@ -201,4 +203,4 @@ def test_sklearn_digits(): 'digits_embedding_42.npy')) assert_array_almost_equal(embedding, to_match, err_msg='Digits embedding ' 'is not consistent ' - 'with previous runs') \ No newline at end of file + 'with previous runs') From 933b2eebc988c3973f9bb928ab18e20b8c79cdca Mon Sep 17 00:00:00 2001 From: Nathaniel Saul Date: Thu, 30 Nov 2017 09:43:36 -0800 Subject: [PATCH 3/4] add fix for #28 to sparse matrix support also --- umap/sparse.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/umap/sparse.py b/umap/sparse.py index 8f812709..d948ce37 100644 --- a/umap/sparse.py +++ b/umap/sparse.py @@ -2,7 +2,7 @@ # Enough simple sparse operations in numba to enable sparse UMAP # # License: BSD 3 clause - +from __future__ import print_function import numpy as np import numba From 4841fd7ac08b8cd93b580f83e7608ec1018e3aff Mon Sep 17 00:00:00 2001 From: Nathaniel Saul Date: Thu, 30 Nov 2017 09:46:18 -0800 Subject: [PATCH 4/4] run auto pep8 --- umap/sparse.py | 91 +++++++++++++++++++++-------------------- umap/tests/test_umap.py | 25 ++++++----- umap/umap_.py | 18 ++++---- umap/utils.py | 2 - 4 files changed, 70 insertions(+), 66 deletions(-) diff --git a/umap/sparse.py b/umap/sparse.py index d948ce37..825ad62d 100644 --- a/umap/sparse.py +++ b/umap/sparse.py @@ -192,10 +192,10 @@ def sparse_random_projection_cosine_split(inds, left = indices[left_index] right = indices[right_index] - left_inds = inds[indptr[left]:indptr[left+1]] - left_data = data[indptr[left]:indptr[left+1]] - right_inds = inds[indptr[right]:indptr[right+1]] - right_data = data[indptr[right]:indptr[right+1]] + left_inds = inds[indptr[left]:indptr[left + 1]] + left_data = data[indptr[left]:indptr[left + 1]] + right_inds = inds[indptr[right]:indptr[right + 1]] + right_data = data[indptr[right]:indptr[right + 1]] left_norm = norm(left_data) right_norm = norm(right_data) @@ -222,8 +222,8 @@ def sparse_random_projection_cosine_split(inds, for i in range(indices.shape[0]): margin = 0.0 - i_inds = inds[indptr[indices[i]]:indptr[indices[i]+1]] - i_data = data[indptr[indices[i]]:indptr[indices[i]+1]] + i_inds = inds[indptr[indices[i]]:indptr[indices[i] + 1]] + i_data = data[indptr[indices[i]]:indptr[indices[i] + 1]] mul_inds, mul_data = sparse_mul(hyperplane_inds, hyperplane_data, @@ -314,10 +314,10 @@ def sparse_random_projection_split(inds, left = indices[left_index] right = indices[right_index] - left_inds = inds[indptr[left]:indptr[left+1]] - left_data = data[indptr[left]:indptr[left+1]] - right_inds = inds[indptr[right]:indptr[right+1]] - right_data = data[indptr[right]:indptr[right+1]] + left_inds = inds[indptr[left]:indptr[left + 1]] + left_data = data[indptr[left]:indptr[left + 1]] + right_inds = inds[indptr[right]:indptr[right + 1]] + right_data = data[indptr[right]:indptr[right + 1]] # Compute the normal vector to the hyperplane (the vector between # the two points) and the offset from the origin @@ -347,8 +347,8 @@ def sparse_random_projection_split(inds, side = np.empty(indices.shape[0], np.int8) for i in range(indices.shape[0]): margin = hyperplane_offset - i_inds = inds[indptr[indices[i]]:indptr[indices[i]+1]] - i_data = data[indptr[indices[i]]:indptr[indices[i]+1]] + i_inds = inds[indptr[indices[i]]:indptr[indices[i] + 1]] + i_data = data[indptr[indices[i]]:indptr[indices[i] + 1]] mul_inds, mul_data = sparse_mul(hyperplane_inds, hyperplane_data, @@ -421,11 +421,11 @@ def nn_descent(inds, indptr, data, n_vertices, n_neighbors, rng_state, indices = rejection_sample(n_neighbors, n_vertices, rng_state) for j in range(indices.shape[0]): - from_inds = inds[indptr[i]:indptr[i+1]] - from_data = data[indptr[i]:indptr[i+1]] + from_inds = inds[indptr[i]:indptr[i + 1]] + from_data = data[indptr[i]:indptr[i + 1]] - to_inds = inds[indptr[indices[j]]:indptr[indices[j]+1]] - to_data = data[indptr[indices[j]]:indptr[indices[j]+1]] + to_inds = inds[indptr[indices[j]]:indptr[indices[j] + 1]] + to_data = data[indptr[indices[j]]:indptr[indices[j] + 1]] d = sparse_dist(from_inds, from_data, to_inds, to_data, @@ -447,9 +447,9 @@ def nn_descent(inds, indptr, data, n_vertices, n_neighbors, rng_state, from_data = data[indptr[leaf_array[n, i]]:indptr[leaf_array[n, i] + 1]] to_inds = inds[ - indptr[leaf_array[n, j]]:indptr[leaf_array[n, j] + 1]] + indptr[leaf_array[n, j]]:indptr[leaf_array[n, j] + 1]] to_data = data[ - indptr[leaf_array[n, j]]:indptr[leaf_array[n, j] + 1]] + indptr[leaf_array[n, j]]:indptr[leaf_array[n, j] + 1]] d = sparse_dist(from_inds, from_data, to_inds, to_data, @@ -464,7 +464,7 @@ def nn_descent(inds, indptr, data, n_vertices, n_neighbors, rng_state, for n in range(n_iters): if verbose: - print("\tnn descent iteration ", n, " / ", n_iters) + print("\t", n, " / ", n_iters) candidate_neighbors = build_candidates(current_graph, n_vertices, n_neighbors, max_candidates, @@ -486,9 +486,9 @@ def nn_descent(inds, indptr, data, n_vertices, n_neighbors, rng_state, from_data = data[indptr[p]:indptr[p + 1]] to_inds = inds[ - indptr[q]:indptr[q + 1]] + indptr[q]:indptr[q + 1]] to_data = data[ - indptr[q]:indptr[q + 1]] + indptr[q]:indptr[q + 1]] d = sparse_dist(from_inds, from_data, to_inds, to_data, @@ -504,6 +504,7 @@ def nn_descent(inds, indptr, data, n_vertices, n_neighbors, rng_state, return nn_descent + @numba.njit() def sparse_euclidean(ind1, data1, ind2, data2): aux_inds, aux_data = sparse_diff(ind1, data1, ind2, data2) @@ -560,6 +561,7 @@ def sparse_canberra(ind1, data1, ind2, data2): return np.sum(val_data) + @numba.njit() def sparse_bray_curtis(ind1, data1, ind2, data2): abs_data1 = np.abs(data1) @@ -615,7 +617,7 @@ def sparse_kulsinski(ind1, data1, ind2, data2, n_features): return 0.0 else: return float(num_not_equal - num_true_true + n_features) / \ - (num_not_equal + n_features) + (num_not_equal + n_features) @numba.njit() @@ -708,28 +710,29 @@ def sparse_correlation(ind1, data1, ind2, data2, n_features): else: return (1.0 - (dot_product / (norm1 * norm2))) + sparse_named_distances = { - 'euclidean' : sparse_euclidean, - 'manhattan' : sparse_manhattan, - 'l1' : sparse_manhattan, - 'taxicab' : sparse_manhattan, - 'chebyshev' : sparse_chebyshev, - 'linf' : sparse_chebyshev, - 'linfty' : sparse_chebyshev, - 'linfinity' : sparse_chebyshev, - 'minkowski' : sparse_minkowski, - 'hamming' : sparse_hamming, - 'canberra' : sparse_canberra, - 'bray_curtis' : sparse_bray_curtis, - 'jaccard' : sparse_jaccard, - 'matching' : sparse_matching, - 'kulsinski' : sparse_kulsinski, - 'rogers_tanimoto' : sparse_rogers_tanimoto, - 'russellrao' : sparse_russellrao, - 'sokal_michener' : sparse_sokal_michener, - 'sokal_sneath' : sparse_sokal_sneath, - 'cosine' : sparse_cosine, - 'correlation' : sparse_correlation, + 'euclidean': sparse_euclidean, + 'manhattan': sparse_manhattan, + 'l1': sparse_manhattan, + 'taxicab': sparse_manhattan, + 'chebyshev': sparse_chebyshev, + 'linf': sparse_chebyshev, + 'linfty': sparse_chebyshev, + 'linfinity': sparse_chebyshev, + 'minkowski': sparse_minkowski, + 'hamming': sparse_hamming, + 'canberra': sparse_canberra, + 'bray_curtis': sparse_bray_curtis, + 'jaccard': sparse_jaccard, + 'matching': sparse_matching, + 'kulsinski': sparse_kulsinski, + 'rogers_tanimoto': sparse_rogers_tanimoto, + 'russellrao': sparse_russellrao, + 'sokal_michener': sparse_sokal_michener, + 'sokal_sneath': sparse_sokal_sneath, + 'cosine': sparse_cosine, + 'correlation': sparse_correlation, } sparse_need_n_features = ( @@ -740,4 +743,4 @@ def sparse_correlation(ind1, data1, ind2, data2, n_features): 'russellrao', 'sokal_michener', 'correlation' -) \ No newline at end of file +) diff --git a/umap/tests/test_umap.py b/umap/tests/test_umap.py index 2952e384..039b7bb3 100644 --- a/umap/tests/test_umap.py +++ b/umap/tests/test_umap.py @@ -42,14 +42,14 @@ spatial_data = np.random.randn(10, 20) binary_data = np.random.choice(a=[False, True], size=(10, 20), - p=[0.66, 1-0.66]) + p=[0.66, 1 - 0.66]) sparse_spatial_data = sparse.csr_matrix(spatial_data * binary_data) sparse_binary_data = sparse.csr_matrix(binary_data) nn_data = np.random.uniform(0, 1, size=(1000, 5)) binary_nn_data = np.random.choice(a=[False, True], size=(1000, 5), - p=[0.66, 1-0.66]) + p=[0.66, 1 - 0.66]) sparse_nn_data = sparse.csr_matrix(nn_data * binary_nn_data) spatial_distances = ( @@ -100,7 +100,8 @@ def test_nn_descent_neighbor_accuracy(): percent_correct = num_correct / (spatial_data.shape[0] * 10) assert_greater_equal(percent_correct, 0.99, 'NN-descent did not get 99% ' - 'accuracy on nearest neighbors') + 'accuracy on nearest neighbors') + def test_sparse_nn_descent_neighbor_accuracy(): rng_state = np.random.randint(INT32_MIN, INT32_MAX, size=3) @@ -120,7 +121,8 @@ def test_sparse_nn_descent_neighbor_accuracy(): knn_indices[i] = knn_indices[i][order] tree = KDTree(sparse_nn_data.todense()) - true_indices = tree.query(sparse_nn_data.todense(), 10, return_distance=False) + true_indices = tree.query(sparse_nn_data.todense(), + 10, return_distance=False) print(sparse_nn_data.shape) @@ -133,6 +135,7 @@ def test_sparse_nn_descent_neighbor_accuracy(): '99% accuracy on nearest ' 'neighbors') + def test_trustworthiness(): pass @@ -142,8 +145,8 @@ def test_metrics(): dist_matrix = pairwise_distances(spatial_data, metric=metric) dist_function = dist.named_distances[metric] test_matrix = np.array([[dist_function(spatial_data[i], spatial_data[j]) - for j in range(spatial_data.shape[0])] - for i in range(spatial_data.shape[0])]) + for j in range(spatial_data.shape[0])] + for i in range(spatial_data.shape[0])]) assert_array_almost_equal(test_matrix, dist_matrix, err_msg="Distances don't match " "for metric {}".format(metric)) @@ -152,12 +155,13 @@ def test_metrics(): dist_matrix = pairwise_distances(binary_data, metric=metric) dist_function = dist.named_distances[metric] test_matrix = np.array([[dist_function(binary_data[i], binary_data[j]) - for j in range(binary_data.shape[0])] - for i in range(binary_data.shape[0])]) + for j in range(binary_data.shape[0])] + for i in range(binary_data.shape[0])]) assert_array_almost_equal(test_matrix, dist_matrix, err_msg="Distances don't match " "for metric {}".format(metric)) + def test_sparse_metrics(): for metric in spatial_distances: # Sparse correlation has precision errors right now, leave out ... @@ -174,7 +178,7 @@ def test_sparse_metrics(): sparse_spatial_data[j].data, sparse_spatial_data.shape[1]) for j in range(sparse_spatial_data.shape[0])] - for i in range(sparse_spatial_data.shape[0])]) + for i in range(sparse_spatial_data.shape[0])]) else: test_matrix = np.array( [[dist_function(sparse_spatial_data[i].indices, @@ -182,7 +186,7 @@ def test_sparse_metrics(): sparse_spatial_data[j].indices, sparse_spatial_data[j].data) for j in range(sparse_spatial_data.shape[0])] - for i in range(sparse_spatial_data.shape[0])]) + for i in range(sparse_spatial_data.shape[0])]) assert_array_almost_equal(test_matrix, dist_matrix, err_msg="Distances don't match " @@ -192,6 +196,7 @@ def test_sparse_metrics(): def test_sparse_fit(): pass + @SkipTest def test_sklearn_digits(): digits = datasets.load_digits() diff --git a/umap/umap_.py b/umap/umap_.py index 3342a1fd..922f3eb1 100644 --- a/umap/umap_.py +++ b/umap/umap_.py @@ -288,11 +288,11 @@ def make_tree(data, indices, rng_state, leaf_size=30, angular=False): else: left_indices, right_indices = \ sparse.sparse_random_projection_split( - inds, - indptr, - spdata, - indices, - rng_state) + inds, + indptr, + spdata, + indices, + rng_state) else: if angular: (left_indices, @@ -340,7 +340,6 @@ def get_leaves(tree): return get_leaves(tree.left_child) + get_leaves(tree.right_child) - def rptree_leaf_array(data, n_neighbors, rng_state, n_trees=10, angular=False): """Generate an array of sets of candidate nearest neighbors by constructing a random projection forest and taking the leaves of all the @@ -398,7 +397,6 @@ def rptree_leaf_array(data, n_neighbors, rng_state, n_trees=10, angular=False): return leaf_array - def make_nn_descent(dist, dist_args): """Create a numba accelerated version of nearest neighbor descent specialised for the given distance metric and metric arguments. Numba @@ -652,7 +650,7 @@ def fuzzy_simplicial_set(X, n_neighbors, random_state, raise ValueError('Metric is neither callable, nor a recognised string') if metric in ('cosine', 'correlation', 'dice', 'jaccard'): - angular=True + angular = True rng_state = random_state.randint(INT32_MIN, INT32_MAX, 3).astype(np.int64) @@ -665,7 +663,7 @@ def fuzzy_simplicial_set(X, n_neighbors, random_state, raise ValueError('Metric {} not supported for sparse ' 'data'.format(metric)) metric_nn_descent = sparse.make_sparse_nn_descent(distance_func, - tuple(metric_kwds.values())) + tuple(metric_kwds.values())) leaf_array = rptree_leaf_array(X, n_neighbors, rng_state, n_trees=10, angular=angular) @@ -1026,7 +1024,7 @@ def optimize_layout(embedding, positive_head, positive_tail, if alpha < (initial_alpha * 0.000001): alpha = initial_alpha * 0.000001 - if verbose and i % int(n_edge_samples/10) == 0 : + if verbose and i % int(n_edge_samples / 10) == 0: print("\t", i, " / ", n_edge_samples) return embedding diff --git a/umap/utils.py b/umap/utils.py index 5d630191..42abef28 100644 --- a/umap/utils.py +++ b/umap/utils.py @@ -100,7 +100,6 @@ def rejection_sample(n_samples, pool_size, rng_state): return result - @numba.njit('f8[:, :, :](i8,i8)') def make_heap(n_points, size): """Constructor for the numba enabled heap objects. The heaps are used @@ -215,7 +214,6 @@ def heap_push(heap, row, weight, index, flag): return 1 - @numba.njit(parallel=True) def build_candidates(current_graph, n_vertices, n_neighbors, max_candidates, rng_state):