diff --git a/.travis.yml b/.travis.yml index eaa843f94799e..472b79b34d0b2 100644 --- a/.travis.yml +++ b/.travis.yml @@ -32,22 +32,22 @@ matrix: - libatlas-dev # This environment tests the oldest supported anaconda env - env: DISTRIB="conda" PYTHON_VERSION="2.7" INSTALL_MKL="false" - NUMPY_VERSION="1.8.2" SCIPY_VERSION="0.13.3" CYTHON_VERSION="0.23.4" + NUMPY_VERSION="1.8.2" SCIPY_VERSION="0.13.3" CYTHON_VERSION="0.23.5" COVERAGE=true - # This environment tests the newest supported Anaconda release (4.3.1) + # This environment tests the newest supported Anaconda release (4.4.0) # It also runs tests requiring Pandas. - - env: DISTRIB="conda" PYTHON_VERSION="3.6" INSTALL_MKL="true" - NUMPY_VERSION="1.11.2" SCIPY_VERSION="0.18.1" PANDAS_VERSION="0.19.2" + - env: DISTRIB="conda" PYTHON_VERSION="3.6.1" INSTALL_MKL="true" + NUMPY_VERSION="1.12.1" SCIPY_VERSION="0.19.0" PANDAS_VERSION="0.20.1" CYTHON_VERSION="0.25.2" COVERAGE=true # This environment use pytest to run the tests. It uses the newest - # supported Anaconda release (4.3.1). It also runs tests requiring Pandas. - # - env: USE_PYTEST="true" DISTRIB="conda" PYTHON_VERSION="3.6" - # INSTALL_MKL="true" NUMPY_VERSION="1.11.2" SCIPY_VERSION="0.18.1" - # PANDAS_VERSION="0.19.2" CYTHON_VERSION="0.25.2" + # supported Anaconda release (4.4.0). It also runs tests requiring Pandas. + # - env: USE_PYTEST="true" DISTRIB="conda" PYTHON_VERSION="3.6.1" + # INSTALL_MKL="true" NUMPY_VERSION="1.12.1" SCIPY_VERSION="0.19.0" + # PANDAS_VERSION="0.20.1" CYTHON_VERSION="0.25.2" # flake8 linting on diff wrt common ancestor with upstream/master - env: RUN_FLAKE8="true" SKIP_TESTS="true" DISTRIB="conda" PYTHON_VERSION="3.5" INSTALL_MKL="true" - NUMPY_VERSION="1.10.4" SCIPY_VERSION="0.17.0" CYTHON_VERSION="0.23.4" + NUMPY_VERSION="1.12.1" SCIPY_VERSION="0.19.0" CYTHON_VERSION="0.23.5" # This environment tests scikit-learn against numpy and scipy master # installed from their CI wheels in a virtualenv with the Python # interpreter provided by travis. diff --git a/benchmarks/bench_plot_nmf.py b/benchmarks/bench_plot_nmf.py index 1e54b570c9d8a..a1e0358e392a0 100644 --- a/benchmarks/bench_plot_nmf.py +++ b/benchmarks/bench_plot_nmf.py @@ -24,7 +24,7 @@ from sklearn.decomposition.nmf import INTEGER_TYPES, _check_init from sklearn.externals.joblib import Memory from sklearn.exceptions import ConvergenceWarning -from sklearn.utils.extmath import fast_dot, safe_sparse_dot, squared_norm +from sklearn.utils.extmath import safe_sparse_dot, squared_norm from sklearn.utils import check_array from sklearn.utils.validation import check_is_fitted, check_non_negative @@ -99,7 +99,7 @@ def _nls_subproblem(X, W, H, tol, max_iter, alpha=0., l1_ratio=0., http://www.csie.ntu.edu.tw/~cjlin/nmf/ """ WtX = safe_sparse_dot(W.T, X) - WtW = fast_dot(W.T, W) + WtW = np.dot(W.T, W) # values justified in the paper (alpha is renamed gamma) gamma = 1 diff --git a/doc/developers/performance.rst b/doc/developers/performance.rst index 3abba5dd84c7c..692e7ca1f99a7 100644 --- a/doc/developers/performance.rst +++ b/doc/developers/performance.rst @@ -84,38 +84,6 @@ C/C++ generated files are embedded in distributed stable packages. The goal is to make it possible to install scikit-learn stable version on any machine with Python, Numpy, Scipy and C/C++ compiler. -Fast matrix multiplications -=========================== - -Matrix multiplications (matrix-matrix and matrix-vector) are usually handled -using the NumPy function ``np.dot``, but in versions of NumPy before 1.7.2 -this function is suboptimal when the inputs are not both in the C (row-major) -layout; in that case, the inputs may be implicitly copied to obtain the right -layout. This obviously consumes memory and takes time. - -The function ``fast_dot`` in ``sklearn.utils.extmath`` offers a fast -replacement for ``np.dot`` that prevents copies from being made in some cases. -In all other cases, it dispatches to ``np.dot`` and when the NumPy version is -new enough, it is in fact an alias for that function, making it a drop-in -replacement. Example usage of ``fast_dot``:: - - >>> import numpy as np - >>> from sklearn.utils.extmath import fast_dot - >>> X = np.random.random_sample([2, 10]) - >>> np.allclose(np.dot(X, X.T), fast_dot(X, X.T)) - True - -This function operates optimally on 2-dimensional arrays, both of the same -dtype, which should be either single or double precision float. If these -requirements aren't met or the BLAS package is not available, the call is -silently dispatched to ``numpy.dot``. If you want to be sure when the original -``numpy.dot`` has been invoked in a situation where it is suboptimal, you can -activate the related warning:: - - >>> import warnings - >>> from sklearn.exceptions import NonBLASDotWarning - >>> warnings.simplefilter('always', NonBLASDotWarning) # doctest: +SKIP - .. _profiling-python-code: Profiling Python code @@ -425,4 +393,3 @@ A sample algorithmic trick: warm restarts for cross validation TODO: demonstrate the warm restart tricks for cross validation of linear regression with Coordinate Descent. - diff --git a/doc/whats_new.rst b/doc/whats_new.rst index 8d7521f3c29e8..38febcaa86f66 100644 --- a/doc/whats_new.rst +++ b/doc/whats_new.rst @@ -389,6 +389,37 @@ API changes summary has been renamed to ``n_components`` and will be removed in version 0.21. :issue:`8922` by :user:Attractadore + - SciPy >= 0.13.3 and NumPy >= 1.8.2 are now the minimum supported versions + for scikit-learn. The following backported functions in ``sklearn.utils`` + have been removed or deprecated accordingly. + :issue:`8854` and :issue:`8874` by :user:`Naoya Kanai ` + + Removed in 0.19: + + - ``utils.fixes.argpartition`` + - ``utils.fixes.array_equal`` + - ``utils.fixes.astype`` + - ``utils.fixes.bincount`` + - ``utils.fixes.expit`` + - ``utils.fixes.frombuffer_empty`` + - ``utils.fixes.in1d`` + - ``utils.fixes.norm`` + - ``utils.fixes.rankdata`` + - ``utils.fixes.safe_copy`` + + Deprecated in 0.19, to be removed in 0.21: + + - ``utils.arpack.eigs`` + - ``utils.arpack.eigsh`` + - ``utils.arpack.svds`` + - ``utils.extmath.fast_dot`` + - ``utils.extmath.logsumexp`` + - ``utils.extmath.norm`` + - ``utils.extmath.pinvh`` + - ``utils.random.choice`` + - ``utils.sparsetools.connected_components`` + - ``utils.stats.rankdata`` + .. _changes_0_18_1: diff --git a/sklearn/cluster/_k_means.pyx b/sklearn/cluster/_k_means.pyx index a8f40ad9d349e..cdaa31fcb78ef 100644 --- a/sklearn/cluster/_k_means.pyx +++ b/sklearn/cluster/_k_means.pyx @@ -16,7 +16,6 @@ cimport cython from cython cimport floating from sklearn.utils.sparsefuncs_fast import assign_rows_csr -from sklearn.utils.fixes import bincount ctypedef np.float64_t DOUBLE ctypedef np.int32_t INT @@ -307,7 +306,7 @@ def _centers_dense(np.ndarray[floating, ndim=2] X, else: centers = np.zeros((n_clusters, n_features), dtype=np.float64) - n_samples_in_cluster = bincount(labels, minlength=n_clusters) + n_samples_in_cluster = np.bincount(labels, minlength=n_clusters) empty_clusters = np.where(n_samples_in_cluster == 0)[0] # maybe also relocate small clusters? @@ -367,7 +366,7 @@ def _centers_sparse(X, np.ndarray[INT, ndim=1] labels, n_clusters, cdef np.ndarray[floating, ndim=2, mode="c"] centers cdef np.ndarray[np.npy_intp, ndim=1] far_from_centers cdef np.ndarray[np.npy_intp, ndim=1, mode="c"] n_samples_in_cluster = \ - bincount(labels, minlength=n_clusters) + np.bincount(labels, minlength=n_clusters) cdef np.ndarray[np.npy_intp, ndim=1, mode="c"] empty_clusters = \ np.where(n_samples_in_cluster == 0)[0] cdef int n_empty_clusters = empty_clusters.shape[0] diff --git a/sklearn/cluster/_k_means_elkan.pyx b/sklearn/cluster/_k_means_elkan.pyx index f662402feb850..804ebabc5450a 100644 --- a/sklearn/cluster/_k_means_elkan.pyx +++ b/sklearn/cluster/_k_means_elkan.pyx @@ -16,7 +16,6 @@ from libc.math cimport sqrt from ..metrics import euclidean_distances from ._k_means import _centers_dense -from ..utils.fixes import partition cdef floating euclidian_dist(floating* a, floating* b, int n_features) nogil: @@ -169,7 +168,7 @@ def k_means_elkan(np.ndarray[floating, ndim=2, mode='c'] X_, int n_clusters, print("start iteration") cd = np.asarray(center_half_distances) - distance_next_center = partition(cd, kth=1, axis=0)[1] + distance_next_center = np.partition(cd, kth=1, axis=0)[1] if verbose: print("done sorting") diff --git a/sklearn/cluster/dbscan_.py b/sklearn/cluster/dbscan_.py index 2f3374027d7c2..6c7bba5af9f8c 100644 --- a/sklearn/cluster/dbscan_.py +++ b/sklearn/cluster/dbscan_.py @@ -14,7 +14,6 @@ from ..base import BaseEstimator, ClusterMixin from ..utils import check_array, check_consistent_length -from ..utils.fixes import astype from ..neighbors import NearestNeighbors from ._dbscan_inner import dbscan_inner @@ -123,7 +122,7 @@ def dbscan(X, eps=0.5, min_samples=5, metric='minkowski', metric_params=None, neighborhoods = np.empty(X.shape[0], dtype=object) X.sum_duplicates() # XXX: modifies X's internals in-place X_mask = X.data <= eps - masked_indices = astype(X.indices, np.intp, copy=False)[X_mask] + masked_indices = X.indices.astype(np.intp, copy=False)[X_mask] masked_indptr = np.concatenate(([0], np.cumsum(X_mask)))[X.indptr[1:]] # insert the diagonal: a point is its own neighbor, but 0 distance diff --git a/sklearn/cluster/k_means_.py b/sklearn/cluster/k_means_.py index 680edc2672a71..d1b9f264fe759 100644 --- a/sklearn/cluster/k_means_.py +++ b/sklearn/cluster/k_means_.py @@ -22,14 +22,12 @@ from ..utils.extmath import row_norms, squared_norm, stable_cumsum from ..utils.sparsefuncs_fast import assign_rows_csr from ..utils.sparsefuncs import mean_variance_axis -from ..utils.fixes import astype from ..utils import check_array from ..utils import check_random_state from ..utils import as_float_array from ..utils import gen_batches from ..utils.validation import check_is_fitted from ..utils.validation import FLOAT_DTYPES -from ..utils.random import choice from ..externals.joblib import Parallel from ..externals.joblib import delayed from ..externals.six import string_types @@ -1062,16 +1060,15 @@ def _mini_batch_step(X, x_squared_norms, centers, counts, n_reassigns = to_reassign.sum() if n_reassigns: # Pick new clusters amongst observations with uniform probability - new_centers = choice(X.shape[0], replace=False, size=n_reassigns, - random_state=random_state) + new_centers = random_state.choice(X.shape[0], replace=False, + size=n_reassigns) if verbose: print("[MiniBatchKMeans] Reassigning %i cluster centers." % n_reassigns) if sp.issparse(X) and not sp.issparse(centers): - assign_rows_csr(X, - astype(new_centers, np.intp), - astype(np.where(to_reassign)[0], np.intp), + assign_rows_csr(X, new_centers.astype(np.intp), + np.where(to_reassign)[0].astype(np.intp), centers) else: centers[to_reassign] = X[new_centers] diff --git a/sklearn/cross_validation.py b/sklearn/cross_validation.py index d56845637fc48..7646459da3936 100644 --- a/sklearn/cross_validation.py +++ b/sklearn/cross_validation.py @@ -26,12 +26,10 @@ from .utils.validation import (_is_arraylike, _num_samples, column_or_1d) from .utils.multiclass import type_of_target -from .utils.random import choice from .externals.joblib import Parallel, delayed, logger from .externals.six import with_metaclass from .externals.six.moves import zip from .metrics.scorer import check_scoring -from .utils.fixes import bincount from .gaussian_process.kernels import Kernel as GPKernel from .exceptions import FitFailedWarning @@ -541,7 +539,7 @@ def __init__(self, y, n_folds=3, shuffle=False, y = np.asarray(y) n_samples = y.shape[0] unique_labels, y_inversed = np.unique(y, return_inverse=True) - label_counts = bincount(y_inversed) + label_counts = np.bincount(y_inversed) min_labels = np.min(label_counts) if np.all(self.n_folds > label_counts): raise ValueError("All the n_labels for individual classes" @@ -990,7 +988,7 @@ def _approximate_mode(class_counts, n_draws, rng): # if we need to add more, we add them all and # go to the next value add_now = min(len(inds), need_to_add) - inds = choice(inds, size=add_now, replace=False, random_state=rng) + inds = rng.choice(inds, size=add_now, replace=False) floored[inds] += 1 need_to_add -= add_now if need_to_add == 0: @@ -1072,7 +1070,7 @@ def __init__(self, y, n_iter=10, test_size=0.1, train_size=None, self.classes, self.y_indices = np.unique(y, return_inverse=True) n_cls = self.classes.shape[0] - if np.min(bincount(self.y_indices)) < 2: + if np.min(np.bincount(self.y_indices)) < 2: raise ValueError("The least populated class in y has only 1" " member, which is too few. The minimum" " number of labels for any class cannot" @@ -1089,7 +1087,7 @@ def __init__(self, y, n_iter=10, test_size=0.1, train_size=None, def _iter_indices(self): rng = check_random_state(self.random_state) - cls_count = bincount(self.y_indices) + cls_count = np.bincount(self.y_indices) for n in range(self.n_iter): # if there are ties in the class-counts, we want diff --git a/sklearn/datasets/samples_generator.py b/sklearn/datasets/samples_generator.py index 7a4543aa2068a..3ba9dfd487868 100644 --- a/sklearn/datasets/samples_generator.py +++ b/sklearn/datasets/samples_generator.py @@ -15,7 +15,6 @@ from ..preprocessing import MultiLabelBinarizer from ..utils import check_array, check_random_state from ..utils import shuffle as util_shuffle -from ..utils.fixes import astype from ..utils.random import sample_without_replacement from ..externals import six map = six.moves.map @@ -28,9 +27,9 @@ def _generate_hypercube(samples, dimensions, rng): if dimensions > 30: return np.hstack([_generate_hypercube(samples, dimensions - 30, rng), _generate_hypercube(samples, 30, rng)]) - out = astype(sample_without_replacement(2 ** dimensions, samples, - random_state=rng), - dtype='>u4', copy=False) + out = sample_without_replacement(2 ** dimensions, samples, + random_state=rng).astype(dtype='>u4', + copy=False) out = np.unpackbits(out.view('>u1')).reshape((-1, 32))[:, -dimensions:] return out diff --git a/sklearn/datasets/svmlight_format.py b/sklearn/datasets/svmlight_format.py index 52e81da086f62..a567e2091e1ab 100644 --- a/sklearn/datasets/svmlight_format.py +++ b/sklearn/datasets/svmlight_format.py @@ -28,7 +28,6 @@ from ..externals.six import u, b from ..externals.six.moves import range, zip from ..utils import check_array -from ..utils.fixes import frombuffer_empty def load_svmlight_file(f, n_features=None, dtype=np.float64, @@ -162,11 +161,11 @@ def _open_and_load(f, dtype, multilabel, zero_based, query_id): # convert from array.array, give data the right dtype if not multilabel: - labels = frombuffer_empty(labels, np.float64) - data = frombuffer_empty(data, actual_dtype) - indices = frombuffer_empty(ind, np.intc) + labels = np.frombuffer(labels, np.float64) + data = np.frombuffer(data, actual_dtype) + indices = np.frombuffer(ind, np.intc) indptr = np.frombuffer(indptr, dtype=np.intc) # never empty - query = frombuffer_empty(query, np.int64) + query = np.frombuffer(query, np.int64) data = np.asarray(data, dtype=dtype) # no-op for float{32,64} return data, indices, indptr, labels, query diff --git a/sklearn/decomposition/base.py b/sklearn/decomposition/base.py index 26d6b3ea7283b..fd6ab3bb7c9fd 100644 --- a/sklearn/decomposition/base.py +++ b/sklearn/decomposition/base.py @@ -13,7 +13,6 @@ from ..base import BaseEstimator, TransformerMixin from ..utils import check_array -from ..utils.extmath import fast_dot from ..utils.validation import check_is_fitted from ..externals import six from abc import ABCMeta, abstractmethod @@ -130,7 +129,7 @@ def transform(self, X, y=None): X = check_array(X) if self.mean_ is not None: X = X - self.mean_ - X_transformed = fast_dot(X, self.components_.T) + X_transformed = np.dot(X, self.components_.T) if self.whiten: X_transformed /= np.sqrt(self.explained_variance_) return X_transformed @@ -156,7 +155,7 @@ def inverse_transform(self, X, y=None): exact inverse operation, which includes reversing whitening. """ if self.whiten: - return fast_dot(X, np.sqrt(self.explained_variance_[:, np.newaxis]) * + return np.dot(X, np.sqrt(self.explained_variance_[:, np.newaxis]) * self.components_) + self.mean_ else: - return fast_dot(X, self.components_) + self.mean_ + return np.dot(X, self.components_) + self.mean_ diff --git a/sklearn/decomposition/factor_analysis.py b/sklearn/decomposition/factor_analysis.py index 3326ac197b3af..4440ee90bd84a 100644 --- a/sklearn/decomposition/factor_analysis.py +++ b/sklearn/decomposition/factor_analysis.py @@ -28,7 +28,7 @@ from ..base import BaseEstimator, TransformerMixin from ..externals.six.moves import xrange from ..utils import check_array, check_random_state -from ..utils.extmath import fast_logdet, fast_dot, randomized_svd, squared_norm +from ..utils.extmath import fast_logdet, randomized_svd, squared_norm from ..utils.validation import check_is_fitted from ..exceptions import ConvergenceWarning @@ -256,8 +256,8 @@ def transform(self, X): Wpsi = self.components_ / self.noise_variance_ cov_z = linalg.inv(Ih + np.dot(Wpsi, self.components_.T)) - tmp = fast_dot(X_transformed, Wpsi.T) - X_transformed = fast_dot(tmp, cov_z) + tmp = np.dot(X_transformed, Wpsi.T) + X_transformed = np.dot(tmp, cov_z) return X_transformed diff --git a/sklearn/decomposition/fastica_.py b/sklearn/decomposition/fastica_.py index 3cca0b7d6e89c..55f4c38cbe4c8 100644 --- a/sklearn/decomposition/fastica_.py +++ b/sklearn/decomposition/fastica_.py @@ -16,7 +16,6 @@ from ..externals import six from ..externals.six import moves from ..utils import check_array, as_float_array, check_random_state -from ..utils.extmath import fast_dot from ..utils.validation import check_is_fitted from ..utils.validation import FLOAT_DTYPES @@ -74,7 +73,7 @@ def _ica_def(X, tol, g, fun_args, max_iter, w_init): w /= np.sqrt((w ** 2).sum()) for i in moves.xrange(max_iter): - gwtx, g_wtx = g(fast_dot(w.T, X), fun_args) + gwtx, g_wtx = g(np.dot(w.T, X), fun_args) w1 = (X * gwtx).mean(axis=1) - g_wtx.mean() * w @@ -103,12 +102,12 @@ def _ica_par(X, tol, g, fun_args, max_iter, w_init): del w_init p_ = float(X.shape[1]) for ii in moves.xrange(max_iter): - gwtx, g_wtx = g(fast_dot(W, X), fun_args) - W1 = _sym_decorrelation(fast_dot(gwtx, X.T) / p_ + gwtx, g_wtx = g(np.dot(W, X), fun_args) + W1 = _sym_decorrelation(np.dot(gwtx, X.T) / p_ - g_wtx[:, np.newaxis] * W) del gwtx, g_wtx # builtin max, abs are faster than numpy counter parts. - lim = max(abs(abs(np.diag(fast_dot(W1, W.T))) - 1)) + lim = max(abs(abs(np.diag(np.dot(W1, W.T))) - 1)) W = W1 if lim < tol: break @@ -345,7 +344,7 @@ def g(x, fun_args): if whiten: if compute_sources: - S = fast_dot(fast_dot(W, K), X).T + S = np.dot(np.dot(W, K), X).T else: S = None if return_X_mean: @@ -361,7 +360,7 @@ def g(x, fun_args): else: if compute_sources: - S = fast_dot(W, X).T + S = np.dot(W, X).T else: S = None if return_X_mean: @@ -551,7 +550,7 @@ def transform(self, X, y=None, copy=True): if self.whiten: X -= self.mean_ - return fast_dot(X, self.components_.T) + return np.dot(X, self.components_.T) def inverse_transform(self, X, copy=True): """Transform the sources back to the mixed data (apply mixing matrix). @@ -571,7 +570,7 @@ def inverse_transform(self, X, copy=True): check_is_fitted(self, 'mixing_') X = check_array(X, copy=(copy and self.whiten), dtype=FLOAT_DTYPES) - X = fast_dot(X, self.mixing_.T) + X = np.dot(X, self.mixing_.T) if self.whiten: X += self.mean_ diff --git a/sklearn/decomposition/nmf.py b/sklearn/decomposition/nmf.py index 7623723125e96..72a52f802accb 100644 --- a/sklearn/decomposition/nmf.py +++ b/sklearn/decomposition/nmf.py @@ -20,7 +20,7 @@ from ..base import BaseEstimator, TransformerMixin from ..utils import check_random_state, check_array from ..utils.extmath import randomized_svd, safe_sparse_dot, squared_norm -from ..utils.extmath import fast_dot, safe_min +from ..utils.extmath import safe_min from ..utils.validation import check_is_fitted, check_non_negative from ..exceptions import ConvergenceWarning from .cdnmf_fast import _update_cdnmf_fast @@ -109,7 +109,7 @@ def _beta_divergence(X, W, H, beta, square_root=False): WH_data = _special_sparse_dot(W, H, X).data X_data = X.data else: - WH = fast_dot(W, H) + WH = np.dot(W, H) WH_data = WH.ravel() X_data = X.ravel() @@ -142,7 +142,7 @@ def _beta_divergence(X, W, H, beta, square_root=False): # np.sum(np.dot(W, H) ** beta) sum_WH_beta = 0 for i in range(X.shape[1]): - sum_WH_beta += np.sum(fast_dot(W, H[:, i]) ** beta) + sum_WH_beta += np.sum(np.dot(W, H[:, i]) ** beta) else: sum_WH_beta = np.sum(WH ** beta) @@ -166,7 +166,7 @@ def _special_sparse_dot(W, H, X): WH = sp.coo_matrix((dot_vals, (ii, jj)), shape=X.shape) return WH.tocsr() else: - return fast_dot(W, H) + return np.dot(W, H) def _compute_regularization(alpha, l1_ratio, regularization): @@ -379,7 +379,7 @@ def _update_coordinate_descent(X, W, Ht, l1_reg, l2_reg, shuffle, """ n_components = Ht.shape[1] - HHt = fast_dot(Ht.T, Ht) + HHt = np.dot(Ht.T, Ht) XHt = safe_sparse_dot(X, Ht) # L2 regularization corresponds to increase of the diagonal of HHt @@ -521,8 +521,8 @@ def _multiplicative_update_w(X, W, H, beta_loss, l1_reg_W, l2_reg_W, gamma, # Denominator if HHt is None: - HHt = fast_dot(H, H.T) - denominator = fast_dot(W, HHt) + HHt = np.dot(H, H.T) + denominator = np.dot(W, HHt) else: # Numerator @@ -566,14 +566,14 @@ def _multiplicative_update_w(X, W, H, beta_loss, l1_reg_W, l2_reg_W, gamma, # (compute row by row, avoiding the dense matrix WH) WHHt = np.empty(W.shape) for i in range(X.shape[0]): - WHi = fast_dot(W[i, :], H) + WHi = np.dot(W[i, :], H) if beta_loss - 1 < 0: WHi[WHi == 0] = EPSILON WHi **= beta_loss - 1 - WHHt[i, :] = fast_dot(WHi, H.T) + WHHt[i, :] = np.dot(WHi, H.T) else: WH **= beta_loss - 1 - WHHt = fast_dot(WH, H.T) + WHHt = np.dot(WH, H.T) denominator = WHHt # Add L1 and L2 regularization @@ -597,7 +597,7 @@ def _multiplicative_update_h(X, W, H, beta_loss, l1_reg_H, l2_reg_H, gamma): """update H in Multiplicative Update NMF""" if beta_loss == 2: numerator = safe_sparse_dot(W.T, X) - denominator = fast_dot(fast_dot(W.T, W), H) + denominator = np.dot(np.dot(W.T, W), H) else: # Numerator @@ -641,14 +641,14 @@ def _multiplicative_update_h(X, W, H, beta_loss, l1_reg_H, l2_reg_H, gamma): # (compute column by column, avoiding the dense matrix WH) WtWH = np.empty(H.shape) for i in range(X.shape[1]): - WHi = fast_dot(W, H[:, i]) + WHi = np.dot(W, H[:, i]) if beta_loss - 1 < 0: WHi[WHi == 0] = EPSILON WHi **= beta_loss - 1 - WtWH[:, i] = fast_dot(W.T, WHi) + WtWH[:, i] = np.dot(W.T, WHi) else: WH **= beta_loss - 1 - WtWH = fast_dot(W.T, WH) + WtWH = np.dot(W.T, WH) denominator = WtWH # Add L1 and L2 regularization diff --git a/sklearn/decomposition/pca.py b/sklearn/decomposition/pca.py index 852a0c42cf7cc..89b9f67ee437c 100644 --- a/sklearn/decomposition/pca.py +++ b/sklearn/decomposition/pca.py @@ -25,7 +25,7 @@ from ..utils import deprecated from ..utils import check_random_state, as_float_array from ..utils import check_array -from ..utils.extmath import fast_dot, fast_logdet, randomized_svd, svd_flip +from ..utils.extmath import fast_logdet, randomized_svd, svd_flip from ..utils.extmath import stable_cumsum from ..utils.validation import check_is_fitted @@ -749,7 +749,7 @@ def transform(self, X, y=None): if self.mean_ is not None: X = X - self.mean_ - X = fast_dot(X, self.components_.T) + X = np.dot(X, self.components_.T) return X def fit_transform(self, X, y=None): @@ -768,7 +768,7 @@ def fit_transform(self, X, y=None): """ X = check_array(X) X = self._fit(X) - return fast_dot(X, self.components_.T) + return np.dot(X, self.components_.T) def inverse_transform(self, X, y=None): """Transform data back to its original space. @@ -792,7 +792,7 @@ def inverse_transform(self, X, y=None): """ check_is_fitted(self, 'mean_') - X_original = fast_dot(X, self.components_) + X_original = np.dot(X, self.components_) if self.mean_ is not None: X_original = X_original + self.mean_ return X_original diff --git a/sklearn/decomposition/tests/test_nmf.py b/sklearn/decomposition/tests/test_nmf.py index 6254c147d45a5..3ce53b550cb0e 100644 --- a/sklearn/decomposition/tests/test_nmf.py +++ b/sklearn/decomposition/tests/test_nmf.py @@ -16,7 +16,7 @@ from sklearn.utils.testing import assert_less from sklearn.utils.testing import assert_greater from sklearn.utils.testing import ignore_warnings -from sklearn.utils.extmath import squared_norm, fast_dot +from sklearn.utils.extmath import squared_norm from sklearn.base import clone from sklearn.exceptions import ConvergenceWarning @@ -241,7 +241,7 @@ def _beta_divergence_dense(X, W, H, beta): H = np.array([[H]]) X = np.array([[X]]) - WH = fast_dot(W, H) + WH = np.dot(W, H) if beta == 2: return squared_norm(X - WH) / 2 diff --git a/sklearn/discriminant_analysis.py b/sklearn/discriminant_analysis.py index bc7b1a7945c81..a646e9e6ba0a5 100644 --- a/sklearn/discriminant_analysis.py +++ b/sklearn/discriminant_analysis.py @@ -23,7 +23,6 @@ from .utils.multiclass import unique_labels from .utils import check_array, check_X_y from .utils.validation import check_is_fitted -from .utils.fixes import bincount from .utils.multiclass import check_classification_targets from .preprocessing import StandardScaler @@ -337,8 +336,7 @@ class scatter). This solver supports both classification and self.explained_variance_ratio_ = np.sort(evals / np.sum(evals) )[::-1][:self._max_components] evecs = evecs[:, np.argsort(evals)[::-1]] # sort eigenvectors - # evecs /= np.linalg.norm(evecs, axis=0) # doesn't work with numpy 1.6 - evecs /= np.apply_along_axis(np.linalg.norm, 0, evecs) + evecs /= np.linalg.norm(evecs, axis=0) self.scalings_ = evecs self.coef_ = np.dot(self.means_, evecs).dot(evecs.T) @@ -431,7 +429,7 @@ def fit(self, X, y): if self.priors is None: # estimate priors from sample _, y_t = np.unique(y, return_inverse=True) # non-negative ints - self.priors_ = bincount(y_t) / float(len(y)) + self.priors_ = np.bincount(y_t) / float(len(y)) else: self.priors_ = np.asarray(self.priors) @@ -642,7 +640,7 @@ def fit(self, X, y): if n_classes < 2: raise ValueError('y has less than 2 classes') if self.priors is None: - self.priors_ = bincount(y) / float(n_samples) + self.priors_ = np.bincount(y) / float(n_samples) else: self.priors_ = self.priors diff --git a/sklearn/ensemble/bagging.py b/sklearn/ensemble/bagging.py index 711d089d057d9..243d898439315 100644 --- a/sklearn/ensemble/bagging.py +++ b/sklearn/ensemble/bagging.py @@ -21,7 +21,6 @@ from ..utils.random import sample_without_replacement from ..utils.validation import has_fit_parameter, check_is_fitted from ..utils import indices_to_mask, check_consistent_length -from ..utils.fixes import bincount from ..utils.metaestimators import if_delegate_has_method from ..utils.multiclass import check_classification_targets @@ -104,7 +103,7 @@ def _parallel_build_estimators(n_estimators, ensemble, X, y, sample_weight, curr_sample_weight = sample_weight.copy() if bootstrap: - sample_counts = bincount(indices, minlength=n_samples) + sample_counts = np.bincount(indices, minlength=n_samples) curr_sample_weight *= sample_counts else: not_indices_mask = ~indices_to_mask(indices, n_samples) diff --git a/sklearn/ensemble/forest.py b/sklearn/ensemble/forest.py index 340df89673646..60732bf83a446 100644 --- a/sklearn/ensemble/forest.py +++ b/sklearn/ensemble/forest.py @@ -61,7 +61,7 @@ class calls the ``fit`` method of each sub-estimator on random samples from ..utils import check_random_state, check_array, compute_sample_weight from ..exceptions import DataConversionWarning, NotFittedError from .base import BaseEnsemble, _partition_estimators -from ..utils.fixes import bincount, parallel_helper +from ..utils.fixes import parallel_helper from ..utils.multiclass import check_classification_targets from ..utils.validation import check_is_fitted @@ -85,7 +85,7 @@ def _generate_sample_indices(random_state, n_samples): def _generate_unsampled_indices(random_state, n_samples): """Private function used to forest._set_oob_score function.""" sample_indices = _generate_sample_indices(random_state, n_samples) - sample_counts = bincount(sample_indices, minlength=n_samples) + sample_counts = np.bincount(sample_indices, minlength=n_samples) unsampled_mask = sample_counts == 0 indices_range = np.arange(n_samples) unsampled_indices = indices_range[unsampled_mask] @@ -107,7 +107,7 @@ def _parallel_build_trees(tree, forest, X, y, sample_weight, tree_idx, n_trees, curr_sample_weight = sample_weight.copy() indices = _generate_sample_indices(tree.random_state, n_samples) - sample_counts = bincount(indices, minlength=n_samples) + sample_counts = np.bincount(indices, minlength=n_samples) curr_sample_weight *= sample_counts if class_weight == 'subsample': diff --git a/sklearn/ensemble/gradient_boosting.py b/sklearn/ensemble/gradient_boosting.py index 8c16ccf78ffa6..779cf2d3b543c 100644 --- a/sklearn/ensemble/gradient_boosting.py +++ b/sklearn/ensemble/gradient_boosting.py @@ -56,7 +56,6 @@ from ..utils import check_X_y from ..utils import column_or_1d from ..utils import check_consistent_length -from ..utils.fixes import bincount from ..utils import deprecated from ..utils.stats import _weighted_percentile from ..utils.validation import check_is_fitted @@ -139,7 +138,7 @@ class in the training data. def fit(self, X, y, sample_weight=None): if sample_weight is None: sample_weight = np.ones_like(y, dtype=np.float64) - class_counts = bincount(y, weights=sample_weight) + class_counts = np.bincount(y, weights=sample_weight) self.priors = class_counts / class_counts.sum() def predict(self, X): diff --git a/sklearn/ensemble/tests/test_forest.py b/sklearn/ensemble/tests/test_forest.py index 63f81e5b5550d..b964ed768d85e 100644 --- a/sklearn/ensemble/tests/test_forest.py +++ b/sklearn/ensemble/tests/test_forest.py @@ -41,7 +41,6 @@ from sklearn.ensemble import RandomTreesEmbedding from sklearn.model_selection import GridSearchCV from sklearn.svm import LinearSVC -from sklearn.utils.fixes import bincount from sklearn.utils.validation import check_random_state from sklearn.tree.tree import SPARSE_SPLITTERS @@ -255,7 +254,7 @@ def entropy(samples): n_samples = len(samples) entropy = 0. - for count in bincount(samples): + for count in np.bincount(samples): p = 1. * count / n_samples if p > 0: entropy -= p * np.log2(p) @@ -735,7 +734,7 @@ def check_min_samples_leaf(name): est = ForestEstimator(min_samples_leaf=5, n_estimators=1, random_state=0) est.fit(X, y) out = est.estimators_[0].tree_.apply(X) - node_counts = bincount(out) + node_counts = np.bincount(out) # drop inner nodes leaf_count = node_counts[node_counts != 0] assert_greater(np.min(leaf_count), 4, @@ -777,7 +776,7 @@ def check_min_weight_fraction_leaf(name): est.fit(X, y, sample_weight=weights) out = est.estimators_[0].tree_.apply(X) - node_weights = bincount(out, weights=weights) + node_weights = np.bincount(out, weights=weights) # drop inner nodes leaf_weights = node_weights[node_weights != 0] assert_greater_equal( diff --git a/sklearn/feature_extraction/_hashing.pyx b/sklearn/feature_extraction/_hashing.pyx index 39c2b10378132..e0c1d1bdaece5 100644 --- a/sklearn/feature_extraction/_hashing.pyx +++ b/sklearn/feature_extraction/_hashing.pyx @@ -76,8 +76,5 @@ def transform(raw_X, Py_ssize_t n_features, dtype): array.resize_smart(indptr, len(indptr) + 1) indptr[len(indptr) - 1] = size - if len(indices): - indices_a = np.frombuffer(indices, dtype=np.int32) - else: # workaround for NumPy < 1.7.0 - indices_a = np.empty(0, dtype=np.int32) + indices_a = np.frombuffer(indices, dtype=np.int32) return (indices_a, np.frombuffer(indptr, dtype=np.int32), values[:size]) diff --git a/sklearn/feature_extraction/dict_vectorizer.py b/sklearn/feature_extraction/dict_vectorizer.py index 66390d7a2c963..53804ed83ac45 100644 --- a/sklearn/feature_extraction/dict_vectorizer.py +++ b/sklearn/feature_extraction/dict_vectorizer.py @@ -13,7 +13,6 @@ from ..externals import six from ..externals.six.moves import xrange from ..utils import check_array, tosequence -from ..utils.fixes import frombuffer_empty def _tosequence(X): @@ -183,7 +182,7 @@ def _transform(self, X, fitting): if len(indptr) == 1: raise ValueError("Sample sequence X is empty.") - indices = frombuffer_empty(indices, dtype=np.intc) + indices = np.frombuffer(indices, dtype=np.intc) indptr = np.frombuffer(indptr, dtype=np.intc) shape = (len(indptr) - 1, len(vocab)) diff --git a/sklearn/feature_extraction/image.py b/sklearn/feature_extraction/image.py index 708424cb3f843..37e1a7e3465e9 100644 --- a/sklearn/feature_extraction/image.py +++ b/sklearn/feature_extraction/image.py @@ -16,7 +16,6 @@ from numpy.lib.stride_tricks import as_strided from ..utils import check_array, check_random_state -from ..utils.fixes import astype from ..base import BaseEstimator __all__ = ['PatchExtractor', @@ -108,7 +107,7 @@ def _to_graph(n_x, n_y, n_z, mask=None, img=None, n_voxels = diag.size else: if mask is not None: - mask = astype(mask, dtype=np.bool, copy=False) + mask = mask.astype(dtype=np.bool, copy=False) mask = np.asarray(mask, dtype=np.bool) edges = _mask_edges_weights(mask, edges) n_voxels = np.sum(mask) diff --git a/sklearn/feature_extraction/tests/test_text.py b/sklearn/feature_extraction/tests/test_text.py index 341486abd3b1c..de6674646c981 100644 --- a/sklearn/feature_extraction/tests/test_text.py +++ b/sklearn/feature_extraction/tests/test_text.py @@ -24,7 +24,6 @@ from numpy.testing import assert_array_almost_equal from numpy.testing import assert_array_equal from numpy.testing import assert_raises -from sklearn.utils.random import choice from sklearn.utils.testing import (assert_equal, assert_false, assert_true, assert_not_equal, assert_almost_equal, assert_in, assert_less, assert_greater, @@ -863,8 +862,7 @@ def test_countvectorizer_vocab_sets_when_pickling(): vocab_words = np.array(['beer', 'burger', 'celeri', 'coke', 'pizza', 'salad', 'sparkling', 'tomato', 'water']) for x in range(0, 100): - vocab_set = set(choice(vocab_words, size=5, replace=False, - random_state=rng)) + vocab_set = set(rng.choice(vocab_words, size=5, replace=False)) cv = CountVectorizer(vocabulary=vocab_set) unpickled_cv = pickle.loads(pickle.dumps(cv)) cv.fit(ALL_FOOD_DOCS) @@ -878,7 +876,7 @@ def test_countvectorizer_vocab_dicts_when_pickling(): 'salad', 'sparkling', 'tomato', 'water']) for x in range(0, 100): vocab_dict = dict() - words = choice(vocab_words, size=5, replace=False, random_state=rng) + words = rng.choice(vocab_words, size=5, replace=False) for y in range(0, 5): vocab_dict[words[y]] = y cv = CountVectorizer(vocabulary=vocab_dict) diff --git a/sklearn/feature_extraction/text.py b/sklearn/feature_extraction/text.py index 539e88973bcc0..3cf76187350f6 100644 --- a/sklearn/feature_extraction/text.py +++ b/sklearn/feature_extraction/text.py @@ -29,7 +29,6 @@ from ..preprocessing import normalize from .hashing import FeatureHasher from .stop_words import ENGLISH_STOP_WORDS -from ..utils.fixes import frombuffer_empty, bincount from ..utils.validation import check_is_fitted __all__ = ['CountVectorizer', @@ -503,7 +502,7 @@ def _get_hasher(self): def _document_frequency(X): """Count the number of non-zero values for each feature in sparse X.""" if sp.isspmatrix_csr(X): - return bincount(X.indices, minlength=X.shape[1]) + return np.bincount(X.indices, minlength=X.shape[1]) else: return np.diff(sp.csc_matrix(X, copy=False).indptr) @@ -783,7 +782,7 @@ def _count_vocab(self, raw_documents, fixed_vocab): j_indices = np.asarray(j_indices, dtype=np.intc) indptr = np.frombuffer(indptr, dtype=np.intc) - values = frombuffer_empty(values, dtype=np.intc) + values = np.frombuffer(values, dtype=np.intc) X = sp.csr_matrix((values, j_indices, indptr), shape=(len(indptr) - 1, len(vocabulary)), diff --git a/sklearn/feature_selection/from_model.py b/sklearn/feature_selection/from_model.py index dada33e9a75cc..ab6ce1ddb2545 100644 --- a/sklearn/feature_selection/from_model.py +++ b/sklearn/feature_selection/from_model.py @@ -8,7 +8,6 @@ from ..externals import six from ..exceptions import NotFittedError -from ..utils.fixes import norm from ..utils.metaestimators import if_delegate_has_method @@ -21,7 +20,8 @@ def _get_feature_importances(estimator, norm_order=1): importances = np.abs(estimator.coef_) else: - importances = norm(estimator.coef_, axis=0, ord=norm_order) + importances = np.linalg.norm(estimator.coef_, axis=0, + ord=norm_order) elif importances is None: raise ValueError( diff --git a/sklearn/feature_selection/tests/test_from_model.py b/sklearn/feature_selection/tests/test_from_model.py index 6ef0d824b587c..6ac6b8630b139 100644 --- a/sklearn/feature_selection/tests/test_from_model.py +++ b/sklearn/feature_selection/tests/test_from_model.py @@ -17,7 +17,6 @@ from sklearn.feature_selection import SelectFromModel from sklearn.ensemble import RandomForestClassifier from sklearn.linear_model import PassiveAggressiveClassifier -from sklearn.utils.fixes import norm iris = datasets.load_iris() data, y = iris.data, iris.target @@ -101,7 +100,7 @@ def test_feature_importances_2d_coef(): # Manually check that the norm is correctly performed est.fit(X, y) - importances = norm(est.coef_, axis=0, ord=order) + importances = np.linalg.norm(est.coef_, axis=0, ord=order) feature_mask = importances > func(importances) assert_array_equal(X_new, X[:, feature_mask]) diff --git a/sklearn/isotonic.py b/sklearn/isotonic.py index 910c23508592a..245fc95ff69b2 100644 --- a/sklearn/isotonic.py +++ b/sklearn/isotonic.py @@ -9,7 +9,6 @@ from .base import BaseEstimator, TransformerMixin, RegressorMixin from .utils import as_float_array, check_array, check_consistent_length from .utils import deprecated -from .utils.fixes import astype from ._isotonic import _inplace_contiguous_isotonic_regression, _make_unique import warnings import math @@ -291,7 +290,7 @@ def _build_y(self, X, y, sample_weight, trim_duplicates=True): sample_weight = np.ones(len(y)) order = np.lexsort((y, X)) - X, y, sample_weight = [astype(array[order], np.float64, copy=False) + X, y, sample_weight = [array[order].astype(np.float64, copy=False) for array in [X, y, sample_weight]] unique_X, unique_y, unique_sample_weight = _make_unique( X, y, sample_weight) diff --git a/sklearn/learning_curve.py b/sklearn/learning_curve.py index 59d55cad3eb7f..0cfe4c3cad031 100644 --- a/sklearn/learning_curve.py +++ b/sklearn/learning_curve.py @@ -14,7 +14,6 @@ from .cross_validation import _safe_split, _score, _fit_and_score from .metrics.scorer import check_scoring from .utils import indexable -from .utils.fixes import astype warnings.warn("This module was deprecated in version 0.18 in favor of the " @@ -214,7 +213,7 @@ def _translate_train_sizes(train_sizes, n_max_training_samples): "must be within (0, 1], but is within [%f, %f]." % (n_min_required_samples, n_max_required_samples)) - train_sizes_abs = astype(train_sizes_abs * n_max_training_samples, + train_sizes_abs = (train_sizes_abs * n_max_training_samples).astype( dtype=np.int, copy=False) train_sizes_abs = np.clip(train_sizes_abs, 1, n_max_training_samples) diff --git a/sklearn/linear_model/stochastic_gradient.py b/sklearn/linear_model/stochastic_gradient.py index b3c61408470cc..85f2b8ef7df07 100644 --- a/sklearn/linear_model/stochastic_gradient.py +++ b/sklearn/linear_model/stochastic_gradient.py @@ -20,7 +20,6 @@ from ..externals import six from .sgd_fast import plain_sgd, average_sgd -from ..utils.fixes import astype from ..utils import compute_class_weight from ..utils import deprecated from .sgd_fast import Hinge @@ -864,7 +863,7 @@ def _partial_fit(self, X, y, alpha, C, loss, learning_rate, n_iter, sample_weight, coef_init, intercept_init): X, y = check_X_y(X, y, "csr", copy=False, order='C', dtype=np.float64) - y = astype(y, np.float64, copy=False) + y = y.astype(np.float64, copy=False) n_samples, n_features = X.shape diff --git a/sklearn/linear_model/theil_sen.py b/sklearn/linear_model/theil_sen.py index b51f7d6dd3c32..544f79f9df054 100644 --- a/sklearn/linear_model/theil_sen.py +++ b/sklearn/linear_model/theil_sen.py @@ -21,7 +21,6 @@ from ..base import RegressorMixin from ..utils import check_random_state from ..utils import check_X_y, _get_n_jobs -from ..utils.random import choice from ..externals.joblib import Parallel, delayed from ..externals.six.moves import xrange as range from ..exceptions import ConvergenceWarning @@ -365,10 +364,8 @@ def fit(self, X, y): if np.rint(binom(n_samples, n_subsamples)) <= self.max_subpopulation: indices = list(combinations(range(n_samples), n_subsamples)) else: - indices = [choice(n_samples, - size=n_subsamples, - replace=False, - random_state=random_state) + indices = [random_state.choice(n_samples, size=n_subsamples, + replace=False) for _ in range(self.n_subpopulation_)] n_jobs = _get_n_jobs(self.n_jobs) diff --git a/sklearn/manifold/t_sne.py b/sklearn/manifold/t_sne.py index 30b9ca16e88c2..0fc30e1aaa166 100644 --- a/sklearn/manifold/t_sne.py +++ b/sklearn/manifold/t_sne.py @@ -17,12 +17,10 @@ from ..base import BaseEstimator from ..utils import check_array from ..utils import check_random_state -from ..utils.extmath import _ravel from ..decomposition import PCA from ..metrics.pairwise import pairwise_distances from . import _utils from . import _barnes_hut_tsne -from ..utils.fixes import astype from ..externals.six import string_types from ..utils import deprecated @@ -53,7 +51,7 @@ def _joint_probabilities(distances, desired_perplexity, verbose): """ # Compute conditional probabilities such that they approximately match # the desired perplexity - distances = astype(distances, np.float32, copy=False) + distances = distances.astype(np.float32, copy=False) conditional_P = _utils._binary_search_perplexity( distances, None, desired_perplexity, verbose) P = conditional_P + conditional_P.T @@ -90,8 +88,8 @@ def _joint_probabilities_nn(distances, neighbors, desired_perplexity, verbose): """ # Compute conditional probabilities such that they approximately match # the desired perplexity - distances = astype(distances, np.float32, copy=False) - neighbors = astype(neighbors, np.int64, copy=False) + distances = distances.astype(np.float32, copy=False) + neighbors = neighbors.astype(np.int64, copy=False) conditional_P = _utils._binary_search_perplexity( distances, neighbors, desired_perplexity, verbose) m = "All probabilities should be finite" @@ -158,7 +156,8 @@ def _kl_divergence(params, P, degrees_of_freedom, n_samples, n_components, grad = np.ndarray((n_samples, n_components)) PQd = squareform((P - Q) * n) for i in range(skip_num_points, n_samples): - np.dot(_ravel(PQd[i]), X_embedded[i] - X_embedded, out=grad[i]) + np.dot(np.ravel(PQd[i], order='K'), X_embedded[i] - X_embedded, + out=grad[i]) grad = grad.ravel() c = 2.0 * (degrees_of_freedom + 1.0) / degrees_of_freedom grad *= c @@ -277,9 +276,9 @@ def _kl_divergence_bh(params, P, neighbors, degrees_of_freedom, n_samples, Unraveled gradient of the Kullback-Leibler divergence with respect to the embedding. """ - params = astype(params, np.float32, copy=False) + params = params.astype(np.float32, copy=False) X_embedded = params.reshape(n_samples, n_components) - neighbors = astype(neighbors, np.int64, copy=False) + neighbors = neighbors.astype(np.int64, copy=False) if len(P.shape) == 1: sP = squareform(P).astype(np.float32) else: diff --git a/sklearn/metrics/classification.py b/sklearn/metrics/classification.py index bcf9c48ab30ba..a01314589869b 100644 --- a/sklearn/metrics/classification.py +++ b/sklearn/metrics/classification.py @@ -38,7 +38,6 @@ from ..utils.multiclass import type_of_target from ..utils.validation import _num_samples from ..utils.sparsefuncs import count_nonzero -from ..utils.fixes import bincount from ..exceptions import UndefinedMetricWarning @@ -1088,16 +1087,16 @@ def precision_recall_fscore_support(y_true, y_pred, beta=1.0, labels=None, tp_bins_weights = None if len(tp_bins): - tp_sum = bincount(tp_bins, weights=tp_bins_weights, + tp_sum = np.bincount(tp_bins, weights=tp_bins_weights, minlength=len(labels)) else: # Pathological case true_sum = pred_sum = tp_sum = np.zeros(len(labels)) if len(y_pred): - pred_sum = bincount(y_pred, weights=sample_weight, + pred_sum = np.bincount(y_pred, weights=sample_weight, minlength=len(labels)) if len(y_true): - true_sum = bincount(y_true, weights=sample_weight, + true_sum = np.bincount(y_true, weights=sample_weight, minlength=len(labels)) # Retain only selected labels diff --git a/sklearn/metrics/cluster/supervised.py b/sklearn/metrics/cluster/supervised.py index 6c984ec9e39d7..9115b93abefba 100644 --- a/sklearn/metrics/cluster/supervised.py +++ b/sklearn/metrics/cluster/supervised.py @@ -22,7 +22,6 @@ from scipy import sparse as sp from .expected_mutual_info_fast import expected_mutual_information -from ...utils.fixes import bincount from ...utils.validation import check_array @@ -862,7 +861,7 @@ def entropy(labels): if len(labels) == 0: return 1.0 label_idx = np.unique(labels, return_inverse=True)[1] - pi = bincount(label_idx).astype(np.float64) + pi = np.bincount(label_idx).astype(np.float64) pi = pi[pi > 0] pi_sum = np.sum(pi) # log(a / b) should be calculated as log(a) - log(b) for diff --git a/sklearn/metrics/cluster/unsupervised.py b/sklearn/metrics/cluster/unsupervised.py index 3be683ae08d98..adb141c3120f0 100644 --- a/sklearn/metrics/cluster/unsupervised.py +++ b/sklearn/metrics/cluster/unsupervised.py @@ -9,7 +9,6 @@ from ...utils import check_random_state from ...utils import check_X_y -from ...utils.fixes import bincount from ..pairwise import pairwise_distances from ...preprocessing import LabelEncoder @@ -169,7 +168,7 @@ def silhouette_samples(X, labels, metric='euclidean', **kwds): distances = pairwise_distances(X, metric=metric, **kwds) unique_labels = le.classes_ - n_samples_per_label = bincount(labels, minlength=len(unique_labels)) + n_samples_per_label = np.bincount(labels, minlength=len(unique_labels)) # For sample i, store the mean distance of the cluster to which # it belongs in intra_clust_dists[i] diff --git a/sklearn/metrics/ranking.py b/sklearn/metrics/ranking.py index 5dd28c735c05c..9e7134961c7a0 100644 --- a/sklearn/metrics/ranking.py +++ b/sklearn/metrics/ranking.py @@ -30,8 +30,6 @@ from ..utils import column_or_1d, check_array from ..utils.multiclass import type_of_target from ..utils.extmath import stable_cumsum -from ..utils.fixes import bincount -from ..utils.fixes import array_equal from ..utils.sparsefuncs import count_nonzero from ..exceptions import UndefinedMetricWarning @@ -353,11 +351,11 @@ def _binary_clf_curve(y_true, y_score, pos_label=None, sample_weight=None): # ensure binary classification if pos_label is not specified classes = np.unique(y_true) if (pos_label is None and - not (array_equal(classes, [0, 1]) or - array_equal(classes, [-1, 1]) or - array_equal(classes, [0]) or - array_equal(classes, [-1]) or - array_equal(classes, [1]))): + not (np.array_equal(classes, [0, 1]) or + np.array_equal(classes, [-1, 1]) or + np.array_equal(classes, [0]) or + np.array_equal(classes, [-1]) or + np.array_equal(classes, [1]))): raise ValueError("Data is not binary and pos_label is not specified") elif pos_label is None: pos_label = 1. @@ -789,10 +787,10 @@ def label_ranking_loss(y_true, y_score, sample_weight=None): # Sort and bin the label scores unique_scores, unique_inverse = np.unique(y_score[i], return_inverse=True) - true_at_reversed_rank = bincount( + true_at_reversed_rank = np.bincount( unique_inverse[y_true.indices[start:stop]], minlength=len(unique_scores)) - all_at_reversed_rank = bincount(unique_inverse, + all_at_reversed_rank = np.bincount(unique_inverse, minlength=len(unique_scores)) false_at_reversed_rank = all_at_reversed_rank - true_at_reversed_rank diff --git a/sklearn/metrics/tests/test_classification.py b/sklearn/metrics/tests/test_classification.py index 2e6e24051507d..4163240e24b68 100644 --- a/sklearn/metrics/tests/test_classification.py +++ b/sklearn/metrics/tests/test_classification.py @@ -691,14 +691,8 @@ def test_classification_report_multiclass_with_unicode_label(): avg / total 0.51 0.53 0.47 75 """ - if np_version[:3] < (1, 7, 0): - expected_message = ("NumPy < 1.7.0 does not implement" - " searchsorted on unicode data correctly.") - assert_raise_message(RuntimeError, expected_message, - classification_report, y_true, y_pred) - else: - report = classification_report(y_true, y_pred) - assert_equal(report, expected_report) + report = classification_report(y_true, y_pred) + assert_equal(report, expected_report) def test_classification_report_multiclass_with_long_string_label(): diff --git a/sklearn/model_selection/_split.py b/sklearn/model_selection/_split.py index 369125f339559..d51487cd1cb4e 100644 --- a/sklearn/model_selection/_split.py +++ b/sklearn/model_selection/_split.py @@ -29,9 +29,7 @@ from ..utils.multiclass import type_of_target from ..externals.six import with_metaclass from ..externals.six.moves import zip -from ..utils.fixes import bincount from ..utils.fixes import signature -from ..utils.random import choice from ..base import _pprint __all__ = ['BaseCrossValidator', @@ -577,7 +575,7 @@ def _make_test_folds(self, X, y=None, groups=None): y = np.asarray(y) n_samples = y.shape[0] unique_y, y_inversed = np.unique(y, return_inverse=True) - y_counts = bincount(y_inversed) + y_counts = np.bincount(y_inversed) min_groups = np.min(y_counts) if np.all(self.n_splits > y_counts): raise ValueError("All the n_groups for individual classes" @@ -1377,6 +1375,7 @@ def _approximate_mode(class_counts, n_draws, rng): ... n_draws=2, rng=42) array([1, 1, 0, 0]) """ + rng = check_random_state(rng) # this computes a bad approximation to the mode of the # multivariate hypergeometric given by class_counts and n_draws continuous = n_draws * class_counts / class_counts.sum() @@ -1397,7 +1396,7 @@ def _approximate_mode(class_counts, n_draws, rng): # if we need to add more, we add them all and # go to the next value add_now = min(len(inds), need_to_add) - inds = choice(inds, size=add_now, replace=False, random_state=rng) + inds = rng.choice(inds, size=add_now, replace=False) floored[inds] += 1 need_to_add -= add_now if need_to_add == 0: @@ -1476,7 +1475,7 @@ def _iter_indices(self, X, y, groups=None): classes, y_indices = np.unique(y, return_inverse=True) n_classes = classes.shape[0] - class_counts = bincount(y_indices) + class_counts = np.bincount(y_indices) if np.min(class_counts) < 2: raise ValueError("The least populated class in y has only 1" " member, which is too few. The minimum" diff --git a/sklearn/model_selection/_validation.py b/sklearn/model_selection/_validation.py index e105f0d0b122f..db830619567d8 100644 --- a/sklearn/model_selection/_validation.py +++ b/sklearn/model_selection/_validation.py @@ -21,7 +21,6 @@ from ..base import is_classifier, clone from ..utils import indexable, check_random_state, safe_indexing -from ..utils.fixes import astype from ..utils.validation import _is_arraylike, _num_samples from ..utils.metaestimators import _safe_split from ..externals.joblib import Parallel, delayed, logger @@ -855,8 +854,8 @@ def _translate_train_sizes(train_sizes, n_max_training_samples): "must be within (0, 1], but is within [%f, %f]." % (n_min_required_samples, n_max_required_samples)) - train_sizes_abs = astype(train_sizes_abs * n_max_training_samples, - dtype=np.int, copy=False) + train_sizes_abs = (train_sizes_abs * n_max_training_samples).astype( + dtype=np.int, copy=False) train_sizes_abs = np.clip(train_sizes_abs, 1, n_max_training_samples) else: diff --git a/sklearn/model_selection/tests/test_search.py b/sklearn/model_selection/tests/test_search.py index 3f804d414b750..1d6cf50ec1c33 100644 --- a/sklearn/model_selection/tests/test_search.py +++ b/sklearn/model_selection/tests/test_search.py @@ -11,7 +11,6 @@ import numpy as np import scipy.sparse as sp -from sklearn.utils.fixes import in1d from sklearn.utils.fixes import sp_version from sklearn.utils.testing import assert_equal from sklearn.utils.testing import assert_not_equal @@ -1015,7 +1014,7 @@ def test_grid_search_correct_score_results(): expected_keys = (("mean_test_score", "rank_test_score") + tuple("split%d_test_score" % cv_i for cv_i in range(n_splits))) - assert_true(all(in1d(expected_keys, result_keys))) + assert_true(all(np.in1d(expected_keys, result_keys))) cv = StratifiedKFold(n_splits=n_splits) n_splits = grid_search.n_splits_ diff --git a/sklearn/naive_bayes.py b/sklearn/naive_bayes.py index c351d1169f4b2..5c81dc7041124 100644 --- a/sklearn/naive_bayes.py +++ b/sklearn/naive_bayes.py @@ -29,7 +29,6 @@ from .utils import check_X_y, check_array, check_consistent_length from .utils.extmath import safe_sparse_dot from .utils.multiclass import _check_partial_fit_first_call -from .utils.fixes import in1d from .utils.validation import check_is_fitted from .externals import six @@ -387,7 +386,7 @@ def _partial_fit(self, X, y, classes=None, _refit=False, classes = self.classes_ unique_y = np.unique(y) - unique_y_in_classes = in1d(unique_y, classes) + unique_y_in_classes = np.in1d(unique_y, classes) if not np.all(unique_y_in_classes): raise ValueError("The target label(s) %s in y do not exist in the " diff --git a/sklearn/neighbors/base.py b/sklearn/neighbors/base.py index 0cf8bc04ae230..e963c2744c416 100644 --- a/sklearn/neighbors/base.py +++ b/sklearn/neighbors/base.py @@ -18,7 +18,6 @@ from ..metrics import pairwise_distances from ..metrics.pairwise import PAIRWISE_DISTANCE_FUNCTIONS from ..utils import check_X_y, check_array, _get_n_jobs, gen_even_slices -from ..utils.fixes import argpartition from ..utils.multiclass import check_classification_targets from ..externals import six from ..externals.joblib import Parallel, delayed @@ -356,7 +355,7 @@ class from an array representing our data set and ask who's X, self._fit_X, self.effective_metric_, n_jobs=n_jobs, **self.effective_metric_params_) - neigh_ind = argpartition(dist, n_neighbors - 1, axis=1) + neigh_ind = np.argpartition(dist, n_neighbors - 1, axis=1) neigh_ind = neigh_ind[:, :n_neighbors] # argpartition doesn't guarantee sorted order, so we sort again neigh_ind = neigh_ind[ diff --git a/sklearn/preprocessing/data.py b/sklearn/preprocessing/data.py index e9821797e83a2..46937c77bee46 100644 --- a/sklearn/preprocessing/data.py +++ b/sklearn/preprocessing/data.py @@ -19,7 +19,6 @@ from ..utils import check_array from ..utils.extmath import row_norms from ..utils.extmath import _incremental_mean_and_var -from ..utils.fixes import bincount from ..utils.sparsefuncs_fast import (inplace_csr_row_normalize_l1, inplace_csr_row_normalize_l2) from ..utils.sparsefuncs import (inplace_column_scale, @@ -1149,7 +1148,7 @@ def powers_(self): combinations = self._combinations(self.n_input_features_, self.degree, self.interaction_only, self.include_bias) - return np.vstack(bincount(c, minlength=self.n_input_features_) + return np.vstack(np.bincount(c, minlength=self.n_input_features_) for c in combinations) def get_feature_names(self, input_features=None): diff --git a/sklearn/preprocessing/imputation.py b/sklearn/preprocessing/imputation.py index e414e98f424df..12d5425fbf604 100644 --- a/sklearn/preprocessing/imputation.py +++ b/sklearn/preprocessing/imputation.py @@ -10,7 +10,6 @@ from ..base import BaseEstimator, TransformerMixin from ..utils import check_array -from ..utils.fixes import astype from ..utils.sparsefuncs import _get_median from ..utils.validation import check_is_fitted from ..utils.validation import FLOAT_DTYPES @@ -226,7 +225,7 @@ def _sparse_fit(self, X, strategy, missing_values, axis): X.indptr[1:-1]) # astype necessary for bug in numpy.hsplit before v1.9 - columns = [col[astype(mask, bool, copy=False)] + columns = [col[mask.astype(bool, copy=False)] for col, mask in zip(columns_all, mask_valids)] # Median @@ -357,8 +356,8 @@ def transform(self, X): indexes = np.repeat(np.arange(len(X.indptr) - 1, dtype=np.int), np.diff(X.indptr))[mask] - X.data[mask] = astype(valid_statistics[indexes], X.dtype, - copy=False) + X.data[mask] = valid_statistics[indexes].astype(X.dtype, + copy=False) else: if sparse.issparse(X): X = X.toarray() diff --git a/sklearn/preprocessing/label.py b/sklearn/preprocessing/label.py index f2f7d9afad347..e8ea17f413a59 100644 --- a/sklearn/preprocessing/label.py +++ b/sklearn/preprocessing/label.py @@ -15,10 +15,7 @@ from ..base import BaseEstimator, TransformerMixin -from ..utils.fixes import np_version from ..utils.fixes import sparse_min_max -from ..utils.fixes import astype -from ..utils.fixes import in1d from ..utils import column_or_1d from ..utils.validation import check_array from ..utils.validation import check_is_fitted @@ -39,20 +36,6 @@ ] -def _check_numpy_unicode_bug(labels): - """Check that user is not subject to an old numpy bug - - Fixed in master before 1.7.0: - - https://github.com/numpy/numpy/pull/243 - - """ - if np_version[:3] < (1, 7, 0) and labels.dtype.kind == 'U': - raise RuntimeError("NumPy < 1.7.0 does not implement searchsorted" - " on unicode data correctly. Please upgrade" - " NumPy to use LabelEncoder with unicode inputs.") - - class LabelEncoder(BaseEstimator, TransformerMixin): """Encode labels with value between 0 and n_classes-1. @@ -110,7 +93,6 @@ def fit(self, y): self : returns an instance of self. """ y = column_or_1d(y, warn=True) - _check_numpy_unicode_bug(y) self.classes_ = np.unique(y) return self @@ -127,7 +109,6 @@ def fit_transform(self, y): y : array-like of shape [n_samples] """ y = column_or_1d(y, warn=True) - _check_numpy_unicode_bug(y) self.classes_, y = np.unique(y, return_inverse=True) return y @@ -147,7 +128,6 @@ def transform(self, y): y = column_or_1d(y, warn=True) classes = np.unique(y) - _check_numpy_unicode_bug(classes) if len(np.intersect1d(classes, self.classes_)) < len(classes): diff = np.setdiff1d(classes, self.classes_) raise ValueError("y contains new labels: %s" % str(diff)) @@ -520,7 +500,7 @@ def label_binarize(y, classes, neg_label=0, pos_label=1, sparse_output=False): y = column_or_1d(y) # pick out the known labels from y - y_in_classes = in1d(y, classes) + y_in_classes = np.in1d(y, classes) y_seen = y[y_in_classes] indices = np.searchsorted(sorted_class, y_seen) indptr = np.hstack((0, np.cumsum(y_in_classes))) @@ -541,7 +521,7 @@ def label_binarize(y, classes, neg_label=0, pos_label=1, sparse_output=False): if not sparse_output: Y = Y.toarray() - Y = astype(Y, int, copy=False) + Y = Y.astype(int, copy=False) if neg_label != 0: Y[Y == 0] = neg_label @@ -549,7 +529,7 @@ def label_binarize(y, classes, neg_label=0, pos_label=1, sparse_output=False): if pos_switch: Y[Y == pos_label] = 0 else: - Y.data = astype(Y.data, int, copy=False) + Y.data = Y.data.astype(int, copy=False) # preserve label ordering if np.any(classes != sorted_class): diff --git a/sklearn/utils/class_weight.py b/sklearn/utils/class_weight.py index 353f0023fc686..5b7637c0c3ee3 100644 --- a/sklearn/utils/class_weight.py +++ b/sklearn/utils/class_weight.py @@ -4,9 +4,6 @@ import numpy as np from ..externals import six -from ..utils.fixes import in1d - -from .fixes import bincount def compute_class_weight(class_weight, classes, y): @@ -55,7 +52,7 @@ def compute_class_weight(class_weight, classes, y): raise ValueError("classes should have valid labels that are in y") recip_freq = len(y) / (len(le.classes_) * - bincount(y_ind).astype(np.float64)) + np.bincount(y_ind).astype(np.float64)) weight = recip_freq[le.transform(classes)] else: # user-defined dictionary @@ -170,7 +167,7 @@ def compute_sample_weight(class_weight, y, indices=None): if classes_missing: # Make missing classes' weight zero - weight_k[in1d(y_full, list(classes_missing))] = 0. + weight_k[np.in1d(y_full, list(classes_missing))] = 0. expanded_class_weight.append(weight_k) diff --git a/sklearn/utils/extmath.py b/sklearn/utils/extmath.py index 18a3db760b0ba..c8365ae9ab166 100644 --- a/sklearn/utils/extmath.py +++ b/sklearn/utils/extmath.py @@ -12,7 +12,6 @@ # License: BSD 3 clause from __future__ import division -from functools import partial import warnings import numpy as np @@ -26,7 +25,6 @@ from ..externals.six.moves import xrange from .sparsefuncs_fast import csr_row_norms from .validation import check_array -from ..exceptions import NonBLASDotWarning @deprecated("sklearn.utils.extmath.norm was deprecated in version 0.19" @@ -40,20 +38,13 @@ def norm(x): return linalg.norm(x) -# Newer NumPy has a ravel that needs less copying. -if np_version < (1, 7, 1): - _ravel = np.ravel -else: - _ravel = partial(np.ravel, order='K') - - def squared_norm(x): """Squared Euclidean or Frobenius norm of x. Returns the Euclidean norm when x is a vector, the Frobenius norm when x is a matrix (2-d array). Faster than norm(x) ** 2. """ - x = _ravel(x) + x = np.ravel(x, order='K') if np.issubdtype(x.dtype, np.integer): warnings.warn('Array type is integer, np.dot may overflow. ' 'Data should be float type to avoid this issue', @@ -103,68 +94,10 @@ def _impose_f_order(X): return check_array(X, copy=False, order='F'), False -def _fast_dot(A, B): - if B.shape[0] != A.shape[A.ndim - 1]: # check adopted from '_dotblas.c' - raise ValueError - - if A.dtype != B.dtype or any(x.dtype not in (np.float32, np.float64) - for x in [A, B]): - warnings.warn('Falling back to np.dot. ' - 'Data must be of same type of either ' - '32 or 64 bit float for the BLAS function, gemm, to be ' - 'used for an efficient dot operation. ', - NonBLASDotWarning) - raise ValueError - - if min(A.shape) == 1 or min(B.shape) == 1 or A.ndim != 2 or B.ndim != 2: - raise ValueError - - # scipy 0.9 compliant API - dot = linalg.get_blas_funcs(['gemm'], (A, B))[0] - A, trans_a = _impose_f_order(A) - B, trans_b = _impose_f_order(B) - return dot(alpha=1.0, a=A, b=B, trans_a=trans_a, trans_b=trans_b) - - -def _have_blas_gemm(): - try: - linalg.get_blas_funcs(['gemm']) - return True - except (AttributeError, ValueError): - warnings.warn('Could not import BLAS, falling back to np.dot') - return False - - -# Only use fast_dot for older NumPy; newer ones have tackled the speed issue. -if np_version < (1, 7, 2) and _have_blas_gemm(): - def fast_dot(A, B): - """Compute fast dot products directly calling BLAS. - - This function calls BLAS directly while warranting Fortran contiguity. - This helps avoiding extra copies `np.dot` would have created. - For details see section `Linear Algebra on large Arrays`: - http://wiki.scipy.org/PerformanceTips - - Parameters - ---------- - A, B: instance of np.ndarray - Input arrays. Arrays are supposed to be of the same dtype and to - have exactly 2 dimensions. Currently only floats are supported. - In case these requirements aren't met np.dot(A, B) is returned - instead. To activate the related warning issued in this case - execute the following lines of code: - - >> import warnings - >> from sklearn.exceptions import NonBLASDotWarning - >> warnings.simplefilter('always', NonBLASDotWarning) - """ - try: - return _fast_dot(A, B) - except ValueError: - # Maltyped or malformed data. - return np.dot(A, B) -else: - fast_dot = np.dot +@deprecated("sklearn.utils.extmath.fast_dot was deprecated in version 0.19" + "and will be removed in 0.21. Use the equivalent np.dot instead.") +def fast_dot(a, b, out=None): + return np.dot(a, b, out) def density(w, **kwargs): @@ -191,7 +124,7 @@ def safe_sparse_dot(a, b, dense_output=False): ret = ret.toarray() return ret else: - return fast_dot(a, b) + return np.dot(a, b) def randomized_range_finder(A, size, n_iter, diff --git a/sklearn/utils/fixes.py b/sklearn/utils/fixes.py index a9a20c61b3428..bfa5c917b0030 100644 --- a/sklearn/utils/fixes.py +++ b/sklearn/utils/fixes.py @@ -43,17 +43,7 @@ def _parse_version(version_string): sp_version = _parse_version(scipy.__version__) -# little danse to see if np.copy has an 'order' keyword argument -# Supported since numpy 1.7.0 -if 'order' in signature(np.copy).parameters: - def safe_copy(X): - # Copy, but keep the order - return np.copy(X, order='K') -else: - # Before an 'order' argument was introduced, numpy wouldn't muck with - # the ordering - safe_copy = np.copy - +# Remove when minimum required NumPy >= 1.10 try: if (not np.allclose(np.divide(.4, 1, casting="unsafe"), np.divide(.4, 1, casting="unsafe", dtype=np.float64)) @@ -82,18 +72,6 @@ def divide(x1, x2, out=None, dtype=None): return out -try: - np.array(5).astype(float, copy=False) -except TypeError: - # Compat where astype accepted no copy argument (numpy < 1.7.0) - def astype(array, dtype, copy=True): - if not copy and array.dtype == dtype: - return array - return array.astype(dtype) -else: - astype = np.ndarray.astype - - try: with warnings.catch_warnings(record=True): # Don't raise the numpy deprecation warnings that appear in @@ -106,11 +84,7 @@ def astype(array, dtype, copy=True): def _minor_reduce(X, ufunc): major_index = np.flatnonzero(np.diff(X.indptr)) - if X.data.size == 0 and major_index.size == 0: - # Numpy < 1.8.0 don't handle empty arrays in reduceat - value = np.zeros_like(X.data) - else: - value = ufunc.reduceat(X.data, X.indptr[major_index]) + value = ufunc.reduceat(X.data, X.indptr[major_index]) return major_index, value def _min_or_max_axis(X, axis, min_or_max): @@ -164,79 +138,6 @@ def sparse_min_max(X, axis): X.max(axis=axis).toarray().ravel()) -try: - from numpy import argpartition -except ImportError: - # numpy.argpartition was introduced in v 1.8.0 - def argpartition(a, kth, axis=-1, kind='introselect', order=None): - return np.argsort(a, axis=axis, order=order) - -try: - from numpy import partition -except ImportError: - warnings.warn('Using `sort` instead of partition.' - 'Upgrade numpy to 1.8 for better performace on large number' - 'of clusters') - def partition(a, kth, axis=-1, kind='introselect', order=None): - return np.sort(a, axis=axis, order=order) - - -if np_version < (1, 7): - # Prior to 1.7.0, np.frombuffer wouldn't work for empty first arg. - def frombuffer_empty(buf, dtype): - if len(buf) == 0: - return np.empty(0, dtype=dtype) - else: - return np.frombuffer(buf, dtype=dtype) -else: - frombuffer_empty = np.frombuffer - - -if np_version < (1, 8): - def in1d(ar1, ar2, assume_unique=False, invert=False): - # Backport of numpy function in1d 1.8.1 to support numpy 1.6.2 - # Ravel both arrays, behavior for the first array could be different - ar1 = np.asarray(ar1).ravel() - ar2 = np.asarray(ar2).ravel() - - # This code is significantly faster when the condition is satisfied. - if len(ar2) < 10 * len(ar1) ** 0.145: - if invert: - mask = np.ones(len(ar1), dtype=np.bool) - for a in ar2: - mask &= (ar1 != a) - else: - mask = np.zeros(len(ar1), dtype=np.bool) - for a in ar2: - mask |= (ar1 == a) - return mask - - # Otherwise use sorting - if not assume_unique: - ar1, rev_idx = np.unique(ar1, return_inverse=True) - ar2 = np.unique(ar2) - - ar = np.concatenate((ar1, ar2)) - # We need this to be a stable sort, so always use 'mergesort' - # here. The values from the first array should always come before - # the values from the second array. - order = ar.argsort(kind='mergesort') - sar = ar[order] - if invert: - bool_ar = (sar[1:] != sar[:-1]) - else: - bool_ar = (sar[1:] == sar[:-1]) - flag = np.concatenate((bool_ar, [invert])) - indx = order.argsort(kind='mergesort')[:len(ar1)] - - if assume_unique: - return flag[indx] - else: - return flag[indx][rev_idx] -else: - from numpy import in1d - - if sp_version < (0, 15): # Backport fix for scikit-learn/scikit-learn#2986 / scipy/scipy#4142 from ._scipy_sparse_lsqr_backport import lsqr as sparse_lsqr @@ -249,22 +150,6 @@ def parallel_helper(obj, methodname, *args, **kwargs): return getattr(obj, methodname)(*args, **kwargs) -if np_version < (1, 6, 2): - # Allow bincount to accept empty arrays - # https://github.com/numpy/numpy/commit/40f0844846a9d7665616b142407a3d74cb65a040 - def bincount(x, weights=None, minlength=None): - if len(x) > 0: - return np.bincount(x, weights, minlength) - else: - if minlength is None: - minlength = 0 - minlength = np.asscalar(np.asarray(minlength, dtype=np.intp)) - return np.zeros(minlength, dtype=np.intp) - -else: - from numpy import bincount - - if 'exist_ok' in signature(os.makedirs).parameters: makedirs = os.makedirs else: @@ -287,20 +172,6 @@ def makedirs(name, mode=0o777, exist_ok=False): raise -if np_version < (1, 8, 1): - def array_equal(a1, a2): - # copy-paste from numpy 1.8.1 - try: - a1, a2 = np.asarray(a1), np.asarray(a2) - except: - return False - if a1.shape != a2.shape: - return False - return bool(np.asarray(a1 == a2).all()) -else: - from numpy import array_equal - - if np_version < (1, 12): class MaskedArray(np.ma.MaskedArray): # Before numpy 1.12, np.ma.MaskedArray object is not picklable @@ -317,33 +188,3 @@ def __getstate__(self): self._fill_value) else: from numpy.ma import MaskedArray # noqa - -if 'axis' not in signature(np.linalg.norm).parameters: - - def norm(X, ord=None, axis=None): - """ - Handles the axis parameter for the norm function - in old versions of numpy (useless for numpy >= 1.8). - """ - - if axis is None or X.ndim == 1: - result = np.linalg.norm(X, ord=ord) - return result - - if axis not in (0, 1): - raise NotImplementedError(""" - The fix that adds axis parameter to the old numpy - norm only works for 1D or 2D arrays. - """) - - if axis == 0: - X = X.T - - result = np.zeros(X.shape[0]) - for i in range(len(result)): - result[i] = np.linalg.norm(X[i], ord=ord) - - return result - -else: - norm = np.linalg.norm diff --git a/sklearn/utils/linear_assignment_.py b/sklearn/utils/linear_assignment_.py index 5282c84e21130..54a7d99fa369f 100644 --- a/sklearn/utils/linear_assignment_.py +++ b/sklearn/utils/linear_assignment_.py @@ -15,8 +15,6 @@ import numpy as np -from .fixes import astype - def linear_assignment(X): """Solve the linear assignment problem using the Hungarian algorithm. @@ -193,7 +191,7 @@ def _step4(state): # We convert to int as numpy operations are faster on int C = (state.C == 0).astype(np.int) covered_C = C * state.row_uncovered[:, np.newaxis] - covered_C *= astype(state.col_uncovered, dtype=np.int, copy=False) + covered_C *= state.col_uncovered.astype(dtype=np.int, copy=False) n = state.C.shape[0] m = state.C.shape[1] while True: @@ -215,7 +213,7 @@ def _step4(state): state.row_uncovered[row] = False state.col_uncovered[col] = True covered_C[:, col] = C[:, col] * ( - astype(state.row_uncovered, dtype=np.int, copy=False)) + state.row_uncovered.astype(dtype=np.int, copy=False)) covered_C[row] = 0 diff --git a/sklearn/utils/multiclass.py b/sklearn/utils/multiclass.py index 2d3c80510db0d..34560225b1e8b 100644 --- a/sklearn/utils/multiclass.py +++ b/sklearn/utils/multiclass.py @@ -20,8 +20,7 @@ from ..externals.six import string_types from .validation import check_array -from ..utils.fixes import bincount -from ..utils.fixes import array_equal + def _unique_multiclass(y): @@ -299,7 +298,7 @@ def _check_partial_fit_first_call(clf, classes=None): elif classes is not None: if getattr(clf, 'classes_', None) is not None: - if not array_equal(clf.classes_, unique_labels(classes)): + if not np.array_equal(clf.classes_, unique_labels(classes)): raise ValueError( "`classes=%r` is not the same as on last call " "to partial_fit, was: %r" % (classes, clf.classes_)) @@ -360,7 +359,7 @@ def class_distribution(y, sample_weight=None): classes_k, y_k = np.unique(y.data[y.indptr[k]:y.indptr[k + 1]], return_inverse=True) - class_prior_k = bincount(y_k, weights=nz_samp_weight) + class_prior_k = np.bincount(y_k, weights=nz_samp_weight) # An explicit zero was found, combine its weight with the weight # of the implicit zeros @@ -382,7 +381,7 @@ def class_distribution(y, sample_weight=None): classes_k, y_k = np.unique(y[:, k], return_inverse=True) classes.append(classes_k) n_classes.append(classes_k.shape[0]) - class_prior_k = bincount(y_k, weights=sample_weight) + class_prior_k = np.bincount(y_k, weights=sample_weight) class_prior.append(class_prior_k / class_prior_k.sum()) return (classes, n_classes, class_prior) diff --git a/sklearn/utils/random.py b/sklearn/utils/random.py index 5805f9be2c8fa..73ac0a9a572a3 100644 --- a/sklearn/utils/random.py +++ b/sklearn/utils/random.py @@ -4,18 +4,20 @@ from __future__ import division import numpy as np import scipy.sparse as sp -import operator import array from sklearn.utils import check_random_state -from sklearn.utils.fixes import astype from ._random import sample_without_replacement +from .deprecation import deprecated __all__ = ['sample_without_replacement', 'choice'] # This is a backport of np.random.choice from numpy 1.7 # The function can be removed when we bump the requirements to >=1.7 +@deprecated("sklearn.utils.random.choice was deprecated in version 0.19" + "and will be removed in 0.21. Use np.random.choice or" + "np.random.RandomState.choice instead.") def choice(a, size=None, replace=True, p=None, random_state=None): """ choice(a, size=None, replace=True, p=None) @@ -104,102 +106,11 @@ def choice(a, size=None, replace=True, p=None, random_state=None): dtype='|S11') """ - random_state = check_random_state(random_state) - - # Format and Verify input - a = np.array(a, copy=False) - if a.ndim == 0: - try: - # __index__ must return an integer by python rules. - pop_size = operator.index(a.item()) - except TypeError: - raise ValueError("a must be 1-dimensional or an integer") - if pop_size <= 0: - raise ValueError("a must be greater than 0") - elif a.ndim != 1: - raise ValueError("a must be 1-dimensional") + if random_state is not None: + random_state = check_random_state(random_state) + return random_state.choice(a, size, replace, p) else: - pop_size = a.shape[0] - if pop_size is 0: - raise ValueError("a must be non-empty") - - if p is not None: - p = np.array(p, dtype=np.double, ndmin=1, copy=False) - if p.ndim != 1: - raise ValueError("p must be 1-dimensional") - if p.size != pop_size: - raise ValueError("a and p must have same size") - if np.any(p < 0): - raise ValueError("probabilities are not non-negative") - if not np.allclose(p.sum(), 1): - raise ValueError("probabilities do not sum to 1") - - shape = size - if shape is not None: - size = np.prod(shape, dtype=np.intp) - else: - size = 1 - - # Actual sampling - if replace: - if p is not None: - cdf = p.cumsum() - cdf /= cdf[-1] - uniform_samples = random_state.random_sample(shape) - idx = cdf.searchsorted(uniform_samples, side='right') - # searchsorted returns a scalar - idx = np.array(idx, copy=False) - else: - idx = random_state.randint(0, pop_size, size=shape) - else: - if size > pop_size: - raise ValueError("Cannot take a larger sample than " - "population when 'replace=False'") - - if p is not None: - if np.sum(p > 0) < size: - raise ValueError("Fewer non-zero entries in p than size") - n_uniq = 0 - p = p.copy() - found = np.zeros(shape, dtype=np.int) - flat_found = found.ravel() - while n_uniq < size: - x = random_state.rand(size - n_uniq) - if n_uniq > 0: - p[flat_found[0:n_uniq]] = 0 - cdf = np.cumsum(p) - cdf /= cdf[-1] - new = cdf.searchsorted(x, side='right') - _, unique_indices = np.unique(new, return_index=True) - unique_indices.sort() - new = new.take(unique_indices) - flat_found[n_uniq:n_uniq + new.size] = new - n_uniq += new.size - idx = found - else: - idx = random_state.permutation(pop_size)[:size] - if shape is not None: - idx.shape = shape - - if shape is None and isinstance(idx, np.ndarray): - # In most cases a scalar will have been made an array - idx = idx.item(0) - - # Use samples as indices for a if a is array-like - if a.ndim == 0: - return idx - - if shape is not None and idx.ndim == 0: - # If size == () then the user requested a 0-d array as opposed to - # a scalar object when size is None. However a[idx] is always a - # scalar and not an array. So this makes sure the result is an - # array, taking into account that np.array(item) may not work - # for object arrays. - res = np.empty((), dtype=a.dtype) - res[()] = a[idx] - return res - - return a[idx] + return np.random.choice(a, size, replace, p) def random_choice_csc(n_samples, classes, class_probability=None, @@ -238,7 +149,7 @@ def random_choice_csc(n_samples, classes, class_probability=None, if classes[j].dtype.kind != 'i': raise ValueError("class dtype %s is not supported" % classes[j].dtype) - classes[j] = astype(classes[j], np.int64, copy=False) + classes[j] = classes[j].astype(np.int64, copy=False) # use uniform distribution if no class_probability is given if class_probability is None: diff --git a/sklearn/utils/sparsefuncs.py b/sklearn/utils/sparsefuncs.py index 8515ff2593f31..9b081ec45f421 100644 --- a/sklearn/utils/sparsefuncs.py +++ b/sklearn/utils/sparsefuncs.py @@ -6,7 +6,7 @@ import scipy.sparse as sp import numpy as np -from .fixes import sparse_min_max, bincount +from .fixes import sparse_min_max from .sparsefuncs_fast import ( csr_mean_variance_axis0 as _csr_mean_var_axis0, csc_mean_variance_axis0 as _csc_mean_var_axis0, @@ -401,10 +401,10 @@ def count_nonzero(X, axis=None, sample_weight=None): return out * sample_weight elif axis == 0: if sample_weight is None: - return bincount(X.indices, minlength=X.shape[1]) + return np.bincount(X.indices, minlength=X.shape[1]) else: weights = np.repeat(sample_weight, np.diff(X.indptr)) - return bincount(X.indices, minlength=X.shape[1], + return np.bincount(X.indices, minlength=X.shape[1], weights=weights) else: raise ValueError('Unsupported axis: {0}'.format(axis)) diff --git a/sklearn/utils/tests/test_extmath.py b/sklearn/utils/tests/test_extmath.py index 40dcfbbe137ff..2b720310b21d5 100644 --- a/sklearn/utils/tests/test_extmath.py +++ b/sklearn/utils/tests/test_extmath.py @@ -32,7 +32,6 @@ from sklearn.utils.extmath import weighted_mode from sklearn.utils.extmath import cartesian from sklearn.utils.extmath import log_logistic -from sklearn.utils.extmath import fast_dot, _fast_dot from sklearn.utils.extmath import svd_flip from sklearn.utils.extmath import _incremental_mean_and_var from sklearn.utils.extmath import _deterministic_vector_sign_flip @@ -426,84 +425,6 @@ def naive_log_logistic(x): assert_array_almost_equal(log_logistic(extreme_x), [-100, 0]) -def test_fast_dot(): - # Check fast dot blas wrapper function - if fast_dot is np.dot: - return - - rng = np.random.RandomState(42) - A = rng.random_sample([2, 10]) - B = rng.random_sample([2, 10]) - - try: - linalg.get_blas_funcs(['gemm'])[0] - has_blas = True - except (AttributeError, ValueError): - has_blas = False - - if has_blas: - # Test _fast_dot for invalid input. - - # Maltyped data. - for dt1, dt2 in [['f8', 'f4'], ['i4', 'i4']]: - assert_raises(ValueError, _fast_dot, A.astype(dt1), - B.astype(dt2).T) - - # Malformed data. - - # ndim == 0 - E = np.empty(0) - assert_raises(ValueError, _fast_dot, E, E) - - # ndim == 1 - assert_raises(ValueError, _fast_dot, A, A[0]) - - # ndim > 2 - assert_raises(ValueError, _fast_dot, A.T, np.array([A, A])) - - # min(shape) == 1 - assert_raises(ValueError, _fast_dot, A, A[0, :][None, :]) - - # test for matrix mismatch error - assert_raises(ValueError, _fast_dot, A, A) - - # Test cov-like use case + dtypes. - for dtype in ['f8', 'f4']: - A = A.astype(dtype) - B = B.astype(dtype) - - # col < row - C = np.dot(A.T, A) - C_ = fast_dot(A.T, A) - assert_almost_equal(C, C_, decimal=5) - - C = np.dot(A.T, B) - C_ = fast_dot(A.T, B) - assert_almost_equal(C, C_, decimal=5) - - C = np.dot(A, B.T) - C_ = fast_dot(A, B.T) - assert_almost_equal(C, C_, decimal=5) - - # Test square matrix * rectangular use case. - A = rng.random_sample([2, 2]) - for dtype in ['f8', 'f4']: - A = A.astype(dtype) - B = B.astype(dtype) - - C = np.dot(A, B) - C_ = fast_dot(A, B) - assert_almost_equal(C, C_, decimal=5) - - C = np.dot(A.T, B) - C_ = fast_dot(A.T, B) - assert_almost_equal(C, C_, decimal=5) - - if has_blas: - for x in [np.array([[d] * 10] * 2) for d in [np.inf, np.nan]]: - assert_raises(ValueError, _fast_dot, x, x.T) - - def test_incremental_variance_update_formulas(): # Test Youngs and Cramer incremental variance formulas. # Doggie data from http://www.mathsisfun.com/data/standard-deviation.html diff --git a/sklearn/utils/tests/test_fixes.py b/sklearn/utils/tests/test_fixes.py index de48d8a33691c..7bdcfc2fc13df 100644 --- a/sklearn/utils/tests/test_fixes.py +++ b/sklearn/utils/tests/test_fixes.py @@ -4,46 +4,18 @@ # License: BSD 3 clause import pickle -import numpy as np -import math from sklearn.utils.testing import assert_equal -from sklearn.utils.testing import assert_false -from sklearn.utils.testing import assert_true from sklearn.utils.testing import assert_array_equal from sklearn.utils.fixes import divide -from sklearn.utils.fixes import astype from sklearn.utils.fixes import MaskedArray -from sklearn.utils.fixes import norm def test_divide(): assert_equal(divide(.6, 1), .600000000000) -def test_astype_copy_memory(): - a_int32 = np.ones(3, np.int32) - - # Check that dtype conversion works - b_float32 = astype(a_int32, dtype=np.float32, copy=False) - assert_equal(b_float32.dtype, np.float32) - - # Changing dtype forces a copy even if copy=False - assert_false(np.may_share_memory(b_float32, a_int32)) - - # Check that copy can be skipped if requested dtype match - c_int32 = astype(a_int32, dtype=np.int32, copy=False) - assert_true(c_int32 is a_int32) - - # Check that copy can be forced, and is the case by default: - d_int32 = astype(a_int32, dtype=np.int32, copy=True) - assert_false(np.may_share_memory(d_int32, a_int32)) - - e_int32 = astype(a_int32, dtype=np.int32) - assert_false(np.may_share_memory(e_int32, a_int32)) - - def test_masked_array_obj_dtype_pickleable(): marr = MaskedArray([1, None, 'a'], dtype=object) @@ -52,26 +24,3 @@ def test_masked_array_obj_dtype_pickleable(): marr_pickled = pickle.loads(pickle.dumps(marr)) assert_array_equal(marr.data, marr_pickled.data) assert_array_equal(marr.mask, marr_pickled.mask) - - -def test_norm(): - X = np.array([[-2, 4, 5], - [1, 3, -4], - [0, 0, 8], - [0, 0, 0]]).astype(float) - - # Test various axis and order - assert_equal(math.sqrt(135), norm(X)) - assert_array_equal( - np.array([math.sqrt(5), math.sqrt(25), math.sqrt(105)]), - norm(X, axis=0) - ) - assert_array_equal(np.array([3, 7, 17]), norm(X, axis=0, ord=1)) - assert_array_equal(np.array([2, 4, 8]), norm(X, axis=0, ord=np.inf)) - assert_array_equal(np.array([0, 0, 0]), norm(X, axis=0, ord=-np.inf)) - assert_array_equal(np.array([11, 8, 8, 0]), norm(X, axis=1, ord=1)) - - # Test shapes - assert_equal((), norm(X).shape) - assert_equal((3,), norm(X, axis=0).shape) - assert_equal((4,), norm(X, axis=1).shape)