fix tests, do not use numpy in randomSplit, no performance gain

msjgriffiths · Nov 13, 2014 · 78bf997 · 78bf997
1 parent f5fdf63
commit 78bf997
Show file tree

Hide file tree

Showing 2 changed files with 4 additions and 3 deletions.
diff --git a/python/pyspark/rdd.py b/python/pyspark/rdd.py
@@ -325,11 +325,11 @@ def randomSplit(self, weights, seed=None):
         :return: split RDDs in a list
 
         >>> rdd = sc.parallelize(range(5), 1)
-        >>> rdd1, rdd2 = rdd.randomSplit([2, 3], 101)
+        >>> rdd1, rdd2 = rdd.randomSplit([2, 3], 17)
         >>> rdd1.collect()
-        [2, 3]
+        [1, 3]
         >>> rdd2.collect()
-        [0, 1, 4]
+        [0, 2, 4]
         """
         s = float(sum(weights))
         cweights = [0.0]

diff --git a/python/pyspark/rddsampler.py b/python/pyspark/rddsampler.py
@@ -119,6 +119,7 @@ class RDDRangeSampler(RDDSamplerBase):
 
     def __init__(self, lowerBound, upperBound, seed=None):
         RDDSamplerBase.__init__(self, False, seed)
+        self._use_numpy = False  # no performance gain from numpy
         self._lowerBound = lowerBound
         self._upperBound = upperBound