In [1]:
from math import sqrt

In [2]:
def isprime(n):
    ''' check if integer n is a prime'''
    # make sure n is a positive ineteger
    n = abs(int(n))
    # 0 and 1 are not primes
    if n < 2:
        return False
    # 2 is the only even prime
    elif n == 2:
        return True
    # all other even numbers are not primes
    elif n % 2 ==0:
        return False
    # range starts with 3 and only needs to go up to the integer
    # sqrt by odd numbers
    for x in range(3, int(sqrt(n))+1, 2):
        if n % x == 0:
            return False
    return True    

In [3]:
[(i,isprime(i),int(sqrt(i))+1) for i in range(0,25)]

[(0, False, 1),
 (1, False, 2),
 (2, True, 2),
 (3, True, 2),
 (4, False, 3),
 (5, True, 3),
 (6, False, 3),
 (7, True, 3),
 (8, False, 3),
 (9, False, 4),
 (10, False, 4),
 (11, True, 4),
 (12, False, 4),
 (13, True, 4),
 (14, False, 4),
 (15, False, 4),
 (16, False, 5),
 (17, True, 5),
 (18, False, 5),
 (19, True, 5),
 (20, False, 5),
 (21, False, 5),
 (22, False, 5),
 (23, True, 5),
 (24, False, 5)]

In [4]:
from pyspark import SparkContext 
sc = SparkContext()

In [5]:
# make an RDD of a million elements, filtering out the prime numbers and counting them
sc.range(int(1e6)).filter(isprime).count()

78498

In [6]:
# make an RDD of primes
primes = sc.range(1000000).filter(isprime)

In [7]:
# take a sample of 10 elements from it
primes.takeSample(True, 10)

[388697,
 292141,
 563149,
 282389,
 946769,
 230309,
 852259,
 680831,
 159833,
 986767]

In [8]:
# tell Spark engine you are going to reuse an RDD using cache()
primes.cache()
# take a sample
primes.takeSample(True, 10)

[729331, 39451, 814393, 125107, 576581, 226531, 313763, 945631, 525769, 932549]

In [9]:
# repeating the sampling process is faster the second time...
primes.takeSample(True, 10)

[104479, 734479, 968963, 521789, 69193, 724643, 56171, 252157, 35863, 846689]

In [10]:
# if RDD very large, cache would exhaust memory
# instead use persist() to persist RDD to disk
# first redefine RDD as Spark doesn't let you change cacheing of an RDD
primes = sc.range(1000000).filter(isprime)

In [11]:
from pyspark import StorageLevel
primes.persist(StorageLevel.DISK_ONLY) # use MEMORY_AND_DISK if you aren't sure of RDD's size

PythonRDD[11] at RDD at PythonRDD.scala:48