In [1]:
import findspark
findspark.init()
import pyspark
sc = pyspark.SparkContext(appName='Word Count')

In [2]:
lines = sc.textFile("war_and_peace.txt",2)
type(lines)

pyspark.rdd.RDD

In [3]:
lines.count()

63877

In [4]:
lines.take(5)

['The Project Gutenberg EBook of War and Peace, by Leo Tolstoy',
 '',
 'This eBook is for the use of anyone anywhere at no cost and with almost',
 'no restrictions whatsoever.  You may copy it, give it away or re-use it',
 'under the terms of the Project Gutenberg License included with this']

In [5]:
lines.getNumPartitions()

2

In [6]:
def count_in_a_partition(iterator):
    yield sum(1 for _ in iterator)
lines.mapPartitions(count_in_a_partition).collect()

[32385, 31492]

In [7]:
nonNullLines = lines.filter(lambda line: len(line)>0)

In [8]:
nonNullLines.count()

50902

In [9]:
nonNullLines.take(5)

['The Project Gutenberg EBook of War and Peace, by Leo Tolstoy',
 'This eBook is for the use of anyone anywhere at no cost and with almost',
 'no restrictions whatsoever.  You may copy it, give it away or re-use it',
 'under the terms of the Project Gutenberg License included with this',
 'eBook or online at www.gutenberg.org']

In [10]:
words = nonNullLines.flatMap(lambda line: line.split())

In [11]:
words.count()

562613

In [12]:
words.take(5)

['The', 'Project', 'Gutenberg', 'EBook', 'of']

In [13]:
upperWords = words.map(lambda word: word.upper())

In [14]:
upperWords.count()

562613

In [15]:
upperWords.take(5)

['THE', 'PROJECT', 'GUTENBERG', 'EBOOK', 'OF']

In [16]:
pairedOnes = upperWords.map(lambda uw: (uw, 1))

In [17]:
pairedOnes.count()

562613

In [18]:
pairedOnes.take(5)

[('THE', 1), ('PROJECT', 1), ('GUTENBERG', 1), ('EBOOK', 1), ('OF', 1)]

In [19]:
wordCounts = pairedOnes.reduceByKey(lambda prev, next: prev + next)

In [20]:
wordCounts.count()

39904

In [21]:
wordCounts.take(5)

[('INTENTIONALLY,', 1),
 ('HIM--PEOPLE', 1),
 ('FANNED', 2),
 ('OTHERS,', 52),
 ('WORTH.', 1)]

In [22]:
for word in wordCounts.take(5):
    print("*****", word)

***** ('INTENTIONALLY,', 1)
***** ('HIM--PEOPLE', 1)
***** ('FANNED', 2)
***** ('OTHERS,', 52)
***** ('WORTH.', 1)


We can also create an RDD from an existing collection of data elements using parallelize method of SparkContext.

In [23]:
data = [1, 2, 3, 4, 5]

In [24]:
type(data)

list

In [25]:
distData = sc.parallelize(data,2)

In [26]:
type(distData)

pyspark.rdd.RDD

In [27]:
distData.getNumPartitions()

2

In [28]:
sc.stop()