In [1]:
# Import relevant packages
import findspark
findspark.init()
from pyspark import SparkContext, SparkConf
from Utils import Utils

In [2]:
# Instantiate a SparkConf with an appName and threadCount to utilize (i.e. index in 'local[index]') 
conf = SparkConf().setAppName('collect').setMaster('local[*]')
sc = SparkContext(conf=conf)    # Instantiate a SparkContext using conf

In [3]:
inputWords = ["I", "am", "Nishant", "Prajapati", "I", "love", "Data", "Sciences", "and", "I", "like", "Mechanical", "Sciences"]  # List of strings
wordRdd = sc.parallelize(inputWords)  # List of strings Rdd

# Next we see common actions in Spark


In [4]:
# Action: Collect methd
words = wordRdd.collect()   # Convert back to list of strings
words

['I',
 'am',
 'Nishant',
 'Prajapati',
 'I',
 'love',
 'Data',
 'Sciences',
 'and',
 'I',
 'like',
 'Mechanical',
 'Sciences']

In [5]:
# Action: count() 
wordCount = wordRdd.count()
wordCount

13

In [6]:
# Action: countByValue()
countByValue = wordRdd.countByValue()
countByValue

defaultdict(int,
            {'I': 3,
             'am': 1,
             'Nishant': 1,
             'Prajapati': 1,
             'love': 1,
             'Data': 1,
             'Sciences': 2,
             'and': 1,
             'like': 1,
             'Mechanical': 1})

In [7]:
countByKey = wordRdd.countByKey()
countByKey

defaultdict(int,
            {'I': 3, 'a': 2, 'N': 1, 'P': 1, 'l': 2, 'D': 1, 'S': 2, 'M': 1})

In [9]:
# Action:  take(n), takes n elements from RDD
# Its useful for unit tests and debugging
words = wordRdd.take(3)   # First three entries / rows 
words

['I', 'am', 'Nishant']

In [10]:
# Action: reduce(), reduces the elements of this RDD using a specified commutative and associative binary operator. 
inputIntegers = [1,2,3,4,5]
intergerRdd = sc.parallelize(inputIntegers)

product = intergerRdd.reduce(lambda x, y : x*y)
product

120

In [21]:
# Input textFile path and name
filePath = 'e:\\Eskills-Academy-projects\\python-spark-tutorial-master\\in\\'
fileName = "prime_nums.text"
textFile = filePath + fileName

lines = sc.textFile(textFile)

In [27]:
numbers = lines.flatMap(lambda line: line.split("\t"))
validNumbers = numbers.filter(lambda number: number)    # If number is an empty string, it returns false, and viceversa

In [38]:
intNumbers = validNumbers.map(lambda number: int(number))  # Convert from string to integer
# intNumbers.take(100)

sum = intNumbers.reduce(lambda x, y: x + y)
sum


24133