In [8]:
from pyspark.context import SparkContext
from pyspark.sql.session import SparkSession
import os

In [3]:
spark = SparkSession.builder \
 .master("local") \
     .appName("TrainingDay1")\
     .getOrCreate()

In [38]:
sc=spark.sparkContext

### addFile(path, recursive=False)
Add a file to be downloaded with this Spark job on every node. The path passed can be either a local file, a file in HDFS (or other Hadoop-supported filesystems), or an HTTP, HTTPS or FTP URI.

To access the file in Spark jobs, use L{SparkFiles.get(fileName)<pyspark.files.SparkFiles.get>} with the filename to find its download location.

### addPyFile(path)[source]
Add a .py or .zip dependency for all tasks to be executed on this SparkContext in the future. The path passed can be either a local file, a file in HDFS (or other Hadoop-supported filesystems), or an HTTP, HTTPS or FTP URI.

### property applicationId
A unique identifier for the Spark application. Its format depends on the scheduler implementation.

in case of local spark app something like ‘local-1433865536131’

in case of YARN something like ‘application_1433865536131_34483’

In [6]:
spark.sparkContext.applicationId

'local-1613362176927'

In [17]:
from pyspark import SparkFiles
tempdir="C:\\Users\\RaviVerma\\Desktop\\personal"
path = os.path.join(tempdir, "test.txt")
with open(path, "w") as testFile:
   _ = testFile.write("100")
spark.sparkContext.addFile(path)
def func(iterator):
   with open(SparkFiles.get("test.txt")) as testFile:
       fileVal = int(testFile.readline())
       return [x * fileVal for x in iterator]
spark.sparkContext.parallelize([1, 2, 3, 4]).mapPartitions(func).collect()

[100, 200, 300, 400]

### parallelize(c, numSlices=None)[source]
Distribute a local Python collection to form an RDD. Using xrange is recommended if the input represents a range for performance.

In [18]:
spark.sparkContext.parallelize([0,2,3,4,6],5).collect()

[0, 2, 3, 4, 6]

In [19]:
spark.sparkContext.parallelize([0,2,3,4,6],5).glom().collect()

[[0], [2], [3], [4], [6]]

### glom()[source]
Return an RDD created by coalescing all elements within each partition into a list.

In [20]:
rdd=spark.sparkContext.parallelize([1,2,3,4],2)
sorted(rdd.glom().collect())

[[1, 2], [3, 4]]

### pickleFile(name, minPartitions=None)[source]
Load an RDD previously saved using RDD.saveAsPickleFile method.

In [21]:
tmpFile = NamedTemporaryFile(delete=True)
tmpFile.close()
sc.parallelize(range(10)).saveAsPickleFile(tmpFile.name, 5)
sorted(sc.pickleFile(tmpFile.name, 3).collect())

NameError: name 'NamedTemporaryFile' is not defined

### runJob(rdd, partitionFunc, partitions=None, allowLocal=False)[source]
Executes the given partitionFunc on the specified set of partitions, returning the result as an array of elements.

If ‘partitions’ is not specified, this will run over all partitions.

In [30]:
myRdd=spark.sparkContext.parallelize(range(6),3)

In [31]:
myRdd.collect()

[0, 1, 2, 3, 4, 5]

In [33]:
spark.sparkContext.runJob(myRdd,lambda part:[x*x for x in part])

[0, 1, 4, 9, 16, 25]

In [35]:
spark.sparkContext.runJob(myRdd,lambda part:[x*x for x in part],[0,2],True)

[0, 1, 16, 25]

### textFile(name, minPartitions=None, use_unicode=True)[source]
Read a text file from HDFS, a local file system (available on all nodes), or any Hadoop-supported file system URI, and return it as an RDD of Strings.

In [39]:
path = os.path.join(tempdir, "sample-text.txt")
with open(path, "w") as testFile:
   _ = testFile.write("Hello world!")
textFile = sc.textFile(path)
textFile.collect()

['Hello world!']

### union(rdds)[source]
Build the union of a list of RDDs.

This supports unions() of RDDs with different serialized formats, although this forces them to be reserialized using the default serializer:

In [40]:
path = os.path.join(tempdir, "union-text.txt")
with open(path, "w") as testFile:
   _ = testFile.write("Hello")
textFile = sc.textFile(path)
textFile.collect()
###['Hello']
parallelized = sc.parallelize(["World!"])
sorted(sc.union([textFile, parallelized]).collect())

['Hello', 'World!']

### wholeTextFiles(path, minPartitions=None, use_unicode=True)[source]
Read a directory of text files from HDFS, a local file system (available on all nodes), or any Hadoop-supported file system URI. Each file is read as a single record and returned in a key-value pair, where the key is the path of each file, the value is the content of each file.

dirPath = os.path.join(tempdir, "files")
os.mkdir(dirPath)
with open(os.path.join(dirPath, "1.txt"), "w") as file1:
   _ = file1.write("1")
with open(os.path.join(dirPath, "2.txt"), "w") as file2:
   _ = file2.write("2")
textFiles = sc.wholeTextFiles(dirPath)
sorted(textFiles.collect())

### cartesian(other)[source]
Return the Cartesian product of this RDD and another one, that is, the RDD of all pairs of elements (a, b) where a is in self and b is in other.

In [42]:
rdd=sc.parallelize([1,2])
sorted(rdd.cartesian(rdd).collect())

[(1, 1), (1, 2), (2, 1), (2, 2)]

### coalesce(numPartitions, shuffle=False)[source]
Return a new RDD that is reduced into numPartitions partitions.