In [98]:
import findspark
findspark.init()

# Initializing Spark

In [99]:
from pyspark import SparkContext
sc = SparkContext(master='local[2]')

ValueError: Cannot run multiple SparkContexts at once; existing SparkContext(app=My app, master=local) created by __init__ at <ipython-input-2-68bedc93cd13>:6 

## Inspect SparkContext

In [3]:
sc.version

'2.3.0'

In [4]:
sc.pythonVer

'3.6'

In [5]:
sc.master

'local[2]'

In [6]:
str(sc.sparkHome)

'None'

In [7]:
str(sc.sparkUser())

'zhangyong'

In [8]:
sc.appName

'pyspark-shell'

In [11]:
sc.applicationId

'local-1531047437660'

In [12]:
sc.defaultParallelism

2

In [13]:
sc.defaultMinPartitions

2

## Configuration

In [2]:
from pyspark import SparkConf, SparkContext
conf = (SparkConf()
       .setMaster('local')
       .setAppName('My app')
       .set('spark.executor.memory','1g'))
sc = SparkContext(conf=conf)

## Using the Shell
```
$ ./bin/spark-shell --master local[2]
$ ./bin/pyspark --master local[2] --py-files code.py
```
Set which master the context connnect to with the `--master` argument, and add python .zip, .egg or .py files to runtime path b passing a comma-seprated list to `--py-files`

# Loading Data
## Parallelized Collections

In [82]:
rdd = sc.parallelize([('a',7),('a',2),('b',2)])
rdd2 = sc.parallelize([('a',2),('d',1),('b',1)])
rdd3 = sc.parallelize(range(100))
rdd4 = sc.parallelize([('a',['x','y','z']),
                      ('b',['p','r'])])

## External Data
Read either one text file from HDFS, a local file system or any Hadoop-supported file system URI

In [None]:
textFile = sc.textFile('/my/directory/*.txt')
textFile2 = sc.wholeTextFiles('/my/diretory')

# Retrieving RDD Information
## Basic Information

In [4]:
rdd.getNumPartitions()

1

In [5]:
rdd.count()

3

In [6]:
rdd.countByKey() # count rdd instances by key

defaultdict(int, {'a': 2, 'b': 1})

In [7]:
rdd.countByValue() # count rdd instances by value

defaultdict(int, {('a', 2): 1, ('a', 7): 1, ('b', 2): 1})

In [8]:
rdd.collectAsMap() #return (key, value) pair as dictionary

{'a': 2, 'b': 2}

In [9]:
rdd3.sum() #sum of rdd elements

4950

In [10]:
sc.parallelize([]).isEmpty() #check whether RDD is empty

True

## Summary

In [12]:
rdd3.max()

99

In [13]:
rdd3.min()

0

In [14]:
rdd3.mean()

49.5

In [15]:
rdd3.stdev()

28.86607004772212

In [16]:
rdd3.variance()

833.25

In [17]:
rdd3.histogram(3) #compute histogram by bins

([0, 33, 66, 99], [33, 33, 34])

In [18]:
rdd3.stats()

(count: 100, mean: 49.5, stdev: 28.86607004772212, max: 99.0, min: 0.0)

# Applying Functions

In [20]:
rdd.map(lambda x: x+(x[1],x[0])).collect()

[('a', 7, 7, 'a'), ('a', 2, 2, 'a'), ('b', 2, 2, 'b')]

In [22]:
# apply a function to each RDD elment and flatten the result
rdd5 = rdd.flatMap(lambda x: x+(x[1],x[0]))
rdd5.collect()

['a', 7, 7, 'a', 'a', 2, 2, 'a', 'b', 2, 2, 'b']

In [24]:
# apply a flatMap functions to each (key,value) pair without changing the keys
rdd4.flatMapValues(lambda x:x).collect()

[('a', 'x'), ('a', 'y'), ('a', 'z'), ('b', 'p'), ('b', 'r')]

# Selecting Data

## Getting

In [25]:
rdd.collect() # return a list with all RDD elements

[('a', 7), ('a', 2), ('b', 2)]

In [26]:
rdd.take(2)

[('a', 7), ('a', 2)]

In [27]:
rdd.first()

('a', 7)

In [28]:
rdd.top(2)

[('b', 2), ('a', 7)]

## Sampling

In [36]:
rdd3.sample(False,0.15,81).collect()

[3, 4, 27, 28, 35, 41, 43, 49, 53, 58, 85, 93]

## Filtering

In [39]:
rdd.filter(lambda x:'a' in x).collect()

[('a', 7), ('a', 2)]

In [40]:
rdd5.distinct().collect()

['a', 7, 2, 'b']

In [41]:
rdd.keys().collect()

['a', 'a', 'b']

# Iterating

In [43]:
# apply a function to all RDD elements
def g(x): print(x)
    
rdd.foreach(g)

# Reshaping Data
## Reducing

In [44]:
rdd.reduceByKey(lambda x,y : x+y).collect() #Merge the rdd values for each key

[('a', 9), ('b', 2)]

In [48]:
rdd.reduce(lambda x,y:x+y) #merge the rdd values

('a', 7, 'a', 2, 'b', 2)

## Grouping by

In [54]:
rdd3.groupBy(lambda x:x %2).mapValues(list).collect()

[(0,
  [0,
   2,
   4,
   6,
   8,
   10,
   12,
   14,
   16,
   18,
   20,
   22,
   24,
   26,
   28,
   30,
   32,
   34,
   36,
   38,
   40,
   42,
   44,
   46,
   48,
   50,
   52,
   54,
   56,
   58,
   60,
   62,
   64,
   66,
   68,
   70,
   72,
   74,
   76,
   78,
   80,
   82,
   84,
   86,
   88,
   90,
   92,
   94,
   96,
   98]),
 (1,
  [1,
   3,
   5,
   7,
   9,
   11,
   13,
   15,
   17,
   19,
   21,
   23,
   25,
   27,
   29,
   31,
   33,
   35,
   37,
   39,
   41,
   43,
   45,
   47,
   49,
   51,
   53,
   55,
   57,
   59,
   61,
   63,
   65,
   67,
   69,
   71,
   73,
   75,
   77,
   79,
   81,
   83,
   85,
   87,
   89,
   91,
   93,
   95,
   97,
   99])]

In [55]:
rdd.groupByKey().mapValues(list).collect()

[('a', [7, 2]), ('b', [2])]

## Aggregating

In [56]:
# aggregate RDD elements of each partition and then the results for all paritions
seqOp = (lambda x,y: (x[0]+y,x[1]+1))
combOp = (lambda x,y: (x[0]+y[0],x[1]+y[1]))
rdd3.aggregate((0,0),seqOp,combOp)

(4950, 100)

In [62]:
rdd.aggregateByKey((0,0),seqOp,combOp).collect()

[('a', (9, 2)), ('b', (2, 1))]

In [67]:
from operator import add
rdd3.fold(0,add)

4950

In [69]:
rdd.foldByKey(0,add).collect()

[('a', 9), ('b', 2)]

In [70]:
rdd3.keyBy(lambda x:x+x).collect()

[(0, 0),
 (2, 1),
 (4, 2),
 (6, 3),
 (8, 4),
 (10, 5),
 (12, 6),
 (14, 7),
 (16, 8),
 (18, 9),
 (20, 10),
 (22, 11),
 (24, 12),
 (26, 13),
 (28, 14),
 (30, 15),
 (32, 16),
 (34, 17),
 (36, 18),
 (38, 19),
 (40, 20),
 (42, 21),
 (44, 22),
 (46, 23),
 (48, 24),
 (50, 25),
 (52, 26),
 (54, 27),
 (56, 28),
 (58, 29),
 (60, 30),
 (62, 31),
 (64, 32),
 (66, 33),
 (68, 34),
 (70, 35),
 (72, 36),
 (74, 37),
 (76, 38),
 (78, 39),
 (80, 40),
 (82, 41),
 (84, 42),
 (86, 43),
 (88, 44),
 (90, 45),
 (92, 46),
 (94, 47),
 (96, 48),
 (98, 49),
 (100, 50),
 (102, 51),
 (104, 52),
 (106, 53),
 (108, 54),
 (110, 55),
 (112, 56),
 (114, 57),
 (116, 58),
 (118, 59),
 (120, 60),
 (122, 61),
 (124, 62),
 (126, 63),
 (128, 64),
 (130, 65),
 (132, 66),
 (134, 67),
 (136, 68),
 (138, 69),
 (140, 70),
 (142, 71),
 (144, 72),
 (146, 73),
 (148, 74),
 (150, 75),
 (152, 76),
 (154, 77),
 (156, 78),
 (158, 79),
 (160, 80),
 (162, 81),
 (164, 82),
 (166, 83),
 (168, 84),
 (170, 85),
 (172, 86),
 (174, 87),
 (176, 88

# Mathematical Operations

In [76]:
rdd.subtract(rdd2).collect() # return each rdd value not contained in rdd2

[('a', 7), ('b', 2), ('a', 2)]

In [78]:
rdd2.subtractByKey(rdd).collect() #return each (key,value) pair of rdd2 with no matching key in rdd

[('d', 1)]

In [83]:
rdd.cartesian(rdd2).collect() #return the cartesian product of rdd and rdd2

[(('a', 7), ('a', 2)),
 (('a', 7), ('d', 1)),
 (('a', 7), ('b', 1)),
 (('a', 2), ('a', 2)),
 (('a', 2), ('d', 1)),
 (('a', 2), ('b', 1)),
 (('b', 2), ('a', 2)),
 (('b', 2), ('d', 1)),
 (('b', 2), ('b', 1))]

## Sort

In [85]:
rdd2.sortBy(lambda x:x[1]).collect()

[('d', 1), ('b', 1), ('a', 2)]

In [86]:
rdd2.sortByKey().collect()

[('a', 2), ('b', 1), ('d', 1)]

# Repartitioning

In [91]:
rddx=rdd.repartition(4)
rddx.getNumPartitions()

4

In [92]:
# decrease the number of partiions in RDD to 1
rddx1=rddx.coalesce(1)
rddx1.getNumPartitions()

1

# Saving

In [95]:
rdd.saveAsTextFile('rdd.txt')

In [None]:
rdd.saveAsHadoopFile('hdfs://namenodehost/parent/child','org.apache.hadoop.mapred.TextOutputFormat')

# Stopping SparkContext

In [None]:
sc.stop()

# Execution

$./bin/spark-sumit examples/src/main/python/pi.py