# RDDs

### DF -> RDD and RDD -> DF

In [1]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName('a').getOrCreate()

myDF = spark.createDataFrame([('Matt', 22),('Nick', 23),('Mike', 20)], ['name', 'age'])
myDF.show()

myDF.rdd.map(lambda row: (row[0], row[1]+1)).toDF(['name', 'age']).show()

+----+---+
|name|age|
+----+---+
|Matt| 22|
|Nick| 23|
|Mike| 20|
+----+---+

+----+---+
|name|age|
+----+---+
|Matt| 23|
|Nick| 24|
|Mike| 21|
+----+---+



In [1]:
#https://training.databricks.com/visualapi.pdf
from pyspark import SparkContext

# spark = SparkSession.builder.appName('goose').getOrCreate()
sc = SparkContext('local', 'goose')

### Transformations

In [2]:
data = sc.parallelize(["b", "a", "c", "b"])
data.collect()

['b', 'a', 'c', 'b']

#### MAP

In [3]:
counts = data.map(lambda c: (c, 1))
counts.collect()

[('b', 1), ('a', 1), ('c', 1), ('b', 1)]

#### FLAT MAP

In [4]:
data.flatMap(lambda c: (c, 1)).collect()

['b', 1, 'a', 1, 'c', 1, 'b', 1]

#### FILTER

In [5]:
data.filter(lambda c: c == 'a' or c == 'b').collect()

['b', 'a', 'b']

#### GROUP BY

In [6]:
names = sc.parallelize(['John', 'Matt', 'Mike', 'Bill', 'James', 'Josh'])
grouped = names.groupBy(lambda name: name[0])

print([(k, list(v)) for (k, v) in grouped.collect()])

[('J', ['John', 'James', 'Josh']), ('M', ['Matt', 'Mike']), ('B', ['Bill'])]


#### GROUP BY KEY

In [7]:
grouped2 = counts.groupByKey().map(lambda tup : (tup[0], len(tup[1])))
grouped2.collect()

# print([(k, list(v)) for (k, v) in grouped2.collect()])

[('b', 2), ('a', 1), ('c', 1)]

#### REDUCE BY KEY

In [8]:
counts.collect()

grouped3 = counts.reduceByKey(lambda val1, val2: val1 + val2)
grouped3.collect()

# print([(k, list(v)) for (k, v) in grouped3.collect()])

[('b', 2), ('a', 1), ('c', 1)]

#### MAP PARTITIONS

In [9]:
x = sc.parallelize([1,2,3], 2)
x.collect()


def f(iterator):
    yield sum(iterator); 
    yield 42
    
y = x.mapPartitions(f)

# [ [sum, 42] ]
# glom() flattens elements on the same partition
print(x.glom().collect())
print(y.glom().collect())

[[1], [2, 3]]
[[1, 42], [5, 42]]


#### SAMPLE

In [10]:
nums = sc.parallelize([1, 2, 3, 4, 5])
nums.sample(False, 0.73, 89898).collect()

[3, 5]

#### UNION

In [11]:
x = sc.parallelize([1, 2, 3], 2)
y = sc.parallelize([3, 4], 1)
z = x.union(y)

# appending rdds together
print(z.glom().collect())

[[1], [2, 3], [3, 4]]


#### JOIN

In [12]:
x = sc.parallelize([("a", 1), ("b", 2)])
y = sc.parallelize([("a", 3), ("a", 4), ("b", 5)])

z = x.join(y)

# joins based on common keys
print(z.collect())

[('b', (2, 5)), ('a', (1, 3)), ('a', (1, 4))]


#### DISTINCT

In [13]:
nums = sc.parallelize([1, 2, 1, 4, 1])
nums = nums.distinct()

print(nums.collect())

[1, 2, 4]


#### COALESCE

In [14]:
x = sc.parallelize([1, 2, 3, 4, 5], 3)
y = x.coalesce(2)

# repartitions to 2 without balnacing or creating new partitions
print(x.glom().collect())
print(y.glom().collect())

[[1], [2, 3], [4, 5]]
[[1], [2, 3, 4, 5]]


#### KEY BY

In [15]:
x = sc.parallelize(['John', 'Fred', 'Anna', 'James'])
y = x.keyBy(lambda w: w[0])

# creates key from data
print(y.collect())

[('J', 'John'), ('F', 'Fred'), ('A', 'Anna'), ('J', 'James')]


#### PARTITION BY

In [16]:
x = sc.parallelize([('J','James'),('F','Fred'),('A','Anna'),('J','John')], 3)
y = x.partitionBy(2, lambda w: 0 if w[0] < 'H' else 1)

print(x.glom().collect())
print(y.glom().collect())

[[('J', 'James')], [('F', 'Fred')], [('A', 'Anna'), ('J', 'John')]]
[[('F', 'Fred'), ('A', 'Anna')], [('J', 'James'), ('J', 'John')]]


#### ZIP

In [17]:
x = sc.parallelize([1, 2, 3])
y = x.map(lambda n : n * n)   #[1, 4, 9]

z = x.zip(y)

print(z.collect())

[(1, 1), (2, 4), (3, 9)]


### ACTIONS

#### GET NUM PARTITIONS

In [18]:
x = sc.parallelize([1,2,3], 2)
y = x.getNumPartitions()

print(x.glom().collect())
print(y)

[[1], [2, 3]]
2


#### COLLECT

In [19]:
x = sc.parallelize([1,2,3], 2)
y = x.collect()

# Sends RDD data into a list
print(x.glom().collect())
print(y)

[[1], [2, 3]]
[1, 2, 3]


#### REDUCE

In [20]:
x = sc.parallelize([1, 2, 3, 4])
y = x.reduce(lambda a, b: a + b)

# Solves the rdd into a value based on given function
print(x.collect())
print(y)

[1, 2, 3, 4]
10


#### AGGREGATE

In [21]:
seqOp = lambda data, item: (data[0] + [item], data[1] + item)
combOp = lambda d1, d2: (d1[0] + d2[0], d1[1] + d2[1])

x = sc.parallelize([1,2,3,4])
y = x.aggregate(([], 0), seqOp, combOp)

print(y)

([1, 2, 3, 4], 10)


#### MAX

In [22]:
x = sc.parallelize([2, 4, 1])
y = x.max()

print(x.collect())
print(y)

[2, 4, 1]
4


#### SUM

In [23]:
x = sc.parallelize([2, 4, 1])
y = x.sum()

print(x.collect())
print(y)

[2, 4, 1]
7


#### MEAN

In [24]:
x = sc.parallelize([2, 4, 1])
y = x.mean()

print(x.collect())
print(y)

[2, 4, 1]
2.3333333333333335


#### STDEV

In [25]:
x = sc.parallelize([2, 4, 1])
y = x.stdev()

print(x.collect())
print(y)

[2, 4, 1]
1.247219128924647


#### COUNT BY KEY

In [26]:
x = sc.parallelize([('J', 'James'), ('F','Fred'), ('A','Anna'), ('J','John')])
y = x.countByKey()

print(y)

defaultdict(<class 'int'>, {'J': 2, 'F': 1, 'A': 1})


#### SAVE

In [27]:
dbutils.fs.rm("/temp/demo", True)
x = sc.parallelize([2,4,1])
x.saveAsTextFile("/temp/demo")
y = sc.textFile("/temp/demo")

print(y.collect())