In [1]:
import findspark
findspark.init()

In [2]:
from pyspark import SparkConf, SparkContext
from pyspark.sql import SQLContext

In [3]:
conf = SparkConf().setMaster("local[4]").setAppName("another")
sc = SparkContext(conf=conf)

sqlContext = SQLContext(sc)

In [4]:
rdd = sc.parallelize([(1,2),(3,4),(3,6)])

In [5]:
for val in rdd.reduceByKey(lambda x, y: x+y).collect():
    print(val)

(1, 2)
(3, 10)


In [6]:
for val, list in rdd.groupByKey().collect():
    for li in list:
        print("{} {}".format(val, li))

1 2
3 4
3 6


In [7]:
for key, val in rdd.mapValues(lambda x: x+1).collect():
    print("{} {}".format(key,val))

1 3
3 5
3 7


In [8]:
for val in rdd.values().collect():
    print("{}".format(val))

2
4
6


In [9]:
for val in rdd.values().collect():
    val = val + 1

In [10]:
rdd.collect()

[(1, 2), (3, 4), (3, 6)]

In [11]:
rdd.mapValues(lambda s: s + 1).collect()

[(1, 3), (3, 5), (3, 7)]

In [12]:
rdd.map(lambda a: (a[0], a[1] + 1)).collect()

[(1, 3), (3, 5), (3, 7)]

In [20]:
rdd.map(lambda a: (a[0], range(a[1], 3))).collect()

[(1, range(2, 3)), (3, range(4, 3)), (3, range(6, 3))]

In [6]:
rdd = sc.parallelize([3,4,5]).map(lambda x:  (x, [*range(x + 3)]))

In [12]:
rdd.collect()

[(3, [0, 1, 2, 3, 4, 5]),
 (4, [0, 1, 2, 3, 4, 5, 6]),
 (5, [0, 1, 2, 3, 4, 5, 6, 7])]

In [15]:
def mapper(l):
    return list(map(lambda a: a + 1, l))

In [9]:
list(map(lambda a: a, [0, 1, 2, 3, 4, 5, 6]))

[0, 1, 2, 3, 4, 5, 6]

In [16]:
rdd.mapValues(lambda z: mapper(z)).collect()

[(3, [1, 2, 3, 4, 5, 6]),
 (4, [1, 2, 3, 4, 5, 6, 7]),
 (5, [1, 2, 3, 4, 5, 6, 7, 8])]

In [None]:
def mapper(entry):
    return (entry[0],map(lambda x : x[1],entry[1]))

data = [("key1", [('',"val1"),('',"val2")]),("key2",[('',"val3"),('',"val2"),('',"val4")])]

rdd = sc.parallelize(data)

rdd2 = rdd.map(lambda x : mapper(x))

rdd2.collect()
# [('key1', ['val1', 'val2']), ('key2', ['val3', 'val2', 'val4'])]

In [13]:
data = [("zip-1", [('file-11',"val-11"), ('file-12',"val-12"), ('file-13',"val-13")]),
       ("zip-2", [('file-21',"val-21"), ('file-22',"val-22"), ('file-23',"val-23")]), 
       ("zip-3", [('file-31',"val-31"), ('file-32',"val-32"), ('file-33',"val-33")])]

In [14]:
rdd = sc.parallelize(data)

In [49]:
rdd.collect()

[('zip-1',
  [('file-11', 'val-11'), ('file-12', 'val-12'), ('file-13', 'val-13')]),
 ('zip-2',
  [('file-21', 'val-21'), ('file-22', 'val-22'), ('file-23', 'val-23')]),
 ('zip-3',
  [('file-31', 'val-31'), ('file-32', 'val-32'), ('file-33', 'val-33')])]

In [51]:
rdd.flatMap(lambda a: a[1]).collect()

[('file-11', 'val-11'),
 ('file-12', 'val-12'),
 ('file-13', 'val-13'),
 ('file-21', 'val-21'),
 ('file-22', 'val-22'),
 ('file-23', 'val-23'),
 ('file-31', 'val-31'),
 ('file-32', 'val-32'),
 ('file-33', 'val-33')]

In [25]:
def mapper(comp_files):
    return list(map(lambda list_file: list_file[0] , comp_files))    

In [39]:
test = [[('file-11', 'val-11'), ('file-12', 'val-12'), ('file-13', 'val-13')],
 [('file-21', 'val-21'), ('file-22', 'val-22'), ('file-23', 'val-23')],
 [('file-31', 'val-31'), ('file-32', 'val-32'), ('file-33', 'val-33')]]
test

[[('file-11', 'val-11'), ('file-12', 'val-12'), ('file-13', 'val-13')],
 [('file-21', 'val-21'), ('file-22', 'val-22'), ('file-23', 'val-23')],
 [('file-31', 'val-31'), ('file-32', 'val-32'), ('file-33', 'val-33')]]

In [33]:
for l in test:
    print(l)
    for j in l:
        print(j[1])

[('file-11', 'val-11'), ('file-12', 'val-12'), ('file-13', 'val-13')]
val-11
val-12
val-13
[('file-21', 'val-21'), ('file-22', 'val-22'), ('file-23', 'val-23')]
val-21
val-22
val-23
[('file-31', 'val-31'), ('file-32', 'val-32'), ('file-33', 'val-33')]
val-31
val-32
val-33
