In [1]:
import os
import findspark
findspark.init()
import pyspark
myConf=pyspark.SparkConf()
spark = pyspark.sql.SparkSession.builder\
    .master("local")\
    .appName("myApp")\
    .config(conf=myConf)\
    .getOrCreate()

# pair RDD
#### key,value 쌍으로 구성된 RDD
##### byKey(), byValue()

##### groupByKey(), reduceByKey(), combineByKey(), aggregateByKey(), mapValues()

In [3]:
_testList=[("key1",1),("key1",1),("key1",1),("key2",1),("key2",1),
           ("key1",1),("key2",1),
           ("key1",1),("key1",1),("key2",1),("key2",1)]
_testRdd=spark.sparkContext.parallelize(_testList)

In [4]:
_testRdd.getNumPartitions()

1

In [5]:
year = 2020
name = 'jsl'
f"Hello, {name} {year}."

'Hello, jsl 2020.'

In [6]:
partitions = _testRdd.glom().collect()
for num, partition in enumerate(partitions):
    print(f'Partitions {num} -> {partition}')

Partitions 0 -> [('key1', 1), ('key1', 1), ('key1', 1), ('key2', 1), ('key2', 1), ('key1', 1), ('key2', 1), ('key1', 1), ('key1', 1), ('key2', 1), ('key2', 1)]


In [7]:

_testRdd.keys().collect()

['key1',
 'key1',
 'key1',
 'key2',
 'key2',
 'key1',
 'key2',
 'key1',
 'key1',
 'key2',
 'key2']

In [8]:
_testRdd.reduceByKey(lambda x,y:x+y).collect()

[('key1', 6), ('key2', 5)]

In [10]:
_testRdd.groupByKey().collect()
# groupByKey는 쓰지 말도록

[('key1', <pyspark.resultiterable.ResultIterable at 0x26a9be9d460>),
 ('key2', <pyspark.resultiterable.ResultIterable at 0x26a9be8c190>)]

In [11]:
_testRdd.groupByKey().mapValues(list).collect() # list is a function, that is, list()

[('key1', [1, 1, 1, 1, 1, 1]), ('key2', [1, 1, 1, 1, 1])]

In [12]:
_testRdd.mapValues(lambda x:x+1).collect()

[('key1', 2),
 ('key1', 2),
 ('key1', 2),
 ('key2', 2),
 ('key2', 2),
 ('key1', 2),
 ('key2', 2),
 ('key1', 2),
 ('key1', 2),
 ('key2', 2),
 ('key2', 2)]

# 단어빈도 예제

In [14]:
myRdd2=spark.sparkContext\
    .textFile(os.path.join("data","ds_spark_wiki.txt"))

In [15]:
myRdd2\
    .flatMap(lambda x:x.split())\
    .map(lambda x:(x,1))\
    .groupByKey()\
    .take(3)

[('Wikipedia', <pyspark.resultiterable.ResultIterable at 0x26a9be8c580>),
 ('Apache', <pyspark.resultiterable.ResultIterable at 0x26a9be8cdc0>),
 ('Spark', <pyspark.resultiterable.ResultIterable at 0x26a9be9d580>)]

In [16]:
myRdd2\
    .flatMap(lambda x:x.split())\
    .map(lambda x:(x,1))\
    .groupByKey()\
    .mapValues(sum)\
    .take(20)

[('Wikipedia', 1),
 ('Apache', 6),
 ('Spark', 7),
 ('is', 1),
 ('an', 2),
 ('open', 1),
 ('source', 1),
 ('cluster', 1),
 ('computing', 1),
 ('framework.', 1),
 ('아파치', 5),
 ('스파크는', 1),
 ('오픈', 1),
 ('소스', 1),
 ('클러스터', 1),
 ('컴퓨팅', 1),
 ('프레임워크이다.', 1),
 ('스파크', 4),
 ('Originally', 1),
 ('developed', 1)]

In [17]:
def f(x): return len(x)
myRdd2\
    .flatMap(lambda x:x.split())\
    .map(lambda x:(x,1))\
    .groupByKey()\
    .mapValues(f)\
    .sortByKey(True)\
    .take(10)

[('AMPLab,', 1),
 ('Apache', 6),
 ("Berkeley's", 1),
 ('California,', 1),
 ('Foundation,', 1),
 ('Originally', 1),
 ('Software', 1),
 ('Spark', 7),
 ('University', 1),
 ('Wikipedia', 1)]

In [20]:
wc=myRdd2\
    .flatMap(lambda x:x.split())\
    .map(lambda x:(x,1))\
    .groupByKey()\
    .mapValues(sum)\
    .sortByKey(True)\
    .take(10)

In [21]:
for e in wc:
    k = e[0]
    v = e[1]
    print (f"단어:{k}\t\t빈도:{v}")

단어:AMPLab,		빈도:1
단어:Apache		빈도:6
단어:Berkeley's		빈도:1
단어:California,		빈도:1
단어:Foundation,		빈도:1
단어:Originally		빈도:1
단어:Software		빈도:1
단어:Spark		빈도:7
단어:University		빈도:1
단어:Wikipedia		빈도:1


# reduceByKey는 (K,V)로 반환

In [22]:
myRdd2\
    .flatMap(lambda x:x.split())\
    .map(lambda x:(x,1))\
    .reduceByKey(lambda x,y:x+y)\
    .take(10)

[('Wikipedia', 1),
 ('Apache', 6),
 ('Spark', 7),
 ('is', 1),
 ('an', 2),
 ('open', 1),
 ('source', 1),
 ('cluster', 1),
 ('computing', 1),
 ('framework.', 1)]

# countByKey

In [23]:
myRdd2\
    .flatMap(lambda x:x.split())\
    .map(lambda x:(x,1))\
    .countByKey() # .items() to be added to get a list

defaultdict(int,
            {'Wikipedia': 1,
             'Apache': 6,
             'Spark': 7,
             'is': 1,
             'an': 2,
             'open': 1,
             'source': 1,
             'cluster': 1,
             'computing': 1,
             'framework.': 1,
             '아파치': 5,
             '스파크는': 1,
             '오픈': 1,
             '소스': 1,
             '클러스터': 1,
             '컴퓨팅': 1,
             '프레임워크이다.': 1,
             '스파크': 4,
             'Originally': 1,
             'developed': 1,
             'at': 1,
             'the': 3,
             'University': 1,
             'of': 1,
             'California,': 1,
             "Berkeley's": 1,
             'AMPLab,': 1,
             'codebase': 1,
             'was': 1,
             'later': 1,
             'donated': 1,
             'to': 1,
             'Software': 1,
             'Foundation,': 1,
             'which': 1,
             'has': 1,
             'maintained': 1,
             'it': 1,
      