# S.6.3. Pair RDD

In [5]:
from pyspark.context import SparkContext
from pyspark.sql.session import SparkSession
sc = SparkContext('local')
spark = SparkSession(sc)

In [22]:
import os
myRdd2=spark.sparkContext\
    .textFile(os.path.join("data","ds_spark_wiki.txt"))

#### Paired RDD 생성

In [45]:
_testList=[("key1",1),("key1",1),("key1",1),("key2",2),("key2",2),
           ("key1",1),("key2",2),
           ("key1",1),("key1",1),("key2",2),("key2",2)]
_testRdd=spark.sparkContext.parallelize(_testList)

In [46]:
_testRdd.getNumPartitions()

1

In [47]:
_testRdd=spark.sparkContext.parallelize(_testList, 2)

In [48]:
_testRdd.getNumPartitions()

2

In [49]:
#f 를 사용하면 변수를 그대로 출력할 수 있다
year = 2020
name = 'jsl'
f"Hello, {name} {year}."

'Hello, jsl 2020.'

In [50]:
#glom(): partition의 개수 확인 가능
partitions = _testRdd.glom().collect()
for num, partition in enumerate(partitions):
    print(f'Partitions {num} -> {partition}')

Partitions 0 -> [('key1', 1), ('key1', 1), ('key1', 1), ('key2', 2), ('key2', 2)]
Partitions 1 -> [('key1', 1), ('key2', 2), ('key1', 1), ('key1', 1), ('key2', 2), ('key2', 2)]


In [51]:
_testRdd.keys().collect()

['key1',
 'key1',
 'key1',
 'key2',
 'key2',
 'key1',
 'key2',
 'key1',
 'key1',
 'key2',
 'key2']

In [52]:
#reducebykey: 각 partition마다 묶어서 저장
_testRdd.reduceByKey(lambda x,y:x+y).collect()

[('key1', 6), ('key2', 10)]

#### groupByKey, reduceByKey, mapValues

In [53]:
#groupByKey의결과는 ResultIterable(결과 바로 볼 수 없음)
_testRdd.groupByKey().collect()

[('key1', <pyspark.resultiterable.ResultIterable at 0x15b6214e948>),
 ('key2', <pyspark.resultiterable.ResultIterable at 0x15b6214eb48>)]

In [54]:
_testRdd.groupByKey().mapValues(list).collect() # list is a function, that is, list()

[('key1', [1, 1, 1, 1, 1, 1]), ('key2', [2, 2, 2, 2, 2])]

In [55]:
#1씩 더함
_testRdd.mapValues(lambda x:x+1).collect()

[('key1', 2),
 ('key1', 2),
 ('key1', 2),
 ('key2', 3),
 ('key2', 3),
 ('key1', 2),
 ('key2', 3),
 ('key1', 2),
 ('key1', 2),
 ('key2', 3),
 ('key2', 3)]

#### 단어 빈도 예제

In [69]:
myRdd2\  #리눅스에서\는 다음줄로 연결된다는 의미로 쓰인다.
    .flatMap(lambda x:x.split())\  #2차원함수처럼 flat시킴, 단어로 만듦
    .map(lambda x:(x,1))\ #tuple구조로 만든다. 단어 빈도 셀 때 (key, value) 형태로 만듦)
    .groupByKey()\
    .take(3)

[('Wikipedia', <pyspark.resultiterable.ResultIterable at 0x15b62197988>),
 ('Apache', <pyspark.resultiterable.ResultIterable at 0x15b62197388>),
 ('Spark', <pyspark.resultiterable.ResultIterable at 0x15b621891c8>)]

In [67]:
myRdd2\
    .flatMap(lambda x:x.split())\
    .map(lambda x:(x,1))\
    .groupByKey()\
    .mapValues(sum)\  ## key별로 빈도수 합계
    .take(20)

[('Wikipedia', 1),
 ('Apache', 6),
 ('Spark', 7),
 ('is', 1),
 ('an', 2),
 ('open', 1),
 ('source', 1),
 ('cluster', 1),
 ('computing', 1),
 ('framework.', 1),
 ('아파치', 5),
 ('스파크는', 1),
 ('오픈', 1),
 ('소스', 1),
 ('클러스터', 1),
 ('컴퓨팅', 1),
 ('프레임워크이다.', 1),
 ('스파크', 4),
 ('Originally', 1),
 ('developed', 1)]

In [71]:
def f(x): return len(x)
myRdd2\
    .flatMap(lambda x:x.split())\
    .map(lambda x:(x,1))\
    .groupByKey()\
    .mapValues(f)\
    .sortByKey(True)\
    .take(10)

#결과: list형태
# key값, value 쌍 형태

[('AMPLab,', 1),
 ('Apache', 6),
 ("Berkeley's", 1),
 ('California,', 1),
 ('Foundation,', 1),
 ('Originally', 1),
 ('Software', 1),
 ('Spark', 7),
 ('University', 1),
 ('Wikipedia', 1)]

In [59]:
#groupByKey -> mapValues를 이용하여 키 셈
wc=myRdd2\
    .flatMap(lambda x:x.split())\
    .map(lambda x:(x,1))\
    .groupByKey()\
    .mapValues(sum)\
    .sortByKey(True)\
    .take(10)

In [72]:
for e in wc:
    k = e[0]
    v = e[1]
    print (f"단어:{k}\t\t빈도:{v}") #  \ttab키를 넣는다

# ","는 아직 처리하지 않았다! 나중에 품사구분시

단어:AMPLab,	빈도:1
단어:Apache	빈도:6
단어:Berkeley's	빈도:1
단어:California,	빈도:1
단어:Foundation,	빈도:1
단어:Originally	빈도:1
단어:Software	빈도:1
단어:Spark	빈도:7
단어:University	빈도:1
단어:Wikipedia	빈도:1


In [61]:
#reduceByKey는 mapValues사용 안하고 셀수있다.
myRdd2\
    .flatMap(lambda x:x.split())\
    .map(lambda x:(x,1))\
    .reduceByKey(lambda x,y:x+y)\
    .take(10)

[('Wikipedia', 1),
 ('Apache', 6),
 ('Spark', 7),
 ('is', 1),
 ('an', 2),
 ('open', 1),
 ('source', 1),
 ('cluster', 1),
 ('computing', 1),
 ('framework.', 1)]

#### countByKey()

- 결과는 **딕셔너리**형태!  
`defaultdict`


In [63]:
myRdd2\
    .flatMap(lambda x:x.split())\
    .map(lambda x:(x,1))\
    .countByKey() # .items() to be added to get a list

defaultdict(int,
            {'Wikipedia': 1,
             'Apache': 6,
             'Spark': 7,
             'is': 1,
             'an': 2,
             'open': 1,
             'source': 1,
             'cluster': 1,
             'computing': 1,
             'framework.': 1,
             '아파치': 5,
             '스파크는': 1,
             '오픈': 1,
             '소스': 1,
             '클러스터': 1,
             '컴퓨팅': 1,
             '프레임워크이다.': 1,
             '스파크': 4,
             'Originally': 1,
             'developed': 1,
             'at': 1,
             'the': 3,
             'University': 1,
             'of': 1,
             'California,': 1,
             "Berkeley's": 1,
             'AMPLab,': 1,
             'codebase': 1,
             'was': 1,
             'later': 1,
             'donated': 1,
             'to': 1,
             'Software': 1,
             'Foundation,': 1,
             'which': 1,
             'has': 1,
             'maintained': 1,
             'it': 1,
      