In [6]:
from pyspark.context import SparkContext
from pyspark.sql.session import SparkSession
sc = SparkContext('local')
spark = SparkSession(sc)

### combineByKey
- `combineByKey()`: 키별로 합계, 개수 (key, (sum, count))를 계산. createCombiner, mergeValue, mergeCombiners.
- `groupByKey`,`reduceByKey`는 빈도 계산 위해 사용했었음

In [7]:
_testList=[("key1",1),("key1",3),("key2",2),("key1",2),("key2",4),
           ("key1",5),("key2",6),
           ("key1",7),("key1",8),("key2",9),("key2",3)]

In [10]:
_testRdd=spark.sparkContext.parallelize(_testList)

In [11]:
_testRdd.getNumPartitions()

1

- partition이 하나이면 키별로 값을 더한다
    - 키 o: combiner 실행
    - 키 x: mergeValues 실행

In [12]:
# combineByKey: 인자가 3개
# 1. combiner, 2. value 더하는 것, 3. 1번째 combiner를 더하는 것
_testRdd.combineByKey(lambda v : str(v)+"*", lambda c, v : c+"#"+str(v), lambda c1, c2 : c1+'&'+c2).collect()

[('key1', '1*#3#2#5#7#8'), ('key2', '2*#4#6#9#3')]

#### partition 2개로 만들어보자

In [31]:
_testRdd=spark.sparkContext.parallelize(_testList, 2) #두번째 인자에 partition 수를 쓴다

In [14]:
partitions = _testRdd.glom().collect()
for num, partition in enumerate(partitions):
    print(f'Partitions {num} -> {partition}')

Partitions 0 -> [('key1', 1), ('key1', 3), ('key2', 2), ('key1', 2), ('key2', 4)]
Partitions 1 -> [('key1', 5), ('key2', 6), ('key1', 7), ('key1', 8), ('key2', 9), ('key2', 3)]


In [18]:
#파티션이 2개 이상인 경우 merge combiner 작동 (&)
_testRdd.combineByKey(lambda v : str(v)+"*", lambda c, v : c+"#"+str(v), lambda c1, c2 : c1+'&'+c2).collect()

[('key1', '1*#3#2&5*#7#8'), ('key2', '2*#4&6*#9#3')]

In [43]:
_testRdd.combineByKey(lambda value: (value,1),
                     lambda x,value: (x[0]+value, x[1]+1),                      
                     lambda x,y: (x[0]+y[0], x[1]+y[1])).collect()

#1,3,2,5,7,8 더해서 26, 개수 6개 더함

[('key1', (26, 6)), ('key2', (24, 5))]

In [44]:
_testCbkRdd=_testRdd.combineByKey(lambda value: (value,1),
                     lambda x,value: (x[0]+value, x[1]+1),                      
                     lambda x,y: (x[0]+y[0], x[1]+y[1]))#.collect()

In [46]:
#x[0]: key   x[1][0]: sum   x[1][1]:count
averageByKey = _testCbkRdd.map(lambda x:(x[0],x[1][0]/x[1][1]))
averageByKey.collectAsMap() #map구조로 결과를 합산, {}가 map구조임

{'key1': 4.333333333333333, 'key2': 4.8}

### comebineByKey 예제
- 평균을 쉽게 구할수있다.

In [47]:
marks = spark.sparkContext.parallelize([('kim',86),('lim',87),('kim',75),
                                      ('kim',91),('lim',78),('lim',92),
                                      ('lim',79),('lee',99)])

In [48]:
# combineByKey: 인자가 3개
# 1. combiner, 2. value 더하는 것, 3. 1번째 combiner를 더하는 것
marksByKey = marks.combineByKey(lambda value: (value,1),
                             lambda x,value: (x[0]+value, x[1]+1),
                             lambda x,y: (x[0]+y[0], x[1]+y[1]))

In [49]:
marksByKey.collect()

[('kim', (252, 3)), ('lim', (336, 4)), ('lee', (99, 1))]

In [50]:
heights = spark.sparkContext.parallelize([
        ('M',182.),('F',164.),('M',180.),('M',185.),('M',171.),('F',162.)
    ])

In [51]:
heightsByKey = heights.combineByKey(lambda value: (value,1),
                             lambda x,value: (x[0]+value, x[1]+1),
                             lambda x,y: (x[0]+y[0], x[1]+y[1]))

In [52]:
heightsByKey.collect()

[('M', (718.0, 4)), ('F', (326.0, 2))]

In [53]:
avgByKey = heightsByKey.map(lambda x: (x[0],x[1][0]/x[1][1]))

print (avgByKey.collectAsMap())

{'M': 179.5, 'F': 163.0}
