In [2]:
import findspark
findspark.init()
from pyspark.sql import SparkSession
spark = SparkSession.builder \
    .master("spark://10.10.10.100:7077") \
    .appName("Example") \
    .getOrCreate()
# spark.conf.set("spark.sql.shuffle.partitions", "5")

# 실행 계획

In [2]:
df1 = spark.range(2, 10000000, 2) # 2,4,6,... 2개 파티션
df2 = spark.range(2, 10000000, 4) # 2,6,10,... 3개 파티션
step1 = df1.repartition(5) # 셔플링
step12 = df2.repartition(6) # 셔플링
step2 = step1.selectExpr("id * 5 as id") # 10, 20, 30,...
step3 = step2.join(step12, ["id"]) # 셔플링
step4 = step3.selectExpr("sum(id)") 
print step4.collect() # Action, 결과 전송을 위한 셔플링
print step4.explain() # 실행계획 출력

[Row(sum(id)=2500000000000)]
== Physical Plan ==
*(7) HashAggregate(keys=[], functions=[sum(id#6L)])
+- Exchange SinglePartition
   +- *(6) HashAggregate(keys=[], functions=[partial_sum(id#6L)])
      +- *(6) Project [id#6L]
         +- *(6) SortMergeJoin [id#6L], [id#2L], Inner
            :- *(3) Sort [id#6L ASC NULLS FIRST], false, 0
            :  +- Exchange hashpartitioning(id#6L, 200)
            :     +- *(2) Project [(id#0L * 5) AS id#6L]
            :        +- Exchange RoundRobinPartitioning(5)
            :           +- *(1) Range (2, 10000000, step=2, splits=3)
            +- *(5) Sort [id#2L ASC NULLS FIRST], false, 0
               +- Exchange hashpartitioning(id#2L, 200)
                  +- Exchange RoundRobinPartitioning(6)
                     +- *(4) Range (2, 10000000, step=4, splits=3)
None


# RDD 생성

## 1. DataFrame -> RDD

In [3]:
spark.range(10).rdd

MapPartitionsRDD[4] at javaToPython at NativeMethodAccessorImpl.java:0

## 2. 파이썬 객체로 생성

In [4]:
myCollection = "Spark The Definitive Guide : Big Data Processing Made Simple".split(" ")
words = spark.sparkContext.parallelize(myCollection, 2)
words

ParallelCollectionRDD[5] at parallelize at PythonRDD.scala:184

## 3. DataSource로 생성

In [5]:
path = "/home/demo/test/Spark-The-Definitive-Guide/README.md"
spark.sparkContext.textFile(path)

/home/demo/test/Spark-The-Definitive-Guide/README.md MapPartitionsRDD[7] at textFile at NativeMethodAccessorImpl.java:0

# 트랜스포메이션
지연 처리 방식

## 1. distinct

In [6]:
print words.distinct().count()
print words.distinct().collect()

10
['Simple', 'Spark', 'Data', 'The', 'Made', 'Definitive', 'Big', 'Processing', ':', 'Guide']


## 2. filter

In [7]:
def startsWithS(individual):
    return individual.startswith("S")

print words.filter(lambda word: startsWithS(word)).collect()
print words.filter(lambda word: word.startswith("S")).collect()

['Spark', 'Simple']
['Spark', 'Simple']


## 3. map

In [8]:
words2 = words.map(lambda word: (word, word[0], word.startswith("S"))) # (word, word[0], True or False)
print words2.filter(lambda record: record[2]).take(5)

[('Spark', 'S', True), ('Simple', 'S', True)]


## 4. flatMap

In [9]:
# 단어를 문자 집합으로 변환
words.flatMap(lambda word: word).take(15) # ['Spark', 'The', ...] -> ['S', 'p', 'a', 'r', 'k', 'T', ...]

['S', 'p', 'a', 'r', 'k', 'T', 'h', 'e', 'D', 'e', 'f', 'i', 'n', 'i', 't']

## 5. sortBy

In [10]:
words.sortBy(lambda word: len(word) * -1).take(2) # -10, -9, -8 ...

['Definitive', 'Processing']

## 6. randomSplit

In [11]:
randomSplit = words.randomSplit([0.1, 0.9])
print randomSplit[0].collect()
print randomSplit[1].collect()

[]
['Spark', 'The', 'Definitive', 'Guide', ':', 'Big', 'Data', 'Processing', 'Made', 'Simple']


# 액션
즉시 실행 방식, 데이터를 드라이버로 모으거나 외부 데이터 소스로 보냄

## 1. reduce

In [12]:
print spark.sparkContext.parallelize(range(1, 21)).reduce(lambda x, y: x + y) # 210

def wordLengthReducer(leftWord, rightWord):
    if len(leftWord) > len(rightWord):
        return leftWord
    else:
        return rightWord

print words.reduce(wordLengthReducer)

210
Processing


## 2. count

In [13]:
words.count()

10

## 3. first

In [14]:
words.first()

'Spark'

## 4. max, min

In [15]:
print spark.sparkContext.parallelize(range(1, 21)).max()
print spark.sparkContext.parallelize(range(1, 21)).min()

20
1


## 5. take
먼저 하나의 파티션을 스캔, 그 다음 해당 파티션의 결과 수를 이용해 파라미터로 지정된 값을 만족하는데 필요한 추가 파티션 수를 예측

In [25]:
print words.take(1)
print words.takeOrdered(1)

['Spark']
[':']
