In [1]:
from pyspark.context import SparkContext
from pyspark.sql.session import SparkSession
sc = SparkContext('local')
spark = SparkSession(sc)

In [2]:
import requests

In [3]:
r=requests.get("https://raw.githubusercontent.com/jokecamp/FootballData/master/World%20Cups/all-world-cup-players.json")

In [4]:
wc=r.json() #json으로 읽는다.

In [5]:
from pyspark.sql import Row

wcDf = spark.createDataFrame(Row(**x) for x in wc)

---
# S.6 Spark SQL

In [6]:
wcDf.printSchema()

root
 |-- Competition: string (nullable = true)
 |-- Year: long (nullable = true)
 |-- Team: string (nullable = true)
 |-- Number: string (nullable = true)
 |-- Position: string (nullable = true)
 |-- FullName: string (nullable = true)
 |-- Club: string (nullable = true)
 |-- ClubCountry: string (nullable = true)
 |-- DateOfBirth: string (nullable = true)
 |-- IsCaptain: boolean (nullable = true)



In [7]:
wcDf.createOrReplaceTempView("wc")
spark.sql("select Club,Team,Year from wc").show(1) #select, updata, delete 등..

#특이점: sql명령어는 ""안에 쓴다!

+--------------------+---------+----+
|                Club|     Team|Year|
+--------------------+---------+----+
|Club AtlÃ©tico Ta...|Argentina|1930|
+--------------------+---------+----+
only showing top 1 row



In [8]:
wcPlayers=spark.sql("select FullName,Club,Team,Year from wc") #결과를 변수에 넣어서 출력
wcPlayers.show(1)

+------------+--------------------+---------+----+
|    FullName|                Club|     Team|Year|
+------------+--------------------+---------+----+
|Ãngel Bossio|Club AtlÃ©tico Ta...|Argentina|1930|
+------------+--------------------+---------+----+
only showing top 1 row



In [9]:
spark.catalog.listTables()

[Table(name='wc', database=None, description=None, tableType='TEMPORARY', isTemporary=True)]

In [10]:
namesRdd=wcPlayers.rdd.map(lambda x: "Full name: "+x[0]) #0: full name #wcPlayers를 RDD map으로 만듦, x가 line이 들어고 line이 리스트로 구성되어있으면 index로 접근 
for e in namesRdd.take(5):
    print (e)

Full name: Ãngel Bossio
Full name: Juan Botasso
Full name: Roberto Cherro
Full name: Alberto Chividini
Full name: 


##  sql.functions and join
리스트에 포함되어 있는 과일에 고유번호를 할당해 보자.

In [21]:
bucketDf=spark.createDataFrame([[1,["orange", "apple", "pineapple"]],
                                [2,["watermelon","apple","bananas"]]],
                               ["bucketId","items"])

truncate는 행의 값을 잘라내지 않고 출력한다. show(bucketDf.count(), truncate=False)는 모든 행을 완전하게 출력한다.

In [22]:
bucketDf.show(bucketDf.count(), truncate=False)

+--------+----------------------------+
|bucketId|items                       |
+--------+----------------------------+
|1       |[orange, apple, pineapple]  |
|2       |[watermelon, apple, bananas]|
+--------+----------------------------+



####  explode
컬럼에 List 또는 배열이 포함된 경우 explode() 함수는 이를 flat해서 새로운 컬럼을 생성하게 된다.  
`explode(분리할것)`
`alias("새로운 열의 이름")` (여기선 그대로 items)

In [30]:
from pyspark.sql.functions import explode
bDf=bucketDf.select(bucketDf.bucketId, explode(bucketDf.items).alias('items'))

In [31]:
bDf.show()

+--------+----------+
|bucketId|     items|
+--------+----------+
|       1|    orange|
|       1|     apple|
|       1| pineapple|
|       2|watermelon|
|       2|     apple|
|       2|   bananas|
+--------+----------+



또 다른 DataFrame을 생성해보자. 나중에 앞의 DataFrame과 join하게 된다.

In [15]:
fDf=spark.createDataFrame([["orange", "F1"],
                            ["", "F2"],
                            ["pineapple","F3"],
                            ["watermelon","F4"],
                            ["bananas","F5"]],
                            ["item","itemId"])

In [16]:
fDf.show()

+----------+------+
|      item|itemId|
+----------+------+
|    orange|    F1|
|          |    F2|
| pineapple|    F3|
|watermelon|    F4|
|   bananas|    F5|
+----------+------+



####  join
join은 inner, cross, outer, full, full_outer, left, left_outer, right, right_outer, left_semi, left_anti 여러 종류가 있다. inner기준으로 item이 일치하지 않는 것은 제외하게 된다

In [17]:
#다른 df에 없는것들은 알아서 제거된다!
joinDf=fDf.join(bDf, fDf.item==bDf.item, "inner")

In [18]:
joinDf.select(fDf.itemId,fDf.item,bDf.bucketId).show()

+------+----------+--------+
|itemId|      item|bucketId|
+------+----------+--------+
|    F5|   bananas|       2|
|    F1|    orange|       1|
|    F3| pineapple|       1|
|    F4|watermelon|       2|
+------+----------+--------+



###  