In [1]:
from pyspark.sql import SparkSession

# SparkSession 생성
spark = SparkSession.builder \
    .appName("241211_01_SparkSQL_SQLtest") \
    .getOrCreate()

24/12/11 10:29:13 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


In [2]:
from pyspark.sql import Row

user_data = [
    Row(user_id=1, username="A", address="서울"),
    Row(user_id=2, username="B", address="대전"),
    Row(user_id=3, username="C", address="경기도"),
    Row(user_id=4, username="D", address=None),
    Row(user_id=5, username="E", address=None),
    Row(user_id=6, username="F", address="부산"),
    Row(user_id=7, username="G", address="대구"),
    Row(user_id=8, username="H", address="광주"),
    Row(user_id=9, username="I", address="울산"),
    Row(user_id=10, username="J", address="강원도"),
    Row(user_id=11, username="K", address="충청도")
]

In [3]:
user_df = spark.createDataFrame(user_data)
user_df.createOrReplaceTempView('users')

In [4]:
books_data = [
    Row(book_id=1, title="Book A", author_fname="John", author_lname="Doe", pages=300, released_year=2005, stock_quantity=55),
    Row(book_id=2, title="Book B", author_fname="Jane", author_lname="Smith", pages=250, released_year=2010, stock_quantity=40),
    Row(book_id=3, title="Book C", author_fname="Emily", author_lname="Jones", pages=180, released_year=2015, stock_quantity=20),
    Row(book_id=4, title="Book D", author_fname="Chris", author_lname="Brown", pages=320, released_year=2012, stock_quantity=75),
    Row(book_id=5, title="Book E", author_fname="Anna", author_lname="Davis", pages=270, released_year=2008, stock_quantity=35)
]

books_df = spark.createDataFrame(books_data)
books_df.createOrReplaceTempView("books")


In [7]:
# 두번째 쿼리
query_users = '''
SELECT username,
	IF(address IS NULL, '주소없음', address) AS adress
from users;
'''
spark.sql(query_users).show()

+--------+--------+
|username|  adress|
+--------+--------+
|       A|    서울|
|       B|    대전|
|       C|  경기도|
|       D|주소없음|
|       E|주소없음|
|       F|    부산|
|       G|    대구|
|       H|    광주|
|       I|    울산|
|       J|  강원도|
|       K|  충청도|
+--------+--------+



In [8]:
# 세번쨰 쿼리
query_users = '''
SELECT address,
       IF(address IN ('경기도', '서울'), '수도권', '지방') AS region
FROM users;
'''
spark.sql(query_users).show()

+-------+------+
|address|region|
+-------+------+
|   서울|수도권|
|   대전|  지방|
| 경기도|수도권|
|   null|  지방|
|   null|  지방|
|   부산|  지방|
|   대구|  지방|
|   광주|  지방|
|   울산|  지방|
| 강원도|  지방|
| 충청도|  지방|
+-------+------+



In [15]:
# 네번쨰 쿼리, books table
# stock_quantity >= 50 '재고많음', >=30 '중간', '재고없음'
query_users_1 = '''
SELECT *,
       CASE 
           WHEN stock_quantity >= 50 THEN '재고많음'
           WHEN stock_quantity >= 30 THEN '재고중간'
           ELSE '재고없음'
       END AS stock_status
FROM books;
'''
spark.sql(query_users).show()

+-------+------+------------+------------+-----+-------------+--------------+------------+
|book_id| title|author_fname|author_lname|pages|released_year|stock_quantity|stock_status|
+-------+------+------------+------------+-----+-------------+--------------+------------+
|      1|Book A|        John|         Doe|  300|         2005|            55|    재고많음|
|      2|Book B|        Jane|       Smith|  250|         2010|            40|        중간|
|      3|Book C|       Emily|       Jones|  180|         2015|            20|    재고없음|
|      4|Book D|       Chris|       Brown|  320|         2012|            75|    재고많음|
|      5|Book E|        Anna|       Davis|  270|         2008|            35|        중간|
+-------+------+------------+------------+-----+-------------+--------------+------------+



In [16]:
spark.sql(query_users_1).explain() #IF문을 써서 할때랑 차이가 있음

== Physical Plan ==
*(1) Project [book_id#6L, title#7, author_fname#8, author_lname#9, pages#10L, released_year#11L, stock_quantity#12L, CASE WHEN (stock_quantity#12L >= 50) THEN 재고많음 WHEN (stock_quantity#12L >= 30) THEN 재고중간 ELSE 재고없음 END AS stock_status#198]
+- *(1) Scan ExistingRDD[book_id#6L,title#7,author_fname#8,author_lname#9,pages#10L,released_year#11L,stock_quantity#12L]




In [19]:
books_sql_2 = '''
select distinct author_lname from books
'''
spark.sql(books_sql_2).explain() #IF문을 써서 할때랑 차이가 있음

== Physical Plan ==
*(2) HashAggregate(keys=[author_lname#9], functions=[])
+- Exchange hashpartitioning(author_lname#9, 200), ENSURE_REQUIREMENTS, [id=#115]
   +- *(1) HashAggregate(keys=[author_lname#9], functions=[])
      +- *(1) Project [author_lname#9]
         +- *(1) Scan ExistingRDD[book_id#6L,title#7,author_fname#8,author_lname#9,pages#10L,released_year#11L,stock_quantity#12L]




In [20]:
spark.sql(books_sql_2).show()

+------------+
|author_lname|
+------------+
|       Jones|
|       Davis|
|       Smith|
|         Doe|
|       Brown|
+------------+



In [22]:
books_sql_3 = '''
SELECT author_lname, COUNT(*)
FROM books
GROUP BY author_lname;
'''

spark.sql(books_sql_3).explain()
spark.sql(books_sql_3).show()

== Physical Plan ==
*(2) HashAggregate(keys=[author_lname#9], functions=[count(1)])
+- Exchange hashpartitioning(author_lname#9, 200), ENSURE_REQUIREMENTS, [id=#183]
   +- *(1) HashAggregate(keys=[author_lname#9], functions=[partial_count(1)])
      +- *(1) Project [author_lname#9]
         +- *(1) Scan ExistingRDD[book_id#6L,title#7,author_fname#8,author_lname#9,pages#10L,released_year#11L,stock_quantity#12L]


+------------+--------+
|author_lname|count(1)|
+------------+--------+
|       Jones|       1|
|       Davis|       1|
|       Smith|       1|
|         Doe|       1|
|       Brown|       1|
+------------+--------+



## 데이터 변경(조인을 위해 연결성 부여)

In [23]:
# books 테이블 데이터에 borrowed_by 추가
books_data_with_user = [
    Row(book_id=1, title="Book A", author_fname="John", author_lname="Doe", pages=300, released_year=2005, stock_quantity=55, borrowed_by=1),
    Row(book_id=2, title="Book B", author_fname="Jane", author_lname="Smith", pages=250, released_year=2010, stock_quantity=40, borrowed_by=2),
    Row(book_id=3, title="Book C", author_fname="Emily", author_lname="Jones", pages=180, released_year=2015, stock_quantity=20, borrowed_by=3),
    Row(book_id=4, title="Book D", author_fname="Chris", author_lname="Brown", pages=320, released_year=2012, stock_quantity=75, borrowed_by=None),
    Row(book_id=5, title="Book E", author_fname="Anna", author_lname="Davis", pages=270, released_year=2008, stock_quantity=35, borrowed_by=6)
]

# DataFrame 생성
books_df_with_user = spark.createDataFrame(books_data_with_user)

# Temp View 등록
books_df_with_user.createOrReplaceTempView("books")

In [24]:
books_sql_3 = '''
SELECT *
FROM books
'''

spark.sql(books_sql_3).explain()
spark.sql(books_sql_3).show()

== Physical Plan ==
*(1) Scan ExistingRDD[book_id#241L,title#242,author_fname#243,author_lname#244,pages#245L,released_year#246L,stock_quantity#247L,borrowed_by#248L]


+-------+------+------------+------------+-----+-------------+--------------+-----------+
|book_id| title|author_fname|author_lname|pages|released_year|stock_quantity|borrowed_by|
+-------+------+------------+------------+-----+-------------+--------------+-----------+
|      1|Book A|        John|         Doe|  300|         2005|            55|          1|
|      2|Book B|        Jane|       Smith|  250|         2010|            40|          2|
|      3|Book C|       Emily|       Jones|  180|         2015|            20|          3|
|      4|Book D|       Chris|       Brown|  320|         2012|            75|       null|
|      5|Book E|        Anna|       Davis|  270|         2008|            35|          6|
+-------+------+------------+------------+-----+-------------+--------------+-----------+



In [25]:
# book_id=3, stock_quantity=50으로 바꾼다. > 데이터 전처리 과정
from pyspark.sql.functions import *

updated_books_df = books_df_with_user.withColumn(
    "stock_quantity",
    when(books_df_with_user.book_id == 3, 50).otherwise(books_df_with_user.stock_quantity)
)

updated_books_df.show()

+-------+------+------------+------------+-----+-------------+--------------+-----------+
|book_id| title|author_fname|author_lname|pages|released_year|stock_quantity|borrowed_by|
+-------+------+------------+------------+-----+-------------+--------------+-----------+
|      1|Book A|        John|         Doe|  300|         2005|            55|          1|
|      2|Book B|        Jane|       Smith|  250|         2010|            40|          2|
|      3|Book C|       Emily|       Jones|  180|         2015|            50|          3|
|      4|Book D|       Chris|       Brown|  320|         2012|            75|       null|
|      5|Book E|        Anna|       Davis|  270|         2008|            35|          6|
+-------+------+------------+------------+-----+-------------+--------------+-----------+



In [28]:
# stock_quantity 값을 10% 증가
updated_books_df = books_df_with_user.withColumn(
    "stock_quantity",
    (col("stock_quantity") * 1.1).cast("int")  # 10% 증가 후 정수로 변환
)
# 뷰로 등록
updated_books_df.createOrReplaceTempView("updated_books")

# 확인
updated_books_df.show()

+-------+------+------------+------------+-----+-------------+--------------+-----------+
|book_id| title|author_fname|author_lname|pages|released_year|stock_quantity|borrowed_by|
+-------+------+------------+------------+-----+-------------+--------------+-----------+
|      1|Book A|        John|         Doe|  300|         2005|            60|          1|
|      2|Book B|        Jane|       Smith|  250|         2010|            44|          2|
|      3|Book C|       Emily|       Jones|  180|         2015|            22|          3|
|      4|Book D|       Chris|       Brown|  320|         2012|            82|       null|
|      5|Book E|        Anna|       Davis|  270|         2008|            38|          6|
+-------+------+------------+------------+-----+-------------+--------------+-----------+



In [35]:
# 데이터 저장 : overwrite, append, ignore, error
updated_books_df.write.csv("data/output/sqltest_updated_books.csv", header=True, mode="overwrite")

In [36]:
user_df.write.csv("data/output/sqltest_updated_users.csv", header=True, mode="overwrite")

In [38]:
updated_books_df1 = spark.read.csv("data/output/sqltest_updated_books.csv", header = True)
updated_books_df1.show()

+-------+------+------------+------------+-----+-------------+--------------+-----------+
|book_id| title|author_fname|author_lname|pages|released_year|stock_quantity|borrowed_by|
+-------+------+------------+------------+-----+-------------+--------------+-----------+
|      3|Book C|       Emily|       Jones|  180|         2015|            22|          3|
|      4|Book D|       Chris|       Brown|  320|         2012|            82|       null|
|      5|Book E|        Anna|       Davis|  270|         2008|            38|          6|
|      1|Book A|        John|         Doe|  300|         2005|            60|          1|
|      2|Book B|        Jane|       Smith|  250|         2010|            44|          2|
+-------+------+------------+------------+-----+-------------+--------------+-----------+



In [40]:
# 조인 실습
# book_id, title, author_fname, author_lname, username, address
join_query = '''
SELECT book_id, title, author_fname, author_lname, username, address
FROM books b INNER JOIN users u ON b.borrowed_by = u.user_id; 
'''

spark.sql(join_query).show()

                                                                                

+-------+------+------------+------------+--------+-------+
|book_id| title|author_fname|author_lname|username|address|
+-------+------+------------+------------+--------+-------+
|      5|Book E|        Anna|       Davis|       F|   부산|
|      1|Book A|        John|         Doe|       A|   서울|
|      3|Book C|       Emily|       Jones|       C| 경기도|
|      2|Book B|        Jane|       Smith|       B|   대전|
+-------+------+------------+------------+--------+-------+



In [None]:
# Left Join


In [41]:
# 사용자의 책 대여 목록 > 전체 사용자 > 대여한 정보가 있으면 나오고 없으면 NULL
# RIGHT JOIN

join_query = '''
SELECT book_id, title, author_fname, author_lname, username, address
FROM books b RIGHT JOIN users u ON b.borrowed_by = u.user_id; 
'''

spark.sql(join_query).show()

+-------+------+------------+------------+--------+-------+
|book_id| title|author_fname|author_lname|username|address|
+-------+------+------------+------------+--------+-------+
|   null|  null|        null|        null|       G|   대구|
|      5|Book E|        Anna|       Davis|       F|   부산|
|   null|  null|        null|        null|       I|   울산|
|   null|  null|        null|        null|       E|   null|
|      1|Book A|        John|         Doe|       A|   서울|
|   null|  null|        null|        null|       J| 강원도|
|      3|Book C|       Emily|       Jones|       C| 경기도|
|   null|  null|        null|        null|       H|   광주|
|   null|  null|        null|        null|       K| 충청도|
|      2|Book B|        Jane|       Smith|       B|   대전|
|   null|  null|        null|        null|       D|   null|
+-------+------+------------+------------+--------+-------+



In [44]:
# 특정 지역 = 서울에 거주하는 사용자가 대여한 책 목록
query = '''
SELECT book_id, title, author_fname, author_lname, username, address
FROM books b RIGHT JOIN users u ON b.borrowed_by = u.user_id
WHERE u.address = '서울';
'''
spark.sql(query).explain()
spark.sql(query).show()

== Physical Plan ==
*(5) Project [book_id#241L, title#242, author_fname#243, author_lname#244, username#1, address#2]
+- SortMergeJoin [borrowed_by#248L], [user_id#0L], RightOuter
   :- *(2) Sort [borrowed_by#248L ASC NULLS FIRST], false, 0
   :  +- Exchange hashpartitioning(borrowed_by#248L, 200), ENSURE_REQUIREMENTS, [id=#614]
   :     +- *(1) Project [book_id#241L, title#242, author_fname#243, author_lname#244, borrowed_by#248L]
   :        +- *(1) Filter isnotnull(borrowed_by#248L)
   :           +- *(1) Scan ExistingRDD[book_id#241L,title#242,author_fname#243,author_lname#244,pages#245L,released_year#246L,stock_quantity#247L,borrowed_by#248L]
   +- *(4) Sort [user_id#0L ASC NULLS FIRST], false, 0
      +- Exchange hashpartitioning(user_id#0L, 200), ENSURE_REQUIREMENTS, [id=#619]
         +- *(3) Filter (isnotnull(address#2) AND (address#2 = 서울))
            +- *(3) Scan ExistingRDD[user_id#0L,username#1,address#2]


+-------+------+------------+------------+--------+-------+
|book

In [47]:
# 사용자별로 대여한 책의 수(users가 중심이 되어야함)
query = '''
SELECT u.user_id, u.username, COUNT(b.book_id)
FROM users u LEFT JOIN books b ON u.user_id = b.borrowed_by
GROUP BY u.user_id, u.username;
'''

spark.sql(query).explain()
spark.sql(query).show()

== Physical Plan ==
*(5) HashAggregate(keys=[user_id#0L, username#1], functions=[count(book_id#241L)])
+- *(5) HashAggregate(keys=[user_id#0L, username#1], functions=[partial_count(book_id#241L)])
   +- *(5) Project [user_id#0L, username#1, book_id#241L]
      +- SortMergeJoin [user_id#0L], [borrowed_by#248L], LeftOuter
         :- *(2) Sort [user_id#0L ASC NULLS FIRST], false, 0
         :  +- Exchange hashpartitioning(user_id#0L, 200), ENSURE_REQUIREMENTS, [id=#724]
         :     +- *(1) Project [user_id#0L, username#1]
         :        +- *(1) Scan ExistingRDD[user_id#0L,username#1,address#2]
         +- *(4) Sort [borrowed_by#248L ASC NULLS FIRST], false, 0
            +- Exchange hashpartitioning(borrowed_by#248L, 200), ENSURE_REQUIREMENTS, [id=#729]
               +- *(3) Project [book_id#241L, borrowed_by#248L]
                  +- *(3) Filter isnotnull(borrowed_by#248L)
                     +- *(3) Scan ExistingRDD[book_id#241L,title#242,author_fname#243,author_lname#244,page

                                                                                

+-------+--------+--------------+
|user_id|username|count(book_id)|
+-------+--------+--------------+
|      7|       G|             0|
|      6|       F|             1|
|      9|       I|             0|
|      5|       E|             0|
|      1|       A|             1|
|     10|       J|             0|
|      3|       C|             1|
|      8|       H|             0|
|     11|       K|             0|
|      2|       B|             1|
|      4|       D|             0|
+-------+--------+--------------+



In [49]:
# 300페이지 이상이면 long, 아니면 short으로 하는 컬럼 만들기
query = '''
SELECT book_id, pages, title, CASE WHEN pages>=300 THEN 'Long' ELSE 'Short' END AS page_category
FROM books;
'''

spark.sql(query).explain()
spark.sql(query).show()

== Physical Plan ==
*(1) Project [book_id#241L, pages#245L, title#242, CASE WHEN (pages#245L >= 300) THEN Long ELSE Short END AS page_category#820]
+- *(1) Scan ExistingRDD[book_id#241L,title#242,author_fname#243,author_lname#244,pages#245L,released_year#246L,stock_quantity#247L,borrowed_by#248L]


+-------+-----+------+-------------+
|book_id|pages| title|page_category|
+-------+-----+------+-------------+
|      1|  300|Book A|         Long|
|      2|  250|Book B|        Short|
|      3|  180|Book C|        Short|
|      4|  320|Book D|         Long|
|      5|  270|Book E|        Short|
+-------+-----+------+-------------+



In [50]:
# Stock_quantity > 50이면 충분, 30이상이면 보통, 아니면 부족
query = '''
SELECT book_id, stock_quantity, title, 
       CASE 
           WHEN stock_quantity >= 50 THEN '충분'
           WHEN stock_quantity >= 30 THEN '보통'
           ELSE '부족'
       END AS stock_status
FROM books;
'''

spark.sql(query).explain()
spark.sql(query).show()

== Physical Plan ==
*(1) Project [book_id#241L, stock_quantity#247L, title#242, CASE WHEN (stock_quantity#247L >= 50) THEN 충분 WHEN (stock_quantity#247L >= 30) THEN 보통 ELSE 부족 END AS stock_status#847]
+- *(1) Scan ExistingRDD[book_id#241L,title#242,author_fname#243,author_lname#244,pages#245L,released_year#246L,stock_quantity#247L,borrowed_by#248L]


+-------+--------------+------+------------+
|book_id|stock_quantity| title|stock_status|
+-------+--------------+------+------------+
|      1|            55|Book A|        충분|
|      2|            40|Book B|        보통|
|      3|            20|Book C|        부족|
|      4|            75|Book D|        충분|
|      5|            35|Book E|        보통|
+-------+--------------+------+------------+



In [52]:
# 작가별 대여된 책의 수
query = '''
SELECT author_fname, author_lname, COUNT(borrowed_by) AS borrow_count
FROM books
WHERE borrowed_by IS NOT NULL
GROUP BY author_fname, author_lname;
'''

spark.sql(query).explain()
spark.sql(query).show()

== Physical Plan ==
*(2) HashAggregate(keys=[author_fname#243, author_lname#244], functions=[count(borrowed_by#248L)])
+- Exchange hashpartitioning(author_fname#243, author_lname#244, 200), ENSURE_REQUIREMENTS, [id=#950]
   +- *(1) HashAggregate(keys=[author_fname#243, author_lname#244], functions=[partial_count(borrowed_by#248L)])
      +- *(1) Project [author_fname#243, author_lname#244, borrowed_by#248L]
         +- *(1) Filter isnotnull(borrowed_by#248L)
            +- *(1) Scan ExistingRDD[book_id#241L,title#242,author_fname#243,author_lname#244,pages#245L,released_year#246L,stock_quantity#247L,borrowed_by#248L]


+------------+------------+------------+
|author_fname|author_lname|borrow_count|
+------------+------------+------------+
|        Anna|       Davis|           1|
|       Emily|       Jones|           1|
|        John|         Doe|           1|
|        Jane|       Smith|           1|
+------------+------------+------------+



In [53]:
# 책의 발행 연도별 대여 현황
query = '''
SELECT released_year, COUNT(borrowed_by) AS borrow_count
FROM books
GROUP BY released_year;
'''

spark.sql(query).explain()
spark.sql(query).show()

== Physical Plan ==
*(2) HashAggregate(keys=[released_year#246L], functions=[count(borrowed_by#248L)])
+- Exchange hashpartitioning(released_year#246L, 200), ENSURE_REQUIREMENTS, [id=#1001]
   +- *(1) HashAggregate(keys=[released_year#246L], functions=[partial_count(borrowed_by#248L)])
      +- *(1) Project [released_year#246L, borrowed_by#248L]
         +- *(1) Scan ExistingRDD[book_id#241L,title#242,author_fname#243,author_lname#244,pages#245L,released_year#246L,stock_quantity#247L,borrowed_by#248L]


+-------------+------------+
|released_year|borrow_count|
+-------------+------------+
|         2012|           0|
|         2010|           1|
|         2005|           1|
|         2008|           1|
|         2015|           1|
+-------------+------------+



In [55]:
# 대여되지 않은 책 중 페이지 수가 가장 많은 책
query = '''
SELECT book_id, title, pages
FROM books
WHERE borrowed_by IS NULL
ORDER BY pages DESC
LIMIT 1;
'''

spark.sql(query).explain()
spark.sql(query).show()

== Physical Plan ==
TakeOrderedAndProject(limit=1, orderBy=[pages#245L DESC NULLS LAST], output=[book_id#241L,title#242,pages#245L])
+- *(1) Project [book_id#241L, title#242, pages#245L]
   +- *(1) Filter isnull(borrowed_by#248L)
      +- *(1) Scan ExistingRDD[book_id#241L,title#242,author_fname#243,author_lname#244,pages#245L,released_year#246L,stock_quantity#247L,borrowed_by#248L]


+-------+------+-----+
|book_id| title|pages|
+-------+------+-----+
|      4|Book D|  320|
+-------+------+-----+



In [58]:
# 사용자의 지역별 대여된 책 수
query = '''
SELECT book_id, title, stock_quantity, 
       CASE 
           WHEN stock_quantity < 30 THEN '부족'
           ELSE '충분'
       END AS stock_status,
       
FROM books
WHERE stock_quantity < 30;
'''

spark.sql(query).explain()
spark.sql(query).show()

== Physical Plan ==
*(6) HashAggregate(keys=[address#2], functions=[count(borrowed_by#248L)])
+- Exchange hashpartitioning(address#2, 200), ENSURE_REQUIREMENTS, [id=#1176]
   +- *(5) HashAggregate(keys=[address#2], functions=[partial_count(borrowed_by#248L)])
      +- *(5) Project [address#2, borrowed_by#248L]
         +- *(5) SortMergeJoin [user_id#0L], [borrowed_by#248L], Inner
            :- *(2) Sort [user_id#0L ASC NULLS FIRST], false, 0
            :  +- Exchange hashpartitioning(user_id#0L, 200), ENSURE_REQUIREMENTS, [id=#1161]
            :     +- *(1) Project [user_id#0L, address#2]
            :        +- *(1) Filter isnotnull(user_id#0L)
            :           +- *(1) Scan ExistingRDD[user_id#0L,username#1,address#2]
            +- *(4) Sort [borrowed_by#248L ASC NULLS FIRST], false, 0
               +- Exchange hashpartitioning(borrowed_by#248L, 200), ENSURE_REQUIREMENTS, [id=#1167]
                  +- *(3) Project [borrowed_by#248L]
                     +- *(3) Filter is

                                                                                

+-------+------------+
|address|borrow_count|
+-------+------------+
|   대전|           1|
| 경기도|           1|
|   부산|           1|
|   서울|           1|
+-------+------------+



In [59]:
# 재고가 부족한 책과 대여 상태 확인
query = '''
SELECT book_id, title, stock_quantity, 
       CASE 
           WHEN stock_quantity < 30 THEN '부족'
           ELSE '충분'
       END AS stock_status
FROM books
WHERE stock_quantity < 30 ; -- stock_quatity를 바로 조건식으로 걸 수는 없음. 실행 순서로 인해
'''

spark.sql(query).explain()
spark.sql(query).show()

== Physical Plan ==
*(1) Project [book_id#241L, title#242, stock_quantity#247L, CASE WHEN (stock_quantity#247L < 30) THEN 부족 ELSE 충분 END AS stock_status#1010]
+- *(1) Filter (isnotnull(stock_quantity#247L) AND (stock_quantity#247L < 30))
   +- *(1) Scan ExistingRDD[book_id#241L,title#242,author_fname#243,author_lname#244,pages#245L,released_year#246L,stock_quantity#247L,borrowed_by#248L]


+-------+------+--------------+------------+
|book_id| title|stock_quantity|stock_status|
+-------+------+--------------+------------+
|      3|Book C|            20|        부족|
+-------+------+--------------+------------+



In [60]:
spark.stop()