### SparkSQL
+ 스파크 데이터프레임에 저장된 데이터들을 SQL 문법을 이용해서
탐색할 수 있도록 해 줌
+ spark.sql() 함수 사용
+ OLTP 보다는 OLAP 처리에 적합

In [1]:
# sparkSQL을 위한 스파크세션 생성
spark = SparkSession.builder.master('app').appName('sparkSQL').getOrCreate()

In [2]:
emp = spark.read.csv('data/employees.csv',header=True, inferSchema=True)
dept = spark.read.csv('data/departments.csv',header=True, inferSchema=True)

In [3]:
# SQL 사용을 위해 View 객체 생성
# 객체명.createOrReplaceTempView(뷰이름)
EMP = emp.createOrReplaceTempView('EMP')
DEPT = dept.createOrReplaceTempView('DEPT')

In [5]:
sql = 'select * from EMP'
spark.sql(sql).show(5)

+-----------+----------+---------+--------+------------+----------+-------+-------+--------------+----------+-------------+
|EMPLOYEE_ID|FIRST_NAME|LAST_NAME|   EMAIL|PHONE_NUMBER| HIRE_DATE| JOB_ID| SALARY|COMMISSION_PCT|MANAGER_ID|DEPARTMENT_ID|
+-----------+----------+---------+--------+------------+----------+-------+-------+--------------+----------+-------------+
|        100|    Steven|     King|   SKING|515.123.4567|2003-06-17|AD_PRES|24000.0|          null|      null|           90|
|        101|     Neena|  Kochhar|NKOCHHAR|515.123.4568|2005-09-21|  AD_VP|17000.0|          null|       100|           90|
|        102|       Lex|  De Haan| LDEHAAN|515.123.4569|2001-01-13|  AD_VP|17000.0|          null|       100|           90|
|        103| Alexander|   Hunold| AHUNOLD|590.423.4567|2006-01-03|IT_PROG| 9000.0|          null|       102|           60|
|        104|     Bruce|    Ernst|  BERNST|590.423.4568|2007-05-21|IT_PROG| 6000.0|          null|       103|           60|
+-------

In [6]:
sql = 'select * from DEPT'
spark.sql(sql).show(5)

+-------------+---------------+----------+-----------+
|DEPARTMENT_ID|DEPARTMENT_NAME|MANAGER_ID|LOCATION_ID|
+-------------+---------------+----------+-----------+
|           10| Administration|       200|       1700|
|           20|      Marketing|       201|       1800|
|           30|     Purchasing|       114|       1700|
|           40|Human Resources|       203|       2400|
|           50|       Shipping|       121|       1500|
+-------------+---------------+----------+-----------+
only showing top 5 rows



In [30]:
# 2006-02-05부터 2006-11-15사이에 고용된 사원 조회
sql = '''select FIRST_NAME, LAST_NAME, HIRE_DATE from EMP \
    where HIRE_DATE between '2006-02-05' and '2006-11-15'\
    order by HIRE_DATE'''
spark.sql(sql).show()

+-----------+-----------+----------+
| FIRST_NAME|  LAST_NAME| HIRE_DATE|
+-----------+-----------+----------+
|      Valli|  Pataballa|2006-02-05|
|       John|        Seo|2006-02-12|
|       Jean|     Fleaur|2006-02-23|
|Jose Manuel|      Urman|2006-03-07|
|    Randall|      Matos|2006-03-15|
|   Harrison|      Bloom|2006-03-23|
|   Jonathon|     Taylor|2006-03-24|
|Christopher|      Olsen|2006-03-30|
|     Joshua|      Patel|2006-04-06|
|       Jack| Livingston|2006-04-23|
|      Alana|      Walsh|2006-04-24|
|      Kevin|     Feeney|2006-05-23|
|      Julia|  Dellinger|2006-06-24|
|     Samuel|     McCain|2006-07-01|
|      Peter|     Vargas|2006-07-09|
|    Timothy|      Gates|2006-07-11|
|    Michael|     Rogers|2006-08-26|
|      Irene|Mikkilineni|2006-09-28|
|     Sarath|     Sewall|2006-11-03|
|        Guy|     Himuro|2006-11-15|
+-----------+-----------+----------+



In [32]:
# 직책별 사원수 조회후 사원수 내림차순 정렬
sql = "select JOB_ID, count(*) cnt from EMP\
    group by JOB_ID order by cnt DESC"
spark.sql(sql).show()

+----------+---+
|    JOB_ID|cnt|
+----------+---+
|    SA_REP| 30|
|  SH_CLERK| 20|
|  ST_CLERK| 20|
|FI_ACCOUNT|  5|
|  PU_CLERK|  5|
|    ST_MAN|  5|
|   IT_PROG|  5|
|    SA_MAN|  5|
|     AD_VP|  2|
|    MK_MAN|  1|
|    FI_MGR|  1|
|AC_ACCOUNT|  1|
|    MK_REP|  1|
|    HR_REP|  1|
|    AC_MGR|  1|
|    PU_MAN|  1|
|   AD_PRES|  1|
|    PR_REP|  1|
|   AD_ASST|  1|
+----------+---+



In [33]:
# 20번 또는 50번 부서에 근무하며, 
# 급여가 5000 ~ 12,000 사이인 사원들의 
# LAST_NAME 및 급여를 조회하세요
sql = "select LAST_NAME, SALARY from EMP\
    where DEPARTMENT_ID in (20, 50) and\
    SALARY between 5000 and 12000"
spark.sql(sql).show()

+---------+------+
|LAST_NAME|SALARY|
+---------+------+
|    Weiss|8000.0|
|    Fripp|8200.0|
| Kaufling|7900.0|
|  Vollman|6500.0|
|  Mourgos|5800.0|
|      Fay|6000.0|
+---------+------+



In [34]:
## 하계 올림픽 국가별 메달수
summer = spark.read.csv('data/summermedals.csv', header=True, inferSchema=True)
summer = summer.createOrReplaceTempView('summer')

In [37]:
# 금은동 다
sql = '''
    select Country, count(Medal) medals from SUMMER
    group by Country order by medals desc
'''
spark.sql(sql).show(10)

+-------+------+
|Country|medals|
+-------+------+
|    USA|  4585|
|    URS|  2049|
|    GBR|  1720|
|    FRA|  1396|
|    GER|  1305|
|    ITA|  1296|
|    AUS|  1189|
|    HUN|  1079|
|    SWE|  1044|
|    NED|   851|
+-------+------+
only showing top 10 rows



In [46]:
# 금은동 나눠서
sql = '''
    select Country, Medal, count(Medal) medals from SUMMER
    group by Country, Medal order by medals desc
'''
spark.sql(sql).show(10)

+-------+------+------+
|Country| Medal|medals|
+-------+------+------+
|    USA|  Gold|  2235|
|    USA|Silver|  1252|
|    USA|Bronze|  1098|
|    URS|  Gold|   838|
|    URS|Silver|   627|
|    GBR|Silver|   621|
|    URS|Bronze|   584|
|    GBR|Bronze|   553|
|    GBR|  Gold|   546|
|    FRA|Bronze|   497|
+-------+------+------+
only showing top 10 rows



In [38]:
## 타이타닉 승선객들중 승선위치별 성별 생존자 조회
titanic = spark.read.csv('data/titanic.csv', header=True, inferSchema=True)
TITANIC = titanic.createOrReplaceTempView('TITANIC')

In [43]:
sql = '''
    select embarked, sex, survived, count(survived) cnt from TITANIC
    where embarked is not null
    group by embarked, sex, survived order by embarked
'''
spark.sql(sql).show(15)

+--------+------+--------+---+
|embarked|   sex|survived|cnt|
+--------+------+--------+---+
|       C|female|       1|102|
|       C|female|       0| 11|
|       C|  male|       1| 48|
|       C|  male|       0|109|
|       Q|  male|       1|  7|
|       Q|female|       0| 23|
|       Q|  male|       0| 56|
|       Q|female|       1| 37|
|       S|female|       0| 93|
|       S|  male|       1|106|
|       S|  male|       0|517|
|       S|female|       1|198|
+--------+------+--------+---+



In [45]:
# 생존자만 추린거 (survived == 1)
sql = '''
    select embarked, sex, survived, count(survived) cnt from TITANIC
    where embarked is not null and survived == 1
    group by embarked, sex, survived order by embarked
'''
spark.sql(sql).show(15)

+--------+------+--------+---+
|embarked|   sex|survived|cnt|
+--------+------+--------+---+
|       C|female|       1|102|
|       C|  male|       1| 48|
|       Q|female|       1| 37|
|       Q|  male|       1|  7|
|       S|  male|       1|106|
|       S|female|       1|198|
+--------+------+--------+---+



In [None]:
## 음식점 종류 예약

In [None]:
## 커피숍 종류는 몇 개인가?

In [None]:
## 구별 스타벅스는 몇 개인가?