## 데이터프레임 생성

In [1]:
from pyspark.sql.types import StringType, IntegerType

### 리스트를 이용해서 데이터프레임 만들기
* createDataFrame(RDD객체, 타입)

In [2]:
fruits = ['apple','peach','banana','mango','mincho']
words = sc.parallelize(fruits)
words.collect()   # RDD

['apple', 'peach', 'banana', 'mango', 'mincho']

In [3]:
df = spark.createDataFrame(words, StringType())
df.show()

+------+
| value|
+------+
| apple|
| peach|
|banana|
| mango|
|mincho|
+------+



### 리스트로 데이터프레임 객체 만들기
* (과일명, 가격)

In [4]:
data = [('apple', 1500), ('peach', 2000), ('banana', 1500), ('mango', 2500), ('mincho', 3000)]

In [5]:
fruits = spark.createDataFrame(data)
fruits.collect()

[Row(_1='apple', _2=1500),
 Row(_1='peach', _2=2000),
 Row(_1='banana', _2=1500),
 Row(_1='mango', _2=2500),
 Row(_1='mincho', _2=3000)]

In [6]:
# 컬럼명 지정하면서 데이터프레임 생성
fruits = spark.createDataFrame(data, ['fruits','price'])
fruits.collect()

[Row(fruits='apple', price=1500),
 Row(fruits='peach', price=2000),
 Row(fruits='banana', price=1500),
 Row(fruits='mango', price=2500),
 Row(fruits='mincho', price=3000)]

In [7]:
# 컬럼명, 데이터타입 지정하면서 데이터프레임 생성
fruits = spark.createDataFrame(data, "fruits:string, price:int")
fruits.collect()

[Row(fruits='apple', price=1500),
 Row(fruits='peach', price=2000),
 Row(fruits='banana', price=1500),
 Row(fruits='mango', price=2500),
 Row(fruits='mincho', price=3000)]

In [8]:
# 특정 컬럼 출력 : select
fruits.select('fruits').collect()

[Row(fruits='apple'),
 Row(fruits='peach'),
 Row(fruits='banana'),
 Row(fruits='mango'),
 Row(fruits='mincho')]

In [9]:
fruits.show()

+------+-----+
|fruits|price|
+------+-----+
| apple| 1500|
| peach| 2000|
|banana| 1500|
| mango| 2500|
|mincho| 3000|
+------+-----+



### 스파크세션을 이용한 고급 데이터프레임 생성

In [2]:
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType

In [11]:
# 데이터프레임 스키마 정의 - employees
spark = SparkSession.builder.appName('emp').getOrCreate()

In [12]:
# 데이터프레임 생성전 스키마 정의
# add(컬럼명, 데이터타입)
emp_schema = StructType().add('empno', 'integer').add('fname', 'string')\
.add('lname', 'string').add('hdate', 'string').add('sal', 'integer')\
.add('deptid', 'integer')

In [13]:
# 지정한 스키마를 이용해서 데이터프레임 생성
# 데이터프레임의 각 행은 set 객체로 정의
# 스키마는 schema 속성으로 지정
emp = [(123,'steve','king','2003-06-17',35000,None),
   (456,'john','seo','2005-12-15',20000,50),
   (789,'david',None,'2004-03-01',22000,90)]
df = spark.createDataFrame(emp, schema=emp_schema)
df.show()

+-----+-----+-----+----------+-----+------+
|empno|fname|lname|     hdate|  sal|deptid|
+-----+-----+-----+----------+-----+------+
|  123|steve| king|2003-06-17|35000|  null|
|  456| john|  seo|2005-12-15|20000|    50|
|  789|david| null|2004-03-01|22000|    90|
+-----+-----+-----+----------+-----+------+



In [14]:
# 데이터프레임 스키마 확인
df.printSchema()

root
 |-- empno: integer (nullable = true)
 |-- fname: string (nullable = true)
 |-- lname: string (nullable = true)
 |-- hdate: string (nullable = true)
 |-- sal: integer (nullable = true)
 |-- deptid: integer (nullable = true)



### csv 파일을 이용해서 데이터프레임 만들기
+ read.csv(경로, 헤더여부, 스키마여부)

In [3]:
emp = spark.read.csv('data/employees.csv',header=True, inferSchema=True)
emp.printSchema()

root
 |-- EMPLOYEE_ID: integer (nullable = true)
 |-- FIRST_NAME: string (nullable = true)
 |-- LAST_NAME: string (nullable = true)
 |-- EMAIL: string (nullable = true)
 |-- PHONE_NUMBER: string (nullable = true)
 |-- HIRE_DATE: string (nullable = true)
 |-- JOB_ID: string (nullable = true)
 |-- SALARY: double (nullable = true)
 |-- COMMISSION_PCT: double (nullable = true)
 |-- MANAGER_ID: integer (nullable = true)
 |-- DEPARTMENT_ID: integer (nullable = true)



In [4]:
emp.count()

107

In [5]:
# 데이터프레임 컬럼목록 출력
emp.columns

['EMPLOYEE_ID',
 'FIRST_NAME',
 'LAST_NAME',
 'EMAIL',
 'PHONE_NUMBER',
 'HIRE_DATE',
 'JOB_ID',
 'SALARY',
 'COMMISSION_PCT',
 'MANAGER_ID',
 'DEPARTMENT_ID']

In [6]:
# 지정한 수만큼 행 출력
emp.show(5)

+-----------+----------+---------+--------+------------+----------+-------+-------+--------------+----------+-------------+
|EMPLOYEE_ID|FIRST_NAME|LAST_NAME|   EMAIL|PHONE_NUMBER| HIRE_DATE| JOB_ID| SALARY|COMMISSION_PCT|MANAGER_ID|DEPARTMENT_ID|
+-----------+----------+---------+--------+------------+----------+-------+-------+--------------+----------+-------------+
|        100|    Steven|     King|   SKING|515.123.4567|2003-06-17|AD_PRES|24000.0|          null|      null|           90|
|        101|     Neena|  Kochhar|NKOCHHAR|515.123.4568|2005-09-21|  AD_VP|17000.0|          null|       100|           90|
|        102|       Lex|  De Haan| LDEHAAN|515.123.4569|2001-01-13|  AD_VP|17000.0|          null|       100|           90|
|        103| Alexander|   Hunold| AHUNOLD|590.423.4567|2006-01-03|IT_PROG| 9000.0|          null|       102|           60|
|        104|     Bruce|    Ernst|  BERNST|590.423.4568|2007-05-21|IT_PROG| 6000.0|          null|       103|           60|
+-------

#### 데이터 요약
* summary(통계항목)

In [7]:
emp.summary().show()

+-------+----------------+----------+---------+-------+------------------+----------+----------+------------------+-------------------+------------------+------------------+
|summary|     EMPLOYEE_ID|FIRST_NAME|LAST_NAME|  EMAIL|      PHONE_NUMBER| HIRE_DATE|    JOB_ID|            SALARY|     COMMISSION_PCT|        MANAGER_ID|     DEPARTMENT_ID|
+-------+----------------+----------+---------+-------+------------------+----------+----------+------------------+-------------------+------------------+------------------+
|  count|             107|       107|      107|    107|               107|       107|       107|               107|                 35|               106|               106|
|   mean|           153.0|      null|     null|   null|              null|      null|      null|6461.8317757009345|0.22285714285714286|124.76415094339623| 63.20754716981132|
| stddev|31.0322412983658|      null|     null|   null|              null|      null|      null|3909.5797305524825|0.0851839334675

In [8]:
emp.select('SALARY','COMMISSION_PCT').summary().show()

+-------+------------------+-------------------+
|summary|            SALARY|     COMMISSION_PCT|
+-------+------------------+-------------------+
|  count|               107|                 35|
|   mean|6461.8317757009345|0.22285714285714286|
| stddev|3909.5797305524825|0.08518393346757594|
|    min|            2100.0|                0.1|
|    25%|            3100.0|               0.15|
|    50%|            6200.0|                0.2|
|    75%|            9000.0|                0.3|
|    max|           24000.0|                0.4|
+-------+------------------+-------------------+



In [9]:
emp.select('SALARY','COMMISSION_PCT')\
    .summary('mean','stddev','min','max').show()

+-------+------------------+-------------------+
|summary|            SALARY|     COMMISSION_PCT|
+-------+------------------+-------------------+
|   mean|6461.8317757009345|0.22285714285714286|
| stddev|3909.5797305524825|0.08518393346757594|
|    min|            2100.0|                0.1|
|    max|           24000.0|                0.4|
+-------+------------------+-------------------+



### 데이터프레임 데이터 탐색
+ select
+ where
+ orderBy
+ groupBy

In [10]:
# 모든 사원의 이름 조회
emp.select(['FIRST_NAME','LAST_NAME']).show(5)

+----------+---------+
|FIRST_NAME|LAST_NAME|
+----------+---------+
|    Steven|     King|
|     Neena|  Kochhar|
|       Lex|  De Haan|
| Alexander|   Hunold|
|     Bruce|    Ernst|
+----------+---------+
only showing top 5 rows



In [11]:
# 급여가 7000 이상인 사원 조회
# 컬럼 지정 : 객체명[컬럼명]
emp.where(emp['SALARY'] >= 7000).show(5)

+-----------+----------+---------+--------+------------+----------+-------+-------+--------------+----------+-------------+
|EMPLOYEE_ID|FIRST_NAME|LAST_NAME|   EMAIL|PHONE_NUMBER| HIRE_DATE| JOB_ID| SALARY|COMMISSION_PCT|MANAGER_ID|DEPARTMENT_ID|
+-----------+----------+---------+--------+------------+----------+-------+-------+--------------+----------+-------------+
|        100|    Steven|     King|   SKING|515.123.4567|2003-06-17|AD_PRES|24000.0|          null|      null|           90|
|        101|     Neena|  Kochhar|NKOCHHAR|515.123.4568|2005-09-21|  AD_VP|17000.0|          null|       100|           90|
|        102|       Lex|  De Haan| LDEHAAN|515.123.4569|2001-01-13|  AD_VP|17000.0|          null|       100|           90|
|        103| Alexander|   Hunold| AHUNOLD|590.423.4567|2006-01-03|IT_PROG| 9000.0|          null|       102|           60|
|        108|     Nancy|Greenberg|NGREENBE|515.124.4569|2002-08-17| FI_MGR|12008.0|          null|       101|          100|
+-------

In [22]:
# 급여가 7000 이상인 사원의 수 조회
emp.filter(emp['SALARY'] >= 7000).count()

47

In [14]:
# 2006-02-05부터 2006-11-15사이에 고용된 사원 조회
emp.where((emp['HIRE_DATE'] >= '2006-02-05') &
           (emp['HIRE_DATE'] >= '2006-02-05')).show(5)

+-----------+-----------+---------+--------+------------+----------+----------+------+--------------+----------+-------------+
|EMPLOYEE_ID| FIRST_NAME|LAST_NAME|   EMAIL|PHONE_NUMBER| HIRE_DATE|    JOB_ID|SALARY|COMMISSION_PCT|MANAGER_ID|DEPARTMENT_ID|
+-----------+-----------+---------+--------+------------+----------+----------+------+--------------+----------+-------------+
|        104|      Bruce|    Ernst|  BERNST|590.423.4568|2007-05-21|   IT_PROG|6000.0|          null|       103|           60|
|        106|      Valli|Pataballa|VPATABAL|590.423.4560|2006-02-05|   IT_PROG|4800.0|          null|       103|           60|
|        107|      Diana|  Lorentz|DLORENTZ|590.423.5567|2007-02-07|   IT_PROG|4200.0|          null|       103|           60|
|        112|Jose Manuel|    Urman| JMURMAN|515.124.4469|2006-03-07|FI_ACCOUNT|7800.0|          null|       108|          100|
|        113|       Luis|     Popp|   LPOPP|515.124.4567|2007-12-07|FI_ACCOUNT|6900.0|          null|       108

In [15]:
# 부서번호별 사원수 조회
emp.groupBy('DEPARTMENT_ID').count().show(5)

+-------------+-----+
|DEPARTMENT_ID|count|
+-------------+-----+
|         null|    1|
|           20|    2|
|           40|    1|
|          100|    6|
|           10|    1|
+-------------+-----+
only showing top 5 rows



In [16]:
# 직책별 사원수 조회
emp.groupBy('JOB_ID').count().show(5)

+----------+-----+
|    JOB_ID|count|
+----------+-----+
|FI_ACCOUNT|    5|
|    MK_MAN|    1|
|   IT_PROG|    5|
|    FI_MGR|    1|
|AC_ACCOUNT|    1|
+----------+-----+
only showing top 5 rows



In [17]:
# 부서번호별 사원수 조회후 부서번호 순으로 정렬
emp.groupBy('DEPARTMENT_ID').count()\
    .orderBy('DEPARTMENT_ID').show(5)

+-------------+-----+
|DEPARTMENT_ID|count|
+-------------+-----+
|         null|    1|
|           10|    1|
|           20|    2|
|           30|    6|
|           40|    1|
+-------------+-----+
only showing top 5 rows



In [19]:
# 직책별 사원수 조회후 직책 순으로 정렬
emp.groupBy('JOB_ID').count()\
    .orderBy('JOB_ID').show(5)

+----------+-----+
|    JOB_ID|count|
+----------+-----+
|AC_ACCOUNT|    1|
|    AC_MGR|    1|
|   AD_ASST|    1|
|   AD_PRES|    1|
|     AD_VP|    2|
+----------+-----+
only showing top 5 rows



In [21]:
# 직책별 사원수 조회후 사원수 내림차순 정렬
emp.groupBy('JOB_ID').count()\
    .orderBy('count', ascending=False).show(5)

+--------+-----+
|  JOB_ID|count|
+--------+-----+
|  SA_REP|   30|
|ST_CLERK|   20|
|SH_CLERK|   20|
|  ST_MAN|    5|
|  SA_MAN|    5|
+--------+-----+
only showing top 5 rows



#### 집계함수 사용
+ agg(집계함수명)

In [23]:
import pyspark.sql.functions as F

In [27]:
# 직책별 평균 급여를 조회해서 내림차순으로 정렬
# 단, '평균급여' 컬럼의 이름에 함수명이 포함되어 출력
emp.groupBy('JOB_ID').agg(F.avg('SALARY')).orderBy('avg(SALARY)', ascending=False).show(5)

+-------+-----------+
| JOB_ID|avg(SALARY)|
+-------+-----------+
|AD_PRES|    24000.0|
|  AD_VP|    17000.0|
| MK_MAN|    13000.0|
| SA_MAN|    12200.0|
| AC_MGR|    12008.0|
+-------+-----------+
only showing top 5 rows



In [28]:
# 직책별 평균 급여를 조회해서 내림차순으로 정렬 (별칭부여:alias)
emp.groupBy('JOB_ID').agg(F.avg('SALARY').alias('mean sal')).orderBy('mean sal', ascending=False).show(5)

+-------+--------+
| JOB_ID|mean sal|
+-------+--------+
|AD_PRES| 24000.0|
|  AD_VP| 17000.0|
| MK_MAN| 13000.0|
| SA_MAN| 12200.0|
| AC_MGR| 12008.0|
+-------+--------+
only showing top 5 rows



In [31]:
# 사원들의 직책을 모두 출력하세요
# 단, 중복없이 하나씩만 표시되도록 합니다 - distinct()
emp.select('JOB_ID').distinct().show(5)

+----------+
|    JOB_ID|
+----------+
|FI_ACCOUNT|
|    MK_MAN|
|   IT_PROG|
|    FI_MGR|
|AC_ACCOUNT|
+----------+
only showing top 5 rows



In [44]:
# 모든 직책 수는? (중복제외하고 카운팅)
emp.select('JOB_ID').distinct().count()

19

In [35]:
emp.select(F.countDistinct('JOB_ID').alias('JOB_ID')).show()

+------+
|JOB_ID|
+------+
|    19|
+------+



In [50]:
# 사원의 이름, 직책, 급여 출력하세요
# 단, 5% 인상한 급여도 같이 출력합니다
emp.select('FIRST_NAME', 'JOB_ID', F.round(emp.SALARY).alias('SALARY'), (emp.SALARY * 1.05).alias('105% sal')).show(5)

+----------+-------+-------+--------+
|FIRST_NAME| JOB_ID| SALARY|105% sal|
+----------+-------+-------+--------+
|    Steven|AD_PRES|24000.0| 25200.0|
|     Neena|  AD_VP|17000.0| 17850.0|
|       Lex|  AD_VP|17000.0| 17850.0|
| Alexander|IT_PROG| 9000.0|  9450.0|
|     Bruce|IT_PROG| 6000.0|  6300.0|
+----------+-------+-------+--------+
only showing top 5 rows



In [52]:
emp.select('FIRST_NAME', 'JOB_ID', emp.SALARY.cast('int'), (emp.SALARY * 1.05).cast('int').alias('105% sal')).show(5)

+----------+-------+------+--------+
|FIRST_NAME| JOB_ID|SALARY|105% sal|
+----------+-------+------+--------+
|    Steven|AD_PRES| 24000|   25200|
|     Neena|  AD_VP| 17000|   17850|
|       Lex|  AD_VP| 17000|   17850|
| Alexander|IT_PROG|  9000|    9450|
|     Bruce|IT_PROG|  6000|    6300|
+----------+-------+------+--------+
only showing top 5 rows



In [65]:
# 20번 또는 50번 부서에 근무하며, 
# 급여가 5000 ~ 12,000 사이인 사원들의 
# LAST_NAME 및 급여를 조회하세요
emp.where((emp.DEPARTMENT_ID == 20) | (emp.DEPARTMENT_ID == 50)) \
    .where((emp.SALARY >= 5000) & (emp.SALARY <= 12000))\
    .select('LAST_NAME', 'SALARY').show()

+---------+------+
|LAST_NAME|SALARY|
+---------+------+
|    Weiss|8000.0|
|    Fripp|8200.0|
| Kaufling|7900.0|
|  Vollman|6500.0|
|  Mourgos|5800.0|
|      Fay|6000.0|
+---------+------+



In [70]:
emp.where(emp.DEPARTMENT_ID.isin(20, 50))\
    .where(emp.SALARY.between(5000, 12000))\
    .select('LAST_NAME', 'DEPARTMENT_ID', 'SALARY').show()

+---------+-------------+------+
|LAST_NAME|DEPARTMENT_ID|SALARY|
+---------+-------------+------+
|    Weiss|           50|8000.0|
|    Fripp|           50|8200.0|
| Kaufling|           50|7900.0|
|  Vollman|           50|6500.0|
|  Mourgos|           50|5800.0|
|      Fay|           20|6000.0|
+---------+-------------+------+

