In [3]:
from pyspark.sql import  SQLContext, Row
from pyspark import SparkContext,SparkConf
import pandas as pd
from pyspark.sql.functions import regexp_extract, regexp_replace, when,udf,col,count,sum,avg,round
import pyspark.sql.functions as F

In [4]:
sc = SparkContext('local')
sqlCtx = SQLContext( sc )

In [5]:
df = sqlCtx.read.csv('../data/airline-passengers.csv', 
                     header = True, inferSchema = True)
df.toPandas() 

Unnamed: 0,Month,Passengers
0,1949-01,112
1,1949-02,118
2,1949-03,132
3,1949-04,129
4,1949-05,121
...,...,...
139,1960-08,606
140,1960-09,508
141,1960-10,461
142,1960-11,390


In [6]:
df.printSchema()

root
 |-- Month: string (nullable = true)
 |-- Passengers: integer (nullable = true)



### http://docs.oracle.com/javase/tutorial/i18n/format/simpleDateFormat.html
### 포맷 확인!
#### 년 y
#### 달 M
#### 일 d

In [7]:
df1 = df.withColumn('parsed', F.to_timestamp('Month', 'yyyy-MM'))
df1.show()

+-------+----------+-------------------+
|  Month|Passengers|             parsed|
+-------+----------+-------------------+
|1949-01|       112|1949-01-01 00:00:00|
|1949-02|       118|1949-02-01 00:00:00|
|1949-03|       132|1949-03-01 00:00:00|
|1949-04|       129|1949-04-01 00:00:00|
|1949-05|       121|1949-05-01 00:00:00|
|1949-06|       135|1949-06-01 00:00:00|
|1949-07|       148|1949-07-01 00:00:00|
|1949-08|       148|1949-08-01 00:00:00|
|1949-09|       136|1949-09-01 00:00:00|
|1949-10|       119|1949-10-01 00:00:00|
|1949-11|       104|1949-11-01 00:00:00|
|1949-12|       118|1949-12-01 00:00:00|
|1950-01|       115|1950-01-01 00:00:00|
|1950-02|       126|1950-02-01 00:00:00|
|1950-03|       141|1950-03-01 00:00:00|
|1950-04|       135|1950-04-01 01:00:00|
|1950-05|       125|1950-05-01 00:00:00|
|1950-06|       149|1950-06-01 00:00:00|
|1950-07|       170|1950-07-01 00:00:00|
|1950-08|       170|1950-08-01 00:00:00|
+-------+----------+-------------------+
only showing top

# timestamp형이어야 시게열로 계산 가능!

In [8]:
# timestamp형이어야 시게열로 계산 가능!
df1.printSchema()

root
 |-- Month: string (nullable = true)
 |-- Passengers: integer (nullable = true)
 |-- parsed: timestamp (nullable = true)



In [14]:
# timestamp형일 경우, 이렇게 쓸 수 있음
# 스파크는 슬라이싱이 없기 때문에 where 조건절을 써서 추출

# df1.where(df1['parsed']>='1950').show()
df1.where("parsed>='1950'").show()

+-------+----------+-------------------+
|  Month|Passengers|             parsed|
+-------+----------+-------------------+
|1950-01|       115|1950-01-01 00:00:00|
|1950-02|       126|1950-02-01 00:00:00|
|1950-03|       141|1950-03-01 00:00:00|
|1950-04|       135|1950-04-01 01:00:00|
|1950-05|       125|1950-05-01 00:00:00|
|1950-06|       149|1950-06-01 00:00:00|
|1950-07|       170|1950-07-01 00:00:00|
|1950-08|       170|1950-08-01 00:00:00|
|1950-09|       158|1950-09-01 00:00:00|
|1950-10|       133|1950-10-01 00:00:00|
|1950-11|       114|1950-11-01 00:00:00|
|1950-12|       140|1950-12-01 00:00:00|
|1951-01|       145|1951-01-01 00:00:00|
|1951-02|       150|1951-02-01 00:00:00|
|1951-03|       178|1951-03-01 00:00:00|
|1951-04|       163|1951-04-01 00:00:00|
|1951-05|       172|1951-05-01 00:00:00|
|1951-06|       178|1951-06-01 00:00:00|
|1951-07|       199|1951-07-01 00:00:00|
|1951-08|       199|1951-08-01 00:00:00|
+-------+----------+-------------------+
only showing top

In [15]:
df1.where("parsed>='1950-03'").show()

+-------+----------+-------------------+
|  Month|Passengers|             parsed|
+-------+----------+-------------------+
|1950-03|       141|1950-03-01 00:00:00|
|1950-04|       135|1950-04-01 01:00:00|
|1950-05|       125|1950-05-01 00:00:00|
|1950-06|       149|1950-06-01 00:00:00|
|1950-07|       170|1950-07-01 00:00:00|
|1950-08|       170|1950-08-01 00:00:00|
|1950-09|       158|1950-09-01 00:00:00|
|1950-10|       133|1950-10-01 00:00:00|
|1950-11|       114|1950-11-01 00:00:00|
|1950-12|       140|1950-12-01 00:00:00|
|1951-01|       145|1951-01-01 00:00:00|
|1951-02|       150|1951-02-01 00:00:00|
|1951-03|       178|1951-03-01 00:00:00|
|1951-04|       163|1951-04-01 00:00:00|
|1951-05|       172|1951-05-01 00:00:00|
|1951-06|       178|1951-06-01 00:00:00|
|1951-07|       199|1951-07-01 00:00:00|
|1951-08|       199|1951-08-01 00:00:00|
|1951-09|       184|1951-09-01 00:00:00|
|1951-10|       162|1951-10-01 00:00:00|
+-------+----------+-------------------+
only showing top

In [19]:
# 년도만 추출
df1.withColumn('y', F.year('parsed')).show()

+-------+----------+-------------------+----+
|  Month|Passengers|             parsed|   y|
+-------+----------+-------------------+----+
|1949-01|       112|1949-01-01 00:00:00|1949|
|1949-02|       118|1949-02-01 00:00:00|1949|
|1949-03|       132|1949-03-01 00:00:00|1949|
|1949-04|       129|1949-04-01 00:00:00|1949|
|1949-05|       121|1949-05-01 00:00:00|1949|
|1949-06|       135|1949-06-01 00:00:00|1949|
|1949-07|       148|1949-07-01 00:00:00|1949|
|1949-08|       148|1949-08-01 00:00:00|1949|
|1949-09|       136|1949-09-01 00:00:00|1949|
|1949-10|       119|1949-10-01 00:00:00|1949|
|1949-11|       104|1949-11-01 00:00:00|1949|
|1949-12|       118|1949-12-01 00:00:00|1949|
|1950-01|       115|1950-01-01 00:00:00|1950|
|1950-02|       126|1950-02-01 00:00:00|1950|
|1950-03|       141|1950-03-01 00:00:00|1950|
|1950-04|       135|1950-04-01 01:00:00|1950|
|1950-05|       125|1950-05-01 00:00:00|1950|
|1950-06|       149|1950-06-01 00:00:00|1950|
|1950-07|       170|1950-07-01 00:

In [20]:
# 달만 추출
df1.withColumn('m', F.month('parsed')).show()

+-------+----------+-------------------+---+
|  Month|Passengers|             parsed|  m|
+-------+----------+-------------------+---+
|1949-01|       112|1949-01-01 00:00:00|  1|
|1949-02|       118|1949-02-01 00:00:00|  2|
|1949-03|       132|1949-03-01 00:00:00|  3|
|1949-04|       129|1949-04-01 00:00:00|  4|
|1949-05|       121|1949-05-01 00:00:00|  5|
|1949-06|       135|1949-06-01 00:00:00|  6|
|1949-07|       148|1949-07-01 00:00:00|  7|
|1949-08|       148|1949-08-01 00:00:00|  8|
|1949-09|       136|1949-09-01 00:00:00|  9|
|1949-10|       119|1949-10-01 00:00:00| 10|
|1949-11|       104|1949-11-01 00:00:00| 11|
|1949-12|       118|1949-12-01 00:00:00| 12|
|1950-01|       115|1950-01-01 00:00:00|  1|
|1950-02|       126|1950-02-01 00:00:00|  2|
|1950-03|       141|1950-03-01 00:00:00|  3|
|1950-04|       135|1950-04-01 01:00:00|  4|
|1950-05|       125|1950-05-01 00:00:00|  5|
|1950-06|       149|1950-06-01 00:00:00|  6|
|1950-07|       170|1950-07-01 00:00:00|  7|
|1950-08| 

In [21]:
df1.withColumn('f', F.date_format('parsed', 'yyyy년MM월dd일')).show()

+-------+----------+-------------------+--------------+
|  Month|Passengers|             parsed|             f|
+-------+----------+-------------------+--------------+
|1949-01|       112|1949-01-01 00:00:00|1949년01월01일|
|1949-02|       118|1949-02-01 00:00:00|1949년02월01일|
|1949-03|       132|1949-03-01 00:00:00|1949년03월01일|
|1949-04|       129|1949-04-01 00:00:00|1949년04월01일|
|1949-05|       121|1949-05-01 00:00:00|1949년05월01일|
|1949-06|       135|1949-06-01 00:00:00|1949년06월01일|
|1949-07|       148|1949-07-01 00:00:00|1949년07월01일|
|1949-08|       148|1949-08-01 00:00:00|1949년08월01일|
|1949-09|       136|1949-09-01 00:00:00|1949년09월01일|
|1949-10|       119|1949-10-01 00:00:00|1949년10월01일|
|1949-11|       104|1949-11-01 00:00:00|1949년11월01일|
|1949-12|       118|1949-12-01 00:00:00|1949년12월01일|
|1950-01|       115|1950-01-01 00:00:00|1950년01월01일|
|1950-02|       126|1950-02-01 00:00:00|1950년02월01일|
|1950-03|       141|1950-03-01 00:00:00|1950년03월01일|
|1950-04|       135|1950-04-01 01:00:

In [30]:
# selectExpr하면 year가 적용됨
# 약어도 써도 됨

df1.selectExpr('year(parsed) as year', 'Passengers').groupby('year').mean().orderBy('year').show()

+----+---------+------------------+
|year|avg(year)|   avg(Passengers)|
+----+---------+------------------+
|1949|   1949.0|126.66666666666667|
|1950|   1950.0|139.66666666666666|
|1951|   1951.0|170.16666666666666|
|1952|   1952.0|             197.0|
|1953|   1953.0|             225.0|
|1954|   1954.0|238.91666666666666|
|1955|   1955.0|             284.0|
|1956|   1956.0|            328.25|
|1957|   1957.0| 368.4166666666667|
|1958|   1958.0|             381.0|
|1959|   1959.0| 428.3333333333333|
|1960|   1960.0| 476.1666666666667|
+----+---------+------------------+



In [34]:
df1.groupBy( F.year( 'parsed').alias('year') ).mean().\
    orderBy('year').\
    withColumn('avg(Passengers)',F.round('avg(Passengers)',2) ).show()

+----+---------------+
|year|avg(Passengers)|
+----+---------------+
|1949|         126.67|
|1950|         139.67|
|1951|         170.17|
|1952|          197.0|
|1953|          225.0|
|1954|         238.92|
|1955|          284.0|
|1956|         328.25|
|1957|         368.42|
|1958|          381.0|
|1959|         428.33|
|1960|         476.17|
+----+---------------+



In [35]:
df1.show()

+-------+----------+-------------------+
|  Month|Passengers|             parsed|
+-------+----------+-------------------+
|1949-01|       112|1949-01-01 00:00:00|
|1949-02|       118|1949-02-01 00:00:00|
|1949-03|       132|1949-03-01 00:00:00|
|1949-04|       129|1949-04-01 00:00:00|
|1949-05|       121|1949-05-01 00:00:00|
|1949-06|       135|1949-06-01 00:00:00|
|1949-07|       148|1949-07-01 00:00:00|
|1949-08|       148|1949-08-01 00:00:00|
|1949-09|       136|1949-09-01 00:00:00|
|1949-10|       119|1949-10-01 00:00:00|
|1949-11|       104|1949-11-01 00:00:00|
|1949-12|       118|1949-12-01 00:00:00|
|1950-01|       115|1950-01-01 00:00:00|
|1950-02|       126|1950-02-01 00:00:00|
|1950-03|       141|1950-03-01 00:00:00|
|1950-04|       135|1950-04-01 01:00:00|
|1950-05|       125|1950-05-01 00:00:00|
|1950-06|       149|1950-06-01 00:00:00|
|1950-07|       170|1950-07-01 00:00:00|
|1950-08|       170|1950-08-01 00:00:00|
+-------+----------+-------------------+
only showing top

In [36]:
df1.groupBy( F.year( 'parsed')).mean().show()

+------------+------------------+
|year(parsed)|   avg(Passengers)|
+------------+------------------+
|        1959| 428.3333333333333|
|        1955|             284.0|
|        1952|             197.0|
|        1956|            328.25|
|        1951|170.16666666666666|
|        1950|139.66666666666666|
|        1949|126.66666666666667|
|        1957| 368.4166666666667|
|        1960| 476.1666666666667|
|        1953|             225.0|
|        1958|             381.0|
|        1954|238.91666666666666|
+------------+------------------+



In [39]:
# window함수는 주식 데이터로 살펴보자!

df1.groupBy( F.window( 'parsed', '1 week')).mean().show()

+--------------------+---------------+
|              window|avg(Passengers)|
+--------------------+---------------+
|[1949-06-30 10:00...|          148.0|
|[1951-09-27 09:00...|          162.0|
|[1959-08-27 09:30...|          463.0|
|[1955-07-28 09:30...|          347.0|
|[1959-09-24 08:30...|          407.0|
|[1949-08-25 10:00...|          136.0|
|[1949-12-29 09:00...|          115.0|
|[1959-01-29 08:30...|          342.0|
|[1954-07-29 08:30...|          293.0|
|[1957-04-25 08:30...|          355.0|
|[1953-12-31 09:00...|          204.0|
|[1956-11-29 08:30...|          306.0|
|[1953-11-26 09:00...|          201.0|
|[1958-03-27 08:30...|          348.0|
|[1949-09-29 09:00...|          119.0|
|[1955-01-27 08:30...|          233.0|
|[1956-06-28 09:30...|          413.0|
|[1957-12-26 08:30...|          340.0|
|[1958-08-28 09:30...|          404.0|
|[1954-02-25 09:00...|          235.0|
+--------------------+---------------+
only showing top 20 rows



# 주식데이터

In [49]:
# !pip install pandas_datareader

Collecting pandas_datareader
  Using cached pandas_datareader-0.9.0-py3-none-any.whl (107 kB)
Installing collected packages: pandas-datareader
Successfully installed pandas-datareader-0.9.0


In [50]:
import pandas as pd
from pandas_datareader import data

In [51]:
samsungDF = data.get_data_yahoo('005930.KS', '2017-01-01')
samsungDF.reset_index(inplace=True)

In [52]:
sDF = sqlCtx.createDataFrame( samsungDF )
sDF.show()

+-------------------+-------+-------+-------+-------+----------+---------------+
|               Date|   High|    Low|   Open|  Close|    Volume|      Adj Close|
+-------------------+-------+-------+-------+-------+----------+---------------+
|2017-01-02 00:00:00|36240.0|35880.0|35980.0|36100.0| 4650600.0|  32702.4609375|
|2017-01-03 00:00:00|36620.0|36020.0|36280.0|36480.0| 7357650.0|   33046.703125|
|2017-01-04 00:00:00|36520.0|36100.0|36500.0|36160.0| 7971750.0|32756.814453125|
|2017-01-05 00:00:00|36060.0|35540.0|36060.0|35560.0|1.096745E7|32213.283203125|
|2017-01-06 00:00:00|36440.0|36040.0|36180.0|36200.0| 8880950.0| 32793.05078125|
|2017-01-09 00:00:00|37500.0|36560.0|36600.0|37220.0| 1.31949E7|     33717.0625|
|2017-01-10 00:00:00|37400.0|37080.0|37280.0|37240.0| 9099800.0|   33735.171875|
|2017-01-11 00:00:00|38560.0|37420.0|37520.0|38280.0|1.201815E7|   34677.296875|
|2017-01-12 00:00:00|38800.0|37980.0|38000.0|38800.0|1.166915E7|  35148.3515625|
|2017-01-13 00:00:00|38320.0

In [53]:
# Date가 timestamp 형임을 확인!

sDF.printSchema()

root
 |-- Date: timestamp (nullable = true)
 |-- High: double (nullable = true)
 |-- Low: double (nullable = true)
 |-- Open: double (nullable = true)
 |-- Close: double (nullable = true)
 |-- Volume: double (nullable = true)
 |-- Adj Close: double (nullable = true)



In [54]:
# 7일미다

sDF.groupBy( F.window( 'Date', '1 week')).mean('Close').orderBy('window').show()

+--------------------+------------------+
|              window|        avg(Close)|
+--------------------+------------------+
|[2016-12-29 09:00...|           36075.0|
|[2017-01-05 09:00...|           37548.0|
|[2017-01-12 09:00...|           37100.0|
|[2017-01-19 09:00...|           38544.0|
|[2017-01-26 09:00...|39313.333333333336|
|[2017-02-02 09:00...|           38928.0|
|[2017-02-09 09:00...|           37928.0|
|[2017-02-16 09:00...|           38788.0|
|[2017-02-23 09:00...|           38610.0|
|[2017-03-02 09:00...|           40060.0|
|[2017-03-09 09:00...|           41076.0|
|[2017-03-16 09:00...|           42224.0|
|[2017-03-23 09:00...|           41588.0|
|[2017-03-30 09:00...|           41740.0|
|[2017-04-06 09:00...|           41892.0|
|[2017-04-13 09:00...|           41252.0|
|[2017-04-20 09:00...|           42268.0|
|[2017-04-27 09:00...|45013.333333333336|
|[2017-05-04 09:00...|           46040.0|
|[2017-05-11 09:00...|           46116.0|
+--------------------+------------