# Spark로 데이터 파이프라인 만들기
+ 중요사항: Spark는 데이터 프레임을 토대로 RDD(Resilient Distributed Dataset)라 불리는 Low level 데이터 구주로 되어있다.
+ RDD는 지연실행 되고, MapReduce와 마찬가지로 임의의 함수를 Map과 Reduce로 분산 처리할 수 있다.
+ 출처: 빅데이터를 지탱하는 기술 (나시다 케이스케)
+ 상세페이지: 232~236page

In [1]:
import os
import time 
from pyspark.sql import Row

## 1. 데이터프레임 생성 (뷰: tweets)

In [2]:
df = (spark.read.format("com.mongodb.spark.sql.DefaultSource").option("uri", "mongodb://localhost/twitter.tweet_small_sample").load())
df.createOrReplaceTempView("tweets")

                                                                                

In [3]:
df.show(1)

[Stage 1:>                                                          (0 + 1) / 1]

+--------------------+--------------------+--------------------+
|                 _id|          _timestamp|                data|
+--------------------+--------------------+--------------------+
|{61ffbb80f6b8e9bd...|2022-02-06T21:13:...|{{null}, 24204959...|
+--------------------+--------------------+--------------------+
only showing top 1 row



                                                                                

In [4]:
#By using col() function
from pyspark.sql.functions import col
# df.select(col("data.created_at"), col("data.text"), col("data.lang") == 'ko').show(truncate=False)
# df.filter("data.lang = 'ko'").collect()

In [5]:
query = """
select
        data.lang
    , count(*) as cnt
from tweets
group by data.lang
order by cnt desc
"""

In [7]:
strt_time = time.time()
spark.sql(query).show()
print("time: ", time.time() - strt_time)



+----+------+
|lang|   cnt|
+----+------+
|  en|284549|
|  ja|189826|
| und| 88200|
|  th| 61411|
|  es| 56196|
|  ko| 44486|
|  ar| 44412|
|  tr| 42052|
|  pt| 40400|
|  in| 38053|
|  fr| 19267|
|  tl| 14915|
|  hi| 14359|
|  it|  9141|
|  ru|  5946|
|  de|  5654|
|  zh|  5024|
|  fa|  4956|
|  pl|  4031|
|  ur|  3548|
+----+------+
only showing top 20 rows

time:  33.68653750419617


                                                                                

## 2. 조건에 맞는 레코드 추출 

In [3]:
from pyspark.sql import Row
from datetime import datetime
import dateutil.parser

In [7]:
query = """
select data.created_at, data.text
from tweets where data.lang = 'ko'
"""

In [8]:
spark.sql(query).show(5)

+--------------------+---------------------------------+
|          created_at|                             text|
+--------------------+---------------------------------+
|2022-02-06T12:13:...|          @dkdtmxkcoth 나도 같...|
|2022-02-06T12:13:...|             RT @Rmlove09127: ...|
|2022-02-06T12:13:...|             RT @BYangsalang: ...|
|2022-02-06T12:13:...|모두들 힘껏 뛰었지만 늦어 버렸...|
|2022-02-06T12:13:...|           RT @Chuverall: 이게...|
+--------------------+---------------------------------+
only showing top 5 rows



                                                                                

In [13]:
#df.filter("data.lang = 'ko'").rdd.take(1)

In [None]:
#df.filter("data.lang = 'ko'").rdd.flatMap(text_split).take(5)

## 3. 단어로 분해

In [2]:
from pyspark.sql import Row
from datetime import datetime
import dateutil.parser

def string_to_datetime(str):
    d = dateutil.parser.parse(str)
    return d.strftime('%Y-%m-%d %H:%M:%S')
    # return datetime.strptime(str, '%Y-%m-%dT%H:%M:%S.000Z')

def text_split(row):
    """트윗을 단어로 분해하는 제너레이터 함수
        :Args: row 
        
        :returns 
            - Row() 객체로 반환
    """
    if row.data:
        for word in row.data.text.split():
            yield Row(time=string_to_datetime(row.data.created_at), word=word)

In [10]:
ko_words = df.filter("data.lang = 'ko'").rdd.flatMap(text_split).toDF()

                                                                                

## 4. 데이터 브레임 생성(뷰: ko_words)

In [11]:
ko_words.createOrReplaceTempView("ko_words")

## 5. 단어별 카운트 시작
+ RDD에 안먹힘

In [12]:
query = """
select
        word
    , count(*) as cnt
from ko_words
group by word
order by cnt desc
"""

spark.sql(query)

DataFrame[word: string, cnt: bigint]

## 6. 열 지향 스토리지로 저장

In [13]:
ko_words.write.saveAsTable("ko_words")

22/02/16 22:16:44 WARN HiveConf: HiveConf of name hive.stats.jdbc.timeout does not exist
22/02/16 22:16:44 WARN HiveConf: HiveConf of name hive.stats.retries.wait does not exist
22/02/16 22:16:47 WARN ObjectStore: Version information not found in metastore. hive.metastore.schema.verification is not enabled so recording the schema version 2.3.0
22/02/16 22:16:47 WARN ObjectStore: setMetaStoreSchemaVersion called but recording version is disabled: version = 2.3.0, comment = Set by MetaStore mentha@127.0.1.1
22/02/16 22:18:36 WARN SessionState: METASTORE_FILTER_HOOK will be ignored, since hive.security.authorization.manager is set to instance of HiveAuthorizerFactory.
22/02/16 22:18:36 WARN HiveConf: HiveConf of name hive.internal.ss.authz.settings.applied.marker does not exist
22/02/16 22:18:36 WARN HiveConf: HiveConf of name hive.stats.jdbc.timeout does not exist
22/02/16 22:18:36 WARN HiveConf: HiveConf of name hive.stats.retries.wait does not exist


In [7]:
!ls -r spark-warehouse

ko_words_sample  ko_words_20220214  ko_words


## 7. 데이터 불러와서 확인하기

In [2]:
spark.table("ko_words").count()

22/02/16 22:46:24 WARN HiveConf: HiveConf of name hive.stats.jdbc.timeout does not exist
22/02/16 22:46:24 WARN HiveConf: HiveConf of name hive.stats.retries.wait does not exist
22/02/16 22:46:29 WARN ObjectStore: Version information not found in metastore. hive.metastore.schema.verification is not enabled so recording the schema version 2.3.0
22/02/16 22:46:29 WARN ObjectStore: setMetaStoreSchemaVersion called but recording version is disabled: version = 2.3.0, comment = Set by MetaStore mentha@127.0.1.1
22/02/16 22:46:29 WARN ObjectStore: Failed to get database default, returning NoSuchObjectException


AnalysisException: Table or view not found: ko_words;
'UnresolvedRelation [ko_words], [], false


In [18]:
spark.table("ko_words_20220214").count()

11629522

In [3]:
query = """
select 
        substr(time, 1, 13) as time
    , word
    , count(*) as cnt
from  ko_words
group by substr(time, 1, 13), word
order by cnt desc
"""

query2 = """
select
        word
    , count(*) as cnt
from ko_words_20220214
group by word
order by cnt desc
"""

query3 = """
select 
        substr(time, 1, 13) as time
    , word
    , count(*) as cnt
from  ko_words_20220214
group by substr(time, 1, 13), word
order by cnt desc
"""

In [6]:
strt_time = time.time()
spark.sql(query).show(3)
print("time: ", time.time() - strt_time)

AnalysisException: Table or view not found: ko_words; line 6 pos 6;
'Sort ['cnt DESC NULLS LAST], true
+- 'Aggregate ['substr('time, 1, 13), 'word], ['substr('time, 1, 13) AS time#0, 'word, count(1) AS cnt#1L]
   +- 'UnresolvedRelation [ko_words], [], false


In [27]:
strt_time = time.time()
spark.sql(query2).show(3)
print("time: ", time.time() - strt_time)

[Stage 41:>                                                         (0 + 4) / 4]

+----+------+
|word|   cnt|
+----+------+
|  RT|561390|
|너무| 39247|
|진짜| 36304|
+----+------+
only showing top 3 rows

time:  7.478374242782593


                                                                                

In [29]:
strt_time = time.time()
spark.sql(query3).show(3)
print("time: ", time.time() - strt_time)

22/02/16 22:35:35 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/02/16 22:35:35 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/02/16 22:35:35 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/02/16 22:35:35 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/02/16 22:35:36 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/02/16 22:35:36 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/02/16 22:35:36 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/02/16 22:35:36 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
[Stage 47:>                                                     

+-------------+----+-----+
|         time|word|  cnt|
+-------------+----+-----+
|2022-02-07 13|  RT|13683|
|2022-02-07 14|  RT|11665|
|2022-02-11 13|  RT| 8522|
+-------------+----+-----+
only showing top 3 rows

time:  23.757562398910522


                                                                                