# Spark 연동 및 Spark SQL 테스트

In [10]:
import os
import time 
from pyspark.sql import Row

In [2]:
df = (spark.read.format("com.mongodb.spark.sql.DefaultSource").option("uri", "mongodb://localhost/twitter.tweet_small_sample").load())

                                                                                

In [3]:
df.createOrReplaceTempView("tweets")

In [119]:
# df 선택하기 예시
df.show(5)

+--------------------+--------------------+--------------------+
|                 _id|          _timestamp|                data|
+--------------------+--------------------+--------------------+
|{61ffbb80f6b8e9bd...|2022-02-06T21:13:...|{{null, null}, 24...|
|{61ffbb80f6b8e9bd...|2022-02-06T21:13:...|{{null, null}, 98...|
|{61ffbb80f6b8e9bd...|2022-02-06T21:13:...|{{null, null}, 14...|
|{61ffbb80f6b8e9bd...|2022-02-06T21:13:...|{{null, null}, 12...|
|{61ffbb80f6b8e9bd...|2022-02-06T21:13:...|{{null, null}, 84...|
+--------------------+--------------------+--------------------+
only showing top 5 rows



In [137]:
#By using col() function
from pyspark.sql.functions import col
# df.select(col("data.created_at"), col("data.text"), col("data.lang") == 'ko').show(truncate=False)
# df.filter("data.lang = 'ko'").collect()

In [7]:
# spark.sql("select data from tweets").show(3)

In [5]:
query = """
select
        data.lang
    , count(*) as cnt
from tweets
group by data.lang
order by cnt desc
"""

### 속도가 매우느리다
- 쿼리의 실행 속도는 스토리지의 성능에 의존한다.
- MongoDB의 경우 열지 향 스토리지처럼 컬럼 단위의 읽기에
- 최적화 되어있지는 않으므로, 그대로는 고속 집계에 적합하지 않다.
- 최적화를 위해서는 한 차례 데이터를 추출해야 한다.
- 234page

In [11]:
strt_time = time.time()
spark.sql(query).show()
print("time: ", time.time() - strt_time)



+----+------+
|lang|   cnt|
+----+------+
|  en|284549|
|  ja|189826|
| und| 88200|
|  th| 61411|
|  es| 56196|
|  ko| 44486|
|  ar| 44412|
|  tr| 42052|
|  pt| 40400|
|  in| 38053|
|  fr| 19267|
|  tl| 14915|
|  hi| 14359|
|  it|  9141|
|  ru|  5946|
|  de|  5654|
|  zh|  5024|
|  fa|  4956|
|  pl|  4031|
|  ur|  3548|
+----+------+
only showing top 20 rows

time:  30.504709243774414


                                                                                

## 텍스트 데이터의 가공
- 234 page

In [21]:
spark.sql("select * from tweets").show(3)

+--------------------+--------------------+--------------------+
|                 _id|          _timestamp|                data|
+--------------------+--------------------+--------------------+
|{61ffbb80f6b8e9bd...|2022-02-06T21:13:...|{{null, null}, 24...|
|{61ffbb80f6b8e9bd...|2022-02-06T21:13:...|{{null, null}, 98...|
|{61ffbb80f6b8e9bd...|2022-02-06T21:13:...|{{null, null}, 14...|
+--------------------+--------------------+--------------------+
only showing top 3 rows



In [19]:
aa

In [39]:
query = """
select from_unixtime(data.created_at / 1000) time, data.text
from tweets where data.lang = 'en'
"""

query_2 = """
select data.created_at, data.text
from tweets where data.lang = 'en'
"""

In [113]:
spark.sql(query_2).show(3)

+--------------------+--------------------+
|          created_at|                text|
+--------------------+--------------------+
|2022-02-06T12:13:...|@EyesRightPhoto @...|
|2022-02-06T12:13:...|RT @IanByrneMP: R...|
|2022-02-06T12:13:...| @ifnotyoursha tilly|
+--------------------+--------------------+
only showing top 3 rows



In [135]:
from pyspark.sql import Row
from datetime import datetime
import dateutil.parser

def string_to_datetime(str):
    d = dateutil.parser.parse(str)
    return d.strftime('%Y-%m-%d %H:%M:%S')
    # return datetime.strptime(str, '%Y-%m-%dT%H:%M:%S.000Z')

def text_split(row):
    """트윗을 단어로 분해하는 제너레이터 함수
        :Args: row 
        
        :returns 
            - Row() 객체로 반환
    """
    if row.data:
        for word in row.data.text.split():
            yield Row(time=string_to_datetime(row.data.created_at), word=word)
            # yield Row(time=row.data.created_at, word=word)
        # except as e:
            

#### '.rdd'로 원시 레코드 참조

In [138]:
df.filter("data.lang = 'ko'").rdd.take(1)

[Row(_id=Row(oid='61ffbb80f6b8e9bd026bb4f6'), _timestamp='2022-02-06T21:13:52.836027+09:00', data=Row(attachments=Row(media_keys=None, poll_ids=None), author_id='1136645494019461120', context_annotations=None, conversation_id='1490279570297307137', created_at='2022-02-06T12:13:41.000Z', entities=Row(annotations=None, cashtags=None, hashtags=None, mentions=[Row(start=0, end=12, username='dkdtmxkcoth', id='1424626232667242496')], urls=None), geo=Row(place_id=None), id='1490297645189177344', in_reply_to_user_id='1424626232667242496', lang='ko', possibly_sensitive=False, public_metrics=Row(retweet_count=0, reply_count=0, like_count=0, quote_count=0), referenced_tweets=[Row(type='replied_to', id='1490297548354899972')], reply_settings='everyone', source='Twitter for Android', text='@dkdtmxkcoth 나도 같커잖아...'))]

#### flatMap()에 제너레이터 함수 적용

In [139]:
df.filter("data.lang = 'ko'").rdd.flatMap(text_split).take(5)

[Row(time='2022-02-06 12:13:41', word='@dkdtmxkcoth'),
 Row(time='2022-02-06 12:13:41', word='나도'),
 Row(time='2022-02-06 12:13:41', word='같커잖아...'),
 Row(time='2022-02-06 12:13:42', word='RT'),
 Row(time='2022-02-06 12:13:42', word='@Rmlove09127:')]

#### toDF()를 사용해 데이터 프레임으로 변환

In [161]:
df.filter("data.lang = 'ko'").rdd.flatMap(text_split).toDF()

DataFrame[time: string, word: string]

### Spark 프로그램에 있어서의 DAG 실행
- 235 page

In [162]:
ko_words = df.filter("data.lang = 'ko'").rdd.flatMap(text_split).toDF()

In [163]:
ko_words.createOrReplaceTempView("ko_words")

In [164]:
query = """
select
        word
    , count(*) as cnt
from ko_words
group by word
order by cnt desc
"""

spark.sql(query).show(10)



+----+-----+
|word|  cnt|
+----+-----+
|  RT|23188|
|너무| 2088|
|진짜| 1615|
|  잘| 1323|
|  거| 1215|
|  이| 1146|
|  수| 1043|
|  다| 1041|
|  나|  989|
|우리|  967|
+----+-----+
only showing top 10 rows



                                                                                

### 저장
- 책 236page 

In [165]:
ko_words.write.saveAsTable("ko_words_sample")

22/02/13 22:10:30 WARN HiveConf: HiveConf of name hive.stats.jdbc.timeout does not exist
22/02/13 22:10:30 WARN HiveConf: HiveConf of name hive.stats.retries.wait does not exist
22/02/13 22:10:33 WARN ObjectStore: Version information not found in metastore. hive.metastore.schema.verification is not enabled so recording the schema version 2.3.0
22/02/13 22:10:33 WARN ObjectStore: setMetaStoreSchemaVersion called but recording version is disabled: version = 2.3.0, comment = Set by MetaStore mentha@127.0.1.1
22/02/13 22:11:34 WARN SessionState: METASTORE_FILTER_HOOK will be ignored, since hive.security.authorization.manager is set to instance of HiveAuthorizerFactory.
22/02/13 22:11:34 WARN HiveConf: HiveConf of name hive.internal.ss.authz.settings.applied.marker does not exist
22/02/13 22:11:34 WARN HiveConf: HiveConf of name hive.stats.jdbc.timeout does not exist
22/02/13 22:11:34 WARN HiveConf: HiveConf of name hive.stats.retries.wait does not exist


In [167]:
!ls -R spark-warehouse

spark-warehouse:
ko_words_sample

spark-warehouse/ko_words_sample:
part-00000-f0393773-2581-4e08-91a1-971f8ef96972-c000.snappy.parquet
part-00001-f0393773-2581-4e08-91a1-971f8ef96972-c000.snappy.parquet
part-00002-f0393773-2581-4e08-91a1-971f8ef96972-c000.snappy.parquet
part-00003-f0393773-2581-4e08-91a1-971f8ef96972-c000.snappy.parquet
part-00004-f0393773-2581-4e08-91a1-971f8ef96972-c000.snappy.parquet
part-00005-f0393773-2581-4e08-91a1-971f8ef96972-c000.snappy.parquet
part-00006-f0393773-2581-4e08-91a1-971f8ef96972-c000.snappy.parquet
part-00007-f0393773-2581-4e08-91a1-971f8ef96972-c000.snappy.parquet
part-00008-f0393773-2581-4e08-91a1-971f8ef96972-c000.snappy.parquet
part-00009-f0393773-2581-4e08-91a1-971f8ef96972-c000.snappy.parquet
part-00010-f0393773-2581-4e08-91a1-971f8ef96972-c000.snappy.parquet
part-00011-f0393773-2581-4e08-91a1-971f8ef96972-c000.snappy.parquet
part-00012-f0393773-2581-4e08-91a1-971f8ef96972-c000.snappy.parquet
part-00013-f0393773-2581-4e08-91a1-971f8ef96972-c

### 저장한 파일 사용하기

In [169]:
spark.table("ko_words_sample").count()

522415

In [173]:
spark.table("ko_words_sample").show(3)

+-------------------+-------------+
|               time|         word|
+-------------------+-------------+
|2022-02-06 14:22:49|           RT|
|2022-02-06 14:22:49|@yangxxddeum:|
|2022-02-06 14:22:49|       여러분|
+-------------------+-------------+
only showing top 3 rows



In [176]:
query = """
select
        word
    , count(*) as cnt
from ko_words_sample
group by word
order by cnt desc
"""

spark.sql(query).show(10)

+----+-----+
|word|  cnt|
+----+-----+
|  RT|23188|
|너무| 2088|
|진짜| 1615|
|  잘| 1323|
|  거| 1215|
|  이| 1146|
|  수| 1043|
|  다| 1041|
|  나|  989|
|우리|  967|
+----+-----+
only showing top 10 rows



### 날짜 예제
- 참고1: https://ourcstory.tistory.com/109
- 참고2: https://docs.python.org/ko/3.9/library/datetime.html#strftime-and-strptime-format-codes

In [80]:
from datetime import datetime
import tzlocal

unix_timestamp = float("1284101485")
local_timezone = tzlocal.get_localzone() # get pytz timezone
local_time = datetime.fromtimestamp(unix_timestamp, local_timezone)
print(local_time.strftime("%Y-%m-%d %H:%M:%S.%f%z (%Z)"))

datetime.strptime("2022-02-06T12:13:41", '%Y-%m-%dT%H:%M:%S')

2010-09-10 15:51:25.000000+0900 (KST)


datetime.datetime(2022, 2, 6, 12, 13, 41)

In [91]:
# 참고자료: https://stackoverflow.com/questions/214777/how-do-you-convert-yyyy-mm-ddthhmmss-000z-time-format-to-mm-dd-yyyy-time-forma
from datetime import datetime
date_format = "%Y-%m-%dT%H:%M:%S.%fZ" 
datetime.strptime('2008-09-26T01:51:42.000Z', date_format)
# datetime(2008, 9, 26, 1, 51, 42)

import dateutil.parser

d = dateutil.parser.parse('2008-09-26T01:51:42.000Z')
print(d.strftime('%m/%d/%Y'))  #==> '09/26/2008'

09/26/2008
