In [3]:
import pyspark
myConf=pyspark.SparkConf()
spark = pyspark.sql.SparkSession\
    .builder\
    .master("local")\
    .appName("myApp")\
    .config(conf=myConf)\
    .getOrCreate()

---
# S.4.2 RDD에서 생성하기
## schema 자동 인식

In [6]:
# '', "" 상관 없다.
myList=[('1','kim, js',170), ('1','lee, sm', 175), ('2','lim, yg',180), ('2','lee',170)]

In [7]:
myRdd = spark.sparkContext.parallelize(myList)

In [8]:
rddDf=myRdd.toDF() #toDF() rdd를 df로

In [9]:
rddDf.printSchema()

root
 |-- _1: string (nullable = true)
 |-- _2: string (nullable = true)
 |-- _3: long (nullable = true)



In [10]:
rddDf=spark.createDataFrame(myRdd)

In [11]:
rddDf.printSchema()

root
 |-- _1: string (nullable = true)
 |-- _2: string (nullable = true)
 |-- _3: long (nullable = true)



## Row를 사용

In [12]:
from pyspark.sql import Row
#.map: 한줄씩 읽어서~(반복)
_myRdd=myRdd.map(lambda x:Row(year=int(x[0]), name=x[1], height=int(x[2])))

In [13]:
_myDf=spark.createDataFrame(_myRdd)

In [14]:
_myDf.printSchema()

root
 |-- year: long (nullable = true)
 |-- name: string (nullable = true)
 |-- height: long (nullable = true)



In [15]:
_myDf.take(1)

[Row(year=1, name='kim, js', height=170)]

In [16]:
from pyspark.sql import Row

r1=Row(name="js1", age=10)
r2=Row(name="js2", age=20)
_myRdd=spark.sparkContext.parallelize([r1,r2])

In [17]:
_myRdd.collect()

[Row(name='js1', age=10), Row(name='js2', age=20)]

## schema를 정의하고 생성

In [21]:
from pyspark.sql.types import StructType, StructField
from pyspark.sql.types import StringType, IntegerType
schema=StructType([
    StructField("name", StringType(), True),
    StructField("age", IntegerType(), True),
    #StructField("created", TimestampType(), True)
])
_myDf=spark.createDataFrame(_myRdd, schema)

In [22]:
_myDf.printSchema()

root
 |-- name: string (nullable = true)
 |-- age: integer (nullable = true)



In [23]:
_myDf.show()

+----+---+
|name|age|
+----+---+
| js1| 10|
| js2| 20|
+----+---+



In [24]:
from pyspark.sql.types import *

myRdd=spark.sparkContext.parallelize([(1, 'kim', 50.0), (2, 'lee', 60.0), (3, 'park', 70.0)])
schema = StructType([
    StructField("id", IntegerType(), True),
    StructField("name", StringType(), True),
    StructField("height", DoubleType(), True)
])
_myDf = spark.createDataFrame(myRdd, schema)

In [25]:
_myDf.printSchema()

root
 |-- id: integer (nullable = true)
 |-- name: string (nullable = true)
 |-- height: double (nullable = true)



In [26]:
_myDf.show()

+---+----+------+
| id|name|height|
+---+----+------+
|  1| kim|  50.0|
|  2| lee|  60.0|
|  3|park|  70.0|
+---+----+------+



# S.4.3 Pandas

In [115]:
from pyspark.sql import Row
Person = Row('year','name', 'height')
row1=Person('1','kim, js',170) #행 이름을 Person으로

In [116]:
myRows = [row1,
          Person('1','lee, sm', 175),
          Person('2','lim, yg',180),
          Person('2','lee',170)]

In [117]:
from pyspark.sql.types import StructType, StructField
from pyspark.sql.types import StringType, IntegerType
mySchema=StructType([
    StructField("year", StringType(), True),
    StructField("name", StringType(), True),
    StructField("height", IntegerType(), True)
])
myDf=spark.createDataFrame(myRows, mySchema)


## Dataframe을 Pandas로 변환

In [118]:
myDf.toPandas() 

Unnamed: 0,year,name,height
0,1,"kim, js",170
1,1,"lee, sm",175
2,2,"lim, yg",180
3,2,lee,170


## Pandas에서 csv 쓰기

In [37]:
import os
myDf.write.format('com.databricks.spark.csv').save(os.path.join('data','_myDf.csv'))

In [122]:
!dir data\_myDf.csv

 C 드라이브의 볼륨에는 이름이 없습니다.
 볼륨 일련 번호: C826-817A

 C:\Users\user\Bigdata\data\_myDf.csv 디렉터리

2020-09-29  오전 12:55    <DIR>          .
2020-09-29  오전 12:55    <DIR>          ..
2020-09-29  오전 12:55                12 .part-00000-042cf187-9b6e-4c22-bc40-1b77acbd08ff-c000.csv.crc
2020-09-29  오전 12:55                 8 ._SUCCESS.crc
2020-09-29  오전 12:55                62 part-00000-042cf187-9b6e-4c22-bc40-1b77acbd08ff-c000.csv
2020-09-29  오전 12:55                 0 _SUCCESS
               4개 파일                  82 바이트
               2개 디렉터리  371,529,879,552 바이트 남음


In [39]:
myDf.toPandas().to_csv(os.path.join('data','myDf.csv'))
#파일을 육안으로 복 수 있다!

In [40]:
#json형식 , 콜론으로 구분 키:값 형태
import pandas as pd
icc = pd.DataFrame( { 'country': ['South Korea','Japan','Hong Kong'],'codes': [81, 82, 852] })

In [41]:
icc

Unnamed: 0,country,codes
0,South Korea,81
1,Japan,82
2,Hong Kong,852


In [42]:
icc[icc['codes']==81]

Unnamed: 0,country,codes
0,South Korea,81


## csv 파일에서 생성

### RDD에서 Dataframe

In [123]:
from pyspark.sql import Row
cfile= os.path.join("data", "ds_spark_2cols.csv")
lines = spark.sparkContext.textFile(cfile)

In [124]:
_col12 = lines.map(lambda l: l.split(",")) # , 로 분리해서 읽음

#각각 값을 분리해서 저장
#strip(): 앞 뒤 관련없는 기호가 달리면 떼줌
#int형으로 바꿈 (안바꿨으면 string형태일거임)
col12 = _col12.map(lambda p: Row(col1=int(p[0].strip()), col2=int(p[1].strip())))


_myDf = spark.createDataFrame(col12)

In [125]:
_myDf.printSchema()
_myDf.collect()

root
 |-- col1: long (nullable = true)
 |-- col2: long (nullable = true)



[Row(col1=35, col2=2),
 Row(col1=40, col2=27),
 Row(col1=12, col2=38),
 Row(col1=15, col2=31),
 Row(col1=21, col2=1),
 Row(col1=14, col2=19),
 Row(col1=46, col2=1),
 Row(col1=10, col2=34),
 Row(col1=28, col2=3),
 Row(col1=48, col2=1),
 Row(col1=16, col2=2),
 Row(col1=30, col2=3),
 Row(col1=32, col2=2),
 Row(col1=48, col2=1),
 Row(col1=31, col2=2),
 Row(col1=22, col2=1),
 Row(col1=12, col2=3),
 Row(col1=39, col2=29),
 Row(col1=19, col2=37),
 Row(col1=25, col2=2)]

### DataFrame으로 직접 읽기

In [126]:
%%writefile data/ds_spark.csv
1,2,3,4
11,22,33,44
111,222,333,444

Overwriting data/ds_spark.csv


In [127]:
df = spark\
        .read\
        .format('com.databricks.spark.csv')\
        .options(header='true', inferschema='true', delimiter=',')\
        .load(os.path.join('data','ds_spark.csv'))

In [128]:
df.show()

+---+---+---+---+
|  1|  2|  3|  4|
+---+---+---+---+
| 11| 22| 33| 44|
|111|222|333|444|
+---+---+---+---+



In [129]:
#자동으로 잘 추정해서 들어감
df.printSchema()

root
 |-- 1: integer (nullable = true)
 |-- 2: integer (nullable = true)
 |-- 3: integer (nullable = true)
 |-- 4: integer (nullable = true)



`inferschema`를 제외하면, string으로 자동인식한다.

In [130]:
df = spark\
        .read\
        .format('com.databricks.spark.csv')\
        .options(header='true', delimiter=',')\
        .load(os.path.join('data','ds_spark.csv'))

In [131]:
df.printSchema()

root
 |-- 1: string (nullable = true)
 |-- 2: string (nullable = true)
 |-- 3: string (nullable = true)
 |-- 4: string (nullable = true)



### csv

In [53]:
df = spark\
        .read\
        .options(header='true', inferschema='true', delimiter=',')\
        .csv(os.path.join('data', 'ds_spark.csv'))
df.show()

+---+---+---+---+
|  1|  2|  3|  4|
+---+---+---+---+
| 11| 22| 33| 44|
|111|222|333|444|
+---+---+---+---+



# S.4.5 tsv 파일 읽기
- tab으로 분류된 파일

In [55]:
import numpy as np
np.array([float(x) for x in '1.658985	4.285136'.split()])
#whitespace가 .split()]되기 때문에	도 된다.

array([1.658985, 4.285136])

In [60]:
# http://wiki.stat.ucla.edu/socr/index.php/SOCR_Data_Dinov_020108_HeightsWeights

In [65]:
%%writefile data/ds_spark_heightweight.txt
1	65.78	112.99
2	71.52	136.49
3	69.40	153.03
4	68.22	142.34
5	67.79	144.30
6	68.70	123.30
7	69.80	141.49
8	70.01	136.46
9	67.90	112.37
10	66.78	120.67
11	66.49	127.45
12	67.62	114.14
13	68.30	125.61
14	67.12	122.46
15	68.28	116.09
16	71.09	140.00
17	66.46	129.50
18	68.65	142.97
19	71.23	137.90
20	67.13	124.04
21	67.83	141.28
22	68.88	143.54
23	63.48	97.90
24	68.42	129.50
25	67.63	141.85
26	67.21	129.72
27	70.84	142.42
28	67.49	131.55
29	66.53	108.33
30	65.44	113.89
31	69.52	103.30
32	65.81	120.75
33	67.82	125.79
34	70.60	136.22
35	71.80	140.10
36	69.21	128.75
37	66.80	141.80
38	67.66	121.23
39	67.81	131.35
40	64.05	106.71
41	68.57	124.36
42	65.18	124.86
43	69.66	139.67
44	67.97	137.37
45	65.98	106.45
46	68.67	128.76
47	66.88	145.68
48	67.70	116.82
49	69.82	143.62
50	69.09	134.93

Overwriting data/ds_spark_heightweight.txt


### RDD

In [69]:
from pyspark.sql.types import *
_tRdd=spark.sparkContext\
    .textFile(os.path.join('data','ds_spark_heightweight.txt'))

In [70]:
#tRdd=rdd.map(lambda x:x.split('\t'))
_tRddSplitted = _tRdd.map(lambda x:x.split()) #split하여 rdd저장

### 형변환

In [71]:
import numpy as np
#myRdd=rdd.map(lambda line:np.array([float(x) for x in line.split('\t')]))
tRdd=_tRdd.map(lambda line:[float(x) for x in line.split('\t')])
tRdd.take(1)

[[1.0, 65.78, 112.99]]

### schema 설정
```py
mySchema = StructType([
    StructField("id", IntegerType(), True),
    StructField("weight", DoubleType(), True),
    StructField("height", DoubleType(), True)
])
myDf=spark.createDataFrame(myRdd, mySchema)
```

### Dataframe 생성

In [72]:
tDfNamed = spark.createDataFrame(tRdd, ["id","weight","height"])

In [73]:
tDfNamed.printSchema()

root
 |-- id: double (nullable = true)
 |-- weight: double (nullable = true)
 |-- height: double (nullable = true)



In [74]:
tDfNamed.take(1)

[Row(id=1.0, weight=65.78, height=112.99)]

## 컬럼을 split으로 분할
`text()`함수 이용해서 파일을 읽는다

In [76]:
tDftxt = spark.read.text(os.path.join('data','ds_spark_heightweight.txt'))

In [77]:
tDftxt.printSchema()

root
 |-- value: string (nullable = true)



In [78]:
from pyspark.sql.functions import split

split_col = split(tDftxt['value'], '\t')


#분리된 컬럼은 getItem() 함수로 가져와서 각 각 weight, height 컬럼이 된다.

In [79]:
#weight(1) height(2)
split_col.getItem(1) # getitem(n): n번째꺼 가져오심

Column<b'split(value, \t, -1)[1]'>

In [80]:
tDftxt = tDftxt.withColumn('weight', split_col.getItem(1))
tDftxt = tDftxt.withColumn('height', split_col.getItem(2))

In [81]:
tDftxt.show()

+---------------+------+------+
|          value|weight|height|
+---------------+------+------+
| 1	65.78	112.99| 65.78|112.99|
| 2	71.52	136.49| 71.52|136.49|
| 3	69.40	153.03| 69.40|153.03|
| 4	68.22	142.34| 68.22|142.34|
| 5	67.79	144.30| 67.79|144.30|
| 6	68.70	123.30| 68.70|123.30|
| 7	69.80	141.49| 69.80|141.49|
| 8	70.01	136.46| 70.01|136.46|
| 9	67.90	112.37| 67.90|112.37|
|10	66.78	120.67| 66.78|120.67|
|11	66.49	127.45| 66.49|127.45|
|12	67.62	114.14| 67.62|114.14|
|13	68.30	125.61| 68.30|125.61|
|14	67.12	122.46| 67.12|122.46|
|15	68.28	116.09| 68.28|116.09|
|16	71.09	140.00| 71.09|140.00|
|17	66.46	129.50| 66.46|129.50|
|18	68.65	142.97| 68.65|142.97|
|19	71.23	137.90| 71.23|137.90|
|20	67.13	124.04| 67.13|124.04|
+---------------+------+------+
only showing top 20 rows



### csv함수로 tsv읽기

In [133]:
tDf = spark\
    .read\
    .options(header='false', inferschema='true', delimiter='\t')\
    .csv(os.path.join('data', 'ds_spark_heightweight.txt'))
tDf.show()

+---+-----+------+
|_c0|  _c1|   _c2|
+---+-----+------+
|  1|65.78|112.99|
|  2|71.52|136.49|
|  3| 69.4|153.03|
|  4|68.22|142.34|
|  5|67.79| 144.3|
|  6| 68.7| 123.3|
|  7| 69.8|141.49|
|  8|70.01|136.46|
|  9| 67.9|112.37|
| 10|66.78|120.67|
| 11|66.49|127.45|
| 12|67.62|114.14|
| 13| 68.3|125.61|
| 14|67.12|122.46|
| 15|68.28|116.09|
| 16|71.09| 140.0|
| 17|66.46| 129.5|
| 18|68.65|142.97|
| 19|71.23| 137.9|
| 20|67.13|124.04|
+---+-----+------+
only showing top 20 rows



---
# JSON 파일에서 생성

In [145]:
%%writefile src/ds_twitter_seoul_3.json
{"contributors": null, "truncated": false, "text": "RT @soompi: #SEVENTEEN’s Mingyu, Jin Se Yeon, And Leeteuk To MC For 2016 Super Seoul Dream Concert \nhttps://t.co/1XRSaRBbE0 https://t.co/fi…", "is_quote_status": false, "in_reply_to_status_id": null, "id": 801657325836763136, "favorite_count": 0, "entities": {"symbols": [], "user_mentions": [{"id": 17659206, "indices": [3, 10], "id_str": "17659206", "screen_name": "soompi", "name": "Soompi"}], "hashtags": [{"indices": [12, 22], "text": "SEVENTEEN"}], "urls": [{"url": "https://t.co/1XRSaRBbE0", "indices": [100, 123], "expanded_url": "http://www.soompi.com/2016/11/20/seventeens-mingyu-jin-se-yeon-leeteuk-mc-dream-concert/", "display_url": "soompi.com/2016/11/20/sev…"}]}, "retweeted": false, "coordinates": null, "source": "<a href=\"http://twitter.com/download/android\" rel=\"nofollow\">Twitter for Android</a>", "in_reply_to_screen_name": null, "in_reply_to_user_id": null, "retweet_count": 1487, "id_str": "801657325836763136", "favorited": false, "retweeted_status": {"contributors": null, "truncated": false, "text": "#SEVENTEEN’s Mingyu, Jin Se Yeon, And Leeteuk To MC For 2016 Super Seoul Dream Concert \nhttps://t.co/1XRSaRBbE0 https://t.co/fifXHpF8or", "is_quote_status": false, "in_reply_to_status_id": null, "id": 800593781586132993, "favorite_count": 1649, "entities": {"symbols": [], "user_mentions": [], "hashtags": [{"indices": [0, 10], "text": "SEVENTEEN"}], "urls": [{"url": "https://t.co/1XRSaRBbE0", "indices": [88, 111], "expanded_url": "http://www.soompi.com/2016/11/20/seventeens-mingyu-jin-se-yeon-leeteuk-mc-dream-concert/", "display_url": "soompi.com/2016/11/20/sev…"}], "media": [{"expanded_url": "https://twitter.com/soompi/status/800593781586132993/photo/1", "display_url": "pic.twitter.com/fifXHpF8or", "url": "https://t.co/fifXHpF8or", "media_url_https": "https://pbs.twimg.com/media/CxxHMk8UsAA4cUT.jpg", "id_str": "800593115165798400", "sizes": {"small": {"h": 382, "resize": "fit", "w": 680}, "large": {"h": 449, "resize": "fit", "w": 800}, "medium": {"h": 449, "resize": "fit", "w": 800}, "thumb": {"h": 150, "resize": "crop", "w": 150}}, "indices": [112, 135], "type": "photo", "id": 800593115165798400, "media_url": "http://pbs.twimg.com/media/CxxHMk8UsAA4cUT.jpg"}]}, "retweeted": false, "coordinates": null, "source": "<a href=\"https://about.twitter.com/products/tweetdeck\" rel=\"nofollow\">TweetDeck</a>", "in_reply_to_screen_name": null, "in_reply_to_user_id": null, "retweet_count": 1487, "id_str": "800593781586132993", "favorited": false, "user": {"follow_request_sent": false, "has_extended_profile": true, "profile_use_background_image": true, "default_profile_image": false, "id": 17659206, "profile_background_image_url_https": "https://pbs.twimg.com/profile_background_images/699864769/1cdde0a85f5c0a994ae1fb06d545a5ec.png", "verified": true, "translator_type": "none", "profile_text_color": "999999", "profile_image_url_https": "https://pbs.twimg.com/profile_images/792117259489583104/4khJk3zz_normal.jpg", "profile_sidebar_fill_color": "000000", "entities": {"url": {"urls": [{"url": "http://t.co/3evT80UlR9", "indices": [0, 22], "expanded_url": "http://www.soompi.com", "display_url": "soompi.com"}]}, "description": {"urls": []}}, "followers_count": 987867, "profile_sidebar_border_color": "000000", "id_str": "17659206", "profile_background_color": "1E1E1E", "listed_count": 3982, "is_translation_enabled": true, "utc_offset": -28800, "statuses_count": 80038, "description": "The original K-pop community. We take gifs, OTPs, and reporting on your bias' fashion choices seriously. But not rumors. Ain't nobody got time for that.", "friends_count": 3532, "location": "Worldwide", "profile_link_color": "31B6F4", "profile_image_url": "http://pbs.twimg.com/profile_images/792117259489583104/4khJk3zz_normal.jpg", "following": false, "geo_enabled": false, "profile_banner_url": "https://pbs.twimg.com/profile_banners/17659206/1478803767", "profile_background_image_url": "http://pbs.twimg.com/profile_background_images/699864769/1cdde0a85f5c0a994ae1fb06d545a5ec.png", "screen_name": "soompi", "lang": "en", "profile_background_tile": true, "favourites_count": 1493, "name": "Soompi", "notifications": false, "url": "http://t.co/3evT80UlR9", "created_at": "Wed Nov 26 20:48:27 +0000 2008", "contributors_enabled": false, "time_zone": "Pacific Time (US & Canada)", "protected": false, "default_profile": false, "is_translator": false}, "geo": null, "in_reply_to_user_id_str": null, "possibly_sensitive": false, "lang": "en", "created_at": "Mon Nov 21 06:56:46 +0000 2016", "in_reply_to_status_id_str": null, "place": null, "extended_entities": {"media": [{"expanded_url": "https://twitter.com/soompi/status/800593781586132993/photo/1", "display_url": "pic.twitter.com/fifXHpF8or", "url": "https://t.co/fifXHpF8or", "media_url_https": "https://pbs.twimg.com/media/CxxHMk8UsAA4cUT.jpg", "id_str": "800593115165798400", "sizes": {"small": {"h": 382, "resize": "fit", "w": 680}, "large": {"h": 449, "resize": "fit", "w": 800}, "medium": {"h": 449, "resize": "fit", "w": 800}, "thumb": {"h": 150, "resize": "crop", "w": 150}}, "indices": [112, 135], "type": "photo", "id": 800593115165798400, "media_url": "http://pbs.twimg.com/media/CxxHMk8UsAA4cUT.jpg"}]}, "metadata": {"iso_language_code": "en", "result_type": "recent"}}, "user": {"follow_request_sent": false, "has_extended_profile": false, "profile_use_background_image": true, "default_profile_image": true, "id": 791090169818521600, "profile_background_image_url_https": null, "verified": false, "translator_type": "none", "profile_text_color": "333333", "profile_image_url_https": "https://abs.twimg.com/sticky/default_profile_images/default_profile_6_normal.png", "profile_sidebar_fill_color": "DDEEF6", "entities": {"description": {"urls": []}}, "followers_count": 0, "profile_sidebar_border_color": "C0DEED", "id_str": "791090169818521600", "profile_background_color": "F5F8FA", "listed_count": 0, "is_translation_enabled": false, "utc_offset": null, "statuses_count": 96, "description": "", "friends_count": 7, "location": "", "profile_link_color": "1DA1F2", "profile_image_url": "http://abs.twimg.com/sticky/default_profile_images/default_profile_6_normal.png", "following": false, "geo_enabled": false, "profile_background_image_url": null, "screen_name": "enriquesanq", "lang": "es", "profile_background_tile": false, "favourites_count": 161, "name": "Enrique santos", "notifications": false, "url": null, "created_at": "Wed Oct 26 01:32:49 +0000 2016", "contributors_enabled": false, "time_zone": null, "protected": false, "default_profile": true, "is_translator": false}, "geo": null, "in_reply_to_user_id_str": null, "possibly_sensitive": false, "lang": "en", "created_at": "Thu Nov 24 05:22:55 +0000 2016", "in_reply_to_status_id_str": null, "place": null, "metadata": {"iso_language_code": "en", "result_type": "recent"}}

Overwriting src/ds_twitter_seoul_3.json


## 파일에서 트윗 읽기

In [146]:
import os

#파일 경로 읽기
_jfname=os.path.join('src','ds_twitter_seoul_3.json')

with open(_jfname, 'rb') as f:
    data = f.readlines()

In [147]:
import json
data_json_str = json.loads(data[0]) #loads: load string, 여기선 0번째 로드함

In [148]:
type(data_json_str)

dict

In [149]:
len(data_json_str)

26

## dataframe에서 트윗 읽기

In [150]:
jfile= os.path.join('src','ds_twitter_seoul_3.json')

tweetDf= spark.read.json(jfile)

In [151]:
tweetDf.printSchema()

root
 |-- contributors: string (nullable = true)
 |-- coordinates: string (nullable = true)
 |-- created_at: string (nullable = true)
 |-- entities: struct (nullable = true)
 |    |-- hashtags: array (nullable = true)
 |    |    |-- element: struct (containsNull = true)
 |    |    |    |-- indices: array (nullable = true)
 |    |    |    |    |-- element: long (containsNull = true)
 |    |    |    |-- text: string (nullable = true)
 |    |-- symbols: array (nullable = true)
 |    |    |-- element: string (containsNull = true)
 |    |-- urls: array (nullable = true)
 |    |    |-- element: struct (containsNull = true)
 |    |    |    |-- display_url: string (nullable = true)
 |    |    |    |-- expanded_url: string (nullable = true)
 |    |    |    |-- indices: array (nullable = true)
 |    |    |    |    |-- element: long (containsNull = true)
 |    |    |    |-- url: string (nullable = true)
 |    |-- user_mentions: array (nullable = true)
 |    |    |-- element: struct (containsNull 

In [152]:
tweetDf.count()

1

In [153]:
tweetDf.select('id').show(10)

+------------------+
|                id|
+------------------+
|801657325836763136|
+------------------+

