In [1]:
print('hello')

hello


In [2]:
!which python

/opt/conda/bin/python


In [3]:
!python --version

Python 3.11.6


In [4]:
!java --version

openjdk 17.0.8.1 2023-08-24
OpenJDK Runtime Environment (build 17.0.8.1+1-Ubuntu-0ubuntu122.04)
OpenJDK 64-Bit Server VM (build 17.0.8.1+1-Ubuntu-0ubuntu122.04, mixed mode, sharing)


In [5]:
!which java

/usr/bin/java


In [6]:
pip show pyspark

Name: pyspark
Version: 3.5.0
Summary: Apache Spark Python API
Home-page: https://github.com/apache/spark/tree/master/python
Author: Spark Developers
Author-email: dev@spark.apache.org
License: http://www.apache.org/licenses/LICENSE-2.0
Location: /usr/local/spark/python
Requires: py4j
Required-by: 
Note: you may need to restart the kernel to use updated packages.


In [7]:
#SPARK_HOME
import os
os.environ.get('SPARK_HOME')

'/usr/local/spark'

In [11]:
#JAVA_HOME
import os
os.environ.get('java_HOME')

In [13]:
os.environ.get('PYTHONPATH')

'/usr/local/spark/python/lib/py4j-0.10.9.7-src.zip:/usr/local/spark/python:'

In [None]:
import pyspark

In [21]:
from pyspark.sql import SparkSession
spark=SparkSession.builder.appName('pyspark example1').getOrCreate() #chaining
#SparkContext.SparkSession

In [15]:
spark

In [22]:
spark.stop()

In [23]:
spark=SparkSession.builder.appName('pyspark example1').getOrCreate() #chaining

In [24]:
spark

In [27]:
data=[ ('Alice',1), ('Bob',2), ('Charlie',3) ]
type(data)

list

In [32]:
#DataFrame 객체(분산객체)를 생성 <> 판다스의 데이터프레임이 아님
data1=spark.createDataFrame(data, ['Name','Value'])
data1

DataFrame[Name: string, Value: bigint]

In [29]:
data1.show()

+-------+-----+
|   Name|Value|
+-------+-----+
|  Alice|    1|
|    Bob|    2|
|Charlie|    3|
+-------+-----+



## RDD 객체 생성

In [35]:
spark=SparkSession.builder.appName('pyspark example1').getOrCreate() #기존꺼

In [37]:
rdd=spark.sparkContext.parallelize([1,2,3,4,5])
rdd

ParallelCollectionRDD[23] at readRDDFromFile at PythonRDD.scala:289

In [38]:
data1

DataFrame[Name: string, Value: bigint]

In [39]:
rdd.take(5) #rdd 객체를 출력하는 함수, n개 지정 필수

[1, 2, 3, 4, 5]

In [40]:
#map 연산:rdd 값으로 연산
squared_rdd=rdd.map(lambda x:x*x)
squared_rdd


PythonRDD[26] at RDD at PythonRDD.scala:53

In [44]:
rdd.take(3)

[1, 2, 3]

In [41]:
squared_rdd.take(3)

[1, 4, 9]

In [45]:
squared_rdd.collect()

[1, 4, 9, 16, 25]

In [50]:
data1.filter(data1.Name == 'Bob').show()

+----+-----+
|Name|Value|
+----+-----+
| Bob|    2|
+----+-----+



In [51]:
data1.filter(data1.Value>2).show()

+-------+-----+
|   Name|Value|
+-------+-----+
|Charlie|    3|
+-------+-----+



In [52]:
data1.createOrReplaceTempView('people')        #임시뷰로 만들기

In [54]:
spark.sql('select * from people').show()

+-------+-----+
|   Name|Value|
+-------+-----+
|  Alice|    1|
|    Bob|    2|
|Charlie|    3|
+-------+-----+



In [58]:
spark.sql('select * from people where Name="Bob"').show()

+----+-----+
|Name|Value|
+----+-----+
| Bob|    2|
+----+-----+



In [59]:
spark.sql('select Name,Value from people where value==3').show()

+-------+-----+
|   Name|Value|
+-------+-----+
|Charlie|    3|
+-------+-----+



## Machine Learning library

In [61]:
from pyspark.ml.regression import LinearRegression

In [63]:
import numpy as np

In [64]:
from pyspark.ml.feature import VectorAssembler


In [68]:
data_age=[ ('Alice',25), ('Bob',30), ('Charlie',33) ]
data2=spark.createDataFrame(data_age, ['Name','Age'])
data2

DataFrame[Name: string, Age: bigint]

In [71]:
assembler = VectorAssembler(inputCols=['Age'],outputCol='features')
vector_df=assembler.transform(data2)
vector_df

DataFrame[Name: string, Age: bigint, features: vector]

In [73]:
lr=LinearRegression(featuresCol='features',labelCol='Age')
model=lr.fit(vector_df)


In [75]:
pred=model.transform(vector_df)
pred

DataFrame[Name: string, Age: bigint, features: vector, prediction: double]

In [77]:
pred.show()

+-------+---+--------+-----------------+
|   Name|Age|features|       prediction|
+-------+---+--------+-----------------+
|  Alice| 25|  [25.0]|24.99999999999993|
|    Bob| 30|  [30.0]|30.00000000000001|
|Charlie| 33|  [33.0]|33.00000000000006|
+-------+---+--------+-----------------+



In [80]:
spark.stop()

## Streaming

In [92]:
from pyspark.sql.functions import explode
from pyspark.sql.functions import split

#explode(): 배열 컬럼을 행으로 분해하는 함수
#예: ['a', 'b', 'c'] → 'a', 'b', 'c' 3개의 행으로 만들어줌
#split(): 문자열을 지정된 구분자(예: ' ') 기준으로 잘라서 배열로 만들어줌

In [83]:
spark=SparkSession.builder.appName('pyspark example1').getOrCreate()  #SparkSession 객체 생성

In [89]:
lines = spark.readStream.format('socket')\
.option('host','localhost')\
.option('port',9999)\
.load() #streaming data 받음

'''스트리밍 데이터 소스를 지정함.
이건 소켓으로부터 텍스트 데이터를 실시간으로 받는 설정이야.
소켓에서 "hello spark"를 입력하면
lines DataFrame에 value = "hello spark" 형태로 들어옴.'''

In [None]:
words=lines.select(explode(split(lines.value, ' ')).alias('word')
'''
lines.value: 소켓으로 들어온 텍스트 한 줄 (예: "hello spark world")

split(lines.value, ' '): 공백 ' ' 기준으로 문자열을 나눠서 리스트로 만듦
→ ["hello", "spark", "world"]

explode(...): 리스트 안의 요소들을 각각의 행으로 분해
→ hello, spark, world → 각각 1행씩

.alias('word'): 새로 생성된 컬럼 이름을 word로 지정함"""                   

In [91]:
spark.stop()