In [1]:
import pyspark
myConf=pyspark.SparkConf()
spark = pyspark.sql.SparkSession\
    .builder\
    .master("local")\
    .appName("myApp")\
    .config(conf=myConf)\
    .getOrCreate()

In [2]:
print (spark.version)

3.0.1


# S.4.1 schema 생성하기
## 자동으로 인식하는 schema

In [3]:
myList=[('1','kim, js', 170),
        ('1','lee, sm', 175),
        ('2','lim, yg',180),
        ('2','lee', 170)]

#''을 주어서 1,2가 string임

In [4]:
myDf=spark.createDataFrame(myList)

In [5]:
myDf.columns
#열 생성을 안해서 자동으로 생성됨

['_1', '_2', '_3']

In [6]:
myDf.printSchema()
#null값이 전부 허용이 된다.

root
 |-- _1: string (nullable = true)
 |-- _2: string (nullable = true)
 |-- _3: long (nullable = true)



In [7]:
print (myDf.take(1)) #1째줄 출력


[Row(_1='1', _2='kim, js', _3=170)]


In [8]:
cols = ['year','name','height']
_myDf = spark.createDataFrame(myList, cols)

In [9]:
_myDf.columns

['year', 'name', 'height']

In [10]:
print (_myDf.take(1))

[Row(year='1', name='kim, js', height=170)]


In [11]:
names = ["kim","lee","lee","lim"]
items = ["espresso","latte","americano","affocato","long black","macciato"]

In [12]:
coffeeDf = spark.createDataFrame([(names[i%4], items[i%6]) for i in range(100)],\
                           ["name","coffee"])

In [13]:
coffeeDf.printSchema()

root
 |-- name: string (nullable = true)
 |-- coffee: string (nullable = true)



In [14]:
coffeeDf.show(10)

+----+----------+
|name|    coffee|
+----+----------+
| kim|  espresso|
| lee|     latte|
| lee| americano|
| lim|  affocato|
| kim|long black|
| lee|  macciato|
| lee|  espresso|
| lim|     latte|
| kim| americano|
| lee|  affocato|
+----+----------+
only showing top 10 rows



## Row 객체를 사용하여 생성

In [15]:
from pyspark.sql import Row
Person = Row('year','name', 'height')
row1=Person('1','kim, js',170) #행 이름을 Person으로

In [16]:
print ("row1: ", row1.year, row1.name, row1.height) #year, name, height를 .연산자 이용해서 부름

row1:  1 kim, js 170


In [17]:
#list
myRows = [row1,
          Person('1','lee, sm', 175),
          Person('2','lim, yg',180),
          Person('2','lee',170)]

In [18]:
#df로
myDf=spark.createDataFrame(myRows)

In [19]:
print (myDf.printSchema())
myDf.show()

root
 |-- year: string (nullable = true)
 |-- name: string (nullable = true)
 |-- height: long (nullable = true)

None
+----+-------+------+
|year|   name|height|
+----+-------+------+
|   1|kim, js|   170|
|   1|lee, sm|   175|
|   2|lim, yg|   180|
|   2|    lee|   170|
+----+-------+------+



## Schema를 정의하고 생성

In [20]:
from pyspark.sql.types import StructType, StructField
from pyspark.sql.types import StringType, IntegerType
mySchema=StructType([
    StructField("year", StringType(), True),
    StructField("name", StringType(), True),
    StructField("height", IntegerType(), True)
])

In [21]:
myDf=spark.createDataFrame(myRows, mySchema)

In [22]:
myDf.printSchema()

root
 |-- year: string (nullable = true)
 |-- name: string (nullable = true)
 |-- height: integer (nullable = true)



In [23]:
myDf.take(1)

[Row(year='1', name='kim, js', height=170)]

In [24]:
myDf.show()

+----+-------+------+
|year|   name|height|
+----+-------+------+
|   1|kim, js|   170|
|   1|lee, sm|   175|
|   2|lim, yg|   180|
|   2|    lee|   170|
+----+-------+------+

