# DataFrame

In [12]:
import findspark
findspark.init()
import pyspark
myConf=pyspark.SparkConf()
spark = pyspark.sql.SparkSession.builder\
    .master("local")\
    .appName("myApp")\
    .config(conf=myConf)\
    .getOrCreate()

## DataFrame은 행,열로 구조화된 데이터구조이다. 

### RDD에 schema를 얹은 구조


- NullType
- StringType
- BinaryType
- BooleanType
- DateType
- TimestampType
- DoubleType
- DecimalType
- ShortType
- ArrayType
- MapType

# DataFrame 생성

## schema를 정해주어야 하나, 정해주지 않으면 알아서 정해줌
### schema는 row와 column으로 구성됨

#### 외부에서 읽기 : spark.createDataFrame() 또는 spark.read()
#### 내부에서 읽기 : spark.createDataFrame()

In [13]:
myList=[('1','kim, js', 170),
        ('1','lee, sm', 175),
        ('2','lim, yg',180),
        ('2','lee', 170)]

In [14]:
myDf=spark.createDataFrame(myList)

In [15]:
myDf.columns

['_1', '_2', '_3']

In [16]:
myDf.printSchema()

root
 |-- _1: string (nullable = true)
 |-- _2: string (nullable = true)
 |-- _3: long (nullable = true)



In [17]:
print (myDf.take(1))

[Row(_1='1', _2='kim, js', _3=170)]


In [18]:
cols = ['year','name','height']
_myDf = spark.createDataFrame(myList, cols)

In [19]:
_myDf.columns

['year', 'name', 'height']

In [20]:
print (_myDf.take(1))

[Row(year='1', name='kim, js', height=170)]


In [21]:
names = ["kim","lee","lee","lim"]
items = ["espresso","latte","americano","affocato","long black","macciato"]

In [22]:
coffeeDf = spark.createDataFrame([(names[i%4], items[i%6]) for i in range(100)],\
                           ["name","coffee"])

In [23]:
coffeeDf.printSchema()

root
 |-- name: string (nullable = true)
 |-- coffee: string (nullable = true)



In [24]:
coffeeDf.show(10)

+----+----------+
|name|    coffee|
+----+----------+
| kim|  espresso|
| lee|     latte|
| lee| americano|
| lim|  affocato|
| kim|long black|
| lee|  macciato|
| lee|  espresso|
| lim|     latte|
| kim| americano|
| lee|  affocato|
+----+----------+
only showing top 10 rows



In [25]:
from pyspark.sql import Row
Person = Row('year','name', 'height')
row1=Person('1','kim, js',170)

In [26]:
print ("row1: ", row1.year, row1.name, row1.height)

row1:  1 kim, js 170


In [27]:
row1.asDict()

{'year': '1', 'name': 'kim, js', 'height': 170}

In [28]:
myRows = [row1,
          Person('1','lee, sm', 175),
          Person('2','lim, yg',180),
          Person('2','lee',170)]


In [29]:
myDf=spark.createDataFrame(myRows)

In [30]:
print (myDf.printSchema())
myDf.show()

root
 |-- year: string (nullable = true)
 |-- name: string (nullable = true)
 |-- height: long (nullable = true)

None
+----+-------+------+
|year|   name|height|
+----+-------+------+
|   1|kim, js|   170|
|   1|lee, sm|   175|
|   2|lim, yg|   180|
|   2|    lee|   170|
+----+-------+------+



# 스키마 만들어서 생성

In [31]:
from pyspark.sql.types import StructType, StructField
from pyspark.sql.types import StringType, IntegerType
mySchema=StructType([
    StructField("year", StringType(), True),
    StructField("name", StringType(), True),
    StructField("height", IntegerType(), True)
])

In [32]:
myDf=spark.createDataFrame(myRows, mySchema)

In [33]:
myDf.printSchema()

root
 |-- year: string (nullable = true)
 |-- name: string (nullable = true)
 |-- height: integer (nullable = true)



In [34]:
myDf.take(1)

[Row(year='1', name='kim, js', height=170)]

In [35]:
wcDf.groupBy('ClubCountry').pivot('Position').count().show()

NameError: name 'wcDf' is not defined

In [36]:
wcDf.groupBy(wcDf.ClubCountry).count().show()

NameError: name 'wcDf' is not defined

# F함수

In [37]:
from pyspark.sql import functions as F

myDf.agg(F.min(myDf.heightD),F.max(myDf.heightD),F.avg(myDf.heightD),F.sum(myDf.heightD)).show()

AttributeError: 'DataFrame' object has no attribute 'heightD'

# 행추가

In [38]:
from pyspark.sql import Row
toAppendDf = spark.createDataFrame([Row(1, "choi, js", 177)])

In [39]:
_myDf = myDf.union(toAppendDf)

In [44]:
_myDf.show()

+----+--------+------+
|year|    name|height|
+----+--------+------+
|   1| kim, js|   170|
|   1| lee, sm|   175|
|   2| lim, yg|   180|
|   2|     lee|   170|
|   1|choi, js|   177|
+----+--------+------+



# 결측값 처리

In [40]:
from pyspark.sql import functions as F
myDf.where(F.col("height").isNull())

DataFrame[year: string, name: string, height: int]

In [42]:
from pyspark.sql.functions import isnan, when, count, col
myDf.select([count(when(isnan(c), c)).alias(c) for c in myDf.columns]).show()

+----+----+------+
|year|name|height|
+----+----+------+
|   0|   0|     0|
+----+----+------+



In [43]:
from pyspark.sql.functions import isnan, when, count, col
myDf.select([count(when(col(c).isNull(), c)).alias(c) for c in myDf.columns]).show()

+----+----+------+
|year|name|height|
+----+----+------+
|   0|   0|     0|
+----+----+------+



# 결측값을 채우는 함수이다.

#### df.na.fill(0) 모든 컬럼의 na를 0으로 교체
#### df.fillna( { 'c0':0, 'c1':0 } ) 컬럼 c0, c1의 na를 0으로 교체
#### 결측값을 삭제할 수도 있다.

#### df.na.drop(subset=["c0"])