# DataFrame

http://spark.apache.org/docs/2.2.0/api/python/pyspark.sql.html#pyspark-sql-module

In [None]:
from pyspark.sql import SparkSession
from pyspark.sql import Row

spark = SparkSession.builder \
    .appName('DataFrame_1') \
    .master('local[*]') \
    .getOrCreate()

In [None]:
sc = spark.sparkContext

In [None]:
data_path = '.../'

# Wczytanie DataFrame bezposrednio z JSON oraz CSV
people = spark.read.json(data_path+'people.json')
employees = spark.read.json(data_path+'employees.json')
people_txt = spark.read.option("inferSchema", "true").csv(data_path+'people.txt')

In [None]:
people_txt.show()

****

### DataFrame to lista wierszy

In [None]:
newPerson1 = Row(name='Greg', age=32)

In [None]:
newPerson1

In [None]:
newPerson1.name

In [None]:
newPerson1.age

In [None]:
newPerson1['age']

In [None]:
'age' in newPerson1

In [None]:
newPerson = Row("age", "name")

In [None]:
newPerson2 = newPerson(24, 'Alice')

In [None]:
newPerson2

In [None]:
newPerson3 = newPerson(None, None)
newPerson4 = newPerson(33, None)
newPerson5 = newPerson(None, 'Peter')
newPerson6 = newPerson(32, 'Peter')
newPerson7 = newPerson(40, 'Greg')

In [None]:
newPeopleDF = spark.createDataFrame([newPerson1, newPerson2, newPerson3, newPerson4, newPerson5, newPerson6, newPerson7])

In [None]:
newPeopleDF.show()

### Przechodzenie RDD <-> DF

In [None]:
type(people)

In [None]:
people

In [None]:
people.collect()

DF -> RDD 

In [None]:
type(people.rdd)

In [None]:
people.rdd.collect()

In [None]:
people.rdd.map(tuple).collect()

RDD -> DF

In [None]:
people.rdd.toDF()

In [None]:
people.rdd.map(tuple).toDF().collect()

Podanie schematu wprost przy tworzeniu DF

Typy danych: http://spark.apache.org/docs/2.1.0/api/python/pyspark.sql.html#module-pyspark.sql.types <br>
https://spark.apache.org/docs/latest/sql-programming-guide.html#data-types <br>
Kilka podstawowych: IntegerType, DoubleType, FloatType, StringType, BooleanType, NullType

In [None]:
from pyspark.sql.types import IntegerType, StringType, StructType, StructField
schema = StructType([StructField("V1", IntegerType()), StructField("V2", StringType())])
df = spark.createDataFrame([[1,2],[3,4]], schema)

In [None]:
# Data Frame nadal jest lista wierszy
df.show()

> **TODO**: Z podanego RDD utworz DataFrame. <br>
Wskazowka: DataFrame stworzymy latwo z RDD Rows<br>

#### Ogolne wiadomosci na temat danych
printSchema, show, columns, dtypes <br>
Znane z RDD np: count, take, head

In [None]:
people.show()

In [None]:
people.printSchema() ## Print the schema in a tree format

In [None]:
employees.printSchema()

In [None]:
people_txt.printSchema()

In [None]:
people_txt.show(1)

In [None]:
people.show(1)

In [None]:
people_txt.show(1)

Liczba wierszy

In [None]:
people.count()

Lista kolumn

In [None]:
people.columns

Lista kolumn wraz z typami danych

In [None]:
people.dtypes

#### Odwolania do poszczegolnych kolumn

In [None]:
people.age

In [None]:
people['age']

In [None]:
people[0]

#### Dodanie/usuniecie kolumny
withColumn, drop

In [None]:
people.show(2)

In [None]:
people.withColumn(colName = 'ageNextYear', col = people.age +1).show(2)

In [None]:
people.drop('age').show()

#### Zmiany nazwy kolumny

DataFrame people_txt ma domyslne nazwy kolumn ze Sparka. Niewiele nam one mowia.

In [None]:
people_txt = people_txt.withColumnRenamed('_c0', 'name').withColumnRenamed('_c1', 'age')
people_txt.show(1)

#### Podstawowe statystyki kolumn w DataFrames
describe

In [None]:
people.describe()

In [None]:
employees.describe('salary').show()

#### Braki danych
isNull, isNotNull<br>
fillna, dropna, replace

In [None]:
newPeopleDF.show()

In [None]:
newPeopleDF.filter(newPeopleDF.age.isNull()).show()

In [None]:
newPeopleDF.filter(newPeopleDF.age.isNotNull()).show()

In [None]:
newPeopleDF.fillna(-1).show()

In [None]:
newPeopleDF.fillna({'age':-1, 'name':'unknown'}).show()

In [None]:
newPeopleDF.fillna({'name':'unknown'}).replace('unknown', 'NN').show()

In [None]:
newPeopleDF.dropna().show()

In [None]:
newPeopleDF.dropna(subset='age').show()

In [None]:
newPeopleDF.dropna?

#### Funkcje wprost ze skladni SQL 
select, where (wymienne z filter), orderBy

In [None]:
people.show()

In [None]:
people.\
select('name').\
where(people.name.like('%n%')).\
orderBy(people.age.asc()).\
show()

> **TODO**: Wyswietl imiona ludzi ze zbioru 'people' starszych niz 29 lat

#### Operacje na zbiorach
union - dziala jak UNION ALL w SQL. <br>
intersect (INTERSECT z SQLa), subtract (EXCEPT z SQLa)

In [None]:
people.union?

In [None]:
people.show()

In [None]:
people_txt.show()

In [None]:
newPeopleDF.show()