<a href="https://colab.research.google.com/github/lukaszlewickii/spark_labs/blob/main/df1_students.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# DataFrame

http://spark.apache.org/docs/2.2.0/api/python/pyspark.sql.html#pyspark-sql-module

In [None]:
!pip install pyspark

Collecting pyspark
  Downloading pyspark-3.5.0.tar.gz (316.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m316.9/316.9 MB[0m [31m3.7 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.5.0-py2.py3-none-any.whl size=317425344 sha256=b0d5e6ab038d74fa965a8ca615eda0c44192de72ca7a9508b2db3dd4d1d4d118
  Stored in directory: /root/.cache/pip/wheels/41/4e/10/c2cf2467f71c678cfc8a6b9ac9241e5e44a01940da8fbb17fc
Successfully built pyspark
Installing collected packages: pyspark
Successfully installed pyspark-3.5.0


In [None]:
from pyspark.sql import SparkSession
from pyspark.sql import Row

spark = SparkSession.builder \
    .appName('DataFrame_1') \
    .master('local[*]') \
    .getOrCreate()

In [None]:
sc = spark.sparkContext

In [None]:
# Wczytanie DataFrame bezposrednio z JSON oraz CSV
people = spark.read.json('people.json')
employees = spark.read.json('employees.json')
people_txt = spark.read.option("inferSchema", "true").csv('people.txt')

In [None]:
people_txt.show()

+-------+----+
|    _c0| _c1|
+-------+----+
|Michael|29.0|
|   Andy|30.0|
| Justin|19.0|
+-------+----+



****

### DataFrame to lista wierszy

In [None]:
newPerson1 = Row(name='Greg', age=32)

In [None]:
newPerson1

Row(name='Greg', age=32)

In [None]:
newPerson1.name

'Greg'

In [None]:
newPerson1.age

32

In [None]:
newPerson1['age']

32

In [None]:
'age' in newPerson1

True

In [None]:
newPerson = Row("age", "name")

In [None]:
newPerson2 = newPerson(24, 'Alice')

In [None]:
newPerson2

Row(age=24, name='Alice')

In [None]:
newPerson3 = newPerson(None, None)
newPerson4 = newPerson(33, None)
newPerson5 = newPerson(None, 'Peter')
newPerson6 = newPerson(32, 'Peter')
newPerson7 = newPerson(40, 'Greg')

In [None]:
newPeopleDF = spark.createDataFrame([newPerson1, newPerson2, newPerson3, newPerson4, newPerson5, newPerson6, newPerson7])

In [None]:
newPeopleDF.show()

+----+----+
|name| age|
+----+----+
|Greg|  32|
|  24|NULL|
|NULL|NULL|
|  33|NULL|
|NULL|NULL|
|  32|NULL|
|  40|NULL|
+----+----+



### Przechodzenie RDD <-> DF

In [None]:
type(people)

pyspark.sql.dataframe.DataFrame

In [None]:
people

DataFrame[age: bigint, name: string]

In [None]:
people.collect()

[Row(age=None, name='Michael'),
 Row(age=30, name='Andy'),
 Row(age=19, name='Justin')]

DF -> RDD

In [None]:
type(people.rdd)

pyspark.rdd.RDD

In [None]:
people.rdd.collect()

[Row(age=None, name='Michael'),
 Row(age=30, name='Andy'),
 Row(age=19, name='Justin')]

In [None]:
people.rdd.map(tuple).collect()

[(None, 'Michael'), (30, 'Andy'), (19, 'Justin')]

RDD -> DF

In [None]:
people.rdd.toDF()

DataFrame[age: bigint, name: string]

In [None]:
people.rdd.map(tuple).toDF().collect()

[Row(_1=None, _2='Michael'), Row(_1=30, _2='Andy'), Row(_1=19, _2='Justin')]

Podanie schematu wprost przy tworzeniu DF

Typy danych: http://spark.apache.org/docs/2.1.0/api/python/pyspark.sql.html#module-pyspark.sql.types <br>
https://spark.apache.org/docs/latest/sql-programming-guide.html#data-types <br>
Kilka podstawowych: IntegerType, DoubleType, FloatType, StringType, BooleanType, NullType

In [None]:
from pyspark.sql.types import IntegerType, StringType, StructType, StructField
schema = StructType([StructField("V1", IntegerType()), StructField("V2", StringType())])
df = spark.createDataFrame([[1,2],[3,4]], schema)

In [None]:
# Data Frame nadal jest lista wierszy
df.show()

+---+---+
| V1| V2|
+---+---+
|  1|  2|
|  3|  4|
+---+---+



> **TODO**: Z podanego RDD utworz DataFrame. <br>
Wskazowka: DataFrame stworzymy latwo z RDD Rows<br>

In [None]:
myRDD = sc.textFile('people.txt')
print(type(myRDD))

<class 'pyspark.rdd.RDD'>


In [None]:
myRDD

people.txt MapPartitionsRDD[112] at textFile at NativeMethodAccessorImpl.java:0

#### Ogolne wiadomosci na temat danych
printSchema, show, columns, dtypes <br>
Znane z RDD np: count, take, head

In [None]:
people.show()

+----+-------+
| age|   name|
+----+-------+
|NULL|Michael|
|  30|   Andy|
|  19| Justin|
+----+-------+



In [None]:
people.printSchema() ## Print the schema in a tree format

root
 |-- age: long (nullable = true)
 |-- name: string (nullable = true)



In [None]:
employees.printSchema()

root
 |-- name: string (nullable = true)
 |-- salary: long (nullable = true)



In [None]:
people_txt.printSchema()

root
 |-- _c0: string (nullable = true)
 |-- _c1: double (nullable = true)



In [None]:
people_txt.show(1)

+-------+----+
|    _c0| _c1|
+-------+----+
|Michael|29.0|
+-------+----+
only showing top 1 row



In [None]:
people.show(1)

+----+-------+
| age|   name|
+----+-------+
|NULL|Michael|
+----+-------+
only showing top 1 row



In [None]:
people_txt.show(1)

+-------+----+
|    _c0| _c1|
+-------+----+
|Michael|29.0|
+-------+----+
only showing top 1 row



Liczba wierszy

In [None]:
people.count()

3

Lista kolumn

In [None]:
people.columns

['age', 'name']

Lista kolumn wraz z typami danych

In [None]:
people.dtypes

[('age', 'bigint'), ('name', 'string')]

#### Odwolania do poszczegolnych kolumn

In [None]:
people.age

Column<'age'>

In [None]:
people['age']

Column<'age'>

In [None]:
people[0]

Column<'age'>

#### Dodanie/usuniecie kolumny
withColumn, drop

In [None]:
people.show(2)

+----+-------+
| age|   name|
+----+-------+
|NULL|Michael|
|  30|   Andy|
+----+-------+
only showing top 2 rows



In [None]:
people.withColumn(colName = 'ageNextYear', col = people.age +1).show(2)

+----+-------+-----------+
| age|   name|ageNextYear|
+----+-------+-----------+
|NULL|Michael|       NULL|
|  30|   Andy|         31|
+----+-------+-----------+
only showing top 2 rows



In [None]:
people.drop('age').show()

+-------+
|   name|
+-------+
|Michael|
|   Andy|
| Justin|
+-------+



#### Zmiany nazwy kolumny

DataFrame people_txt ma domyslne nazwy kolumn ze Sparka. Niewiele nam one mowia.

In [None]:
people_txt = people_txt.withColumnRenamed('_c0', 'name').withColumnRenamed('_c1', 'age')
people_txt.show(1)

#### Podstawowe statystyki kolumn w DataFrames
describe

In [None]:
people.describe()

In [None]:
employees.describe('salary').show()

#### Braki danych
isNull, isNotNull<br>
fillna, dropna, replace

In [None]:
newPeopleDF.show()

In [None]:
newPeopleDF.filter(newPeopleDF.age.isNull()).show()

In [None]:
newPeopleDF.filter(newPeopleDF.age.isNotNull()).show()

In [None]:
newPeopleDF.fillna(-1).show()

In [None]:
newPeopleDF.fillna({'age':-1, 'name':'unknown'}).show()

In [None]:
newPeopleDF.fillna({'name':'unknown'}).replace('unknown', 'NN').show()

In [None]:
newPeopleDF.dropna().show()

In [None]:
newPeopleDF.dropna(subset='age').show()

In [None]:
newPeopleDF.dropna?

#### Funkcje wprost ze skladni SQL
select, where (wymienne z filter), orderBy

In [None]:
people.show()

+----+-------+
| age|   name|
+----+-------+
|NULL|Michael|
|  30|   Andy|
|  19| Justin|
+----+-------+



In [None]:
people.\
select('name').\
where(people.name.like('%n%')).\
orderBy(people.age.asc()).\
show()

+------+
|  name|
+------+
|Justin|
|  Andy|
+------+



> **TODO**: Wyswietl imiona ludzi ze zbioru 'people' starszych niz 29 lat

#### Operacje na zbiorach
union - dziala jak UNION ALL w SQL. <br>
intersect (INTERSECT z SQLa), subtract (EXCEPT z SQLa)

In [None]:
people.union?

In [None]:
people.show()

+----+-------+
| age|   name|
+----+-------+
|NULL|Michael|
|  30|   Andy|
|  19| Justin|
+----+-------+



In [None]:
people_txt.show()

+-------+----+
|    _c0| _c1|
+-------+----+
|Michael|29.0|
|   Andy|30.0|
| Justin|19.0|
+-------+----+



In [None]:
newPeopleDF.show()

+----+----+
|name| age|
+----+----+
|Greg|  32|
|  24|NULL|
|NULL|NULL|
|  33|NULL|
|NULL|NULL|
|  32|NULL|
|  40|NULL|
+----+----+



In [None]:
from google.colab import drive
drive.mount('/content/drive')