In [2]:
import findspark
findspark.init()

In [3]:
from pyspark.conf import SparkConf
config = SparkConf()
# config.set("property", "value")
config.setMaster("local").setAppName("DataFrameBasic")

from pyspark.sql import SparkSession
# spark Session, entry point for Spark SQL, DataFrame
spark = SparkSession.builder\
                    .config(conf=config)\
                    .getOrCreate()

sc = spark.sparkContext

In [4]:
# spark dataframe
# Structured Data
# data + schema
# schema will contain columns and data types
# data - rows with columns as per schema
# DataFrame Core Engine, Spark SQL Core are same
# DataFRame internally has RDD,, Rdd[Row]
# DataFrame is as alias, the actual data still on RDD only
# Data Frame is API, when we call the API, internally API is converted into 
# various plans [logical, optimized, physical plans] and finally physical plan
# used to create Java Byte using Scala
# When it comes to execution, it is Still RDD, transformation, action only


In [38]:
products = [
    # (product_id, product_name, price, brand_id, offer)
    (1, 'iPhone', 1000.0, 100, 0),
    (2, 'Galaxy', 545.50, 101, None),
    (3, 'Pixel', 645.99, 101, None)
]

# no data type mentioned, however we will let spark to infer schema by reading data
schema = ['product_id', 'product_name', 'price', 'brand_id', 'offer']

productDf = spark.createDataFrame(data=products, schema=schema)

# every data frame has schema, we can print it
productDf.printSchema()
# ASCII FORMAT
productDf.show() # 20 records

root
 |-- product_id: long (nullable = true)
 |-- product_name: string (nullable = true)
 |-- price: double (nullable = true)
 |-- brand_id: long (nullable = true)
 |-- offer: long (nullable = true)

+----------+------------+------+--------+-----+
|product_id|product_name| price|brand_id|offer|
+----------+------------+------+--------+-----+
|         1|      iPhone|1000.0|     100|    0|
|         2|      Galaxy| 545.5|     101| null|
|         3|       Pixel|645.99|     101| null|
+----------+------------+------+--------+-----+



In [7]:
# every data frame has rdd internally
# data is nothing but api applied on rdd
# DF is RDD of Row, each has has column name, value
productDf.rdd.collect()

[Row(product_id=1, product_name='iPhone', price=1000.0, brand_id=100),
 Row(product_id=2, product_name='Galaxy', price=545.5, brand_id=101),
 Row(product_id=3, product_name='Pixel', price=645.99, brand_id=101)]

In [8]:
# dataframe rdd partitions
productDf.rdd.getNumPartitions()

1

In [9]:
# data frame has transformation and actions
# transformations shall return dataframe which immutable
# transformation are lazy
# data frame filter
# return a new data dataframe, it won't execute the data, no job, no action
df = productDf.filter (productDf["price"] <= 750)

In [10]:
# apply actions, show is action
df.show() # execute job

+----------+------------+------+--------+
|product_id|product_name| price|brand_id|
+----------+------------+------+--------+
|         2|      Galaxy| 545.5|     101|
|         3|       Pixel|645.99|     101|
+----------+------------+------+--------+



In [11]:
# select api, projection 
df = productDf.select("product_name", "price")
df.printSchema()
df.show()

root
 |-- product_name: string (nullable = true)
 |-- price: double (nullable = true)

+------------+------+
|product_name| price|
+------------+------+
|      iPhone|1000.0|
|      Galaxy| 545.5|
|       Pixel|645.99|
+------------+------+



In [13]:
# selectExpr dynamic expression, CAST, 
# SELECT upper(product_name), price * 0.9 
df = productDf.selectExpr("product_name", "upper(product_name)", 
                          "price", "price  * .9")

df.printSchema()
df.show()

root
 |-- product_name: string (nullable = true)
 |-- upper(product_name): string (nullable = true)
 |-- price: double (nullable = true)
 |-- (price * 0.9): double (nullable = true)

+------------+-------------------+------+-----------------+
|product_name|upper(product_name)| price|    (price * 0.9)|
+------------+-------------------+------+-----------------+
|      iPhone|             IPHONE|1000.0|            900.0|
|      Galaxy|             GALAXY| 545.5|           490.95|
|       Pixel|              PIXEL|645.99|581.3910000000001|
+------------+-------------------+------+-----------------+



In [14]:
# selectExpr dynamic expression, CAST, 
# SELECT upper(product_name), price * 0.9 
# mixing python, sql
df = productDf.selectExpr("product_name", "upper(product_name) as title", 
                          "price", "price  * .9 as grand_total")

df.printSchema()
df.show()

root
 |-- product_name: string (nullable = true)
 |-- title: string (nullable = true)
 |-- price: double (nullable = true)
 |-- grand_total: double (nullable = true)

+------------+------+------+-----------------+
|product_name| title| price|      grand_total|
+------------+------+------+-----------------+
|      iPhone|IPHONE|1000.0|            900.0|
|      Galaxy|GALAXY| 545.5|           490.95|
|       Pixel| PIXEL|645.99|581.3910000000001|
+------------+------+------+-----------------+



In [15]:
# derived a new column called offer_price, adding new column from existing columns
df = productDf.withColumn("offer_price", productDf.price * 0.9)
df.printSchema()
df.show()

root
 |-- product_id: long (nullable = true)
 |-- product_name: string (nullable = true)
 |-- price: double (nullable = true)
 |-- brand_id: long (nullable = true)
 |-- offer_price: double (nullable = true)

+----------+------------+------+--------+-----------------+
|product_id|product_name| price|brand_id|      offer_price|
+----------+------------+------+--------+-----------------+
|         1|      iPhone|1000.0|     100|            900.0|
|         2|      Galaxy| 545.5|     101|           490.95|
|         3|       Pixel|645.99|     101|581.3910000000001|
+----------+------------+------+--------+-----------------+



In [16]:
# rename column
df = productDf.withColumnRenamed("price", "total")
df.printSchema()
df.show()

root
 |-- product_id: long (nullable = true)
 |-- product_name: string (nullable = true)
 |-- total: double (nullable = true)
 |-- brand_id: long (nullable = true)

+----------+------------+------+--------+
|product_id|product_name| total|brand_id|
+----------+------------+------+--------+
|         1|      iPhone|1000.0|     100|
|         2|      Galaxy| 545.5|     101|
|         3|       Pixel|645.99|     101|
+----------+------------+------+--------+



In [17]:
# drop Columns
df = productDf.drop("brand_id")
df.printSchema()
df.show()

root
 |-- product_id: long (nullable = true)
 |-- product_name: string (nullable = true)
 |-- price: double (nullable = true)

+----------+------------+------+
|product_id|product_name| price|
+----------+------------+------+
|         1|      iPhone|1000.0|
|         2|      Galaxy| 545.5|
|         3|       Pixel|645.99|
+----------+------------+------+



In [21]:
# filter, where conditions
# filter and where are same, alias
# python expression
df = productDf.filter( (productDf.price >= 500) & (productDf["price"] < 600))
df.printSchema()
df.show()

root
 |-- product_id: long (nullable = true)
 |-- product_name: string (nullable = true)
 |-- price: double (nullable = true)
 |-- brand_id: long (nullable = true)

+----------+------------+-----+--------+
|product_id|product_name|price|brand_id|
+----------+------------+-----+--------+
|         2|      Galaxy|545.5|     101|
+----------+------------+-----+--------+



In [22]:
# filter and where are same
df = productDf.where( (productDf.price >= 500) & (productDf["price"] < 600))
df.printSchema()
df.show()

root
 |-- product_id: long (nullable = true)
 |-- product_name: string (nullable = true)
 |-- price: double (nullable = true)
 |-- brand_id: long (nullable = true)

+----------+------------+-----+--------+
|product_id|product_name|price|brand_id|
+----------+------------+-----+--------+
|         2|      Galaxy|545.5|     101|
+----------+------------+-----+--------+



In [23]:
# pyspark, filter, or where with sql expression, MIX
df = productDf.where (" price >= 500 AND price < 600")
df.printSchema()
df.show()

root
 |-- product_id: long (nullable = true)
 |-- product_name: string (nullable = true)
 |-- price: double (nullable = true)
 |-- brand_id: long (nullable = true)

+----------+------------+-----+--------+
|product_id|product_name|price|brand_id|
+----------+------------+-----+--------+
|         2|      Galaxy|545.5|     101|
+----------+------------+-----+--------+



In [25]:
# how to reference columns in pyspark
print(productDf.price)
print(productDf['price'])

# with function col - column
from pyspark.sql.functions import col
print(col("price"))

Column<b'price'>
Column<b'price'>
Column<b'price'>


In [29]:
# add a new column, which a fixed constant
from pyspark.sql.functions import lit 
# lit - literal - constant
df = productDf.withColumn("qty", lit(4))\
              .withColumn("amount", col("qty") *  col("price"))

df.printSchema()
df.show()

root
 |-- product_id: long (nullable = true)
 |-- product_name: string (nullable = true)
 |-- price: double (nullable = true)
 |-- brand_id: long (nullable = true)
 |-- qty: integer (nullable = false)
 |-- amount: double (nullable = true)

+----------+------------+------+--------+---+-------+
|product_id|product_name| price|brand_id|qty| amount|
+----------+------------+------+--------+---+-------+
|         1|      iPhone|1000.0|     100|  4| 4000.0|
|         2|      Galaxy| 545.5|     101|  4| 2182.0|
|         3|       Pixel|645.99|     101|  4|2583.96|
+----------+------------+------+--------+---+-------+



In [34]:
# sort data ascending order
df = productDf.sort("price")
df.show()

+----------+------------+------+--------+
|product_id|product_name| price|brand_id|
+----------+------------+------+--------+
|         2|      Galaxy| 545.5|     101|
|         3|       Pixel|645.99|     101|
|         1|      iPhone|1000.0|     100|
+----------+------------+------+--------+



In [35]:
# sorting decending order
from pyspark.sql.functions import desc
df = productDf.sort(desc("price"))
df.show()

+----------+------------+------+--------+
|product_id|product_name| price|brand_id|
+----------+------------+------+--------+
|         1|      iPhone|1000.0|     100|
|         3|       Pixel|645.99|     101|
|         2|      Galaxy| 545.5|     101|
+----------+------------+------+--------+



In [36]:
# alternatively use dataframe columns if we have df reference
df = productDf.sort (productDf.price.asc())
df.show()
# desc
df = productDf.sort (productDf.price.desc())
df.show()


+----------+------------+------+--------+
|product_id|product_name| price|brand_id|
+----------+------------+------+--------+
|         2|      Galaxy| 545.5|     101|
|         3|       Pixel|645.99|     101|
|         1|      iPhone|1000.0|     100|
+----------+------------+------+--------+

+----------+------------+------+--------+
|product_id|product_name| price|brand_id|
+----------+------------+------+--------+
|         1|      iPhone|1000.0|     100|
|         3|       Pixel|645.99|     101|
|         2|      Galaxy| 545.5|     101|
+----------+------------+------+--------+



In [39]:
# now fillna /non available
productDf.show()
df = productDf.fillna(value=0) # null value is replaced with 0 value
df.show()

+----------+------------+------+--------+-----+
|product_id|product_name| price|brand_id|offer|
+----------+------------+------+--------+-----+
|         1|      iPhone|1000.0|     100|    0|
|         2|      Galaxy| 545.5|     101| null|
|         3|       Pixel|645.99|     101| null|
+----------+------------+------+--------+-----+

+----------+------------+------+--------+-----+
|product_id|product_name| price|brand_id|offer|
+----------+------------+------+--------+-----+
|         1|      iPhone|1000.0|     100|    0|
|         2|      Galaxy| 545.5|     101|    0|
|         3|       Pixel|645.99|     101|    0|
+----------+------------+------+--------+-----+



In [42]:
# now fillna /non available, limit to specific columns
productDf.show()
df = productDf.fillna(value=0, subset=['offer']) # null value is replaced with 0 value
df.show()

+----------+------------+------+--------+-----+
|product_id|product_name| price|brand_id|offer|
+----------+------------+------+--------+-----+
|         1|      iPhone|1000.0|     100|    0|
|         2|      Galaxy| 545.5|     101| null|
|         3|       Pixel|645.99|     101| null|
+----------+------------+------+--------+-----+

+----------+------------+------+--------+-----+
|product_id|product_name| price|brand_id|offer|
+----------+------------+------+--------+-----+
|         1|      iPhone|1000.0|     100|    0|
|         2|      Galaxy| 545.5|     101|    0|
|         3|       Pixel|645.99|     101|    0|
+----------+------------+------+--------+-----+

