In [1]:
import findspark
findspark.init()

In [2]:

from pyspark.conf import SparkConf
config = SparkConf()
# config.set("property", "value")
config.setMaster("local").setAppName("DataFrameBasic")

from pyspark.sql import SparkSession
# spark Session, entry point for Spark SQL, DataFrame
spark = SparkSession.builder\
                    .config(conf=config)\
                    .getOrCreate()

sc = spark.sparkContext

22/05/09 20:15:06 WARN Utils: Your hostname, ubuntu-virtual-machine resolves to a loopback address: 127.0.1.1; using 192.168.174.129 instead (on interface ens33)
22/05/09 20:15:06 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
22/05/09 20:15:08 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
22/05/09 20:15:17 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.
22/05/09 20:15:17 WARN Utils: Service 'SparkUI' could not bind on port 4041. Attempting port 4042.
22/05/09 20:15:17 WARN Utils: Service 'SparkUI' could not bind on port 4042. Attempting port 4043.


In [3]:
# spark dataframe
# Structured Data
# data + schema
# schema will contain columns and data types
# data - rows with columns as per schema
# DataFrame Core Engine, Spark SQL Core are same
# DataFRame internally has RDD,, Rdd[Row]
# DataFrame is as alias, the actual data still on RDD only
# Data Frame is API, when we call the API, internally API is converted into 
# various plans [logical, optimized, physical plans] and finally physical plan
# used to create Java Byte Code using Scala Compiler
# When it comes to execution, it is Still RDD, transformation, action only

In [4]:
products = [
    # (product_id, product_name, price, brand_id, offer)
    (1, 'iPhone', 1000.0, 100, 0),
    (2, 'Galaxy', 545.50, 101, None),
    (3, 'Pixel', 645.99, 101, None)
]

# no data type mentioned, however we will let spark to infer schema by reading data
schema = ['product_id', 'product_name', 'price', 'brand_id', 'offer']

productDf = spark.createDataFrame(data=products, schema=schema)

# every data frame has schema, we can print it
productDf.printSchema()
# ASCII FORMAT
productDf.show() # 20 records

root
 |-- product_id: long (nullable = true)
 |-- product_name: string (nullable = true)
 |-- price: double (nullable = true)
 |-- brand_id: long (nullable = true)
 |-- offer: long (nullable = true)



                                                                                

+----------+------------+------+--------+-----+
|product_id|product_name| price|brand_id|offer|
+----------+------------+------+--------+-----+
|         1|      iPhone|1000.0|     100|    0|
|         2|      Galaxy| 545.5|     101| null|
|         3|       Pixel|645.99|     101| null|
+----------+------------+------+--------+-----+



In [5]:
# every data frame has rdd internally
# data is nothing but api applied on rdd
# DF is RDD of Row, each has has column name, value
productDf.rdd.collect()

[Row(product_id=1, product_name='iPhone', price=1000.0, brand_id=100, offer=0),
 Row(product_id=2, product_name='Galaxy', price=545.5, brand_id=101, offer=None),
 Row(product_id=3, product_name='Pixel', price=645.99, brand_id=101, offer=None)]

In [6]:
# dataframe rdd partitions
productDf.rdd.getNumPartitions()

1

In [7]:
# data frame has transformation and actions
# filter:  transformations, filter shall return dataframe which immutable
# transformation are lazy
# data frame filter
# return a new data dataframe, it won't execute the data, no job, no action
# Filer is api of dataframe, in sql known as where
# python syntax
df = productDf.filter (productDf["price"] <= 750)
df.printSchema()
df.show()

root
 |-- product_id: long (nullable = true)
 |-- product_name: string (nullable = true)
 |-- price: double (nullable = true)
 |-- brand_id: long (nullable = true)
 |-- offer: long (nullable = true)

+----------+------------+------+--------+-----+
|product_id|product_name| price|brand_id|offer|
+----------+------------+------+--------+-----+
|         2|      Galaxy| 545.5|     101| null|
|         3|       Pixel|645.99|     101| null|
+----------+------------+------+--------+-----+



In [8]:
# select api, projection 
# creates schema with columns data type
df = productDf.select("product_name", "price")
df.printSchema()
df.show()

root
 |-- product_name: string (nullable = true)
 |-- price: double (nullable = true)

+------------+------+
|product_name| price|
+------------+------+
|      iPhone|1000.0|
|      Galaxy| 545.5|
|       Pixel|645.99|
+------------+------+



In [9]:
# selectExpr dynamic expression, CAST, 
# SELECT upper(product_name), price * 0.9 
df = productDf.selectExpr("product_name", "upper(product_name)", 
                          "price", "price  * .9")

df.printSchema()
df.show()

root
 |-- product_name: string (nullable = true)
 |-- upper(product_name): string (nullable = true)
 |-- price: double (nullable = true)
 |-- (price * 0.9): double (nullable = true)

+------------+-------------------+------+-----------------+
|product_name|upper(product_name)| price|    (price * 0.9)|
+------------+-------------------+------+-----------------+
|      iPhone|             IPHONE|1000.0|            900.0|
|      Galaxy|             GALAXY| 545.5|           490.95|
|       Pixel|              PIXEL|645.99|581.3910000000001|
+------------+-------------------+------+-----------------+



In [10]:
# selectExpr dynamic expression, CAST, 
# SELECT upper(product_name), price * 0.9 
# mixing python, sql
df = productDf.selectExpr("product_name", "upper(product_name) as title", 
                          "price", "price  * .9 as grand_total")

df.printSchema()
df.show()

root
 |-- product_name: string (nullable = true)
 |-- title: string (nullable = true)
 |-- price: double (nullable = true)
 |-- grand_total: double (nullable = true)

+------------+------+------+-----------------+
|product_name| title| price|      grand_total|
+------------+------+------+-----------------+
|      iPhone|IPHONE|1000.0|            900.0|
|      Galaxy|GALAXY| 545.5|           490.95|
|       Pixel| PIXEL|645.99|581.3910000000001|
+------------+------+------+-----------------+



In [11]:
# derived a new column called offer_price, adding new column from existing columns
df = productDf.withColumn("offer_price", productDf.price * 0.9)
df.printSchema()
df.show()

root
 |-- product_id: long (nullable = true)
 |-- product_name: string (nullable = true)
 |-- price: double (nullable = true)
 |-- brand_id: long (nullable = true)
 |-- offer: long (nullable = true)
 |-- offer_price: double (nullable = true)

+----------+------------+------+--------+-----+-----------------+
|product_id|product_name| price|brand_id|offer|      offer_price|
+----------+------------+------+--------+-----+-----------------+
|         1|      iPhone|1000.0|     100|    0|            900.0|
|         2|      Galaxy| 545.5|     101| null|           490.95|
|         3|       Pixel|645.99|     101| null|581.3910000000001|
+----------+------------+------+--------+-----+-----------------+



In [12]:
# rename column
df = productDf.withColumnRenamed("price", "total")
df.printSchema()
df.show()

root
 |-- product_id: long (nullable = true)
 |-- product_name: string (nullable = true)
 |-- total: double (nullable = true)
 |-- brand_id: long (nullable = true)
 |-- offer: long (nullable = true)

+----------+------------+------+--------+-----+
|product_id|product_name| total|brand_id|offer|
+----------+------------+------+--------+-----+
|         1|      iPhone|1000.0|     100|    0|
|         2|      Galaxy| 545.5|     101| null|
|         3|       Pixel|645.99|     101| null|
+----------+------------+------+--------+-----+



In [13]:
# drop Columns
df = productDf.drop("brand_id")
df.printSchema()
df.show()

root
 |-- product_id: long (nullable = true)
 |-- product_name: string (nullable = true)
 |-- price: double (nullable = true)
 |-- offer: long (nullable = true)

+----------+------------+------+-----+
|product_id|product_name| price|offer|
+----------+------------+------+-----+
|         1|      iPhone|1000.0|    0|
|         2|      Galaxy| 545.5| null|
|         3|       Pixel|645.99| null|
+----------+------------+------+-----+



In [14]:
# filter, where conditions
# filter and where are same, alias
# python expression
df = productDf.filter( (productDf.price >= 500) & (productDf["price"] < 600))
df.printSchema()
df.show()

root
 |-- product_id: long (nullable = true)
 |-- product_name: string (nullable = true)
 |-- price: double (nullable = true)
 |-- brand_id: long (nullable = true)
 |-- offer: long (nullable = true)

+----------+------------+-----+--------+-----+
|product_id|product_name|price|brand_id|offer|
+----------+------------+-----+--------+-----+
|         2|      Galaxy|545.5|     101| null|
+----------+------------+-----+--------+-----+



In [15]:
# filter and where are same
df = productDf.where( (productDf.price >= 500) & (productDf["price"] < 600))
df.printSchema()
df.show()

root
 |-- product_id: long (nullable = true)
 |-- product_name: string (nullable = true)
 |-- price: double (nullable = true)
 |-- brand_id: long (nullable = true)
 |-- offer: long (nullable = true)

+----------+------------+-----+--------+-----+
|product_id|product_name|price|brand_id|offer|
+----------+------------+-----+--------+-----+
|         2|      Galaxy|545.5|     101| null|
+----------+------------+-----+--------+-----+



In [22]:
# pyspark, filter, or where with sql expression, MIX
df = productDf.where (" price >= 500 AND price < 600")
df.printSchema()
df.show()

root
 |-- product_id: long (nullable = true)
 |-- product_name: string (nullable = true)
 |-- price: double (nullable = true)
 |-- brand_id: long (nullable = true)
 |-- offer: long (nullable = true)

+----------+------------+-----+--------+-----+
|product_id|product_name|price|brand_id|offer|
+----------+------------+-----+--------+-----+
|         2|      Galaxy|545.5|     101| null|
+----------+------------+-----+--------+-----+



In [21]:
df = productDf.drop("brand_id")
productDf.printSchema()
print("df schema")
df.printSchema()
df2 = df.withColumn("price", df["price"] * 2)
df2.show()
df.show()

root
 |-- product_id: long (nullable = true)
 |-- product_name: string (nullable = true)
 |-- price: double (nullable = true)
 |-- brand_id: long (nullable = true)
 |-- offer: long (nullable = true)

df schema
root
 |-- product_id: long (nullable = true)
 |-- product_name: string (nullable = true)
 |-- price: double (nullable = true)
 |-- offer: long (nullable = true)

+----------+------------+-------+-----+
|product_id|product_name|  price|offer|
+----------+------------+-------+-----+
|         1|      iPhone| 2000.0|    0|
|         2|      Galaxy| 1091.0| null|
|         3|       Pixel|1291.98| null|
+----------+------------+-------+-----+

+----------+------------+------+-----+
|product_id|product_name| price|offer|
+----------+------------+------+-----+
|         1|      iPhone|1000.0|    0|
|         2|      Galaxy| 545.5| null|
|         3|       Pixel|645.99| null|
+----------+------------+------+-----+



In [23]:

# how to reference columns in pyspark, all refers to a class called Column
print(productDf.price)
print(productDf['price'])

# with function col - column
from pyspark.sql.functions import col
print(col("price"))


Column<'price'>
Column<'price'>
Column<'price'>


In [27]:
# add a new column, which a fixed constant
from pyspark.sql.functions import lit , col
# lit - literal - constant
df = productDf.withColumn("qty", lit(4))\
              .withColumn("amount", col("qty")  *  productDf.price)

df.printSchema()
df.show()

root
 |-- product_id: long (nullable = true)
 |-- product_name: string (nullable = true)
 |-- price: double (nullable = true)
 |-- brand_id: long (nullable = true)
 |-- offer: long (nullable = true)
 |-- qty: integer (nullable = false)
 |-- amount: double (nullable = true)

+----------+------------+------+--------+-----+---+-------+
|product_id|product_name| price|brand_id|offer|qty| amount|
+----------+------------+------+--------+-----+---+-------+
|         1|      iPhone|1000.0|     100|    0|  4| 4000.0|
|         2|      Galaxy| 545.5|     101| null|  4| 2182.0|
|         3|       Pixel|645.99|     101| null|  4|2583.96|
+----------+------------+------+--------+-----+---+-------+



In [28]:
# sort data ascending order
df = productDf.sort("price") # spark sql, asc is default
df.show()

+----------+------------+------+--------+-----+
|product_id|product_name| price|brand_id|offer|
+----------+------------+------+--------+-----+
|         2|      Galaxy| 545.5|     101| null|
|         3|       Pixel|645.99|     101| null|
|         1|      iPhone|1000.0|     100|    0|
+----------+------------+------+--------+-----+



In [30]:
# sorting decending order
from pyspark.sql.functions import desc, asc
df = productDf.sort(desc("price"))
df.show()

+----------+------------+------+--------+-----+
|product_id|product_name| price|brand_id|offer|
+----------+------------+------+--------+-----+
|         2|      Galaxy| 545.5|     101| null|
|         3|       Pixel|645.99|     101| null|
|         1|      iPhone|1000.0|     100|    0|
+----------+------------+------+--------+-----+



In [31]:
# alternatively use dataframe columns if we have df reference
df = productDf.sort (productDf.price.asc())
df.show()
# desc
df = productDf.sort (productDf.price.desc())
df.show()

+----------+------------+------+--------+-----+
|product_id|product_name| price|brand_id|offer|
+----------+------------+------+--------+-----+
|         2|      Galaxy| 545.5|     101| null|
|         3|       Pixel|645.99|     101| null|
|         1|      iPhone|1000.0|     100|    0|
+----------+------------+------+--------+-----+

+----------+------------+------+--------+-----+
|product_id|product_name| price|brand_id|offer|
+----------+------------+------+--------+-----+
|         1|      iPhone|1000.0|     100|    0|
|         3|       Pixel|645.99|     101| null|
|         2|      Galaxy| 545.5|     101| null|
+----------+------------+------+--------+-----+



In [32]:
# now fillna /non available
productDf.show()
df = productDf.fillna(value=0) # null value is replaced with 0 value
df.show()

+----------+------------+------+--------+-----+
|product_id|product_name| price|brand_id|offer|
+----------+------------+------+--------+-----+
|         1|      iPhone|1000.0|     100|    0|
|         2|      Galaxy| 545.5|     101| null|
|         3|       Pixel|645.99|     101| null|
+----------+------------+------+--------+-----+

+----------+------------+------+--------+-----+
|product_id|product_name| price|brand_id|offer|
+----------+------------+------+--------+-----+
|         1|      iPhone|1000.0|     100|    0|
|         2|      Galaxy| 545.5|     101|    0|
|         3|       Pixel|645.99|     101|    0|
+----------+------------+------+--------+-----+



In [35]:
# now fillna /non available, limit to specific columns
productDf.show()
df = productDf.fillna(value=0, subset=['offer']) # null value is replaced with 0 value
df.show()

+----------+------------+------+--------+-----+
|product_id|product_name| price|brand_id|offer|
+----------+------------+------+--------+-----+
|         1|      iPhone|1000.0|     100|    0|
|         2|      Galaxy| 545.5|     101| null|
|         3|       Pixel|645.99|     101| null|
+----------+------------+------+--------+-----+

+----------+------------+------+--------+-----+
|product_id|product_name| price|brand_id|offer|
+----------+------------+------+--------+-----+
|         1|      iPhone|1000.0|     100|    0|
|         2|      Galaxy| 545.5|     101|    0|
|         3|       Pixel|645.99|     101|    0|
+----------+------------+------+--------+-----+

