## DataFrame

A DataFrame is a distributed collection of data in a strucured way. The data is organized as rows and named columns.

1. DataFrame is abstracted RDD.
3. It is similar to table in realtional (SQL) database.
3. Enables query and other optimizations.

### Advantages
1. Optimized Execution
2. Ease of Use
3. Integration with Eco-System: Integrated easily with MLlib, Streaming and other extensions
4. Built-in Optimization
5. Powerful Data Conversions

In [1]:
from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .appName("MySparkApp-DataFrame") \
    .getOrCreate()

In [2]:
spark

#### Creating DataFrame

In [3]:
data = [(1, "Paul", 32), (2, "Tina", 45), (3, "John", 28)]

In [4]:
df = spark.createDataFrame(data)

In [5]:
df.show()

+---+----+---+
| _1|  _2| _3|
+---+----+---+
|  1|Paul| 32|
|  2|Tina| 45|
|  3|John| 28|
+---+----+---+



In [6]:
df = spark.createDataFrame(data=data, schema=["id", "name", "age"])

In [7]:
df.show()

+---+----+---+
| id|name|age|
+---+----+---+
|  1|Paul| 32|
|  2|Tina| 45|
|  3|John| 28|
+---+----+---+



In [8]:
df.printSchema()

root
 |-- id: long (nullable = true)
 |-- name: string (nullable = true)
 |-- age: long (nullable = true)



#### DataFrame with External Schema

1. StructType
2. StructField
3. Data Types: LongType, StringType, IntegerType, ArrayType, MapType

In [9]:
from pyspark.sql.types import *

schema = StructType([
    StructField(name="id", dataType=LongType(), nullable=False),
    StructField(name="name", dataType=StringType(), nullable=False),
    StructField(name="age", dataType=IntegerType(), nullable=True)
])

df = spark.createDataFrame(data=data, schema=schema)
df.printSchema()

root
 |-- id: long (nullable = false)
 |-- name: string (nullable = false)
 |-- age: integer (nullable = true)



In [10]:
df.show()

+---+----+---+
| id|name|age|
+---+----+---+
|  1|Paul| 32|
|  2|Tina| 45|
|  3|John| 28|
+---+----+---+



#### ArrayType

In [12]:
data = [(100, ["PC", "Monitor", "Keyboard"]), (101, ["Laptop", "Speaker"]), (102, ["Mouse", "Adapter"]), (103, ["Headphone"])]

In [13]:
from pyspark.sql.types import StringType, IntegerType, ArrayType

schema = StructType([
    StructField(name="order_id", dataType=IntegerType(), nullable=False),
    StructField(name="items", dataType=ArrayType(elementType=StringType()), nullable=False)
])

df = spark.createDataFrame(data=data, schema=schema)
df.printSchema()

root
 |-- order_id: integer (nullable = false)
 |-- items: array (nullable = false)
 |    |-- element: string (containsNull = true)



In [16]:
df.show(truncate=False)

+--------+-----------------------+
|order_id|items                  |
+--------+-----------------------+
|100     |[PC, Monitor, Keyboard]|
|101     |[Laptop, Speaker]      |
|102     |[Mouse, Adapter]       |
|103     |[Headphone]            |
+--------+-----------------------+



#### MapType

In [30]:
data = [(1, {"name": "Paul", "gender": "male"}), (2, {"name": "Tina", "gender": "female"}), (3, {"name": "John", "gender": "male"})]

In [31]:
from pyspark.sql.types import StringType, IntegerType, MapType

schema = StructType([
    StructField(name="user_id", dataType=IntegerType()),
    StructField(name="users_info", dataType=MapType(keyType=StringType(), valueType=StringType()))
])

df = spark.createDataFrame(data=data, schema=schema)
df.printSchema()

root
 |-- user_id: integer (nullable = true)
 |-- users_info: map (nullable = true)
 |    |-- key: string
 |    |-- value: string (valueContainsNull = true)



In [33]:
df.show(truncate=False)

+-------+--------------------------------+
|user_id|users_info                      |
+-------+--------------------------------+
|1      |{gender -> male, name -> Paul}  |
|2      |{gender -> female, name -> Tina}|
|3      |{gender -> male, name -> John}  |
+-------+--------------------------------+



In [34]:
data = [
    {
        "id": 1,
        "name": "Paul",
        "age": 32
    },
    {
        "id": 2,
        "name": "Tina",
        "age": 45
    },
    {
        "id": 3,
        "name": "John",
        "age": 28
    }
]

In [35]:
df = spark.createDataFrame(data=data)
df.show()

+---+---+----+
|age| id|name|
+---+---+----+
| 32|  1|Paul|
| 45|  2|Tina|
| 28|  3|John|
+---+---+----+



In [36]:
df.printSchema()

root
 |-- age: long (nullable = true)
 |-- id: long (nullable = true)
 |-- name: string (nullable = true)



#### Reading CSV File to DataFrame

In [9]:
df = spark.read.csv("resources/in/employee/employee_data_1.csv", header=True, inferSchema=True)
#df = spark.read.format("csv").load("resources/in/employee_data_1.csv", header=True, inferSchema=True)

In [10]:
df.printSchema()

root
 |-- ID: integer (nullable = true)
 |-- Name: string (nullable = true)
 |-- Department: string (nullable = true)
 |-- Salary: integer (nullable = true)



#### show() - Print or display the DataFrame records

In [None]:
df.show() # Display first 20 rows

+---+--------------+-----------+------+
| ID|          Name| Department|Salary|
+---+--------------+-----------+------+
|  1|      John Doe|Engineering| 50000|
|  2|    Jane Smith|  Marketing| 45000|
|  3|     Jim Brown|      Sales| 40000|
|  4|  Jackie White|         HR| 42000|
|  5|   Emily Davis|Engineering| 60000|
|  6| Michael Scott| Management| 75000|
|  7|    Pam Beesly|  Reception| 35000|
|  8|Dwight Schrute|      Sales| 50000|
|  9| Angela Martin| Accounting| 48000|
| 10|  Kevin Malone| Accounting| 45000|
| 11|Oscar Martinez| Accounting| 47000|
| 12|Stanley Hudson|      Sales| 46000|
+---+--------------+-----------+------+



In [14]:
df.show(5) # Display first N rows

+---+------------+-----------+------+
| ID|        Name| Department|Salary|
+---+------------+-----------+------+
|  1|    John Doe|Engineering| 50000|
|  2|  Jane Smith|  Marketing| 45000|
|  3|   Jim Brown|      Sales| 40000|
|  4|Jackie White|         HR| 42000|
|  5| Emily Davis|Engineering| 60000|
+---+------------+-----------+------+
only showing top 5 rows



In [19]:
df.show(n=5, truncate=10)

+---+----------+----------+------+
| ID|      Name|Department|Salary|
+---+----------+----------+------+
|  1|  John Doe|Enginee...| 50000|
|  2|Jane Smith| Marketing| 45000|
|  3| Jim Brown|     Sales| 40000|
|  4|Jackie ...|        HR| 42000|
|  5|Emily D...|Enginee...| 60000|
+---+----------+----------+------+
only showing top 5 rows



In [20]:
df.show(n=3, vertical=True)

-RECORD 0-----------------
 ID         | 1           
 Name       | John Doe    
 Department | Engineering 
 Salary     | 50000       
-RECORD 1-----------------
 ID         | 2           
 Name       | Jane Smith  
 Department | Marketing   
 Salary     | 45000       
-RECORD 2-----------------
 ID         | 3           
 Name       | Jim Brown   
 Department | Sales       
 Salary     | 40000       
only showing top 3 rows



#### Reading from multiple CSV files

In [3]:
df = spark.read.csv(path=["resources/in/employee/employee_data_1.csv", "resources/in/employee/employee_data_2.csv"], header=True, inferSchema=True)

In [4]:
df.show()

+---+---------------+-----------------+------+
| ID|           Name|       Department|Salary|
+---+---------------+-----------------+------+
|  1|       John Doe|      Engineering| 50000|
|  2|     Jane Smith|        Marketing| 45000|
|  3|      Jim Brown|            Sales| 40000|
|  4|   Jackie White|               HR| 42000|
|  5|    Emily Davis|      Engineering| 60000|
|  6|  Michael Scott|       Management| 75000|
|  7|     Pam Beesly|        Reception| 35000|
|  8| Dwight Schrute|            Sales| 50000|
|  9|  Angela Martin|       Accounting| 48000|
| 10|   Kevin Malone|       Accounting| 45000|
| 11| Oscar Martinez|       Accounting| 47000|
| 12| Stanley Hudson|            Sales| 46000|
| 13|  Phyllis Vance|            Sales| 44000|
| 14|    Ryan Howard|             Temp| 30000|
| 15|   Kelly Kapoor| Customer Service| 37000|
| 16|Toby Flenderson|               HR| 43000|
| 17|  Creed Bratton|Quality Assurance| 38000|
| 18|   Andy Bernard|            Sales| 49000|
| 19|    Erin

#### Writing DataFrame to CSV File

In [5]:
df = spark.read.csv("resources/in/employee/employee_data_1.csv", header=True, inferSchema=True)
df.show()

+---+--------------+-----------+------+
| ID|          Name| Department|Salary|
+---+--------------+-----------+------+
|  1|      John Doe|Engineering| 50000|
|  2|    Jane Smith|  Marketing| 45000|
|  3|     Jim Brown|      Sales| 40000|
|  4|  Jackie White|         HR| 42000|
|  5|   Emily Davis|Engineering| 60000|
|  6| Michael Scott| Management| 75000|
|  7|    Pam Beesly|  Reception| 35000|
|  8|Dwight Schrute|      Sales| 50000|
|  9| Angela Martin| Accounting| 48000|
| 10|  Kevin Malone| Accounting| 45000|
| 11|Oscar Martinez| Accounting| 47000|
| 12|Stanley Hudson|      Sales| 46000|
+---+--------------+-----------+------+



In [110]:
df = df.filter(df["Salary"] >= 50000)

In [111]:
df.show()

+---+--------------+-----------+------+
| ID|          Name| Department|Salary|
+---+--------------+-----------+------+
|  1|      John Doe|Engineering| 50000|
|  5|   Emily Davis|Engineering| 60000|
|  6| Michael Scott| Management| 75000|
|  8|Dwight Schrute|      Sales| 50000|
+---+--------------+-----------+------+



#### Download WinUtils from Hadoop

- Clone and Extract the Repo for WinUtils: https://github.com/cdarlint/winutils
- Resource: https://cwiki.apache.org/confluence/display/HADOOP2/WindowsProblems
- Set the environment variables as: HADOOP_HOME = Hadoop directory and Path = %HADOOP_HOME%\bin

In [112]:
#df.write.options(header='True', delimeter=",").csv("resources/out/high_paid_employee_data.csv")
#df.write.csv(path="hdfs://resources/out/employee_data", mode="overwrite", header=True, sep=",")

#### Reading from JSON File to DataFrame

- Read from single and multiple JSON files
- Read from line seperated JSON and multiline (JSON array) JSON file

In [17]:
df = spark.read.json(path="resources/in/product_inventory/product_inventory_line_separated.json", multiLine=False) #default multiline is False

In [18]:
df.show(5)

+----------+-----+---------+------------------+-----+
|  Category|Price|ProductID|       ProductName|Stock|
+----------+-----+---------+------------------+-----+
|Smartphone|  999|     P001|   Apple iPhone 15|   50|
|Smartphone|  899|     P002|Samsung Galaxy S23|   30|
|Headphones|  299|     P003|   Sony WH-1000XM5|  100|
|    Laptop| 1199|     P004|       Dell XPS 13|   20|
|    Laptop| 1399|     P005|   HP Spectre x360|   15|
+----------+-----+---------+------------------+-----+
only showing top 5 rows



In [19]:
df.printSchema()

root
 |-- Category: string (nullable = true)
 |-- Price: long (nullable = true)
 |-- ProductID: string (nullable = true)
 |-- ProductName: string (nullable = true)
 |-- Stock: long (nullable = true)



In [25]:
df = spark.read.json(path="resources/in/product_inventory/product_inventory_multiline.json", multiLine=True)

In [26]:
df.show(5)

+----------+-----+---------+------------------+-----+
|  Category|Price|ProductID|       ProductName|Stock|
+----------+-----+---------+------------------+-----+
|Smartphone|  999|     P001|   Apple iPhone 15|   50|
|Smartphone|  899|     P002|Samsung Galaxy S23|   30|
|Headphones|  299|     P003|   Sony WH-1000XM5|  100|
|    Laptop| 1199|     P004|       Dell XPS 13|   20|
|    Laptop| 1399|     P005|   HP Spectre x360|   15|
+----------+-----+---------+------------------+-----+
only showing top 5 rows



In [6]:
df = spark.read.json(path=["resources/in/product_inventory/product_inventory_part1.json", "resources/in/product_inventory/product_inventory_part2.json"], multiLine=True)
#df = spark.read.json(path="resources/in/product_inventory/product_inventory_part*.json", multiLine=True)

In [7]:
df.show()

+--------------+-----+---------+--------------------+-----+
|      Category|Price|ProductID|         ProductName|Stock|
+--------------+-----+---------+--------------------+-----+
|    Smartphone|  999|     P001|     Apple iPhone 15|   50|
|    Smartphone|  899|     P002|  Samsung Galaxy S23|   30|
|    Headphones|  299|     P003|     Sony WH-1000XM5|  100|
|        Laptop| 1199|     P004|         Dell XPS 13|   20|
|        Laptop| 1399|     P005|     HP Spectre x360|   15|
|    Headphones|  329|     P006|Bose QuietComfort 45|   80|
|      Wearable|  499|     P007|Apple Watch Series 9|   60|
|      Wearable|  399|     P008|Samsung Galaxy Wa...|   70|
|      E-Reader|  129|     P009|   Kindle Paperwhite|  120|
|   Accessories|   99|     P010|Logitech MX Master 3|   90|
|   Accessories|   69|     P011| Razer DeathAdder V2|  100|
|    Smartphone|  799|     P012|      Google Pixel 8|   40|
|        Laptop| 1499|     P013|      Lenovo Yoga 9i|   25|
|        Camera| 1999|     P014|        

In [8]:
df.count()

20

#### Writing DataFrame to JSON File

In [15]:
#df.write.json(path="resources/out/product_inventory", mode="append")

#### Reading from Parquet files to DataFrame

In [2]:
df = spark.read.parquet("resources/in/parquest_files/userdata1.parquet")

In [3]:
df.show(5)

+-------------------+---+----------+---------+--------------------+------+--------------+----------------+------------+---------+---------+--------------------+--------+
|  registration_dttm| id|first_name|last_name|               email|gender|    ip_address|              cc|     country|birthdate|   salary|               title|comments|
+-------------------+---+----------+---------+--------------------+------+--------------+----------------+------------+---------+---------+--------------------+--------+
|2016-02-03 13:25:29|  1|    Amanda|   Jordan|    ajordan0@com.com|Female|   1.197.201.2|6759521864920116|   Indonesia| 3/8/1971| 49756.53|    Internal Auditor|   1E+02|
|2016-02-03 22:34:03|  2|    Albert|  Freeman|     afreeman1@is.gd|  Male|218.111.175.34|                |      Canada|1/16/1968|150280.17|       Accountant IV|        |
|2016-02-03 06:39:31|  3|    Evelyn|   Morgan|emorgan2@altervis...|Female|  7.161.136.94|6767119071901597|      Russia| 2/1/1960|144972.51| Structural

In [4]:
df.count()

1000

In [2]:
#df = spark.read.parquet("resources/in/parquest_files/*.parquet")
df = spark.read.parquet(*["resources/in/parquest_files/userdata1.parquet", "resources/in/parquest_files/userdata1.parquet"])

In [3]:
df.count()

2000

In [4]:
df.show(3)

+-------------------+---+----------+---------+--------------------+------+--------------+----------------+---------+---------+---------+-------------------+--------+
|  registration_dttm| id|first_name|last_name|               email|gender|    ip_address|              cc|  country|birthdate|   salary|              title|comments|
+-------------------+---+----------+---------+--------------------+------+--------------+----------------+---------+---------+---------+-------------------+--------+
|2016-02-03 13:25:29|  1|    Amanda|   Jordan|    ajordan0@com.com|Female|   1.197.201.2|6759521864920116|Indonesia| 3/8/1971| 49756.53|   Internal Auditor|   1E+02|
|2016-02-03 22:34:03|  2|    Albert|  Freeman|     afreeman1@is.gd|  Male|218.111.175.34|                |   Canada|1/16/1968|150280.17|      Accountant IV|        |
|2016-02-03 06:39:31|  3|    Evelyn|   Morgan|emorgan2@altervis...|Female|  7.161.136.94|6767119071901597|   Russia| 2/1/1960|144972.51|Structural Engineer|        |
+---

#### Writing DataFrame to Parquest File

In [7]:
#df.write.parquet(path="resources/out/parquest_files/", mode="overwrite")

#### partitionBy()
- Split the data into multiple small files based on specified columns.
- Need to used carefully. It is recommeded to use on columns having discrete number of values. If used on continuous values like name/id, it create many folders.

In [None]:
#df.write.parquet(path="resources/out/parquest_files/user", mode="overwrite", partitionBy=["country"])

In [None]:
#df.write.parquet(path="resources/out/parquest_files/user", mode="overwrite", partitionBy=["country", "gender"])

#### Row()

In [12]:
from pyspark.sql import Row

data = [Row("Laptop", 45000), Row("Monitor", 12000), Row("Keyboard", 2600), Row("Mouse", 1200)]

df = spark.createDataFrame(data=data, schema=["item", "price"])

In [13]:
df.show()

+--------+-----+
|    item|price|
+--------+-----+
|  Laptop|45000|
| Monitor|12000|
|Keyboard| 2600|
|   Mouse| 1200|
+--------+-----+



In [16]:
Order = Row("item", "price")

#order = Order("Laptop", 45000)

data = [Order("Laptop", 45000), Order("Monitor", 12000), Order("Keyboard", 2600), Order("Mouse", 1200)]

df = spark.createDataFrame(data=data)

In [17]:
df.show()

+--------+-----+
|    item|price|
+--------+-----+
|  Laptop|45000|
| Monitor|12000|
|Keyboard| 2600|
|   Mouse| 1200|
+--------+-----+



#### Column()

In [18]:
from pyspark.sql.functions import lit

col = lit("Dummy")
type(col)

pyspark.sql.column.Column

In [19]:
data = [("Laptop", 45000), ("Monitor", 12000), ("Keyboard", 2600), ("Mouse", 1200)]
schema = ["item", "price"]

df = spark.createDataFrame(data=data, schema=schema)

In [20]:
df.show()

+--------+-----+
|    item|price|
+--------+-----+
|  Laptop|45000|
| Monitor|12000|
|Keyboard| 2600|
|   Mouse| 1200|
+--------+-----+



In [21]:
df = df.withColumn("country", lit("India"))
df.show()

+--------+-----+-------+
|    item|price|country|
+--------+-----+-------+
|  Laptop|45000|  India|
| Monitor|12000|  India|
|Keyboard| 2600|  India|
|   Mouse| 1200|  India|
+--------+-----+-------+



In [22]:
df.select(df.item).show()

+--------+
|    item|
+--------+
|  Laptop|
| Monitor|
|Keyboard|
|   Mouse|
+--------+



In [23]:
df.select(df["item"]).show()

+--------+
|    item|
+--------+
|  Laptop|
| Monitor|
|Keyboard|
|   Mouse|
+--------+



In [25]:
from pyspark.sql.functions import col
df.select(col("item")).show()

+--------+
|    item|
+--------+
|  Laptop|
| Monitor|
|Keyboard|
|   Mouse|
+--------+



In [27]:
df.select(df.item, df.country).show()

+--------+-------+
|    item|country|
+--------+-------+
|  Laptop|  India|
| Monitor|  India|
|Keyboard|  India|
|   Mouse|  India|
+--------+-------+



In [6]:
spark.stop()