In [0]:
# Read CSV file into DataFrame
csv_file_path = "/FileStore/tables/Online_Sales_Data_1.csv"
df = spark.read.csv(csv_file_path, header=True)



In [0]:
# Display schema of DataFrame
df.printSchema()

# Display content of DataFrame
df.show(5)

root
 |-- ID: string (nullable = true)
 |-- Date: string (nullable = true)
 |-- Category: string (nullable = true)
 |-- Name: string (nullable = true)
 |-- Units: string (nullable = true)
 |-- Unit_Price: string (nullable = true)
 |-- Total_Revenue: string (nullable = true)
 |-- Region: string (nullable = true)
 |-- Payment_Method: string (nullable = true)

+-----+----------+---------------+--------------------+-----+----------+-------------+-------------+--------------+
|   ID|      Date|       Category|                Name|Units|Unit_Price|Total_Revenue|       Region|Payment_Method|
+-----+----------+---------------+--------------------+-----+----------+-------------+-------------+--------------+
|10001|01-01-2024|    Electronics|       iPhone 14 Pro|    2|    999.99|      1999.98|North America|   Credit Card|
|10002|02-01-2024|Home Appliances|    Dyson V11 Vacuum|    1|    499.99|       499.99|       Europe|        PayPal|
|10003|03-01-2024|       Clothing|    Levi's 501 Jeans|    3

In [0]:
# Select specific columns
selected_columns = df.select("ID", "Name", "Unit_Price")
print("Selected Columns:")
selected_columns.show(10)

Selected Columns:
+-----+--------------------+----------+
|   ID|                Name|Unit_Price|
+-----+--------------------+----------+
|10001|       iPhone 14 Pro|    999.99|
|10002|    Dyson V11 Vacuum|    499.99|
|10003|    Levi's 501 Jeans|     69.99|
|10004|   The Da Vinci Code|     15.99|
|10005|Neutrogena Skinca...|     89.99|
|10006|Wilson Evolution ...|     29.99|
|10007| MacBook Pro 16-inch|   2499.99|
|10008|Blueair Classic 480i|    599.99|
|10009|    Nike Air Force 1|     89.99|
|10010|Dune by Frank Her...|     25.99|
+-----+--------------------+----------+
only showing top 10 rows



Filter: Apply conditions to filter rows.


In [0]:
# Filter rows based on a condition
filtered_data = df.filter(df.Units > 5)
print("Filtered Data:", filtered_data.count())
filtered_data.show()

Filtered Data: 5
+-----+----------+--------+--------------------+-----+----------+-------------+------+--------------+
|   ID|      Date|Category|                Name|Units|Unit_Price|Total_Revenue|Region|Payment_Method|
+-----+----------+--------+--------------------+-----+----------+-------------+------+--------------+
|10009|09-01-2024|Clothing|    Nike Air Force 1|    6|     89.99|       539.94|  Asia|    Debit Card|
|10048|17-02-2024|  Sports|Yeti Rambler Tumbler|    6|     39.99|       239.94|  Asia|   Credit Card|
|10063|03-03-2024|Clothing|Hanes ComfortSoft...|   10|      9.99|         99.9|  Asia|    Debit Card|
|10084|24-03-2024|  Sports|Spalding NBA Stre...|    6|     24.99|       149.94|  Asia|   Credit Card|
|10099|08-04-2024|Clothing|Gap Essential Cre...|    6|     19.99|       119.94|  Asia|    Debit Card|
+-----+----------+--------+--------------------+-----+----------+-------------+------+--------------+



GroupBy: Group data based on specific columns
Aggregations: Perform functions like sum, average, etc., on grouped data.

In [0]:
# GroupBy and Aggregations
grouped_data = df.groupBy("Category").agg({"Units": "sum", "Unit_Price": "avg"})
print("Grouped and Aggregated Data:")
grouped_data.show()

Grouped and Aggregated Data:
+---------------+----------+------------------+
|       Category|sum(Units)|   avg(Unit_Price)|
+---------------+----------+------------------+
|         Sports|      88.0| 261.2839999999999|
|    Electronics|      66.0| 691.5915000000003|
|       Clothing|     145.0| 67.53649999999999|
|Beauty Products|      46.0|61.623000000000005|
|          Books|     114.0|16.153000000000006|
|Home Appliances|      59.0|320.18549999999993|
+---------------+----------+------------------+



Join: Combine multiple DataFrames based on specified columns.

In [0]:
# Join with another DataFrame
df2 = df.select("id", "category").limit(10)
joined_data = df.join(df2, "id", "inner")
print("Joined Data:")
joined_data.show()

Joined Data:
+-----+----------+---------------+--------------------+-----+----------+-------------+-------------+--------------+---------------+
|   ID|      Date|       Category|                Name|Units|Unit_Price|Total_Revenue|       Region|Payment_Method|       category|
+-----+----------+---------------+--------------------+-----+----------+-------------+-------------+--------------+---------------+
|10001|01-01-2024|    Electronics|       iPhone 14 Pro|    2|    999.99|      1999.98|North America|   Credit Card|    Electronics|
|10002|02-01-2024|Home Appliances|    Dyson V11 Vacuum|    1|    499.99|       499.99|       Europe|        PayPal|Home Appliances|
|10003|03-01-2024|       Clothing|    Levi's 501 Jeans|    3|     69.99|       209.97|         Asia|    Debit Card|       Clothing|
|10004|04-01-2024|          Books|   The Da Vinci Code|    4|     15.99|        63.96|North America|   Credit Card|          Books|
|10005|05-01-2024|Beauty Products|Neutrogena Skinca...|    1|  

Sort: Arrange rows based on one or more columns.

In [0]:
# Sort by a column
sorted_data = df.orderBy("Unit_Price")
print("Sorted Data:")
sorted_data.show(10)

Sorted Data:
+-----+----------+---------------+--------------------+-----+----------+-------------+-------------+--------------+
|   ID|      Date|       Category|                Name|Units|Unit_Price|Total_Revenue|       Region|Payment_Method|
+-----+----------+---------------+--------------------+-----+----------+-------------+-------------+--------------+
|10070|10-03-2024|          Books|The Great Gatsby ...|    2|     10.99|        21.98|North America|   Credit Card|
|10106|15-04-2024|          Books|The Girl on the T...|    4|     10.99|        43.96|North America|   Credit Card|
|10148|27-05-2024|          Books|The Outsiders by ...|    3|     10.99|        32.97|North America|   Credit Card|
|10190|08-07-2024|          Books|The Catcher in th...|    3|     10.99|        32.97|North America|   Credit Card|
|10214|01-08-2024|          Books|The Girl with the...|    3|     10.99|        32.97|North America|   Credit Card|
|10238|25-08-2024|          Books|The Handmaid's Ta...|    

In [0]:
# Sort by a column desc
from pyspark.sql.functions import col, desc
sorted_data = df.orderBy(col("Unit_Price").desc(), col("ID").desc())
print("Sorted Data Descending:")
sorted_data.show(10)

Sorted Data Descending:
+-----+----------+---------------+--------------------+-----+----------+-------------+-------------+--------------+
|   ID|      Date|       Category|                Name|Units|Unit_Price|Total_Revenue|       Region|Payment_Method|
+-----+----------+---------------+--------------------+-----+----------+-------------+-------------+--------------+
|10096|05-04-2024|         Sports| Garmin Fenix 6X Pro|    1|    999.99|       999.99|         Asia|   Credit Card|
|10001|01-01-2024|    Electronics|       iPhone 14 Pro|    2|    999.99|      1999.98|North America|   Credit Card|
|10091|31-03-2024|    Electronics|Logitech MX Maste...|    2|     99.99|       199.98|North America|   Credit Card|
|10085|25-03-2024|    Electronics| Ring Video Doorbell|    1|     99.99|        99.99|North America|   Credit Card|
|10020|20-01-2024|Home Appliances|Ninja Professiona...|    1|     99.99|        99.99|       Europe|        PayPal|
|10200|18-07-2024|Home Appliances|Instant Pot Du

Distinct: Get unique rows.

In [0]:
# Get distinct product category
distinct_rows = df.select("Category").distinct()
print("Distinct Product Categories:")
distinct_rows.show()

Distinct Product Categories:
+---------------+
|       Category|
+---------------+
|         Sports|
|    Electronics|
|       Clothing|
|Beauty Products|
|          Books|
|Home Appliances|
+---------------+



Drop: Remove specified columns.

In [0]:
# Drop columns
dropped_columns = df.drop("Units", "Category")
print("Dropped Columns:")
dropped_columns.show(10)

Dropped Columns:
+-----+----------+--------------------+----------+-------------+-------------+--------------+
|   ID|      Date|                Name|Unit_Price|Total_Revenue|       Region|Payment_Method|
+-----+----------+--------------------+----------+-------------+-------------+--------------+
|10001|01-01-2024|       iPhone 14 Pro|    999.99|      1999.98|North America|   Credit Card|
|10002|02-01-2024|    Dyson V11 Vacuum|    499.99|       499.99|       Europe|        PayPal|
|10003|03-01-2024|    Levi's 501 Jeans|     69.99|       209.97|         Asia|    Debit Card|
|10004|04-01-2024|   The Da Vinci Code|     15.99|        63.96|North America|   Credit Card|
|10005|05-01-2024|Neutrogena Skinca...|     89.99|        89.99|       Europe|        PayPal|
|10006|06-01-2024|Wilson Evolution ...|     29.99|       149.95|         Asia|   Credit Card|
|10007|07-01-2024| MacBook Pro 16-inch|   2499.99|      2499.99|North America|   Credit Card|
|10008|08-01-2024|Blueair Classic 480i|    

WithColumn: Add new calculated columns.

In [0]:
# Add a new calculated column
df_with_new_column = df.withColumn("Derived_Revenue", df.Units * df.Unit_Price)
print("DataFrame with New Column:")
df_with_new_column.show(10)

DataFrame with New Column:
+-----+----------+---------------+--------------------+-----+----------+-------------+-------------+--------------+------------------+
|   ID|      Date|       Category|                Name|Units|Unit_Price|Total_Revenue|       Region|Payment_Method|   Derived_Revenue|
+-----+----------+---------------+--------------------+-----+----------+-------------+-------------+--------------+------------------+
|10001|01-01-2024|    Electronics|       iPhone 14 Pro|    2|    999.99|      1999.98|North America|   Credit Card|           1999.98|
|10002|02-01-2024|Home Appliances|    Dyson V11 Vacuum|    1|    499.99|       499.99|       Europe|        PayPal|            499.99|
|10003|03-01-2024|       Clothing|    Levi's 501 Jeans|    3|     69.99|       209.97|         Asia|    Debit Card|209.96999999999997|
|10004|04-01-2024|          Books|   The Da Vinci Code|    4|     15.99|        63.96|North America|   Credit Card|             63.96|
|10005|05-01-2024|Beauty Pro

Alias: Rename columns for better readability.

In [0]:
# Rename columns using alias
df_with_alias = df.withColumnRenamed("Unit_Price", "Price")
print("DataFrame with Aliased Column:")
df_with_alias.show(10)

DataFrame with Aliased Column:
+-----+----------+---------------+--------------------+-----+-------+-------------+-------------+--------------+
|   ID|      Date|       Category|                Name|Units|  Price|Total_Revenue|       Region|Payment_Method|
+-----+----------+---------------+--------------------+-----+-------+-------------+-------------+--------------+
|10001|01-01-2024|    Electronics|       iPhone 14 Pro|    2| 999.99|      1999.98|North America|   Credit Card|
|10002|02-01-2024|Home Appliances|    Dyson V11 Vacuum|    1| 499.99|       499.99|       Europe|        PayPal|
|10003|03-01-2024|       Clothing|    Levi's 501 Jeans|    3|  69.99|       209.97|         Asia|    Debit Card|
|10004|04-01-2024|          Books|   The Da Vinci Code|    4|  15.99|        63.96|North America|   Credit Card|
|10005|05-01-2024|Beauty Products|Neutrogena Skinca...|    1|  89.99|        89.99|       Europe|        PayPal|
|10006|06-01-2024|         Sports|Wilson Evolution ...|    5|  29