# Indian Takeaway Orders Dataset - Analysis using PySpark

In [7]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName("MyApp").getOrCreate()
print(spark.version)

3.5.0


## Load CSV Files

#### Restaurant 1

In [8]:
# Load orders file
r1_orders_df = spark.read.csv("restaurant-1-orders.csv", header = True, inferSchema = True)
r1_orders_df.show(5)

+------------+----------------+-------------------+--------+-------------+--------------+
|Order Number|      Order Date|          Item Name|Quantity|Product Price|Total products|
+------------+----------------+-------------------+--------+-------------+--------------+
|       16118|03/08/2019 20:25|      Plain Papadum|       2|          0.8|             6|
|       16118|03/08/2019 20:25|   King Prawn Balti|       1|        12.95|             6|
|       16118|03/08/2019 20:25|        Garlic Naan|       1|         2.95|             6|
|       16118|03/08/2019 20:25|      Mushroom Rice|       1|         3.95|             6|
|       16118|03/08/2019 20:25|Paneer Tikka Masala|       1|         8.95|             6|
+------------+----------------+-------------------+--------+-------------+--------------+
only showing top 5 rows



In [9]:
# Load products price file
r1_prices_df = spark.read.csv("restaurant-1-products-price.csv", header = True, inferSchema = True)
r1_prices_df.show(5)

+-------------+-------------+
|    Item Name|Product Price|
+-------------+-------------+
|   Mint Sauce|          0.5|
|  Lime Pickle|          0.5|
|Mango Chutney|          0.5|
|    Red Sauce|          0.5|
|Onion Chutney|          0.5|
+-------------+-------------+
only showing top 5 rows



#### Restaurant 2

In [10]:
# Load orders file
r2_orders_df = spark.read.csv("restaurant-2-orders.csv", header = True, inferSchema = True)
r2_orders_df.show(5)

+--------+----------------+--------------------+--------+-------------+--------------+
|Order ID|      Order Date|           Item Name|Quantity|Product Price|Total products|
+--------+----------------+--------------------+--------+-------------+--------------+
|   25583|03/08/2019 21:58|Tandoori Mixed Grill|       1|        11.95|            12|
|   25583|03/08/2019 21:58|        Madras Sauce|       1|         3.95|            12|
|   25583|03/08/2019 21:58|       Mushroom Rice|       2|         3.95|            12|
|   25583|03/08/2019 21:58|         Garlic Naan|       1|         2.95|            12|
|   25583|03/08/2019 21:58|             Paratha|       1|         2.95|            12|
+--------+----------------+--------------------+--------+-------------+--------------+
only showing top 5 rows



In [11]:
# Load products price file
r2_prices_df = spark.read.csv("restaurant-2-products-price.csv", header = True, inferSchema = True)
r2_prices_df.show(5)

+-------------+-------------+
|    Item Name|Product Price|
+-------------+-------------+
|Onion Chutney|          0.5|
|   Mint Sauce|          0.5|
|Mango Chutney|          0.5|
|    Red Sauce|          0.5|
|  Lime Pickle|          0.5|
+-------------+-------------+
only showing top 5 rows



## Data Exploration

### Check Schemas

In [12]:
print("Restaurant 1 Orders Schema:")
r1_orders_df.printSchema()
print("Restaurant 2 Orders Schema:")
r2_orders_df.printSchema()
print("Restaurant 1 Prices Schema:")
r1_prices_df.printSchema()
print("Restaurant 2 Prices Schema:")
r2_prices_df.printSchema()

Restaurant 1 Orders Schema:
root
 |-- Order Number: integer (nullable = true)
 |-- Order Date: string (nullable = true)
 |-- Item Name: string (nullable = true)
 |-- Quantity: integer (nullable = true)
 |-- Product Price: double (nullable = true)
 |-- Total products: integer (nullable = true)

Restaurant 2 Orders Schema:
root
 |-- Order ID: integer (nullable = true)
 |-- Order Date: string (nullable = true)
 |-- Item Name: string (nullable = true)
 |-- Quantity: integer (nullable = true)
 |-- Product Price: double (nullable = true)
 |-- Total products: integer (nullable = true)

Restaurant 1 Prices Schema:
root
 |-- Item Name: string (nullable = true)
 |-- Product Price: double (nullable = true)

Restaurant 2 Prices Schema:
root
 |-- Item Name: string (nullable = true)
 |-- Product Price: double (nullable = true)



### Number of Total and Unique Orders

In [13]:
print("Restaurant 1 - Total rows:", r1_orders_df.count())
print("Restaurant 1 - Unique orders:", r1_orders_df.select("Order Number").distinct().count())

print("Restaurant 2 - Total rows:", r2_orders_df.count())
print("Restaurant 2 - Unique orders:", r2_orders_df.select("Order ID").distinct().count())

Restaurant 1 - Total rows: 74818
Restaurant 1 - Unique orders: 13397
Restaurant 2 - Total rows: 119183
Restaurant 2 - Unique orders: 19658


### Summary Stats of Quantities and Prices

In [14]:
print("Restaurant 1:")
r1_orders_df.describe(["Quantity", "Product Price"]).show()
print("Restaurant 2:")
r2_orders_df.describe(["Quantity", "Product Price"]).show()

Restaurant 1:
+-------+------------------+-----------------+
|summary|          Quantity|    Product Price|
+-------+------------------+-----------------+
|  count|             74818|            74818|
|   mean|  1.24356438290251|5.286491886982787|
| stddev|0.7982073410496792|  3.3382213559897|
|    min|                 1|              0.5|
|    max|                51|            17.95|
+-------+------------------+-----------------+

Restaurant 2:
+-------+------------------+------------------+
|summary|          Quantity|     Product Price|
+-------+------------------+------------------+
|  count|            119183|            119183|
|   mean|1.2488693857345428| 5.108173145502638|
| stddev|0.7022026750515845|3.2077047387844217|
|    min|                 1|               0.5|
|    max|                20|             17.95|
+-------+------------------+------------------+



### Top Selling Items by Quantity

In [15]:
# Register the DataFrame as a SQL temporary view
r1_orders_df.createOrReplaceTempView("r1_orders")

# SQL query
top_items_r1 = spark.sql("""
    SELECT `Item Name`, SUM(Quantity) AS Total_Quantity
    FROM r1_orders
    GROUP BY `Item Name`
    ORDER BY Total_Quantity DESC
    LIMIT 10
""")

print("Restaurant 1:")
top_items_r1.show()


Restaurant 1:
+--------------------+--------------+
|           Item Name|Total_Quantity|
+--------------------+--------------+
|       Plain Papadum|         10648|
|          Pilau Rice|          6367|
|          Plain Naan|          4983|
|         Garlic Naan|          3318|
|          Plain Rice|          2964|
|        Onion Bhajee|          2749|
|       Mango Chutney|          2504|
|Chicken Tikka Masala|          2473|
|             Chapati|          1935|
|          Mint Sauce|          1840|
+--------------------+--------------+



In [16]:
# Register the DataFrame as a SQL temporary view
r2_orders_df.createOrReplaceTempView("r2_orders")

# SQL query:
top_items_r2 = spark.sql("""
    SELECT `Item Name`, SUM(Quantity) AS Total_Quantity
    FROM r2_orders
    GROUP BY `Item Name`
    ORDER BY Total_Quantity DESC
    LIMIT 10
""")

print("Restaurant 2:")
top_items_r2.show()


Restaurant 2:
+--------------------+--------------+
|           Item Name|Total_Quantity|
+--------------------+--------------+
|       Plain Papadum|         18056|
|          Pilau Rice|         11754|
|                Naan|          8730|
|         Garlic Naan|          4809|
|         Bombay Aloo|          4336|
|       Mango Chutney|          4124|
|Chicken Tikka Masala|          3970|
|         Onion Bhaji|          3965|
|          Plain Rice|          3532|
|       Mushroom Rice|          3424|
+--------------------+--------------+



### Total Revenue per Order

In [21]:
rev_per_order_r1 = spark.sql("""
    SELECT `Order Number`, SUM(Quantity * `Product Price`) AS Revenue
    FROM r1_orders
    GROUP BY `Order Number`
    ORDER BY Revenue DESC
""")

print("Restaurant 1:")
rev_per_order_r1.show(10)

Restaurant 1:
+------------+------------------+
|Order Number|           Revenue|
+------------+------------------+
|        6769|            1242.0|
|        6768|            685.25|
|       15840| 660.4499999999999|
|        9412| 581.9999999999999|
|        9411| 460.7499999999997|
|        9374|432.04999999999995|
|        9413|289.64999999999986|
|        3976|268.49999999999994|
|        9804|231.89999999999995|
|        9414|224.84999999999988|
+------------+------------------+
only showing top 10 rows



In [22]:
rev_per_order_r2 = spark.sql("""
    SELECT `Order ID`, SUM(Quantity * `Product Price`) AS Revenue
    FROM r2_orders
    GROUP BY `Order ID`
    ORDER BY Revenue DESC
""")

print("Restaurant 2:")
rev_per_order_r2.show(10)

Restaurant 2:
+--------+------------------+
|Order ID|           Revenue|
+--------+------------------+
|    7952|             283.3|
|   13246|234.50000000000006|
|   14224|193.45000000000005|
|   19569|183.65000000000003|
|   22328|182.34999999999997|
|   24856|175.95000000000002|
|   21984|171.04999999999998|
|    5170|167.60000000000002|
|   24855|             167.0|
|   17228|166.60000000000002|
+--------+------------------+
only showing top 10 rows



(Change either Order ID or Order Number later)

### First and Last Orders

In [20]:
r1_orders_df.select("Order Date").agg({"Order Date": "min", "Order Date": "max"}).show()
r2_orders_df.select("Order Date").agg({"Order Date": "min", "Order Date": "max"}).show()

+----------------+
| max(Order Date)|
+----------------+
|31/12/2018 21:56|
+----------------+

+----------------+
| max(Order Date)|
+----------------+
|31/12/2018 21:07|
+----------------+

