Importing necessary modules

In [1]:
from pyspark.sql.types import StructType, StructField, IntegerType, StringType, DateType

In [2]:
import pyspark
import pyspark.sql.functions as F
import os

A spark session has been created

In [3]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName('Spark project')\
        .getOrCreate()

24/03/10 14:26:31 WARN Utils: Your hostname, DESKTOP-77VPNBL resolves to a loopback address: 127.0.1.1; using 172.25.173.185 instead (on interface eth0)
24/03/10 14:26:31 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address


Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


24/03/10 14:26:33 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


Importing necessary datasets

In [4]:
schema = StructType([
    StructField("product_id", IntegerType(), True),
    StructField("customer_id", StringType(), True),
    StructField("order_date", DateType(), True),
    StructField("location", StringType(), True),
    StructField("source_order", StringType(), True)
])

In [5]:
sales_df = spark.read.format("csv").option("inferschema","true").schema(schema).load("../dataset/sales.csv.txt")
display(sales_df)

DataFrame[product_id: int, customer_id: string, order_date: date, location: string, source_order: string]

In [6]:
sales_df.show()

+----------+-----------+----------+--------+------------+
|product_id|customer_id|order_date|location|source_order|
+----------+-----------+----------+--------+------------+
|         1|          A|2023-01-01|   India|      Swiggy|
|         2|          A|2022-01-01|   India|      Swiggy|
|         2|          A|2023-01-07|   India|      Swiggy|
|         3|          A|2023-01-10|   India|  Restaurant|
|         3|          A|2022-01-11|   India|      Swiggy|
|         3|          A|2023-01-11|   India|  Restaurant|
|         2|          B|2022-02-01|   India|      Swiggy|
|         2|          B|2023-01-02|   India|      Swiggy|
|         1|          B|2023-01-04|   India|  Restaurant|
|         1|          B|2023-02-11|   India|      Swiggy|
|         3|          B|2023-01-16|   India|      zomato|
|         3|          B|2022-02-01|   India|      zomato|
|         3|          C|2023-01-01|   India|      zomato|
|         1|          C|2023-01-01|      UK|      Swiggy|
|         6|  

In [7]:
schema = StructType([
    StructField("product_id", IntegerType(), True),
    StructField("product_name", StringType(), True),
    StructField("price", IntegerType(), True)
])

In [8]:
menu_df = spark.read.format("csv").option("inferschema","true").schema(schema).load("../dataset/menu.csv.txt")
display(sales_df)

DataFrame[product_id: int, customer_id: string, order_date: date, location: string, source_order: string]

In [9]:
menu_df.show()

+----------+------------+-----+
|product_id|product_name|price|
+----------+------------+-----+
|         1|       PIZZA| null|
|         2|     Chowmin| null|
|         3|    sandwich| null|
|         4|        Dosa| null|
|         5|     Biryani| null|
|         6|       Pasta| null|
+----------+------------+-----+



Deriving year, month and quarter from the given dates

In [10]:
from pyspark.sql.functions import month, quarter, year

In [11]:
sales_df = sales_df.withColumn('order_year', year(sales_df['order_date']))

In [13]:
sales_df = sales_df.withColumn('order_quarter', quarter(sales_df['order_date']))

In [15]:
sales_df = sales_df.withColumn('order_month', month(sales_df['order_date']))

In [16]:
sales_df.show()

+----------+-----------+----------+--------+------------+----------+-------------+-----------+
|product_id|customer_id|order_date|location|source_order|order_year|order_quarter|order_month|
+----------+-----------+----------+--------+------------+----------+-------------+-----------+
|         1|          A|2023-01-01|   India|      Swiggy|      2023|            1|          1|
|         2|          A|2022-01-01|   India|      Swiggy|      2022|            1|          1|
|         2|          A|2023-01-07|   India|      Swiggy|      2023|            1|          1|
|         3|          A|2023-01-10|   India|  Restaurant|      2023|            1|          1|
|         3|          A|2022-01-11|   India|      Swiggy|      2022|            1|          1|
|         3|          A|2023-01-11|   India|  Restaurant|      2023|            1|          1|
|         2|          B|2022-02-01|   India|      Swiggy|      2022|            1|          2|
|         2|          B|2023-01-02|   India|      