### INITIALISING PYSPARK AND STARTING SESSION

In [17]:
import findspark
findspark.init()  # for finding Spark installation

from pyspark.sql import SparkSession

# Creating a Spark session
spark = SparkSession.builder \
    .appName("Big Data Analysis") \
    .getOrCreate()

# Checking if Spark is running
spark


### LOADING DATA SET

In [19]:
dataset_path = r"C:\Users\MUSHARRAF\Downloads\archive\dirty_cafe_sales.csv"

dataset = spark.read.option("header", "true").csv(dataset_path, inferSchema=True)

### DISPLAYING 15 ROWS

In [21]:
dataset.show(15)

+--------------+--------+--------+--------------+-----------+--------------+--------+----------------+
|Transaction ID|    Item|Quantity|Price Per Unit|Total Spent|Payment Method|Location|Transaction Date|
+--------------+--------+--------+--------------+-----------+--------------+--------+----------------+
|   TXN_1961373|  Coffee|       2|           2.0|        4.0|   Credit Card|Takeaway|      2023-09-08|
|   TXN_4977031|    Cake|       4|           3.0|       12.0|          Cash|In-store|      2023-05-16|
|   TXN_4271903|  Cookie|       4|           1.0|      ERROR|   Credit Card|In-store|      2023-07-19|
|   TXN_7034554|   Salad|       2|           5.0|       10.0|       UNKNOWN| UNKNOWN|      2023-04-27|
|   TXN_3160411|  Coffee|       2|           2.0|        4.0|Digital Wallet|In-store|      2023-06-11|
|   TXN_2602893|Smoothie|       5|           4.0|       20.0|   Credit Card|    NULL|      2023-03-31|
|   TXN_4433211| UNKNOWN|       3|           3.0|        9.0|         ERR

### PRINTING DATATYPES 

In [23]:
dataset.printSchema()

root
 |-- Transaction ID: string (nullable = true)
 |-- Item: string (nullable = true)
 |-- Quantity: string (nullable = true)
 |-- Price Per Unit: string (nullable = true)
 |-- Total Spent: string (nullable = true)
 |-- Payment Method: string (nullable = true)
 |-- Location: string (nullable = true)
 |-- Transaction Date: string (nullable = true)



### HANDLING MISSING VALUES

In [25]:
dataset_clean = dataset.dropna()
dataset_clean.count()

4550

### CONVERTING DATA TYPE

In [27]:
from pyspark.sql.functions import col
dataset_clean = dataset_clean.withColumn("Price Per Unit", col("Price Per Unit").cast("float"))

### GROUPING DATA

In [29]:
from pyspark.sql.functions import count

sales_count = dataset_clean.groupBy("Item").agg(count("*").alias("Total_Spent"))
sales_count.show(15)


+--------+-----------+
|    Item|Total_Spent|
+--------+-----------+
|   Salad|        564|
|   ERROR|        136|
|     Tea|        505|
|Sandwich|        547|
|   Juice|        569|
| UNKNOWN|        159|
|Smoothie|        491|
|  Coffee|        503|
|    Cake|        539|
|  Cookie|        537|
+--------+-----------+



### PRICE STATISTICS

In [31]:
from pyspark.sql.functions import avg, max, min

price_stats = dataset_clean.agg(
    avg("Price Per Unit").alias("average_price"),
    max("Price Per Unit").alias("max_price"),
    min("Price Per Unit").alias("min_price")
)
price_stats.show()


+------------------+---------+---------+
|     average_price|max_price|min_price|
+------------------+---------+---------+
|2.9569831127339112|      5.0|      1.0|
+------------------+---------+---------+

