In [1]:
!pip install pyspark




In [2]:
from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .appName("Retail Sales Big Data Analysis") \
    .getOrCreate()


In [3]:
df = spark.read.csv(
    "/content/retail_sales_dataset.csv",
    header=True,
    inferSchema=True
)


In [4]:
df.show(5)
df.printSchema()
df.count()


+--------------+----------+-----------+------+---+----------------+--------+--------------+------------+
|Transaction ID|      Date|Customer ID|Gender|Age|Product Category|Quantity|Price per Unit|Total Amount|
+--------------+----------+-----------+------+---+----------------+--------+--------------+------------+
|             1|2023-11-24|    CUST001|  Male| 34|          Beauty|       3|            50|         150|
|             2|2023-02-27|    CUST002|Female| 26|        Clothing|       2|           500|        1000|
|             3|2023-01-13|    CUST003|  Male| 50|     Electronics|       1|            30|          30|
|             4|2023-05-21|    CUST004|  Male| 37|        Clothing|       1|           500|         500|
|             5|2023-05-06|    CUST005|  Male| 30|          Beauty|       2|            50|         100|
+--------------+----------+-----------+------+---+----------------+--------+--------------+------------+
only showing top 5 rows
root
 |-- Transaction ID: integ

1000

In [5]:
df.columns


['Transaction ID',
 'Date',
 'Customer ID',
 'Gender',
 'Age',
 'Product Category',
 'Quantity',
 'Price per Unit',
 'Total Amount']

In [6]:
df_clean = df.dropna()
df_clean.count()


1000

In [7]:
from pyspark.sql.functions import sum, col

df_clean.select(
    sum(col("Total Amount")).alias("Total_Revenue")
).show()


+-------------+
|Total_Revenue|
+-------------+
|       456000|
+-------------+



In [8]:
df_clean.groupBy("Product Category") \
    .agg(sum(col("Total Amount")).alias("Category_Revenue")) \
    .orderBy("Category_Revenue", ascending=False) \
    .show()


+----------------+----------------+
|Product Category|Category_Revenue|
+----------------+----------------+
|     Electronics|          156905|
|        Clothing|          155580|
|          Beauty|          143515|
+----------------+----------------+



In [9]:
df_clean.groupBy("Gender") \
    .agg(sum(col("Total Amount")).alias("Total_Sales")) \
    .show()


+------+-----------+
|Gender|Total_Sales|
+------+-----------+
|Female|     232840|
|  Male|     223160|
+------+-----------+



In [10]:
df_clean.groupBy("Product Category") \
    .agg(sum(col("Quantity")).alias("Total_Quantity")) \
    .orderBy("Total_Quantity", ascending=False) \
    .show()


+----------------+--------------+
|Product Category|Total_Quantity|
+----------------+--------------+
|        Clothing|           894|
|     Electronics|           849|
|          Beauty|           771|
+----------------+--------------+



This project uses PySpark to process a retail sales dataset using distributed computing.
PySpark enables parallel data processing and scalability, allowing the same code to handle much larger datasets efficiently compared to traditional tools like Pandas.