In [1]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName("Sales Data Analysis").getOrCreate()


**PyScala For 5000 Sales Record**

In [2]:
df = spark.read.csv("/content/sample_data/5000 Sales Records.csv", header=True, inferSchema=True)

+---------------+
|      Item Type|
+---------------+
|      Baby Food|
|         Snacks|
|      Beverages|
|         Cereal|
|         Snacks|
|  Personal Care|
|        Clothes|
|        Clothes|
|        Clothes|
|      Baby Food|
|Office Supplies|
|      Cosmetics|
|         Cereal|
|      Cosmetics|
|      Baby Food|
|           Meat|
|         Cereal|
|Office Supplies|
|      Beverages|
|      Baby Food|
+---------------+
only showing top 20 rows



**Q.Display the number of sales yearwise.**

In [4]:
from pyspark.sql.functions import year, to_date, col

df = df.withColumn("Order_Date", to_date(col("Order Date"), "M/d/yyyy"))
df.show(5)

df = df.withColumn("Year", year(col("Order_Date")))
df.show(5)

yearly_sales = df.groupBy("Year").count().orderBy("Year")
yearly_sales = yearly_sales.withColumnRenamed("count", "Yearly Sales")
yearly_sales.show()

+--------------------+--------------------+---------+-------------+--------------+----------+---------+---------+----------+----------+---------+-------------+----------+------------+----------+----+
|              Region|             Country|Item Type|Sales Channel|Order Priority|Order Date| Order ID|Ship Date|Units Sold|Unit Price|Unit Cost|Total Revenue|Total Cost|Total Profit|Order_Date|Year|
+--------------------+--------------------+---------+-------------+--------------+----------+---------+---------+----------+----------+---------+-------------+----------+------------+----------+----+
|Central America a...|Antigua and Barbuda |Baby Food|       Online|             M|12/20/2013|957081544|1/11/2014|       552|    255.28|   159.42|    140914.56|  87999.84|    52914.72|2013-12-20|2013|
|Central America a...|              Panama|   Snacks|      Offline|             C|  7/5/2010|301644504|7/26/2010|      2167|    152.58|    97.44|    330640.86| 211152.48|   119488.38|2010-07-05|2010|


**Q.Display the number of orders for each item**

In [7]:
item_sales = df.groupBy("Item Type").count().orderBy("Item Type")
item_sales = item_sales.withColumnRenamed("count", "Total Sales")
item_sales.show()

+---------------+-----------+
|      Item Type|Total Sales|
+---------------+-----------+
|      Baby Food|        445|
|      Beverages|        447|
|         Cereal|        385|
|        Clothes|        386|
|      Cosmetics|        424|
|         Fruits|        447|
|      Household|        424|
|           Meat|        399|
|Office Supplies|        420|
|  Personal Care|        415|
|         Snacks|        398|
|     Vegetables|        410|
+---------------+-----------+



**Q.Display the country with highest sale**

In [10]:
from pyspark.sql.functions import col, sum, desc

country_wise_sales = df.groupBy("Country").sum("Total Revenue")
country_wise_sales = country_wise_sales.orderBy(desc("sum(Total Revenue)"))
top_country = country_wise_sales.limit(1)
top_country.show()

+-------+-------------------+
|Country| sum(Total Revenue)|
+-------+-------------------+
| Rwanda|6.039873958999999E7|
+-------+-------------------+



**PySpark For retailsales data**

In [13]:
df = spark.read.csv("/content/sample_data/retailsales.csv", header=True, inferSchema=True)


In [18]:
from pyspark.sql.functions import col, month, year, sum as _sum, to_date,udf,lower
from pyspark.sql.types import IntegerType

def calculate_fiscal_year(country, year, month):
    country=country.strip().lower()
    if country == "india" and month < 4:
        return year - 1
    elif country == "usa" and month < 10:
        return year - 1
    else:
        return year

fiscal_udf = udf(calculate_fiscal_year, IntegerType())


df_with_date_parts = df.withColumn("DateParsed", to_date(col("Sales_Date"), "MM/dd/yyyy"))
df_with_date_parts = df_with_date_parts.withColumn("Year", year(col("DateParsed")))
df_with_date_parts = df_with_date_parts.withColumn("Month", month(col("DateParsed")))

df_with_fiscal = df_with_date_parts.withColumn("FiscalYear", fiscal_udf(col("Country"), col("Year"), col("Month")))

df_with_fiscal.show(truncate=False)

+----------+------+-------+-------+----------+----+-----+----------+
|Sales_Date|Amount|Country|Product|DateParsed|Year|Month|FiscalYear|
+----------+------+-------+-------+----------+----+-----+----------+
|01/23/2005|350000|India  |bear   |2005-01-23|2005|1    |2004      |
|01/27/2005|380000|India  |visky  |2005-01-27|2005|1    |2004      |
|02/12/2005|450000|India  |Rum    |2005-02-12|2005|2    |2004      |
|01/23/2006|500000|USA    |bear   |2006-01-23|2006|1    |2005      |
|01/27/2006|550000|USA    |rum    |2006-01-27|2006|1    |2005      |
|02/12/2006|650000|USA    |Visky  |2006-02-12|2006|2    |2005      |
|01/23/2006|500000|China  |Beer   |2006-01-23|2006|1    |2006      |
|01/27/2006|550000|China  |Visky  |2006-01-27|2006|1    |2006      |
|02/12/2006|658000|China  |Rum    |2006-02-12|2006|2    |2006      |
+----------+------+-------+-------+----------+----+-----+----------+



**Q.yearly sales report**

In [20]:

yearly_sales_report = df_with_fiscal.groupBy("FiscalYear") \
    .sum("Amount") \
    .withColumnRenamed("sum(Amount)", "Total_Sales") \
    .orderBy("FiscalYear")


yearly_sales_report.show(truncate=False)
yearly_sales_report.write.csv("/content/sample_data/yearly_sales_report.csv",header=True,mode="overwrite")

+----------+-----------+
|FiscalYear|Total_Sales|
+----------+-----------+
|2004      |1180000    |
|2005      |1700000    |
|2006      |1708000    |
+----------+-----------+



**Q.Yearly sum for all count**

In [25]:
yearly_country_sales = df_with_fiscal.groupBy("FiscalYear", "Country") \
    .sum("Amount") \
    .withColumnRenamed("sum(Amount)", "Total_Sales") \
    .orderBy("FiscalYear", "Country")


yearly_country_sales.show(truncate=False)
yearly_country_sales.write.csv("/content/sample_data/yearly_country_sales.csv",header=True,mode="overwrite")

+----------+-------+-----------+
|FiscalYear|Country|Total_Sales|
+----------+-------+-----------+
|2004      |India  |1180000    |
|2005      |USA    |1700000    |
|2006      |China  |1708000    |
+----------+-------+-----------+



**Q.Yearly sum for specified country**

In [28]:
specified_country = "India"

yearly_sales_specific_country = df_with_fiscal.filter(col("Country") == specified_country) \
    .groupBy("FiscalYear") \
    .sum("Amount") \
    .withColumnRenamed("sum(Amount)", "Total_Sales") \
    .orderBy("FiscalYear")

yearly_sales_specific_country.show(truncate=False)
yearly_sales_specific_country.write.csv("/content/sample_data/yearly_sales_specific_country.csv",header=True,mode="overwrite")

+----------+-----------+
|FiscalYear|Total_Sales|
+----------+-----------+
|2004      |1180000    |
+----------+-----------+



**Q. yearly report dumping in the yr_sales_rep. Meanwhile we can apply all aggregation function here.**



In [32]:
from pyspark.sql.functions import sum, avg, max, min


yr_sales_rep = df_with_fiscal.groupBy("FiscalYear").agg(
    sum("Amount").alias("Total_Sales"),
    avg("Amount").alias("Avg_Sales"),
    max("Amount").alias("Max_Sales"),
    min("Amount").alias("Min_Sales"),
)


yr_sales_rep=yr_sales_rep.orderBy("FiscalYear")
yr_sales_rep.show(truncate=False)
yr_sales_rep.write.csv("/content/sample_data/yr_sales_rep.csv",header=True,mode="overwrite")

+----------+-----------+-----------------+---------+---------+
|FiscalYear|Total_Sales|Avg_Sales        |Max_Sales|Min_Sales|
+----------+-----------+-----------------+---------+---------+
|2004      |1180000    |393333.3333333333|450000   |350000   |
|2005      |1700000    |566666.6666666666|650000   |500000   |
|2006      |1708000    |569333.3333333334|658000   |500000   |
+----------+-----------+-----------------+---------+---------+



**Q.monthly sales report of a perticular year Dumping the data into mn_sales_rep**

In [37]:
from pyspark.sql.functions import sum, col

filtered_df = df_with_fiscal.filter(col("FiscalYear") == 2006)

mn_sales_rep = filtered_df.groupBy("Month").agg(
    sum("Amount").alias("Monthly_Sales")
)

mn_sales_rep = mn_sales_rep.orderBy("Month")

mn_sales_rep.show(truncate=False)

mn_sales_rep.write.csv("/content/sample_data/mn_sales_rep.csv", header=True, mode="overwrite")


+-----+-------------+
|Month|Monthly_Sales|
+-----+-------------+
|1    |1050000      |
|2    |658000       |
+-----+-------------+



**Q.For each quartetr report for each year**

In [39]:
from pyspark.sql.functions import when, sum, col

year_to_filter = 2006
df_year = df_with_fiscal.filter(col("FiscalYear") == year_to_filter)

df_quartered = df_year.withColumn("Quarter",
    when(col("Month").between(1, 3), "Q1")
    .when(col("Month").between(4, 6), "Q2")
    .when(col("Month").between(7, 9), "Q3")
    .when(col("Month").between(10, 12), "Q4")
)

quarterly_sales_rep = df_quartered.groupBy("Quarter").agg(
    sum("Amount").alias("Quarterly_Sales")
)


quarterly_sales_rep=quarterly_sales_rep.orderBy("Quarter")
quarterly_sales_rep.show(truncate=False)
quarterly_sales_rep.write.csv("/content/sample_data/quarterly_sales_rep.csv",header=True,mode="overwrite")

+-------+---------------+
|Quarter|Quarterly_Sales|
+-------+---------------+
|Q1     |1708000        |
+-------+---------------+



**Q.For each quartetr report for each year Write udf function for that.
And for all year**


In [43]:
from pyspark.sql.functions import udf, col, sum
from pyspark.sql.types import StringType


def get_quarter(month):
    if 1 <= month <= 3:
        return "Q1"
    elif 4 <= month <= 6:
        return "Q2"
    elif 7 <= month <= 9:
        return "Q3"
    elif 10 <= month <= 12:
        return "Q4"
    else:
        return "Invalid"

quarter_udf = udf(get_quarter, StringType())

df_with_quarter = df_with_fiscal.withColumn("Quarter", quarter_udf(col("Month")))

quarterly_sales_all_years = df_with_quarter.groupBy("FiscalYear", "Quarter") \
    .agg(sum("Amount").alias("Quarterly_Sales")) \
    .orderBy("FiscalYear", "Quarter")


quarterly_sales_all_years.show(truncate=False)
quarterly_sales_all_years.write.csv("/content/sample_data/quarterly_sales_all_years.csv",header=True,mode="overwrite")

+----------+-------+---------------+
|FiscalYear|Quarter|Quarterly_Sales|
+----------+-------+---------------+
|2004      |Q1     |1180000        |
|2005      |Q1     |1700000        |
|2006      |Q1     |1708000        |
+----------+-------+---------------+



**half yearly sales rep of	"	"
Report is dumped into all_hy_sales_rep**


In [46]:
from pyspark.sql.functions import when, col, sum, month, year, to_date

df_with_date_parts = df.withColumn("DateParsed", to_date(col("Sales_Date"), "MM/dd/yyyy"))
df_with_date_parts = df_with_date_parts.withColumn("Year", year(col("DateParsed")))
df_with_date_parts = df_with_date_parts.withColumn("Month", month(col("DateParsed")))


df_with_halfyear = df_with_date_parts.withColumn(
    "HalfYear",
    when(col("Month") <= 6, "H1").otherwise("H2")
)

all_hy_sales_rep = df_with_halfyear.groupBy("Year", "HalfYear") \
    .agg(sum("Amount").alias("HalfYearly_Sales")) \
    .orderBy("Year", "HalfYear")

all_hy_sales_rep.show(truncate=False)
all_hy_sales_rep.write.csv("/content/sample_data/all_hy_sales_rep.csv",header=True,mode="overwrite")

+----+--------+----------------+
|Year|HalfYear|HalfYearly_Sales|
+----+--------+----------------+
|2005|H1      |1180000         |
|2006|H1      |3408000         |
+----+--------+----------------+



**foreach year, monthly sales rep**

In [50]:
from pyspark.sql.functions import sum, col, month, year, to_date


df_with_date_parts = df.withColumn("DateParsed", to_date(col("Sales_Date"), "MM/dd/yyyy"))
df_with_date_parts = df_with_date_parts.withColumn("Year", year(col("DateParsed")))
df_with_date_parts = df_with_date_parts.withColumn("Month", month(col("DateParsed")))


monthly_sales_rep = df_with_date_parts.groupBy("Year", "Month") \
    .agg(sum("Amount").alias("Monthly_Sales")) \
    .orderBy("Year", "Month")


monthly_sales_rep.show(truncate=False)
monthly_sales_rep.write.csv("/content/sample_data/monthly_sales_rep.csv",header=True,mode="overwrite")

+----+-----+-------------+
|Year|Month|Monthly_Sales|
+----+-----+-------------+
|2005|1    |730000       |
|2005|2    |450000       |
|2006|1    |2100000      |
|2006|2    |1308000      |
+----+-----+-------------+



**Q. foreach year, quarterly sales rep Report is dumped into all_qrt_sales_rep**

In [52]:
from pyspark.sql.functions import udf, col, sum, month, year, to_date
from pyspark.sql.types import StringType


def get_quarter(month):
    if month in [1, 2, 3]:
        return "Q1"
    elif month in [4, 5, 6]:
        return "Q2"
    elif month in [7, 8, 9]:
        return "Q3"
    else:
        return "Q4"


quarter_udf = udf(get_quarter, StringType())

df_with_date_parts = df.withColumn("DateParsed", to_date(col("Sales_Date"), "MM/dd/yyyy"))
df_with_date_parts = df_with_date_parts.withColumn("Year", year(col("DateParsed")))
df_with_date_parts = df_with_date_parts.withColumn("Month", month(col("DateParsed")))

df_with_quarter = df_with_date_parts.withColumn("Quarter", quarter_udf(col("Month")))

all_qrt_sales_rep = df_with_quarter.groupBy("Year", "Quarter") \
    .agg(sum("Amount").alias("Quarterly_Sales")) \
    .orderBy("Year", "Quarter")

all_qrt_sales_rep.show(truncate=False)
all_qrt_sales_rep.write.csv("/content/sample_data/all_qrt_sales_rep.csv",header=True,mode="overwrite")

+----+-------+---------------+
|Year|Quarter|Quarterly_Sales|
+----+-------+---------------+
|2005|Q1     |1180000        |
|2006|Q1     |3408000        |
+----+-------+---------------+



**Q.foreACH year, half yearl sales rep. Report is dumped into fore_yr_sales_rep**

In [54]:
from pyspark.sql.functions import udf, col, sum, month, year, to_date
from pyspark.sql.types import StringType

def get_half_year(month):
    return "H1" if month <= 6 else "H2"


half_year_udf = udf(get_half_year, StringType())

df_with_date_parts = df.withColumn("DateParsed", to_date(col("Sales_Date"), "MM/dd/yyyy"))
df_with_date_parts = df_with_date_parts.withColumn("Year", year(col("DateParsed")))
df_with_date_parts = df_with_date_parts.withColumn("Month", month(col("DateParsed")))


df_with_half_year = df_with_date_parts.withColumn("HalfYear", half_year_udf(col("Month")))

fore_yr_sales_rep = df_with_half_year.groupBy("Year", "HalfYear") \
    .agg(sum("Amount").alias("HalfYearly_Sales")) \
    .orderBy("Year", "HalfYear")

fore_yr_sales_rep.show(truncate=False)
fore_yr_sales_rep.write.csv("/content/sample_data/fore_yr_sales_rep.csv",header=True,mode="overwrite")

+----+--------+----------------+
|Year|HalfYear|HalfYearly_Sales|
+----+--------+----------------+
|2005|H1      |1180000         |
|2006|H1      |3408000         |
+----+--------+----------------+



**Q. in a specific quart which product made more bussiness
Compare all quarter according to product**

In [55]:
rdd = spark.sparkContext.textFile("/content/sample_data/retailsales.csv")
header = rdd.first()
data_rdd = rdd.filter(lambda line: line != header)
data_rdd.collect()
parsed_rdd = data_rdd.map(lambda line: line.split(","))
parsed_rdd.collect()
from datetime import datetime


def map_to_quarter(row):
    try:
        date_str = row[0]
        date_obj = datetime.strptime(date_str, "%m/%d/%Y")
        quarter = (date_obj.month - 1) // 3 + 1
        return (f"Q{quarter}", row)
    except:
        return ("Invalid", row)


quarter_rdd = parsed_rdd.map(map_to_quarter)


def get_amount(row):
    try:
        return int(row[1])
    except:
        return 0


quarters = ["Q1", "Q2", "Q3", "Q4"]


for q in quarters:
    q_rdd = quarter_rdd.filter(lambda x: x[0] == q)

    if not q_rdd.isEmpty():
        max_row = q_rdd.map(lambda x: x[1]).max(key=get_amount)
        print(f"Top selling product in {q}: {max_row}")
    else:
        print(f"Top selling product in {q}: No data available")


Top selling product in Q1: ['02/12/2006', '658000', 'China', 'Rum']
Top selling product in Q2: No data available
Top selling product in Q3: No data available
Top selling product in Q4: No data available
