In [1]:
!pip install pyspark

Defaulting to user installation because normal site-packages is not writeable



[notice] A new release of pip is available: 24.0 -> 24.2
[notice] To update, run: python.exe -m pip install --upgrade pip


In [4]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('Assesment').getOrCreate()

**4. Number of New Users in January 2024: Count of users who joined in January 2024.**

In [7]:
from pyspark.sql.functions import col, month, year
user_data=spark.read.csv('Users.csv',header=True,inferSchema=True)
user_data.show()


# filter users who join in Jan 2024
user_jan_24 = user_data.filter((year(col('join_date')) == 2024) & (month(col('join_date')) == 1))

print(f'Count of user in January 2024 : {user_jan_24.count()} ')

+-------+---------+----------+
|user_id|user_name| join_date|
+-------+---------+----------+
|    101|    Alice|2023-05-10|
|    102|      Bob|2023-06-15|
|    103|  Charlie|2023-07-20|
|    104|     Dana|2023-08-25|
|    105|    Emily|2023-09-30|
+-------+---------+----------+

Count of user in January 2024 : 0 


**1. Monthly Active Users (MAU) for January 2024: Count of unique users active in January
2024.**

In [9]:
user_activity_df=spark.read.csv('UserActivity.csv',header=True,inferSchema=True)
#user_activity_df.show()
user_activity_df_24 = user_activity_df.filter((year(col('activity_date')) == 2024) & (month(col('activity_date')) == 1))
#user_activity_df_24.show()
user_activity_df_24_unique=user_activity_df_24.select('user_id').distinct().count()
print(f'Monthly Active User for January 2024 :{user_activity_df_24_unique}')

Monthly Active User for January 2024 :5


**2. Total Sales Revenue for January 2024: Sum of sales in January 2024.**

In [11]:
from pyspark.sql.functions import sum as spark_sum
sales_df=spark.read.csv('Sales.csv',header=True,inferSchema=True)
#sales_df.show()



df_filtered = sales_df.filter((year(col('sale_date')) == 2024) & (month(col('sale_date')) == 1))

# Sum the sales_amount for jan 2024
total_sales_revenue = df_filtered.agg(spark_sum('amount').alias('total_sales_revenue')).collect()[0]['total_sales_revenue']

print(f'Total sales revenue for January 2024: {total_sales_revenue}')

Total sales revenue for January 2024: 700.0


**3. Average Sale Amount Per Category for January 2024:Average sale amount per category in
January 2024.**

In [13]:

from pyspark.sql.functions import  avg

sales_df=spark.read.csv('Sales.csv',header=True,inferSchema=True)

# filter data for jan 2024
sales_df_24 = sales_df.filter((year(col('sale_date')) == 2024) & (month(col('sale_date')) == 1))

# Calculate average sale amount per category
avg_sales_per_categ= sales_df_24.groupBy('category_id').agg(avg('amount').alias('avg_sales_amount'))
print("Average Sale Amount Per Category for Jan 2024 :")
# Show the result
avg_sales_per_categ.show()

Average Sale Amount Per Category for Jan 2024 :
+-----------+----------------+
|category_id|avg_sales_amount|
+-----------+----------------+
|         C3|           200.0|
|         C1|           100.0|
|         C2|           150.0|
+-----------+----------------+



**5. Top Selling Product Category in January 2024: Product category with highest sales in
January 2024.**

In [15]:

sales_df=spark.read.csv('Sales.csv',header=True,inferSchema=True)
categories_df=spark.read.csv('Categories.csv',header=True,inferSchema=True)


# Filter data for January 2024
sales_df_filtered = sales_df.filter((year(col('sale_date')) == 2024) & (month(col('sale_date')) == 1))

# Calculate total sales per category
total_sales_per_category = sales_df_filtered.groupBy('category_id').agg(spark_sum('amount').alias('total_sales'))

# Join with categories table to get category names
result_df = total_sales_per_category.join(categories_df, on='category_id', how='inner')

# Find the top-selling category
top_selling_category = result_df.orderBy(col('total_sales').desc()).first()

print(f"Top Selling Product Category in January 2024: {top_selling_category['category_name']} with total sales of ${top_selling_category['total_sales']}")

Top Selling Product Category in January 2024: Clothing with total sales of $300.0
