In [0]:
df = spark.table("ecommerce.silver.events_part")
df.show()


In [0]:
#Descriptive Statistics (Statistical Summary)
#Count, Average, Min, Max (Price)

from pyspark.sql.functions import avg, min, max, count

df.select(
    count("*").alias("total_rows"),
    avg("price").alias("avg_price"),
    min("price").alias("min_price"),
    max("price").alias("max_price")
).show()


In [0]:
from pyspark.sql.functions import dayofweek, when

df2 = df.withColumn(
    "is_weekend",
    when(dayofweek("event_date").isin([1,7]), "Weekend")
    .otherwise("Weekday") 
)
df2.show()

In [0]:
#Compare average price (proxy for sales)  
df2.groupBy("is_weekend") \
   .agg(avg("price").alias("avg_price")) \
   .show()


In [0]:
#Correlation Analysis
#Correlation between price and product_id (demo)

df.selectExpr(
    "corr(price, product_id) as price_product_corr"
).show()


In [0]:
#Feature Engineering (ML Prep)
#Create ML-ready features

from pyspark.sql.functions import year, month

features_df = df2.withColumn("year", year("event_date")) \
                 .withColumn("month", month("event_date")) \
                 .withColumn(
                     "is_purchase",
                     when(df2.event_type == "purchase", 1).otherwise(0)
                 )

features_df.show()




In [0]:

from pyspark.sql.functions import sum


user_features = features_df.groupBy("user_id").agg(
    count("*").alias("total_events"),
    avg("price").alias("avg_spend"),
    sum("is_purchase").alias("total_purchases")
)

user_features.show()
