In [0]:
from pyspark.sql import functions as F
from pyspark.sql.window import Window

In [0]:
oct_df = spark.read.option("header", True) \
.option("inferSchema", True) \
    .csv("/Volumes/workspace/ecommerce/ecommerce_data/2019-Oct.csv")

nov_df = spark.read.option("header", True) \
    .option("inferSchema", True) \
    .csv("/Volumes/workspace/ecommerce/ecommerce_data/2019-Nov.csv")


events = oct_df.unionByName(nov_df)



In [0]:
products = events.select(
    "product_id",
    "brand",
    "category_code"
).dropDuplicates()

users = events.select(
    "user_id"
).dropDuplicates()


In [0]:
events.display()
products.display()
users.display()


In [0]:
# perform join (inner left right outer )
events_joined = (events.join(products, on="product_id", how="left").join(users, on="user_id", how="left"))
display(events_joined)

In [0]:
user_window = Window.partitionBy("user_id").orderBy("event_time")

events_with_running_total = events_joined.withColumn("running_events", F.row_number().over(user_window))

In [0]:
events_with_running_total.select("user_id", "event_time", "event_type", "running_events").show(5)

In [0]:
conversion_rate = (
    events.groupBy("category_code")
    .agg(
        F.sum(F.when(F.col("event_type") == "purchase", 1).otherwise(0)).alias("purchase"),
        F.sum(F.when(F.col("event_type") == "view", 1).otherwise(0)).alias("view")
    )
    .withColumn(
        "conversion_rate",
        F.when(F.col("view") > 0, (F.col("purchase") / F.col("view")) * 100).otherwise(0)
    )
)

conversion_rate.show()

In [0]:
#from pyspark.sql import SparkSession

#spark = SparkSession.builder.getOrCreate()

# Events table
events = spark.createDataFrame([
    (1, 101, "click"),
    (2, 102, "view"),
    (3, 103, "purchase"),
    (4, 104, "click")
], ["event_id", "product_id", "event_type"])

# Products table
products = spark.createDataFrame([
    (101, "Mobile"),
    (102, "Laptop"),
    (103, "Tablet")
], ["product_id", "product_name"])

# Users table
users = spark.createDataFrame([
    (1, "Pavan"),
    (2, "Aman"),
    (3, "Neha")
], ["user_id", "user_name"])


In [0]:
events.display()
products.display()

users.display()

In [0]:
### Inner join 
events.join(products, "product_id", "inner").show()


In [0]:
### left join
events.join(products, "product_id", "left").show()


In [0]:
### Left join 
events.join(products, "product_id", "right").show()


In [0]:
### Right join 
events.join(products, on ="product_id", how ="outer").show()
