In [0]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import *

In [0]:
spark = SparkSession.builder.appName("Ecom-DataPipeline").getOrCreate()  

In [0]:
userDf=spark.read.format("delta")\
    .load("/mnt/delta/tables/silver/user")

buyersDf=spark.read.format("delta")\
    .load("/mnt/delta/tables/silver/buyers")

sellersDf=spark.read.format("delta")\
    .load("/mnt/delta/tables/silver/seller")

countriesDf=spark.read.format("delta")\
    .load("/mnt/delta/tables/silver/countries")

In [0]:
buyersDf.printSchema()

root
 |-- country: string (nullable = true)
 |-- buyers: integer (nullable = true)
 |-- topbuyers: integer (nullable = true)
 |-- topbuyerratio: double (nullable = true)
 |-- femalebuyers: integer (nullable = true)
 |-- malebuyers: integer (nullable = true)
 |-- topfemalebuyers: integer (nullable = true)
 |-- topmalebuyers: integer (nullable = true)
 |-- femalebuyersratio: double (nullable = true)
 |-- topfemalebuyersratio: double (nullable = true)
 |-- boughtperwishlistratio: double (nullable = true)
 |-- boughtperlikeratio: double (nullable = true)
 |-- topboughtperwishlistratio: double (nullable = true)
 |-- topboughtperlikeratio: double (nullable = true)
 |-- totalproductsbought: integer (nullable = true)
 |-- totalproductswished: integer (nullable = true)
 |-- totalproductsliked: integer (nullable = true)
 |-- toptotalproductsbought: integer (nullable = true)
 |-- toptotalproductswished: integer (nullable = true)
 |-- toptotalproductsliked: integer (nullable = true)
 |-- meanprodu

In [0]:
userDf.printSchema()

root
 |-- identifierHash: long (nullable = true)
 |-- type: string (nullable = true)
 |-- country: string (nullable = true)
 |-- language: string (nullable = true)
 |-- socialNbFollowers: integer (nullable = true)
 |-- socialNbFollows: integer (nullable = true)
 |-- socialProductsLiked: integer (nullable = true)
 |-- productsListed: integer (nullable = true)
 |-- productsSold: integer (nullable = true)
 |-- productsPassRate: double (nullable = true)
 |-- productsWished: integer (nullable = true)
 |-- productsBought: integer (nullable = true)
 |-- gender: string (nullable = true)
 |-- civilityGenderId: integer (nullable = true)
 |-- civilityTitle: string (nullable = true)
 |-- hasAnyApp: boolean (nullable = true)
 |-- hasAndroidApp: integer (nullable = true)
 |-- hasIosApp: integer (nullable = true)
 |-- hasProfilePicture: integer (nullable = true)
 |-- daysSinceLastLogin: integer (nullable = true)
 |-- seniority: integer (nullable = true)
 |-- seniorityAsMonths: double (nullable = true

In [0]:
countriesDf.printSchema()

root
 |-- country: string (nullable = true)
 |-- sellers: integer (nullable = true)
 |-- topsellers: integer (nullable = true)
 |-- topsellerratio: double (nullable = true)
 |-- femalesellersratio: double (nullable = true)
 |-- topfemalesellersratio: double (nullable = true)
 |-- femalesellers: integer (nullable = true)
 |-- malesellers: integer (nullable = true)
 |-- topfemalesellers: integer (nullable = true)
 |-- topmalesellers: integer (nullable = true)
 |-- countrysoldratio: double (nullable = true)
 |-- bestsoldratio: double (nullable = true)
 |-- toptotalproductssold: double (nullable = true)
 |-- totalproductssold: double (nullable = true)
 |-- toptotalproductslisted: double (nullable = true)
 |-- totalproductslisted: double (nullable = true)
 |-- topmeanproductssold: double (nullable = true)
 |-- topmeanproductslisted: double (nullable = true)
 |-- meanproductssold: double (nullable = true)
 |-- meanproductslisted: double (nullable = true)
 |-- meanofflinedays: double (nullabl

In [0]:
sellersDf.printSchema()

root
 |-- country: string (nullable = true)
 |-- sex: string (nullable = true)
 |-- nbsellers: integer (nullable = true)
 |-- meanproductssold: double (nullable = true)
 |-- meanproductslisted: double (nullable = true)
 |-- meansellerpassrate: double (nullable = true)
 |-- totalproductssold: integer (nullable = true)
 |-- totalproductslisted: integer (nullable = true)
 |-- meanproductsbought: double (nullable = true)
 |-- meanproductswished: double (nullable = true)
 |-- meanproductsliked: double (nullable = true)
 |-- totalbought: integer (nullable = true)
 |-- totalwished: integer (nullable = true)
 |-- totalproductsliked: integer (nullable = true)
 |-- meanfollowers: double (nullable = true)
 |-- meanfollows: double (nullable = true)
 |-- percentofappusers: double (nullable = true)
 |-- percentofiosusers: double (nullable = true)
 |-- meanseniority: double (nullable = true)
 |-- isTopPerformer: integer (nullable = true)
 |-- appUsageCategory: string (nullable = true)



### Creating One Big Table

In [0]:

from pyspark.sql.functions import col, avg, sum, count, round


# Aggregate buyersDf at country level
buyersAgg = buyersDf.groupBy("country").agg(
    count("buyers").alias("total_buyers"),
    sum("totalproductsbought").alias("total_products_bought"),
    round(avg("meanproductsbought"), 2).alias("mean_products_bought"),
    round(avg("meanfollowers"), 2).alias("buyer_mean_followers"),
    round(avg("femaleBuyerRatio"), 2).alias("avg_female_buyer_ratio"),
    round(avg("maleBuyerRatio"), 2).alias("avg_male_buyer_ratio"),
    round(avg("wishlistConversionRate"), 2).alias("avg_wishlist_conversion_rate")
)

# Aggregate countriesDf at country level
countriesAgg = countriesDf.groupBy("country").agg(
    count("sellers").alias("total_sellers"),
    round(sum("totalproductssold"), 2).alias("total_products_sold"),
    round(avg("meanproductssold"), 2).alias("mean_products_sold"),
    round(avg("meanproductslisted"), 2).alias("mean_products_listed"),
    round(avg("femaleSellerRatio"), 2).alias("avg_female_seller_ratio"),
    round(avg("maleSellerRatio"), 2).alias("avg_male_seller_ratio")
)

# Aggregate sellersDf at country level
sellersAgg = sellersDf.groupBy("country").agg(
    count("nbsellers").alias("total_sellers_count"),
    round(avg("meanproductssold"), 2).alias("seller_mean_products_sold"),   # Fixed reference
    sum("totalproductssold").alias("seller_total_products_sold"),
    round(avg("meanfollowers"), 2).alias("seller_mean_followers"),
    round(avg("percentofappusers"), 2).alias("avg_percent_app_users"),
    round(avg("percentofiosusers"), 2).alias("avg_percent_ios_users"),
    round(avg("meanseniority"), 2).alias("avg_mean_seniority"),
    count("isTopPerformer").alias("total_top_performers")
)

# Aggregate userDf at country level
usersAgg = userDf.groupBy("country").agg(
    count("identifierHash").alias("total_users"),
    round(avg("socialNbFollowers"), 2).alias("avg_social_followers"),
    sum("productsListed").alias("total_products_listed"),
    sum("productsSold").alias("total_products_sold_by_users"),
    round(avg("productsPassRate"), 2).alias("avg_products_pass_rate"),
    round(avg("socialEngagementScore"), 2).alias("avg_social_engagement_score")
)

# Combine all aggregated DataFrames into one table using 'country' as the key
finalTable = buyersAgg \
    .join(countriesAgg, on="country", how="outer") \
    .join(sellersAgg, on="country", how="outer") \
    .join(usersAgg, on="country", how="outer")





### Saving the data to Azure Data **Lake**

In [0]:
# Save the final unified table to gold layer
finalTable.write.format("delta") \
    .mode("overwrite") \
    .save("/mnt/landing-zone-2/tables/gold/ecom_obt_final")
