## Project: Advanced Data Lake for E-commerce Data Analysis

#### Setup data lake zones

In [5]:
from pathlib import Path

# Define directories
base_dir = Path('./ecommerce_data_lake')
raw_dir = base_dir / 'raw'
processed_dir = base_dir / 'processed'
cleaned_dir = base_dir / 'cleaned'

# Create directories
for dir in [raw_dir, processed_dir, cleaned_dir]:
    dir.mkdir(parents=True, exist_ok=True)


#### Ingest data from multiple sources

In [6]:
import pandas as pd
import json

# Sample customer data (CSV)
customer_data = {
    "customer_id": [1, 2, 3],
    "name": ["Alice", "Bob", "Charlie"],
    "email": ["alice@example.com", "bob@example.com", "charlie@example.com"]
}
customer_df = pd.DataFrame(customer_data)
customer_df.to_csv(raw_dir / 'customers.csv', index=False)

# Sample transaction data (JSON)
transaction_data = [
    {"transaction_id": 101, "customer_id": 1, "product_id": 1001, "amount": 250.0, "date": "2024-08-24"},
    {"transaction_id": 102, "customer_id": 2, "product_id": 1002, "amount": 150.0, "date": "2024-08-24"},
    {"transaction_id": 103, "customer_id": 1, "product_id": 1003, "amount": 300.0, "date": "2024-08-25"}
]
with open(raw_dir / 'transactions.json', 'w') as f:
    json.dump(transaction_data, f)

# Sample product data (CSV)
product_data = {
    "product_id": [1001, 1002, 1003],
    "name": ["Laptop", "Smartphone", "Tablet"],
    "price": [1000, 600, 400]
}
product_df = pd.DataFrame(product_data)
product_df.to_csv(raw_dir / 'products.csv', index=False)


#### Process data (ETL)

In [7]:
from pyspark.sql import SparkSession

# Initialize Spark session
spark = SparkSession.builder.appName('EcommerceDataLake').getOrCreate()

# Load data into Spark DataFrames
customers_df = spark.read.csv(str(raw_dir / 'customers.csv'), header=True, inferSchema=True)
transactions_df = spark.read.json(str(raw_dir / 'transactions.json'))
products_df = spark.read.csv(str(raw_dir / 'products.csv'), header=True, inferSchema=True)

# Rename conflicting columns
customers_df = customers_df.withColumnRenamed('name', 'customer_name')
products_df = products_df.withColumnRenamed('name', 'product_name')

# Join transactions with customer and product details
transaction_enriched_df = transactions_df \
    .join(customers_df, on='customer_id', how='left') \
    .join(products_df, on='product_id', how='left')

# Save the processed data
transaction_enriched_df.write.parquet(str(processed_dir / 'transactions_enriched.parquet'))


                                                                                

#### Clean Data

In [8]:
# Load processed data
enriched_df = spark.read.parquet(str(processed_dir / 'transactions_enriched.parquet'))

# Filter out small transactions and partition by date
cleaned_df = enriched_df.filter(enriched_df.amount > 200)

# Save cleaned data partitioned by date
cleaned_df.write.partitionBy("date").parquet(str(cleaned_dir / 'transactions_cleaned.parquet'))


#### Query Data

In [9]:
# Load cleaned data
cleaned_data_df = spark.read.parquet(str(cleaned_dir / 'transactions_cleaned.parquet'))

# Example query 1: Total revenue per customer
revenue_per_customer_df = cleaned_data_df.groupBy("customer_id", "customer_name").sum("amount")
revenue_per_customer_df.show()

# Example query 2: Most popular products
popular_products_df = cleaned_data_df.groupBy("product_id", "product_name").count().orderBy("count", ascending=False)
popular_products_df.show()


+-----------+-------------+-----------+
|customer_id|customer_name|sum(amount)|
+-----------+-------------+-----------+
|          1|        Alice|      550.0|
+-----------+-------------+-----------+

+----------+------------+-----+
|product_id|product_name|count|
+----------+------------+-----+
|      1001|      Laptop|    1|
|      1003|      Tablet|    1|
+----------+------------+-----+

