In [1]:
%pip install pyspark pandas 

Collecting pyspark
  Using cached pyspark-4.0.0-py2.py3-none-any.whl
Collecting py4j==0.10.9.9 (from pyspark)
  Using cached py4j-0.10.9.9-py2.py3-none-any.whl.metadata (1.3 kB)
Using cached py4j-0.10.9.9-py2.py3-none-any.whl (203 kB)
Installing collected packages: py4j, pyspark
Successfully installed py4j-0.10.9.9 pyspark-4.0.0

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.0[0m[39;49m -> [0m[32;49m25.1.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


## Synthatic Data 

Creating Product catalog

In [4]:
import pandas as pd
import random
from datetime import datetime, timedelta

# Generate 100 products
products = []
for i in range(1, 101):
    products.append({
        "product_id": f"PRD{str(i).zfill(5)}",
        "product_name": random.choice(["Laptop", "Phone", "Tablet", "Headphones", "Monitor"]),
        "category": random.choice(["Electronics", "Accessories", "Computers"]),
        "price": round(random.uniform(50, 2000), 2)
    })

pd.DataFrame(products).to_csv("marketing_data/products/product_catalog.csv", index=False)

Creating transactions data for each channel 

In [7]:
# Create transactions
transactions = []
start_date = datetime(2024, 1, 1)

for i in range(1, 10001):
    channel = random.choice(["web", "mobile", "instore"])
    transactions.append({
        "transaction_id": f"TXN{str(i).zfill(8)}",
        "product_id": f"PRD{str(random.randint(1, 100)).zfill(5)}",
        "customer_id": f"CUST{str(random.randint(1, 500)).zfill(5)}",
        "amount": round(random.uniform(10, 5000), 2),
        "channel": channel,
        "transaction_date": (start_date + timedelta(days=random.randint(0, 180))).strftime("%Y-%m-%d"),
        "campaign_id": random.choice(["CAMPAIGN1", "CAMPAIGN2", None])
    })

# Convert to DataFrame and split by channel
transactions_df = pd.DataFrame(transactions)
for channel in ["web", "mobile", "instore"]:
    transactions_df[transactions_df["channel"] == channel]\
        .to_csv(f"marketing_data/transactions/{channel}/transactions_{channel}_2024.csv", index=False)

In [8]:
pd.read_csv("marketing_data/products/product_catalog.csv").head()

Unnamed: 0,product_id,product_name,category,price
0,PRD00001,Tablet,Accessories,1100.37
1,PRD00002,Headphones,Accessories,50.51
2,PRD00003,Laptop,Electronics,51.95
3,PRD00004,Monitor,Computers,609.96
4,PRD00005,Laptop,Accessories,725.24


uploading to adls 

In [3]:
%pip install azure-storage-file-datalake dotenv adlfs s3fs


Collecting s3fs
  Downloading s3fs-2025.5.1-py3-none-any.whl.metadata (1.9 kB)
Collecting aiobotocore<3.0.0,>=2.5.4 (from s3fs)
  Downloading aiobotocore-2.23.0-py3-none-any.whl.metadata (24 kB)
Collecting aioitertools<1.0.0,>=0.5.1 (from aiobotocore<3.0.0,>=2.5.4->s3fs)
  Downloading aioitertools-0.12.0-py3-none-any.whl.metadata (3.8 kB)
Collecting botocore<1.38.28,>=1.38.23 (from aiobotocore<3.0.0,>=2.5.4->s3fs)
  Downloading botocore-1.38.27-py3-none-any.whl.metadata (5.7 kB)
Collecting jmespath<2.0.0,>=0.7.1 (from aiobotocore<3.0.0,>=2.5.4->s3fs)
  Using cached jmespath-1.0.1-py3-none-any.whl.metadata (7.6 kB)
Collecting wrapt<2.0.0,>=1.10.10 (from aiobotocore<3.0.0,>=2.5.4->s3fs)
  Using cached wrapt-1.17.2-cp312-cp312-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.4 kB)
Downloading s3fs-2025.5.1-py3-none-any.whl (30 kB)
Downloading aiobotocore-2.23.0-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0

In [4]:
import s3fs
from dotenv import load_dotenv
import os
from pathlib import Path

# Load environment variables
load_dotenv()

# Initialize S3 filesystem
fs = s3fs.S3FileSystem(
    key=os.getenv("AWS_ACCESS_KEY_ID"),
    secret=os.getenv("AWS_SECRET_ACCESS_KEY"),
    client_kwargs={'region_name': os.getenv("AWS_REGION")}
)

bucket_name = os.getenv("AWS_BUCKET_NAME")
local_base = "marketing_data"  # Your local mock data directory

# Create empty output folders (S3 doesn't have true dirs, use dummy files)
def create_s3_dir(s3_path):
    dummy_file = f"{s3_path}/.keep"
    fs.touch(f"{bucket_name}/{dummy_file}")
    print(f"Created S3 dir (via .keep): {dummy_file}")

# Upload function
def upload_to_s3(local_path: str):
    relative_path = Path(local_path).relative_to(local_base)
    s3_path = f"{bucket_name}/marketing_data/{relative_path.as_posix()}"

    if os.path.isdir(local_path):
        create_s3_dir(f"marketing_data/{relative_path.as_posix()}")
    else:
        with open(local_path, "rb") as f:
            fs.put(local_path, s3_path)
            print(f"Uploaded: {local_path} → s3://{s3_path}")

# Walk through local directory
for root, dirs, files in os.walk(local_base):
    # Create S3 directories
    for dir_name in dirs:
        upload_to_s3(os.path.join(root, dir_name))
    
    # Upload files
    for file_name in files:
        upload_to_s3(os.path.join(root, file_name))

# Verification: Print all top-level objects
print("\nFinal S3 Structure:")
for path in fs.ls(f"{bucket_name}/marketing_data", detail=False):
    print(f" - {path}")
    if fs.isdir(path):
        for subpath in fs.ls(path, detail=False):
            print(f"   ├─ {subpath}")


Created S3 dir (via .keep): marketing_data/transactions/.keep
Created S3 dir (via .keep): marketing_data/products/.keep
Created S3 dir (via .keep): marketing_data/transactions/instore/.keep
Created S3 dir (via .keep): marketing_data/transactions/mobile/.keep
Created S3 dir (via .keep): marketing_data/transactions/web/.keep
Uploaded: marketing_data/transactions/instore/transactions_instore_2024.csv → s3://marketing-data-bucket-09/marketing_data/transactions/instore/transactions_instore_2024.csv
Uploaded: marketing_data/transactions/mobile/transactions_mobile_2024.csv → s3://marketing-data-bucket-09/marketing_data/transactions/mobile/transactions_mobile_2024.csv
Uploaded: marketing_data/transactions/web/transactions_web_2024.csv → s3://marketing-data-bucket-09/marketing_data/transactions/web/transactions_web_2024.csv
Uploaded: marketing_data/products/product_catalog.csv → s3://marketing-data-bucket-09/marketing_data/products/product_catalog.csv

Final S3 Structure:
 - marketing-data-buck