In [0]:
print("hello")

hello


##Sample ETL Pipeline

In [0]:
import requests
csv_url = "https://raw.githubusercontent.com/prasertcbs/basic-dataset/master/Restaurant%20customer%20data.csv"
local_path = "/Volumes/workspace/default/data/restaurant_customer_data.csv"
response = requests.get(csv_url)
with open(local_path, "wb") as f:
    f.write(response.content)

print(f"✅ CSV file saved to {local_path}")


✅ CSV file saved to /Volumes/workspace/default/data/restaurant_customer_data.csv


In [0]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, when
from pyspark.sql.types import IntegerType, FloatType

spark = SparkSession.builder.appName("ETL Pipeline").getOrCreate()
df = spark.read.format("csv").option("header", "true").load("/Volumes/workspace/default/data/restaurant_customer_data.csv")

df_transformed = df.withColumn("birth_year", when(col("birth_year").rlike("^[0-9]+$"), col("birth_year").cast(IntegerType())).otherwise(None)) \
                   .withColumn("weight", when(col("weight").rlike("^[0-9]+(\\.[0-9]+)?$"), col("weight").cast(FloatType())).otherwise(None)) \
                   .withColumn("budget", when(col("budget").rlike("^[0-9]+(\\.[0-9]+)?$"), col("budget").cast(FloatType())).otherwise(None)) \
                   .withColumn("height", when(col("height").rlike("^[0-9]+(\\.[0-9]+)?$"), col("height").cast(FloatType())).otherwise(None))


df_cleaned = df_transformed.na.drop(subset=["weight", "budget", "height"])


silver_table_path = "/Volumes/workspace/default/data/restaurant_customers_delta"
df_cleaned.write.format("delta").mode("overwrite").save(silver_table_path)

print("✅ Data processed and saved to Delta format in:", silver_table_path)


✅ Data processed and saved to Delta format in: /Volumes/workspace/default/data/restaurant_customers_delta
