In [0]:
from pyspark.sql.functions import col, regexp_replace, split, explode, from_json, when
from pyspark.sql.types import StructType, StructField, StringType, DoubleType

# Path with all files in folder
path = "s3://weather-streaming-proj-bucket/2025/08/22/04/*"

# Define schema
weather_schema = StructType([
    StructField("date_time", StringType(), True),
    StructField("maxtempC", StringType(), True),
    StructField("mintempC", StringType(), True),
    StructField("totalSnow_cm", StringType(), True),
    StructField("sunHour", StringType(), True),
    StructField("uvIndex", StringType(), True),
    StructField("moon_illumination", StringType(), True),
    StructField("moonrise", StringType(), True),
    StructField("moonset", StringType(), True),
    StructField("sunrise", StringType(), True),
    StructField("sunset", StringType(), True),
    StructField("DewPointC", StringType(), True),
    StructField("FeelsLikeC", StringType(), True),
    StructField("HeatIndexC", StringType(), True),
    StructField("WindChillC", StringType(), True),
    StructField("WindGustKmph", StringType(), True),
    StructField("cloudcover", StringType(), True),
    StructField("humidity", StringType(), True),
    StructField("precipMM", StringType(), True),
    StructField("pressure", StringType(), True),
    StructField("tempC", StringType(), True),
    StructField("visibility", StringType(), True),
    StructField("winddirDegree", StringType(), True),
    StructField("windspeedKmph", StringType(), True),
    StructField("City", StringType(), True)
])

# 1. Read all files as raw text
df_raw = spark.read.text(path)

# 2. Clean newlines
df_clean = df_raw.withColumn("value", regexp_replace(col("value"), r"\n", ""))

# 3. Split concatenated JSON {...}{...} into array
df_split = df_clean.withColumn(
    "json_str",
    split(regexp_replace(col("value"), r"}\{", "}|||{"), r"\|\|\|")
)

# 4. Explode into rows
df_exploded = df_split.withColumn("json_str", explode(col("json_str")))

# 5. Keep only valid JSON-looking rows
df_valid = df_exploded.filter(col("json_str").rlike(r"^\s*\{.*\}\s*$"))

# 6. Parse into struct using schema
df_parsed = df_valid.withColumn("parsed", from_json(col("json_str"), weather_schema)) \
                    .select("parsed.*")

# 7. Optional: cast numeric columns to double (avoid nulls for valid numbers)
numeric_cols = ["maxtempC","mintempC","totalSnow_cm","sunHour","uvIndex",
                "DewPointC","FeelsLikeC","HeatIndexC","WindChillC",
                "WindGustKmph","cloudcover","humidity","precipMM","pressure",
                "tempC","visibility","winddirDegree","windspeedKmph"]

for c in numeric_cols:
    df_parsed = df_parsed.withColumn(c, when(col(c).rlike(r"^\d+(\.\d+)?$"), col(c).cast(DoubleType())).otherwise(None))

# Show results
display(df_parsed)
df_parsed.printSchema()
print("Total records:", df_parsed.count())


In [0]:
from pyspark.sql.functions import col, regexp_replace, split, explode, from_json, when
from pyspark.sql.types import StructType, StructField, StringType, DoubleType

# Path with all files in folder
path = "s3://weather-streaming-proj-bucket/2025/08/22/01/*"

# 1. Read all files as raw text
df_raw = spark.read.text(path)

# 2. Clean newlines
df_clean = df_raw.withColumn("value", regexp_replace(col("value"), r"\n", ""))

# 3. Split concatenated JSON {...}{...} into array
df_split = df_clean.withColumn(
    "json_str",
    split(regexp_replace(col("value"), r"}\{", "}|||{"), r"\|\|\|")
)

# 4. Explode into rows
df_exploded = df_split.withColumn("json_str", explode(col("json_str")))

# 5. Keep only valid JSON-looking rows
df_valid = df_exploded.filter(col("json_str").rlike(r"^\s*\{.*\}\s*$"))

# 6. Parse into struct using schema
df_parsed1 = df_valid.withColumn("parsed", from_json(col("json_str"), weather_schema)) \
                    .select("parsed.*")

# 7. Cast numeric columns to double safely
numeric_cols = ["maxtempC","mintempC","totalSnow_cm","sunHour","uvIndex",
                "DewPointC","FeelsLikeC","HeatIndexC","WindChillC",
                "WindGustKmph","cloudcover","humidity","precipMM","pressure",
                "tempC","visibility","winddirDegree","windspeedKmph"]

for c in numeric_cols:
    df_parsed1 = df_parsed1.withColumn(
        c, when(col(c).rlike(r"^\d+(\.\d+)?$"), col(c).cast(DoubleType())).otherwise(None)
    )

# Show results
display(df_parsed1)
df_parsed1.printSchema()
print("Total records:", df_parsed1.count())


In [0]:
from pyspark.sql.functions import col

df_merged = df_parsed.unionByName(df_parsed1, allowMissingColumns=True)
display(df_merged)
df_merged.printSchema()
print("Total records:", df_merged.count())

In [0]:
df_merged.write.format("delta").mode("overwrite").saveAsTable("weather_catalog.raw.weather_bronze")
display(df_merged)


In [0]:
df_merged.printSchema()