In [0]:
%sql
USE CATALOG workspace;
USE SCHEMA default;

In [0]:
%sql
DROP TABLE IF EXISTS default.my_csv_table;

Datei einlesen und als Tabelle abspeichern

In [0]:
df = spark.read.csv(
    "/Volumes/workspace/default/volume/sample_data.csv",
    header=True,
    sep=";"
)
df.write.format("delta").saveAsTable("default.my_csv_table")

In [0]:
%sql
SELECT * FROM default.my_csv_table

id,name,age,city
1,Alice,25,Zurich
2,Bob,30,Geneva
3,Charlie,35,Bern


Daten anhängen

In [0]:
df = spark.read.csv(
    "/Volumes/workspace/default/volume/sample_data.csv",
    header=True,
    sep=";"
)
df.write \
  .format("delta") \
  .mode("append") \
  .saveAsTable("default.my_csv_table")

In [0]:
%sql
SELECT * FROM default.my_csv_table

id,name,age,city
1,Alice,25,Zurich
2,Bob,30,Geneva
3,Charlie,35,Bern
1,Alice,25,Zurich
2,Bob,30,Geneva
3,Charlie,35,Bern


Merge

In [0]:
# Ausgangslage: wir haben eine Tabelle
df = spark.read.csv(
    "/Volumes/workspace/default/volume/sample_data.csv",
    header=True,
    sep=";"
)
df.write \
  .format("delta") \
  .mode("overwrite") \
  .saveAsTable("default.my_csv_table")

In [0]:
%sql
SELECT * FROM default.my_csv_table

id,name,age,city
1,Alice,25,Zurich
2,Bob,30,Geneva
3,Charlie,35,Bern


In [0]:
from delta.tables import DeltaTable

# 1. Neue CSV einlesen
updates_df = spark.read.csv(
    "/Volumes/workspace/default/volume/sample_data2.csv",
    header=True,
    sep=";"
)

In [0]:
# Data Exploration vom neune CSV
display(updates_df)

id,name,age,city
1,Alice,100000,Zurich
2,Bob,30,Geneva
77,Hans,91,Windisch


In [0]:
 # 2. Delta-Tabelle laden
target_table = DeltaTable.forName(spark, "default.my_csv_table")

In [0]:
# 3. MERGE ausführen (z. B. basierend auf id)
target_table.alias("target").merge(
    updates_df.alias("updates"),
    "target.id = updates.id" 
).whenMatchedUpdateAll() \
 .whenNotMatchedInsertAll() \
 .execute()

DataFrame[num_affected_rows: bigint, num_updated_rows: bigint, num_deleted_rows: bigint, num_inserted_rows: bigint]

In [0]:
%sql
SELECT * FROM default.my_csv_table ORDER BY id

-- Alice hat UPDATE vom Attribut "age" erhalten
-- Bob unverändert
-- Charlie unverändert
-- Hans INSERT

id,name,age,city
1,Alice,100000,Zurich
2,Bob,30,Geneva
3,Charlie,35,Bern
77,Hans,91,Windisch


#Nested JSON Data

In [0]:
%sql
drop table if exists default.nestedjson

In [0]:
from pyspark.sql.types import StructType, StructField, StringType, DoubleType

# Manuelles verschachteltes Schema
nested_schema = StructType([
    StructField("email", StringType(), True),
    StructField("gpa", DoubleType(), True),
    StructField("stu", StringType(), True),
    StructField("profile", StructType([
        StructField("first_name", StringType(), True),
        StructField("last_name", StringType(), True),
        StructField("gender", StringType(), True),
        StructField("address", StructType([
            StructField("street", StringType(), True),
            StructField("city", StringType(), True),
            StructField("country", StringType(), True)
        ]))
    ]))
])

# Lade JSON mit verschachteltem Schema
df_nested = spark.read.schema(nested_schema).json("/Volumes/workspace/default/volume/nested_json.json")

# Speichere als Delta-Tabelle
df_nested.write.format("delta").mode("overwrite").saveAsTable("default.nestedjson")

In [0]:
%sql
SELECT * FROM default.nestedjson

email,gpa,stu,profile
e@aol.com,1.95,S0060,"List(Susana, Gonnely, Female, List(760 Express Court, Obrenovac, Serbia))"
r@nb.com,3.33,S0060,"List(Ronna, Gonning, Non-binary, List(48 Grim Way, Metsomtholaba, Botswana))"
r@gov.com,1.08,S0060,"List(Reade, Goode, Male, List(975 Mendota Center, Seabra, Brazil))"
r@sk.com,1.97,S0060,"List(Row, Goodier, Female, List(..., ..., ...))"


In [0]:
%sql
SELECT
  stu,
  profile.first_name AS first_name,
  profile.last_name AS last_name,
  profile.address.country AS country
FROM default.nestedjson

stu,first_name,last_name,country
S0060,Susana,Gonnely,Serbia
S0060,Ronna,Gonning,Botswana
S0060,Reade,Goode,Brazil
S0060,Row,Goodier,...


# Aufräumen

In [0]:
%sql
DROP TABLE IF EXISTS default.nestedjson;
DROP TABLE IF EXISTS default.my_csv_table;
