
# Great Expectations (GX) Setup Notebook

Dieses Notebook dient zur Konfiguration und Definition der Datenvalidierung mit Hilfe der Bibliothek Great Expectations. Die Ergebnisse des Notebooks (Validation_Definition, Checkpoint) werden für die Simulation zur automatisierten Validierung weiterverwendet

## Voraussetzungen
- Abhängigkeiten aus requirements.txt installiert
- Datenbanksystem installiert (siehe README)

## Inhalt
1. **Setup und Datenbankverbindung**  
2. **Definition der Datenvalidierung**  
3. **Definition des Checkpoints**  
4. **Test-Validierung** 

### 1. Setup und Datenbankverbindung

#### Imports

In [165]:
# Great Expectations Funktionen importieren
import great_expectations as gx
from great_expectations.checkpoint import UpdateDataDocsAction

# Konstanten für die Datenbankverbindung importieren
from constants import (
    DB_HOST, DB_PORT, DB_USER, DB_PASSWORD, DB_NAME,
    DATA_SOURCE, DATA_ASSET, DB_TABLE, BATCH_DEFINITION,
    EXPECTATION_SUITE, VALIDATION_DEFINITION, CHECKPOINT_NAME
)

#### Kontext laden

In [166]:
# GX-Verzeichnis mit allen relevanten Dateien erstellen
context = gx.get_context(mode="file", project_root_dir="../")

#### Connection-String für Datenbankverbindung generieren

In [167]:
db_connection_string = f"postgresql+psycopg2://{DB_USER}:{DB_PASSWORD}@{DB_HOST}:{DB_PORT}/{DB_NAME}"

#### Datenquelle hinzufügen

In [168]:
data_source = context.data_sources.add_or_update_postgres(
    name = DATA_SOURCE, 
    connection_string = db_connection_string
)

#### Daten-Asset (Tabelle) hinzufügen

In [169]:
if not data_source.assets:
    # Asset existiert noch nicht – daher wird es hinzugefügt
    data_asset = data_source.add_table_asset(table_name=DB_TABLE, name=DATA_ASSET)    
else:
    # Asset existiert bereits
    data_asset = data_source.assets[DATA_ASSET]

#### Batch-Definition hinzufügen

In [170]:
if not data_asset.batch_definitions:
   # Batch-Definition existiert noch nicht – daher wird es hinzugefügt
   batch_definition = data_asset.add_batch_definition_daily(name=BATCH_DEFINITION, column="timestamp")
else:
   # Batch existiert bereits
   batch_definition = data_asset.get_batch_definition(name=BATCH_DEFINITION)

#### Verbindung testen

In [171]:
# Batch laden und die ersten 10 Datensätze ausgeben
daily_batch = batch_definition.get_batch()
daily_batch.head()

Calculating Metrics: 100%|██████████| 1/1 [00:00<00:00, 38.57it/s]


   id                  timestamp sensor_id location  temperature   humidity  \
0   1 2025-02-24 17:47:36.533778    WS_001   Berlin    22.500000  45.000000   
1   2 2025-02-24 18:48:29.727524    WS_003  München    20.870524  68.242824   
2   3 2025-02-24 18:48:29.813122    WS_002  Hamburg    20.814600  53.738292   
3   4 2025-02-24 18:48:29.894257    WS_001   Berlin    23.578637  51.845766   
4   5 2025-02-24 18:48:29.979320    WS_004     Köln    22.553310  82.891698   

      pressure  wind_speed  wind_direction  
0  1013.250000    5.500000      180.000000  
1  1016.179505    1.832683      192.848921  
2   889.039772    5.873184      192.258300  
3  1016.591158    7.674495      157.488590  
4  1015.365444    7.246051      153.757616  

### Definition der Datenvalidierung

#### Expectation Suite hinzufügen

In [172]:
gx_suite = context.suites.add_or_update(gx.ExpectationSuite(name=EXPECTATION_SUITE))

#### Expectations definieren

In [173]:
# Validerung von NULL-Werten
exp_timestamp = gx.expectations.ExpectColumnValuesToNotBeNull(column="timestamp")

# Validierung von Wertebereichen
exp_temperature = gx.expectations.ExpectColumnValuesToBeBetween(column="temperature", min_value=-50, max_value=60)
exp_humidity = gx.expectations.ExpectColumnValuesToBeBetween(column="humidity", min_value=0, max_value=100)
exp_pressure = gx.expectations.ExpectColumnValuesToBeBetween(column="pressure", min_value=870, max_value=1080)
exp_wind_speed = gx.expectations.ExpectColumnValuesToBeBetween(column="wind_speed", min_value=0, max_value=50)
exp_wind_direction = gx.expectations.ExpectColumnValuesToBeBetween(column="wind_direction", min_value=0, max_value=360)

# Validierung von kategorischen Werten
exp_location = gx.expectations.ExpectColumnValuesToBeInSet(column="location", value_set=["Berlin", "Hamburg", "München", "Köln", "Frankfurt"])
exp_sensor_id = gx.expectations.ExpectColumnValuesToBeInSet(column="sensor_id", value_set=["WS_001", "WS_002", "WS_003", "WS_004", "WS_005"])

# Validierung, ob die Durchschnittstemperatur in Berlin im Sommer ≥ 15°C ist
exp_temperature_location_season = gx.expectations.ExpectColumnValuesToBeBetween(
    column="temperature",
    min_value=15,
    max_value=30,
    condition_parser="pandas",
    row_condition='location == "Berlin" and season == "Summer"'
)


#### Expectations zum GX-Kontext hinzufügen

In [174]:
context.suites.get(name=EXPECTATION_SUITE).add_expectation(exp_timestamp)
context.suites.get(name=EXPECTATION_SUITE).add_expectation(exp_temperature)
context.suites.get(name=EXPECTATION_SUITE).add_expectation(exp_humidity)
context.suites.get(name=EXPECTATION_SUITE).add_expectation(exp_pressure)
context.suites.get(name=EXPECTATION_SUITE).add_expectation(exp_wind_speed)
context.suites.get(name=EXPECTATION_SUITE).add_expectation(exp_wind_direction)
context.suites.get(name=EXPECTATION_SUITE).add_expectation(exp_location)
context.suites.get(name=EXPECTATION_SUITE).add_expectation(exp_sensor_id)
context.suites.get(name=EXPECTATION_SUITE).add_expectation(exp_temperature_location_season)

ExpectColumnValuesToBeBetween(id='c6252c4c-2856-4538-a276-a4e7451f9b89', meta=None, notes=None, result_format=<ResultFormat.BASIC: 'BASIC'>, description=None, catch_exceptions=True, rendered_content=None, windows=None, batch_id=None, column='temperature', mostly=1, row_condition='location == "Berlin" and season == "Summer"', condition_parser='pandas', min_value=15.0, max_value=30.0, strict_min=False, strict_max=False)

#### Validation-Definition zum Kontext hinzufügen

In [175]:
validation_definition = gx.ValidationDefinition(
    data=batch_definition, 
    suite=context.suites.get(name=EXPECTATION_SUITE), 
    name=VALIDATION_DEFINITION
)
validation_definition = context.validation_definitions.add_or_update(validation_definition)

### Definition des Checkpoints

In [176]:
action_list = [    
    # Diese Aktion aktualisiert die Data Docs statische Website mit den Validierungsergebnissen,
    # nachdem der Checkpoint ausgeführt wurde.
    UpdateDataDocsAction(
        name="update_all_data_docs",
    ),
]

# Checkpoint erstellen
checkpoint = gx.Checkpoint(
    name=CHECKPOINT_NAME,
    validation_definitions=[context.validation_definitions.get(name=VALIDATION_DEFINITION)],
    actions=action_list,
    result_format={"result_format": "COMPLETE"},
)

# Checkpoint zum Kontext hinzufügen
context.checkpoints.add_or_update(checkpoint)

Checkpoint(name='weather_data_validation_checkpoint', validation_definitions=[ValidationDefinition(name='weather_data_validation_definition', data=BatchDefinition(id=UUID('decbb21e-6232-49aa-b5db-7828e4664e10'), name='last_day_batch', partitioner=ColumnPartitionerDaily(column_name='timestamp', sort_ascending=True, method_name='partition_on_year_and_month_and_day')), suite={
  "name": "weather_data_expectation_suite",
  "id": "37e91db3-ee51-4135-83bb-254aff274071",
  "expectations": [
    {
      "type": "expect_column_values_to_not_be_null",
      "kwargs": {
        "column": "timestamp"
      },
      "meta": {},
      "id": "9c6e0dc8-9dca-4cf7-9b7a-56780867f3ce"
    },
    {
      "type": "expect_column_values_to_be_between",
      "kwargs": {
        "column": "temperature",
        "min_value": -50.0,
        "max_value": 60.0
      },
      "meta": {},
      "id": "6f303f97-7310-462e-a02e-64a8ef9fc727"
    },
    {
      "type": "expect_column_values_to_be_between",
      "kwargs

### Test-Validierung

In [177]:
# Vorhandenen Checkpoint anhand seines Namens abrufen und ausführen
checkpoint_test = context.checkpoints.get(name=CHECKPOINT_NAME)
checkpoint_test.run()

Calculating Metrics:  10%|█         | 9/90 [00:00<00:00, 102.51it/s]


CheckpointResult(run_id={"run_name": null, "run_time": "2025-02-24T18:57:25.556405+01:00"}, run_results={ValidationResultIdentifier::weather_data_expectation_suite/__none__/20250224T175725.556405Z/postgres_weather_db-measurement-year_2025-month_2-day_24: {
  "success": false,
  "results": [
    {
      "success": false,
      "expectation_config": {
        "type": "expect_column_values_to_not_be_null",
        "kwargs": {
          "column": "timestamp",
          "batch_id": "postgres_weather_db-measurement-year_2025-month_2-day_24"
        },
        "meta": {},
        "id": "9c6e0dc8-9dca-4cf7-9b7a-56780867f3ce"
      },
      "result": {},
      "meta": {},
      "exception_info": {
        "MetricConfigurationID(metric_name='table.row_count', metric_domain_kwargs_id='e13455050ab3daa63745efcbd5eda273', metric_value_kwargs_id=())": {
          "exception_traceback": "Traceback (most recent call last):\n  File \"c:\\VOLLMER\\Studium-Repositories\\Data-Management-Fundamentals\\gx-io