# Synthetic Data Generator

Dieses Notebook definiert die Struktur für dein synthetisches Datenset basierend auf dem Mockaroo-JSON-Schema, ohne das JSON zur Laufzeit einzulesen.

In [None]:
schema = [
    {
        "id": "id",
        "type": "Row Number",
        "options": {
            "start": 1124000,
            "step": 1
        }
    },
    {
        "id": "entryDate",
        "type": "Date",
        "options": {
            "min": "2024-05-21",
            "max": "2025-05-21",
            "format": "%d.%m.%Y"
        }
    },
    {
        "id": "materialVerfugbarkeit",
        "type": "Custom List",
        "options": {
            "values": [
                "Hoch",
                "Niederig",
                "Fehlt"
            ],
            "weights": [
                0.6,
                0.3,
                0.1
            ]
        }
    },
    {
        "id": "materialBereitstellungszeit",
        "type": "Number",
        "options": {
            "min": 0.1,
            "max": 2.5,
            "decimals": 2
        }
    },
    {
        "id": "materialLiegezeit",
        "type": "Formula",
        "options": {
            "formula": "if materialVerfugbarkeit = 'Hoch' then random(0.1, 0.5) else if materialVerfugbarkeit = 'Mittel' then random(0.5, 1.2) else random(1.2, 3)"
        }
    },
    {
        "id": "planRustUndBearbeitungszeit",
        "type": "Number",
        "options": {
            "min": 0.1,
            "max": 1.5,
            "decimals": 2
        }
    },
    {
        "id": "planMaterialBereitstellungszeit",
        "type": "Number",
        "options": {
            "min": 0.1,
            "max": 1.0,
            "decimals": 2
        }
    },
    {
        "id": "arbeitsplatzAuslastung",
        "type": "Number",
        "options": {
            "min": 60,
            "max": 95,
            "decimals": 0
        }
    },
    {
        "id": "nachbearbeitung",
        "type": "Boolean",
        "options": {
            "weights": [
                0.2,
                0.8
            ]
        }
    },
    {
        "id": "produktionsqualitat",
        "type": "Formula",
        "options": {
            "formula": "if nachbearbeitung = true then random(85, 93) else random(94, 100)"
        }
    },
    {
        "id": "setupZeit",
        "type": "Formula",
        "options": {
            "formula": "if arbeitsplatzAuslastung > 85 then random(0.2, 0.9) else random(0, 0.5)"
        }
    },
    {
        "id": "runZeit",
        "type": "Formula",
        "options": {
            "formula": "if materialVerfugbarkeit = 'Niedrig' then random(0.8, 2) else if materialVerfugbarkeit = 'Mittel' then random(0.5, 1.5) else random(0.1, 1)"
        }
    },
    {
        "id": "setupAndRuntime",
        "type": "Formula",
        "options": {
            "formula": "setupZeit + runZeit"
        }
    },
    {
        "id": "durchlaufzeit",
        "type": "Formula",
        "options": {
            "formula": "materialLiegezeit + setupAndRuntime"
        }
    },
    {
        "id": "quantity",
        "type": "Number",
        "options": {
            "min": 1,
            "max": 500,
            "decimals": 0
        }
    },
    {
        "id": "durchlaufzeitSoll",
        "type": "Formula",
        "options": {
            "formula": "(planRustUndBearbeitungszeit + planMaterialBereitstellungszeit) * (1 - (arbeitsplatzAuslastung - 75) / 100)"
        }
    },
    {
        "id": "zeitraumBisLiefertermin",
        "type": "Number",
        "options": {
            "min": 5,
            "max": 60,
            "decimals": 0
        }
    },
    {
        "id": "termintreue",
        "type": "Formula",
        "options": {
            "formula": "if durchlaufzeit <= durchlaufzeitSoll AND materialVerfugbarkeit != 'Niedrig' then random(85, 100) else random(60, 85)"
        }
    },
    {
        "id": "liefertreue",
        "type": "Formula",
        "options": {
            "formula": "if termintreue > 90 AND produktionsqualitat > 95 AND zeitraumBisLiefertermin > 20 then random(90, 100) else if termintreue > 75 then random(75, 90) else random(60, 75)"
        }
    },
    {
        "id": "planedTimeFabrication",
        "type": "Formula",
        "options": {
            "formula": "quantity * random(5, 50)"
        }
    },
    {
        "id": "actualTimeFabrication",
        "type": "Formula",
        "options": {
            "formula": "if produktionsqualitat > 95 then planedTimeFabrication * random(0.9, 1.1) else planedTimeFabrication * random(1.1, 1.4)"
        }
    },
    {
        "id": "FTPercent",
        "type": "Formula",
        "options": {
            "formula": "((actualTimeFabrication - planedTimeFabrication) / planedTimeFabrication) * 100"
        }
    },
    {
        "id": "durchlaufZeitH",
        "type": "Number",
        "options": {
            "min": 0,
            "max": 0.01,
            "decimals": 2
        }
    }
]

len(schema)  # Anzahl der Felder im Schema

In [1]:
import pandas as pd
import numpy as np
from datetime import datetime
from faker import Faker

In [None]:
fake = Faker()

def generate_field(field, n):
    key = field['name']
    ftype = field['type']
    if ftype == 'Full Name':
        return [fake.name() for _ in range(n)]
    elif ftype == 'Email Address':
        return [fake.email() for _ in range(n)]
    elif ftype == 'Integer':
        return list(np.random.randint(field.get('min', 0), field.get('max', 100), size=n))
    elif ftype == 'Date':
        start = datetime.strptime(field.get('min', '2000-01-01'), '%Y-%m-%d')
        end = datetime.strptime(field.get('max', '2020-12-31'), '%Y-%m-%d')
        return [fake.date_between(start_date=start, end_date=end) for _ in range(n)]
    else:
        # Default placeholder for unhandled types
        return [None] * n

# Anzahl der Zeilen definieren
n = 1000

# DataFrame generieren
data = {field['name']: generate_field(field, n) for field in schema}
df = pd.DataFrame(data)

# Ausgabe der ersten fünf Zeilen
df.head()
