In [10]:
import pandas as pd
from pathlib import Path

In [11]:
BASE_DIR = Path("s:/dev/Lakehouse Projekt/lakehouse-demo")
SILVER = BASE_DIR / "data" / "silver"


# Dim Customer

In [12]:

customers = pd.read_parquet(SILVER / "sales" / "customers.parquet")
cities = pd.read_parquet(SILVER / "dimensions" / "cities.parquet")
provinces = pd.read_parquet(SILVER / "dimensions" / "provinces.parquet")
countries = pd.read_parquet(SILVER / "dimensions" / "countries.parquet")
delivery_methods = pd.read_parquet(SILVER / "dimensions" / "delivery_methods.parquet")
people = pd.read_parquet(SILVER / "dimensions" / "people.parquet")


In [13]:
print(f"customers:{customers.shape}")
print(f"cities:{cities.shape}")
print(f"provinces:{provinces.shape}")
print(f"countries:{countries.shape}")
print(f"delivery_methods:{delivery_methods.shape}")
print(f"people:{people.shape}")

customers:(625, 29)
cities:(37940, 8)
provinces:(53, 10)
countries:(190, 14)
delivery_methods:(10, 5)
people:(906, 16)


In [14]:
# provinces -> countries
responsible_market = provinces.merge(
    countries[["country_id", "country_name", "iso_alpha3_code","continent","region","subregion"]],
    on="country_id",
    how="left"
)

responsible_market = responsible_market.merge(
    cities[["state_province_id","city_id", "city_name"]],
    on ="state_province_id",
    how="left"

)

print(f"Shape after Join: {responsible_market.shape}")


Shape after Join: (37940, 17)


In [15]:
df = customers.merge(
    delivery_methods[["delivery_method_id", "delivery_method_name"]],
    on="delivery_method_id",
    how = "left"
)
print(f"Shape nach Join: {df.shape}")


Shape nach Join: (625, 30)


In [16]:
df = df.merge(
    responsible_market[["city_id", "city_name", "country_name","state_province_code","state_province_name","iso_alpha3_code", "continent", "region", "subregion","sales_territory"]],
    left_on="delivery_city_id",
    right_on="city_id",
    how="left"
)
print(f"Shape nach Join: {df.shape}")

Shape nach Join: (625, 40)


In [17]:
df = df.merge(
    people[["person_id","full_name","email_address"]],
    left_on="primary_contact_person_id",
    right_on="person_id",
    how="left"
)

In [19]:
dim_customers = df[[
    "customer_id",
    "customer_name",
    "full_name",
    "email_address",
    "phone_number",
    "fax_number",
    "website_url",
    "bill_to_customer_id",
    "credit_limit",
    "delivery_method_name",
    "delivery_address_line1",
    "delivery_address_line2",
    "delivery_postal_code",
    "postal_address_line1",
    "postal_address_line2",
    "postal_postal_code",
    "city_name",
    "state_province_code",
    "state_province_name",
    "sales_territory",
    "country_name",
    "iso_alpha3_code",
    "continent",
    "region",
    "subregion"
]]

print(f"Shape dim_customers: {dim_customers.shape}")
dim_customers.head()

Shape dim_customers: (625, 25)


Unnamed: 0,customer_id,customer_name,full_name,email_address,phone_number,fax_number,website_url,bill_to_customer_id,credit_limit,delivery_method_name,...,postal_postal_code,city_name,state_province_code,state_province_name,sales_territory,country_name,iso_alpha3_code,continent,region,subregion
0,1,Tailspin Toys (Head Office),,,(308) 555-0100,(308) 555-0101,http://www.tailspintoys.com,1,,Delivery Van,...,90410,Lisco,NE,Nebraska,Plains,United States,USA,North America,Americas,Northern America
1,2,"Tailspin Toys (Sylvanite, MT)",Lorena Cindric,lorena@tailspintoys.com,(406) 555-0100,(406) 555-0101,http://www.tailspintoys.com/Sylvanite,1,,Delivery Van,...,90216,Sylvanite,MT,Montana,Rocky Mountain,United States,USA,North America,Americas,Northern America
2,3,"Tailspin Toys (Peeples Valley, AZ)",Bhaargav Rambhatla,bhaargav@tailspintoys.com,(480) 555-0100,(480) 555-0101,http://www.tailspintoys.com/PeeplesValley,1,,Delivery Van,...,90205,Peeples Valley,AZ,Arizona,Southwest,United States,USA,North America,Americas,Northern America
3,4,"Tailspin Toys (Medicine Lodge, KS)",Daniel Roman,daniel@tailspintoys.com,(316) 555-0100,(316) 555-0101,http://www.tailspintoys.com/MedicineLodge,1,,Delivery Van,...,90152,Medicine Lodge,KS,Kansas,Plains,United States,USA,North America,Americas,Northern America
4,5,"Tailspin Toys (Gasport, NY)",Johanna Huiting,johanna@tailspintoys.com,(212) 555-0100,(212) 555-0101,http://www.tailspintoys.com/Gasport,1,,Delivery Van,...,90261,Gasport,NY,New York,Mideast,United States,USA,North America,Americas,Northern America


In [24]:
# ===== IMPUTATION: FEHLENDE WERTE FÜLLEN =====

# 1. credit_limit: Mit Hauptkundenkredit füllen
print("=== IMPUTATION CREDIT_LIMIT ===")
print(f"Vorher: {dim_customers['credit_limit'].isna().sum()} missing")

# Merge mit Hauptkunde um deren Kreditlimit zu bekommen
main_customer_credits = dim_customers[['customer_id', 'credit_limit']].rename(
    columns={'customer_id': 'bill_to_customer_id', 'credit_limit': 'parent_credit_limit'}
)
dim_customers = dim_customers.merge(main_customer_credits, on='bill_to_customer_id', how='left')

# Füllen: Erst mit Hauptkundenkredit, dann mit Durchschnitt
avg_credit_limit = dim_customers['credit_limit'].mean()
dim_customers['credit_limit'] = (
    dim_customers['credit_limit']
    .fillna(dim_customers['parent_credit_limit'])
    .fillna(avg_credit_limit)
)

print(f"Nachher: {dim_customers['credit_limit'].isna().sum()} missing")
print(f"Durchschnittlicher credit_limit: {avg_credit_limit:.2f}\n")

# 2. full_name: Mit 'Head Office' füllen
print("=== IMPUTATION FULL_NAME & EMAIL_ADDRESS ===")
print(f"Vorher - full_name: {dim_customers['full_name'].isna().sum()} missing")
print(f"Vorher - email_address: {dim_customers['email_address'].isna().sum()} missing")

dim_customers['full_name'] = dim_customers['full_name'].fillna('Head Office')

# 3. email_address: Mit generischer Adresse basierend auf Kundennamen füllen
# Format: info@customername.com (in lowercase, spaces entfernt)
dim_customers['email_address'] = dim_customers.apply(
    lambda row: row['email_address'] if pd.notna(row['email_address'])
    else 'info@' + row['customer_name'].replace(' (Head Office)', '').replace(' ', '').lower() + '.com',
    axis=1
)

print(f"Nachher - full_name: {dim_customers['full_name'].isna().sum()} missing")
print(f"Nachher - email_address: {dim_customers['email_address'].isna().sum()} missing")

# Beispiele anzeigen
print("\nBeispiele der gefüllten Daten:")
filled_mask = dim_customers['customer_name'].str.contains('Head Office', na=False)
print(dim_customers[filled_mask][['customer_name', 'full_name', 'email_address', 'credit_limit']].head(5))

=== IMPUTATION CREDIT_LIMIT ===
Vorher: 0 missing
Nachher: 0 missing
Durchschnittlicher credit_limit: 2607.68

=== IMPUTATION FULL_NAME & EMAIL_ADDRESS ===
Vorher - full_name: 0 missing
Vorher - email_address: 76 missing
Nachher - full_name: 0 missing
Nachher - email_address: 0 missing

Beispiele der gefüllten Daten:
                   customer_name         full_name          email_address  \
0    Tailspin Toys (Head Office)       Head Office  info@tailspintoys.com   
201   Wingtip Toys (Head Office)  Olga Alexandrova   olga@wingtiptoys.com   

     credit_limit  
0     2607.680493  
201   2607.680493  


In [None]:

cols_to_drop = [col for col in dim_customers.columns if 'parent_credit_limit' in col]
if cols_to_drop:
    dim_customers = dim_customers.drop(columns=cols_to_drop)
    print(f"Entfernt: {cols_to_drop} ✓")
else:
    print("Keine parent_credit_limit Spalten gefunden")

Entfernt: ['parent_credit_limit_x', 'parent_credit_limit_y'] ✓

Finale Spaltenanzahl: 25
Spalten: ['customer_id', 'customer_name', 'full_name', 'email_address', 'phone_number', 'fax_number', 'website_url', 'bill_to_customer_id', 'credit_limit', 'delivery_method_name', 'delivery_address_line1', 'delivery_address_line2', 'delivery_postal_code', 'postal_address_line1', 'postal_address_line2', 'postal_postal_code', 'city_name', 'state_province_code', 'state_province_name', 'sales_territory', 'country_name', 'iso_alpha3_code', 'continent', 'region', 'subregion']


In [None]:

column_mapping = {
    # Customer Info
    'customer_id': 'customer_id',
    'customer_name': 'customer_name',
    'bill_to_customer_id': 'parent_customer_id',
    'credit_limit': 'credit_limit_amount',
    
    # Contact Info
    'full_name': 'contact_full_name',
    'email_address': 'contact_email',
    'phone_number': 'contact_phone',
    'fax_number': 'contact_fax',
    'website_url': 'customer_website',
    
    # Delivery Address
    'delivery_address_line1': 'delivery_address_additional',
    'delivery_address_line2': 'delivery_street_address',
    'delivery_postal_code': 'delivery_postal_code',
    'delivery_method_name': 'delivery_method',
    
    # Postal Address
    'postal_address_line1': 'postal_address_additional',
    'postal_address_line2': 'postal_street_address',
    'postal_postal_code': 'postal_code',
    
    # Geographic Info
    'city_name': 'city',
    'state_province_code': 'state_code',
    'state_province_name': 'state_name',
    'country_name': 'country',
    'iso_alpha3_code': 'country_code_iso3',
    'continent': 'continent',
    'region': 'region',
    'subregion': 'subregion',
    
    # Sales Info
    'sales_territory': 'sales_territory'
}

dim_customers_gold = dim_customers.rename(columns=column_mapping)

column_order = [
    # Customer Basics
    'customer_id', 'customer_name', 'parent_customer_id',
    # Contact Info
    'contact_full_name', 'contact_email', 'contact_phone', 'contact_fax',
    'customer_website',
    # Credit & Delivery
    'credit_limit_amount', 'delivery_method',
    # Delivery Address
    'delivery_postal_code', 'delivery_street_address', 'delivery_address_additional'
    # Postal Address
     'postal_code','postal_street_address','postal_address_additional',
    # Geographic
    'city', 'state_code', 'state_name', 'country', 'country_code_iso3',
    'continent', 'region', 'subregion',
    # Sales Territory
    'sales_territory'
]

dim_customers_gold = dim_customers_gold[column_order]


=== COLUMN RENAME - GOLD LAYER STANDARDS ===

Neue Spalten (25):

   1. customer_id
   2. customer_name
   3. parent_customer_id
   4. contact_full_name
   5. contact_email
   6. contact_phone
   7. contact_fax
   8. customer_website
   9. credit_limit_amount
  10. delivery_method
  11. delivery_address_additional
  12. delivery_street_address
  13. delivery_postal_code
  14. postal_address_additional
  15. postal_street_address
  16. postal_code
  17. city
  18. state_code
  19. state_name
  20. country
  21. country_code_iso3
  22. continent
  23. region
  24. subregion
  25. sales_territory

✓ Gold Layer Dimension erfolgreich erstellt!
  Shape: (625, 25)

Vorschau:
   customer_id                  customer_name  parent_customer_id  \
0            1    Tailspin Toys (Head Office)                   1   
1            2  Tailspin Toys (Sylvanite, MT)                   1   

  contact_full_name            contact_email   contact_phone     contact_fax  \
0       Head Office    info@tailspi