In [1]:
import pandas as pd
from pathlib import Path

In [2]:
BASE_DIR = Path("s:/dev/Lakehouse Projekt/lakehouse-demo")
SILVER = BASE_DIR / "data" / "silver"


# Dim Customer

In [3]:

customers = pd.read_parquet(SILVER / "sales" / "customers.parquet")
cities = pd.read_parquet(SILVER / "dimensions" / "cities.parquet")
provinces = pd.read_parquet(SILVER / "dimensions" / "provinces.parquet")
countries = pd.read_parquet(SILVER / "dimensions" / "countries.parquet")
delivery_methods = pd.read_parquet(SILVER / "dimensions" / "delivery_methods.parquet")
people = pd.read_parquet(SILVER / "dimensions" / "people.parquet")


In [4]:
print(f"customers:{customers.shape}")
print(f"cities:{cities.shape}")
print(f"provinces:{provinces.shape}")
print(f"countries:{countries.shape}")
print(f"delivery_methods:{delivery_methods.shape}")
print(f"people:{people.shape}")

customers:(625, 29)
cities:(37940, 8)
provinces:(53, 10)
countries:(190, 14)
delivery_methods:(10, 5)
people:(906, 16)


In [5]:
# provinces -> countries
responsible_market = provinces.merge(
    countries[["country_id", "country_name", "iso_alpha3_code","continent","region","subregion"]],
    on="country_id",
    how="left"
)

responsible_market = responsible_market.merge(
    cities[["state_province_id","city_id", "city_name"]],
    on ="state_province_id",
    how="left"

)

print(f"Shape after Join: {responsible_market.shape}")


Shape after Join: (37940, 17)


In [8]:
df = customers.merge(
    delivery_methods[["delivery_method_id", "delivery_method_name"]],
    on="delivery_method_id",
    how = "left"
)
print(f"Shape nach Join: {df.shape}")


Shape nach Join: (625, 30)


In [10]:
df = df.merge(
    responsible_market[["city_id", "city_name", "country_name","state_province_code","state_province_name","iso_alpha3_code", "continent", "region", "subregion","sales_territory"]],
    left_on="delivery_city_id",
    right_on="city_id",
    how="left"
)
print(f"Shape nach Join: {df.shape}")

Shape nach Join: (625, 50)


In [None]:
dim_customers = df[[
    "customer_id",
    "customer_name",
    "bill_to_customer_id",
    "primary_contact_person_id",
    "alternate_contact_person_id",
    "credit_limit",
    "phone_number",
    "fax_number",
    "website_url",
    "delivery_method_name",
    "delivery_address_line1",
    "delivery_address_line2",
    "delivery_postal_code",
    "postal_adress_line1",
    "postal_adress_line2",
    "postal_postal_code",
    "city_name",
    "state_province_code",
    "state_province_name",
    "sales_territory",
    "country_name",
    "iso_alpha3_code",
    "continent",
    "region",
    "subregion"
]]

print(f"Shape dim_customers: {dim_customers.shape}")
dim_customers.head()