## Medallion Architecture
- https://www.databricks.com/glossary/medallion-architecture
- https://learn.microsoft.com/en-us/fabric/onelake/onelake-medallion-lakehouse-architecture
- ![](https://www.databricks.com/sites/default/files/inline-images/building-data-pipelines-with-delta-lake-120823.png)

## Data Source
- Data Source can be Database, csv files, API, etc.
- In this demo, five data sources are generated by using synthetic data generator, Faker python library.
- They contains semi-structure data, json data.

In [0]:
%pip install faker-food

Collecting faker-food
  Downloading faker_food-0.3.0-py3-none-any.whl.metadata (2.2 kB)
Collecting Faker>=12.0.0 (from faker-food)
  Downloading faker-37.8.0-py3-none-any.whl.metadata (15 kB)
Downloading faker_food-0.3.0-py3-none-any.whl (11 kB)
Downloading faker-37.8.0-py3-none-any.whl (2.0 MB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/2.0 MB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.0/2.0 MB[0m [31m24.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: Faker, faker-food
Successfully installed Faker-37.8.0 faker-food-0.3.0
[43mNote: you may need to restart the kernel using %restart_python or dbutils.library.restartPython() to use updated packages.[0m


In [0]:
from faker import Faker
from faker_food import FoodProvider

Faker.seed(111)
fake = Faker("en_CA")
fake.add_provider(FoodProvider)

fruits = [fake.fruit() for _ in range(5)]
fruits

['Incaberries', 'Tangelo', 'Limes', 'Kiwi Fruit', 'Longan']

## Five Data Sources
- Synthetic Data with Linear Regression pattern

In [0]:
import random

schema=[
    "drink_name",
    "creation_date",
    "ingredients",
    "calories"    
]

NUMBER_OF_ROWS = 200
incremental_pattern = 0
data = []
for row in range(NUMBER_OF_ROWS):
    data.append(
        (
            fake.word() + "_" + fake.word(),            
            fake.date_between(start_date="-1y", end_date="-1d"),
            { 
                "fruit1": fruits[random.randint(0, 4)],
                "fruit1_weight": round(fake.random.uniform(50+incremental_pattern, 150+incremental_pattern), 2),
                "fruit2": fruits[random.randint(0, 4)],
                "fruit2_weight": round(fake.random.uniform(50+incremental_pattern, 150+incremental_pattern), 2),
            },
            round(fake.random.uniform(100+incremental_pattern, 300+incremental_pattern), 4),
        )
    )
df = spark.createDataFrame(schema=schema,data=data)
df.createOrReplaceTempView("vRecipe1")
display(df)

drink_name,creation_date,ingredients,calories
error_ipsam,2025-03-16,"Map(fruit1 -> Longan, fruit1_weight -> 89.76, fruit2 -> Longan, fruit2_weight -> 111.72)",133.9259
earum_voluptatem,2025-02-17,"Map(fruit1 -> Incaberries, fruit1_weight -> 72.97, fruit2 -> Incaberries, fruit2_weight -> 117.04)",183.2463
molestiae_hic,2025-02-02,"Map(fruit1 -> Incaberries, fruit1_weight -> 130.63, fruit2 -> Kiwi Fruit, fruit2_weight -> 122.46)",254.5222
praesentium_aliquid,2025-01-09,"Map(fruit1 -> Tangelo, fruit1_weight -> 126.04, fruit2 -> Kiwi Fruit, fruit2_weight -> 73.88)",213.7422
nemo_at,2025-07-31,"Map(fruit1 -> Kiwi Fruit, fruit1_weight -> 126.13, fruit2 -> Longan, fruit2_weight -> 60.78)",119.8595
officia_esse,2025-01-19,"Map(fruit1 -> Longan, fruit1_weight -> 137.94, fruit2 -> Incaberries, fruit2_weight -> 100.03)",260.7561
voluptates_adipisci,2025-02-17,"Map(fruit1 -> Tangelo, fruit1_weight -> 100.32, fruit2 -> Limes, fruit2_weight -> 116.71)",290.2142
incidunt_possimus,2025-03-12,"Map(fruit1 -> Longan, fruit1_weight -> 98.98, fruit2 -> Incaberries, fruit2_weight -> 135.69)",121.5037
vero_sit,2024-11-12,"Map(fruit1 -> Kiwi Fruit, fruit1_weight -> 127.04, fruit2 -> Longan, fruit2_weight -> 66.85)",293.6089
odit_quibusdam,2025-04-18,"Map(fruit1 -> Longan, fruit1_weight -> 147.39, fruit2 -> Tangelo, fruit2_weight -> 86.7)",140.8213


In [0]:
%sql
create or replace table odp_hackathon25.source_drink_recipe.drink_recipe_1
as
select * from vRecipe1;

num_affected_rows,num_inserted_rows


In [0]:
import random

schema=[
    "drink_name",
    "creation_date",
    "ingredients",
    "calories"    
]

NUMBER_OF_ROWS = 200
incremental_pattern = 50
data = []
for row in range(NUMBER_OF_ROWS):
    data.append(
        (
            fake.word() + "_" + fake.word(),            
            fake.date_between(start_date="-1y", end_date="-1d"),
            { 
                "fruit1": fruits[random.randint(0, 4)],
                "fruit1_weight": round(fake.random.uniform(50+incremental_pattern, 150+incremental_pattern), 2),
                "fruit2": fruits[random.randint(0, 4)],
                "fruit2_weight": round(fake.random.uniform(50+incremental_pattern, 150+incremental_pattern), 2),
            },
            round(fake.random.uniform(100+incremental_pattern, 300+incremental_pattern), 4),
        )
    )
df = spark.createDataFrame(schema=schema,data=data)
df.createOrReplaceTempView("vRecipe2")
display(df)

drink_name,creation_date,ingredients,calories
accusantium_laboriosam,2024-11-16,"Map(fruit1 -> Longan, fruit1_weight -> 137.06, fruit2 -> Tangelo, fruit2_weight -> 152.28)",233.9525
ducimus_facilis,2025-02-11,"Map(fruit1 -> Limes, fruit1_weight -> 162.14, fruit2 -> Tangelo, fruit2_weight -> 196.92)",317.9603
perferendis_eveniet,2025-07-14,"Map(fruit1 -> Kiwi Fruit, fruit1_weight -> 184.29, fruit2 -> Limes, fruit2_weight -> 185.82)",201.782
ducimus_accusantium,2024-11-20,"Map(fruit1 -> Incaberries, fruit1_weight -> 185.51, fruit2 -> Limes, fruit2_weight -> 172.62)",307.496
fuga_harum,2025-01-28,"Map(fruit1 -> Longan, fruit1_weight -> 184.5, fruit2 -> Tangelo, fruit2_weight -> 110.74)",271.8141
at_nihil,2024-11-02,"Map(fruit1 -> Tangelo, fruit1_weight -> 127.16, fruit2 -> Longan, fruit2_weight -> 181.97)",225.6337
cumque_rerum,2025-08-22,"Map(fruit1 -> Kiwi Fruit, fruit1_weight -> 165.45, fruit2 -> Incaberries, fruit2_weight -> 126.99)",277.9897
saepe_itaque,2025-04-22,"Map(fruit1 -> Limes, fruit1_weight -> 114.71, fruit2 -> Kiwi Fruit, fruit2_weight -> 193.79)",237.3546
quos_magni,2025-04-03,"Map(fruit1 -> Incaberries, fruit1_weight -> 154.98, fruit2 -> Incaberries, fruit2_weight -> 137.11)",286.3694
illo_saepe,2025-02-02,"Map(fruit1 -> Incaberries, fruit1_weight -> 115.33, fruit2 -> Longan, fruit2_weight -> 124.25)",269.9168


In [0]:
%sql
create or replace table odp_hackathon25.source_drink_recipe.drink_recipe_2
as
select * from vRecipe2;

num_affected_rows,num_inserted_rows


In [0]:
import random

schema=[
    "drink_name",
    "creation_date",
    "ingredients",
    "calories"    
]

NUMBER_OF_ROWS = 200
incremental_pattern = 75
data = []
for row in range(NUMBER_OF_ROWS):
    data.append(
        (
            fake.word() + "_" + fake.word(),            
            fake.date_between(start_date="-1y", end_date="-1d"),
            { 
                "fruit1": fruits[random.randint(0, 4)],
                "fruit1_weight": round(fake.random.uniform(50+incremental_pattern, 150+incremental_pattern), 2),
                "fruit2": fruits[random.randint(0, 4)],
                "fruit2_weight": round(fake.random.uniform(50+incremental_pattern, 150+incremental_pattern), 2),
            },
            round(fake.random.uniform(100+incremental_pattern, 300+incremental_pattern), 4),
        )
    )
df = spark.createDataFrame(schema=schema,data=data)
df.createOrReplaceTempView("vRecipe3")
display(df)

drink_name,creation_date,ingredients,calories
blanditiis_explicabo,2024-10-27,"Map(fruit1 -> Longan, fruit1_weight -> 182.37, fruit2 -> Tangelo, fruit2_weight -> 130.44)",260.416
numquam_rerum,2025-01-13,"Map(fruit1 -> Tangelo, fruit1_weight -> 141.53, fruit2 -> Tangelo, fruit2_weight -> 210.1)",358.5102
beatae_sequi,2024-09-17,"Map(fruit1 -> Longan, fruit1_weight -> 214.8, fruit2 -> Kiwi Fruit, fruit2_weight -> 182.09)",294.4526
sapiente_at,2025-06-09,"Map(fruit1 -> Tangelo, fruit1_weight -> 137.86, fruit2 -> Incaberries, fruit2_weight -> 180.79)",337.7018
magnam_pariatur,2025-05-04,"Map(fruit1 -> Incaberries, fruit1_weight -> 175.76, fruit2 -> Limes, fruit2_weight -> 203.82)",315.774
odio_modi,2024-09-24,"Map(fruit1 -> Incaberries, fruit1_weight -> 130.5, fruit2 -> Limes, fruit2_weight -> 204.88)",320.2639
ad_fuga,2024-11-10,"Map(fruit1 -> Tangelo, fruit1_weight -> 127.0, fruit2 -> Longan, fruit2_weight -> 181.82)",322.3133
officiis_aliquam,2025-08-29,"Map(fruit1 -> Limes, fruit1_weight -> 201.01, fruit2 -> Limes, fruit2_weight -> 173.94)",240.9774
vitae_laboriosam,2025-05-01,"Map(fruit1 -> Kiwi Fruit, fruit1_weight -> 193.35, fruit2 -> Kiwi Fruit, fruit2_weight -> 212.02)",373.8146
veniam_id,2024-11-10,"Map(fruit1 -> Longan, fruit1_weight -> 186.42, fruit2 -> Limes, fruit2_weight -> 186.73)",288.7483


In [0]:
%sql
create or replace table odp_hackathon25.source_drink_recipe.drink_recipe_3
as
select * from vRecipe3;

num_affected_rows,num_inserted_rows


In [0]:
import random

schema=[
    "drink_name",
    "creation_date",
    "ingredients",
    "calories"    
]

NUMBER_OF_ROWS = 200
incremental_pattern = 200
data = []
for row in range(NUMBER_OF_ROWS):
    data.append(
        (
            fake.word() + "_" + fake.word(),            
            fake.date_between(start_date="-1y", end_date="-1d"),
            { 
                "fruit1": fruits[random.randint(0, 4)],
                "fruit1_weight": round(fake.random.uniform(50+incremental_pattern, 150+incremental_pattern), 2),
                "fruit2": fruits[random.randint(0, 4)],
                "fruit2_weight": round(fake.random.uniform(50+incremental_pattern, 150+incremental_pattern), 2),
            },
            round(fake.random.uniform(100+incremental_pattern, 300+incremental_pattern), 4),
        )
    )
df = spark.createDataFrame(schema=schema,data=data)
df.createOrReplaceTempView("vRecipe4")
display(df)

drink_name,creation_date,ingredients,calories
sit_officia,2025-05-04,"Map(fruit1 -> Limes, fruit1_weight -> 322.01, fruit2 -> Longan, fruit2_weight -> 282.47)",314.2618
exercitationem_vel,2025-08-10,"Map(fruit1 -> Incaberries, fruit1_weight -> 280.89, fruit2 -> Kiwi Fruit, fruit2_weight -> 285.12)",300.0164
incidunt_quis,2025-04-03,"Map(fruit1 -> Kiwi Fruit, fruit1_weight -> 257.6, fruit2 -> Kiwi Fruit, fruit2_weight -> 329.07)",332.4546
veritatis_id,2025-04-03,"Map(fruit1 -> Tangelo, fruit1_weight -> 311.29, fruit2 -> Limes, fruit2_weight -> 253.67)",403.3771
nemo_quisquam,2025-08-18,"Map(fruit1 -> Tangelo, fruit1_weight -> 252.33, fruit2 -> Incaberries, fruit2_weight -> 288.8)",472.4315
optio_fugit,2025-01-08,"Map(fruit1 -> Limes, fruit1_weight -> 326.84, fruit2 -> Incaberries, fruit2_weight -> 311.37)",309.3367
impedit_voluptates,2025-01-15,"Map(fruit1 -> Limes, fruit1_weight -> 329.27, fruit2 -> Incaberries, fruit2_weight -> 340.0)",472.6027
impedit_doloribus,2024-10-25,"Map(fruit1 -> Incaberries, fruit1_weight -> 302.42, fruit2 -> Tangelo, fruit2_weight -> 313.36)",455.4247
deleniti_fuga,2025-02-01,"Map(fruit1 -> Limes, fruit1_weight -> 321.59, fruit2 -> Limes, fruit2_weight -> 299.27)",453.0334
nulla_expedita,2025-08-11,"Map(fruit1 -> Longan, fruit1_weight -> 346.43, fruit2 -> Incaberries, fruit2_weight -> 327.3)",407.9563


In [0]:
%sql
create or replace table odp_hackathon25.source_drink_recipe.drink_recipe_4
as
select * from vRecipe4;

num_affected_rows,num_inserted_rows


In [0]:
import random

schema=[
    "drink_name",
    "creation_date",
    "ingredients",
    "calories"    
]

NUMBER_OF_ROWS = 200
incremental_pattern = 210
data = []
for row in range(NUMBER_OF_ROWS):
    data.append(
        (
            fake.word() + "_" + fake.word(),            
            fake.date_between(start_date="-1y", end_date="-1d"),
            { 
                "fruit1": fruits[random.randint(0, 4)],
                "fruit1_weight": round(fake.random.uniform(50+incremental_pattern, 150+incremental_pattern), 2),
                "fruit2": fruits[random.randint(0, 4)],
                "fruit2_weight": round(fake.random.uniform(50+incremental_pattern, 150+incremental_pattern), 2),
            },
            round(fake.random.uniform(100+incremental_pattern, 300+incremental_pattern), 4),
        )
    )
df = spark.createDataFrame(schema=schema,data=data)
df.createOrReplaceTempView("vRecipe5")
display(df)

drink_name,creation_date,ingredients,calories
rerum_soluta,2025-08-14,"Map(fruit1 -> Longan, fruit1_weight -> 284.22, fruit2 -> Limes, fruit2_weight -> 318.63)",356.6565
ipsum_perferendis,2024-11-01,"Map(fruit1 -> Longan, fruit1_weight -> 319.7, fruit2 -> Tangelo, fruit2_weight -> 295.43)",336.8422
occaecati_aperiam,2025-05-22,"Map(fruit1 -> Incaberries, fruit1_weight -> 336.79, fruit2 -> Longan, fruit2_weight -> 334.34)",436.1212
quia_quam,2025-03-08,"Map(fruit1 -> Incaberries, fruit1_weight -> 338.81, fruit2 -> Kiwi Fruit, fruit2_weight -> 326.34)",352.6457
sint_mollitia,2025-09-03,"Map(fruit1 -> Incaberries, fruit1_weight -> 279.35, fruit2 -> Tangelo, fruit2_weight -> 263.45)",435.3751
quas_dolorem,2025-01-31,"Map(fruit1 -> Kiwi Fruit, fruit1_weight -> 276.03, fruit2 -> Longan, fruit2_weight -> 271.72)",433.5896
illum_sapiente,2025-08-16,"Map(fruit1 -> Limes, fruit1_weight -> 321.12, fruit2 -> Kiwi Fruit, fruit2_weight -> 277.21)",327.3421
consequatur_maxime,2025-05-07,"Map(fruit1 -> Longan, fruit1_weight -> 286.23, fruit2 -> Tangelo, fruit2_weight -> 332.87)",507.1815
voluptatibus_inventore,2025-09-07,"Map(fruit1 -> Kiwi Fruit, fruit1_weight -> 337.77, fruit2 -> Tangelo, fruit2_weight -> 320.23)",411.581
libero_porro,2025-03-29,"Map(fruit1 -> Tangelo, fruit1_weight -> 341.51, fruit2 -> Tangelo, fruit2_weight -> 322.15)",368.0797


In [0]:
%sql
create or replace table odp_hackathon25.source_drink_recipe.drink_recipe_5
as
select * from vRecipe5;

num_affected_rows,num_inserted_rows


## End of Data Source Demo