In [12]:
#!pip install faker

In [13]:
# Cell 1: Setup and Configuration
# Imports
import datetime
import random
import pandas as pd
from faker import Faker
import sqlite3
import os

# --- 1. CONSTANTS AND LOOKUP DATA ---
# Configuration for product generation
WINE_CATEGORIES = [
    ("Rouge", ["Merlot", "Cabernet Sauvignon", "Pinot Noir", "Syrah", "Malbec"]),
    ("Blanc", ["Chardonnay", "Sauvignon Blanc", "Riesling", "Viognier"]),
    ("Rosé", ["Grenache Rosé", "Syrah Rosé", "Cinsault Rosé"]),
    ("Effervescent", ["Champagne", "Crémant", "Prosecco"]),
    ("Spiritueux", ["Whisky", "Rhum", "Cognac", "Armagnac"]),
]

ADJECTIVES = ["Réserve", "Tradition", "Sélection", "Grande Cuvée", "Prestige", "Vieilles Vignes", "Édition Limitée"]
BOTTLE_SIZES = [0.375, 0.5, 0.75, 1.0, 1.5]
SALES_CHANNELS = ["E-com", "Boutique Paris", "Boutique Lyon", "Boutique Bordeaux"]

# --- 2. REPRODUCIBILITY SETUP ---
SEED = 11
rng = random.Random(SEED)
Faker.seed(SEED)
fake = Faker("fr_FR") # Use French locale for customer names

# --- 3. DYNAMIC VARIABLES ---
PRODUCTS_COUNT = 50
CUSTOMERS_START_COUNT = 100
SIMULATION_DAYS = 10
REPLENISH_AMOUNT = 50
STOCK_REPLENISH_CYCLE = 7
ERROR_RATE = 0.1

print(f"Configuration loaded. Simulation seed: {SEED}")

Configuration loaded. Simulation seed: 11


In [14]:
# Cell 2: Utility Functions

def gaussian_clamped(rng, mu, sigma, a, b):
    """Generates a number from a Gaussian distribution, clamped between min (a) and max (b)."""
    val = rng.gauss(mu, sigma)
    return max(a, min(b, val))

def inject_data_errors(df: pd.DataFrame, error_rate: float = 0.1, seed: int = None) -> pd.DataFrame:
    """Injects intentional errors (e.g., negative stock, zero price) for testing ETL robustness."""
    rng_err = random.Random(seed)
    df = df.copy()
    num_errors = int(len(df) * error_rate)
    
    for idx in rng_err.sample(range(len(df)), num_errors):
        error_type = rng_err.choice(["stock", "price", "bottle", "channel"])
        
        if error_type == "stock":
            df.at[idx, "stock_quantity"] = -rng_err.randint(1, 50)
        elif error_type == "price":
            df.at[idx, "unit_price"] = rng_err.choice([0, 999])
        elif error_type == "bottle":
            df.at[idx, "bottle_size_l"] = rng_err.choice([-1, 3, 5])
        elif error_type == "channel":
            df.at[idx, "sales_channel"] = "UNKNOWN_CHANNEL"
            
    return df

def update_inventory_with_sales(inventory: pd.DataFrame, sales: pd.DataFrame) -> pd.DataFrame:
    """Calculates stock levels based on daily sales."""
    sales_summary = sales.groupby("product_id")["quantity"].sum().reset_index()
    inventory = inventory.merge(sales_summary, on="product_id", how="left").fillna(0)
    inventory["stock_quantity"] -= inventory["quantity"].astype(int)
    inventory["stock_quantity"] = inventory["stock_quantity"].clip(lower=0) # Stock cannot be negative
    return inventory.drop(columns=["quantity"])

def replenish_inventory(inventory: pd.DataFrame, amount: int):
    """Simulates a stock replenishment event."""
    inventory["stock_quantity"] += amount
    return inventory

In [15]:
# Cell 3: Primary Data Generation Logic

def generate_inventory_data(products: int, rng: random.Random) -> pd.DataFrame:
    """Generates product inventory data with product details and prices."""
    rows = []
    current_year = datetime.datetime.now().year
    
    for pid in range(1000, 1000 + products):
        category, grape_list = rng.choice(WINE_CATEGORIES)
        grape = rng.choice(grape_list)
        adj = rng.choice(ADJECTIVES)
        year = rng.randint(current_year - 15, current_year)
        
        # Base price logic
        price_base = {"Rouge": 18, "Blanc": 15, "Rosé": 12, "Effervescent": 30, "Spiritueux": 45}[category]
        price = round(gaussian_clamped(rng, price_base, price_base * 0.3, price_base * 0.5, price_base * 3), 2)
        stock_qty = int(gaussian_clamped(rng, 50, 40, 0, 300))
        
        rows.append({
            "product_id": pid,
            "product_name": f"{grape} {year} – {adj}",
            "category": category,
            "year": year,
            "unit_price": price,
            "stock_quantity": stock_qty,
            "bottle_size_l": rng.choice(BOTTLE_SIZES),
            "sales_channel": rng.choice(SALES_CHANNELS)
        })
        
    return pd.DataFrame(rows)

def generate_customers(n: int, start_id: int, rng: random.Random, fake: Faker) -> pd.DataFrame:
    """Generates customer data using Faker (French locale)."""
    return pd.DataFrame([{
        "customer_id": i + start_id,
        "name": fake.name(),
        "email": fake.email(),
        "city": fake.city(),
        "channel": rng.choice(["en ligne", "boutique"])
    } for i in range(n)])

def generate_sales_for_day(day: str, n_orders: int, inventory: pd.DataFrame, customers: pd.DataFrame, rng: random.Random) -> pd.DataFrame:
    """Generates sales transactions for a single day."""
    start_dt = datetime.datetime.fromisoformat(day)
    delta = datetime.timedelta(days=1).total_seconds()
    rows = []
    
    for i in range(n_orders):
        pid = rng.choice(inventory["product_id"].tolist())
        cid = rng.choice(customers["customer_id"].tolist())
        
        # Quantity distribution bias towards single unit sales
        qty = rng.choices([1, 2, 3, 6], weights=[0.6, 0.25, 0.1, 0.05])[0]
        price = inventory[inventory["product_id"] == pid]["unit_price"].values[0]
        
        # 25% chance of applying a discount
        discount = round(rng.uniform(0, 10), 2) if rng.random() < 0.25 else 0.0
        sold_at = start_dt + datetime.timedelta(seconds=rng.randint(0, int(delta)))
        
        rows.append({
            "order_id": i + 1,
            "product_id": pid,
            "customer_id": cid,
            "quantity": qty,
            "unit_price": price,
            "discount": discount,
            "sold_at": sold_at
        })
        
    return pd.DataFrame(rows)

In [16]:
# Cell 4: Main Simulation Loop (VERSION MODIFIÉE)

# --- 1. INITIAL DATA GENERATION ---
print("1. Generating initial customer and inventory data...")
customers_df = generate_customers(CUSTOMERS_START_COUNT, start_id=1, rng=rng, fake=fake)
inventory_df = generate_inventory_data(PRODUCTS_COUNT, rng=rng)
inventory_df = inject_data_errors(inventory_df, error_rate=ERROR_RATE, seed=SEED)

# 🆕 NOUVEAU : Sauvegarder l'état INITIAL (avant simulation)
customers_initial_df = customers_df.copy()  # Copie des clients initiaux
inventory_initial_df = inventory_df.copy()  # Copie de l'inventaire initial

print(f"\n📸 Snapshot initial sauvegardé :")
print(f"   - Clients initiaux : {len(customers_initial_df)}")
print(f"   - Stock initial moyen : {inventory_initial_df['stock_quantity'].mean():.1f}")

next_cust_id = CUSTOMERS_START_COUNT + 1 

start_date = datetime.datetime(2025, 9, 1)
days = [(start_date + datetime.timedelta(days=i)).strftime("%Y-%m-%d") for i in range(SIMULATION_DAYS)]

all_sales = []

# --- 2. DAILY SIMULATION ---
print(f"\n2. Starting daily simulation for {SIMULATION_DAYS} days...")
for i, day in enumerate(days):
    print(f"--- Simulating Day {i+1} ({day}) ---")

    # A. Add new customers (1-3 clients per day up to 120 max)
    if len(customers_df) < 120:
        new_clients = rng.randint(1, 3)
        new_df = generate_customers(new_clients, start_id=next_cust_id, rng=rng, fake=fake)
        next_cust_id += new_clients
        customers_df = pd.concat([customers_df, new_df], ignore_index=True)
        print(f"  > Added {new_clients} new customers. Total customers: {len(customers_df)}")

    # B. Generate daily sales
    daily_orders = rng.randint(90, 110)
    sales_df = generate_sales_for_day(day, daily_orders, inventory_df, customers_df, rng=rng) 
    
    # C. Update inventory based on sales
    inventory_df = update_inventory_with_sales(inventory_df, sales_df)

    # D. Replenish inventory every X days
    if i % STOCK_REPLENISH_CYCLE == 0:
        inventory_df = replenish_inventory(inventory_df, REPLENISH_AMOUNT)

    all_sales.append(sales_df)

# --- 3. FINAL AGGREGATION ---
final_sales_df = pd.concat(all_sales, ignore_index=True)

print("\n✅ Simulation complete!")
print(f"\n📊 Comparaison AVANT/APRÈS :")
print(f"   Clients : {len(customers_initial_df)} → {len(customers_df)} (+{len(customers_df) - len(customers_initial_df)})")
print(f"   Stock moyen : {inventory_initial_df['stock_quantity'].mean():.1f} → {inventory_df['stock_quantity'].mean():.1f}")
print(f"   Total ventes : {len(final_sales_df)} commandes")

1. Generating initial customer and inventory data...

📸 Snapshot initial sauvegardé :
   - Clients initiaux : 100
   - Stock initial moyen : 49.7

2. Starting daily simulation for 10 days...
--- Simulating Day 1 (2025-09-01) ---
  > Added 1 new customers. Total customers: 101
--- Simulating Day 2 (2025-09-02) ---
  > Added 2 new customers. Total customers: 103
--- Simulating Day 3 (2025-09-03) ---
  > Added 1 new customers. Total customers: 104
--- Simulating Day 4 (2025-09-04) ---
  > Added 3 new customers. Total customers: 107
--- Simulating Day 5 (2025-09-05) ---
  > Added 2 new customers. Total customers: 109
--- Simulating Day 6 (2025-09-06) ---
  > Added 3 new customers. Total customers: 112
--- Simulating Day 7 (2025-09-07) ---
  > Added 2 new customers. Total customers: 114
--- Simulating Day 8 (2025-09-08) ---
  > Added 3 new customers. Total customers: 117
--- Simulating Day 9 (2025-09-09) ---
  > Added 1 new customers. Total customers: 118
--- Simulating Day 10 (2025-09-10) 

In [17]:
# Cellule de comparaison AVANT/APRÈS

print("="*60)
print("📸 COMPARAISON AVANT/APRÈS LA SIMULATION")
print("="*60)

# 1. Comparaison des clients
print("\n👥 CLIENTS :")
print(f"   Avant : {len(customers_initial_df)} clients")
print(f"   Après : {len(customers_df)} clients")
print(f"   Nouveaux clients : {len(customers_df) - len(customers_initial_df)}")

# 2. Comparaison de l'inventaire
print("\n📦 INVENTAIRE (exemple produit 1000) :")
product_id = 1000
stock_avant = inventory_initial_df[inventory_initial_df['product_id'] == product_id]['stock_quantity'].values[0]
stock_apres = inventory_df[inventory_df['product_id'] == product_id]['stock_quantity'].values[0]
print(f"   Stock avant : {stock_avant}")
print(f"   Stock après : {stock_apres}")
print(f"   Différence : {stock_apres - stock_avant}")

# 3. Vue détaillée : top 5 produits avec plus grande variation de stock
inventory_comparison = inventory_initial_df[['product_id', 'product_name', 'stock_quantity']].copy()
inventory_comparison.columns = ['product_id', 'product_name', 'stock_initial']
inventory_comparison = inventory_comparison.merge(
    inventory_df[['product_id', 'stock_quantity']], 
    on='product_id'
)
inventory_comparison.columns = ['product_id', 'product_name', 'stock_initial', 'stock_final']
inventory_comparison['variation'] = inventory_comparison['stock_final'] - inventory_comparison['stock_initial']

print("\n🔥 Top 5 produits avec plus grande variation de stock :")
print(inventory_comparison.nlargest(5, 'variation')[['product_name', 'stock_initial', 'stock_final', 'variation']])

# 4. Afficher les DataFrames
print("\n" + "="*60)
print("Vous avez maintenant accès à :")
print("   - customers_initial_df : Clients AVANT simulation")
print("   - inventory_initial_df : Inventaire AVANT simulation")
print("   - customers_df : Clients APRÈS simulation")
print("   - inventory_df : Inventaire APRÈS simulation")
print("   - final_sales_df : Toutes les ventes (1000+ lignes)")

📸 COMPARAISON AVANT/APRÈS LA SIMULATION

👥 CLIENTS :
   Avant : 100 clients
   Après : 121 clients
   Nouveaux clients : 21

📦 INVENTAIRE (exemple produit 1000) :
   Stock avant : 78
   Stock après : 157
   Différence : 79

🔥 Top 5 produits avec plus grande variation de stock :
                              product_name  stock_initial  stock_final  \
46               Riesling 2020 – Tradition              0           86   
9                    Whisky 2025 – Réserve             16           96   
0                Syrah Rosé 2012 – Réserve             78          157   
23    Cinsault Rosé 2010 – Édition Limitée             18           97   
41  Cabernet Sauvignon 2010 – Grande Cuvée             40          119   

    variation  
46         86  
9          80  
0          79  
23         79  
41         79  

Vous avez maintenant accès à :
   - customers_initial_df : Clients AVANT simulation
   - inventory_initial_df : Inventaire AVANT simulation
   - customers_df : Clients APRÈS simul

In [18]:
# Cell 6: Data Integration into Local SQLite (Chinook.db)

# --- 1. CONFIGURATION ---
# IMPORTANT: Use the exact path to your Chinook.db file.
DB_PATH = '/Users/mory_jr/Library/DBeaverData/workspace6/.metadata/sample-database-sqlite-1/Chinook.db'

# Check if the DB file exists
if not os.path.exists(DB_PATH):
    raise FileNotFoundError(f"FATAL: Chinook database not found at: {DB_PATH}. Please verify the path.")

# Define DataFrames and their target table names in SQLite
dataframes_to_load = {
    'customers': customers_df, 
    'inventory': inventory_df, 
    'sales': final_sales_df     
}

# --- 2. CONNECTION AND LOADING PROCESS ---
conn = None
try:
    conn = sqlite3.connect(DB_PATH)
    print(f"SQLite connection established: {DB_PATH}")

    print("\nStarting DataFrame load into SQLite...")

    for table_name, df in dataframes_to_load.items():
        print(f"Loading table '{table_name}' ({len(df)} rows)...")

        # Use to_sql() to perform the data loading. 
        # 'replace' ensures a clean slate for the ETL source.
        df.to_sql(
            table_name,
            conn,
            if_exists='replace', # Options: 'replace', 'append', 'fail'
            index=False         
        )
        print(f"✅ Table '{table_name}' loaded successfully.")

except sqlite3.Error as e:
    print(f"\nFATAL: SQLite error during data loading: {e}")
except NameError as e:
    print(f"\nFATAL: A DataFrame is missing (Did you run Cell 4?): {e}")

finally:
    # 3. Close the connection
    if conn:
        conn.close()
        print("\nSQLite connection closed. Generation and loading process completed successfully.")

# Awaiting further instructions to proceed with the ETL Pipeline...

SQLite connection established: /Users/mory_jr/Library/DBeaverData/workspace6/.metadata/sample-database-sqlite-1/Chinook.db

Starting DataFrame load into SQLite...
Loading table 'customers' (121 rows)...
✅ Table 'customers' loaded successfully.
Loading table 'inventory' (50 rows)...
✅ Table 'inventory' loaded successfully.
Loading table 'sales' (1003 rows)...
✅ Table 'sales' loaded successfully.

SQLite connection closed. Generation and loading process completed successfully.


In [19]:
customers_df

Unnamed: 0,customer_id,name,email,city,channel
0,1,Suzanne Sauvage,julienblanchet@example.com,Lemoine-sur-Gauthier,boutique
1,2,Madeleine Blanchard,tanguysuzanne@example.net,Sainte Catherine,boutique
2,3,Constance Ferrand de Evrard,lopezauguste@example.org,Brunel,boutique
3,4,Nicolas Millet,mmaury@example.net,Riou,en ligne
4,5,Paul Thomas de Albert,anouk81@example.net,Gomes,en ligne
...,...,...,...,...,...
116,117,Catherine Pascal,gmartins@example.com,Henry,boutique
117,118,Constance Fleury Le Lévy,adrienneraymond@example.net,Blanchard-sur-Mer,boutique
118,119,Élodie de Techer,philippineperez@example.org,Martel,boutique
119,120,Patricia-Margaret Boulanger,alvesrene@example.com,Sainte Nathalie-les-Bains,en ligne


In [20]:
inventory_df

Unnamed: 0,product_id,product_name,category,year,unit_price,stock_quantity,bottle_size_l,sales_channel
0,1000,Syrah Rosé 2012 – Réserve,Rosé,2012,16.02,157,0.5,E-com
1,1001,Cognac 2024 – Sélection,Spiritueux,2024,57.52,147,1.5,Boutique Paris
2,1002,Champagne 2014 – Vieilles Vignes,Effervescent,2014,27.58,145,1.5,Boutique Paris
3,1003,Sauvignon Blanc 2016 – Vieilles Vignes,Blanc,2016,13.28,86,1.0,E-com
4,1004,Champagne 2013 – Réserve,Effervescent,2013,52.4,131,0.5,Boutique Bordeaux
5,1005,Syrah Rosé 2025 – Édition Limitée,Rosé,2025,11.4,131,0.375,Boutique Paris
6,1006,Viognier 2012 – Prestige,Blanc,2012,13.1,207,0.5,E-com
7,1007,Pinot Noir 2024 – Grande Cuvée,Rouge,2024,18.01,124,0.75,Boutique Lyon
8,1008,Rhum 2021 – Réserve,Spiritueux,2021,54.53,155,0.75,Boutique Paris
9,1009,Whisky 2025 – Réserve,Spiritueux,2025,62.73,96,0.375,E-com


In [21]:
customers_df

Unnamed: 0,customer_id,name,email,city,channel
0,1,Suzanne Sauvage,julienblanchet@example.com,Lemoine-sur-Gauthier,boutique
1,2,Madeleine Blanchard,tanguysuzanne@example.net,Sainte Catherine,boutique
2,3,Constance Ferrand de Evrard,lopezauguste@example.org,Brunel,boutique
3,4,Nicolas Millet,mmaury@example.net,Riou,en ligne
4,5,Paul Thomas de Albert,anouk81@example.net,Gomes,en ligne
...,...,...,...,...,...
116,117,Catherine Pascal,gmartins@example.com,Henry,boutique
117,118,Constance Fleury Le Lévy,adrienneraymond@example.net,Blanchard-sur-Mer,boutique
118,119,Élodie de Techer,philippineperez@example.org,Martel,boutique
119,120,Patricia-Margaret Boulanger,alvesrene@example.com,Sainte Nathalie-les-Bains,en ligne


In [22]:
final_sales_df

Unnamed: 0,order_id,product_id,customer_id,quantity,unit_price,discount,sold_at
0,1,1001,5,2,57.52,0.00,2025-09-01 20:42:26
1,2,1030,19,1,44.67,1.72,2025-09-01 05:45:12
2,3,1018,87,1,24.35,9.51,2025-09-01 16:49:24
3,4,1004,98,1,52.40,0.00,2025-09-01 15:31:34
4,5,1032,46,1,22.81,0.00,2025-09-01 00:28:56
...,...,...,...,...,...,...,...
998,100,1044,117,1,9.36,0.00,2025-09-10 16:20:34
999,101,1009,23,1,62.73,0.00,2025-09-10 01:20:11
1000,102,1006,83,3,13.10,4.11,2025-09-10 02:38:32
1001,103,1040,59,1,9.83,0.00,2025-09-10 17:35:52
