In [11]:
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
from sqlalchemy import create_engine, text

# ---- 1. Connection setup ----
# Northwind connection
northwind_host = "127.0.0.1"
northwind_port = 3306
northwind_user = "root"
northwind_pwd  = "Peanut168996!"
northwind_db   = "northwind"

northwind_engine = create_engine(
    f"mysql+pymysql://{northwind_user}:{northwind_pwd}@{northwind_host}:{northwind_port}/{northwind_db}"
)

# shuttle_dw connection
shuttle_host = "127.0.0.1"
shuttle_port = 3306
shuttle_user = "root"
shuttle_pwd  = "Peanut168996!"
shuttle_db   = "shuttle_dw"

shuttle_engine = create_engine(
    f"mysql+pymysql://{shuttle_user}:{shuttle_pwd}@{shuttle_host}:{shuttle_port}/{shuttle_db}"
)

# ---- 2. Fetch all customers from Northwind ----
df_customers = pd.read_sql("SELECT * FROM customers;", northwind_engine)
print("Fetched customers:")
print(df_customers.head())

# ---- 3. Drop dim_customers if exists ----
with shuttle_engine.begin() as conn:
    conn.execute(text("DROP TABLE IF EXISTS dim_customers;"))
print("dim_customers table dropped if it existed.")

# ---- 4. Generate multiple rides per customer ----
routes = ["Green Loop", "Orange Loop", "Gold Line", "Silver Line"]
start_date = datetime(2025, 10, 1)
end_date = datetime.today()

all_rides = []

for _, row in df_customers.iterrows():
    # randomly assign 2-5 rides per customer
    num_rides = np.random.randint(2, 6)
    for _ in range(num_rides):
        ride_date = start_date + timedelta(days=np.random.randint(0, (end_date-start_date).days + 1))
        route = np.random.choice(routes)
        all_rides.append({
            "customer_id": row["id"],
            "first_name": row["first_name"],
            "last_name": row["last_name"],
            "ride_date": ride_date.date(),
            "route_ridden": route
        })

df_dim_customers = pd.DataFrame(all_rides)

# ---- 5. Write to shuttle_dw as dim_customers ----
df_dim_customers.to_sql("dim_customers", shuttle_engine, index=False, if_exists="replace")
print("dim_customers table created in shuttle_dw database.")

# ---- 6. Show sample data ----
print("Sample dim_customers table:")
print(df_dim_customers.head())
print("Columns in dim_customers:", df_dim_customers.columns.tolist())


Fetched customers:
   id    company         last_name first_name email_address  \
0   1  Company A            Bedecs       Anna          None   
1   2  Company B  Gratacos Solsona    Antonio          None   
2   3  Company C              Axen     Thomas          None   
3   4  Company D               Lee  Christina          None   
4   5  Company E         O’Donnell     Martin          None   

                   job_title business_phone home_phone mobile_phone  \
0                      Owner  (123)555-0100       None         None   
1                      Owner  (123)555-0100       None         None   
2  Purchasing Representative  (123)555-0100       None         None   
3         Purchasing Manager  (123)555-0100       None         None   
4                      Owner  (123)555-0100       None         None   

      fax_number         address         city state_province zip_postal_code  \
0  (123)555-0101  123 1st Street      Seattle             WA           99999   
1  (123)555-010