In [2]:
from datetime import datetime, timedelta
import numpy as np

def generate_dates_for_user(user_group, start_date_base, date_range_days):
    """
    Generate order dates for a single user's orders.
    This function is applied to each user group.
    """
    # Generate random start date for this user
    user_id = user_group['user_id'].iloc[0]
    np.random.seed(user_id + 42)  # Seed based on user_id for reproducibility
    random_days = np.random.randint(0, date_range_days)
    user_start_date = start_date_base + timedelta(days=random_days)
    
    # Fill missing days_since_prior_order with default value
    days_since = user_group['days_since_prior_order'].fillna(14).values
    
    # Calculate cumulative days from start
    # First order is at day 0, subsequent orders add days_since_prior_order
    cumulative_days = np.concatenate([[0], np.cumsum(days_since[1:])])
    
    # Generate base dates by adding cumulative days
    base_dates = [user_start_date + timedelta(days=int(d)) for d in cumulative_days]
    
    # Adjust each date to match the correct day of week
    order_dates = []
    for i, base_date in enumerate(base_dates):
        target_dow = int(user_group.iloc[i]['order_dow'])
        hour = int(user_group.iloc[i]['order_hour_of_day'])
        
        # Adjust to match day of week
        current_dow = base_date.weekday()  # Monday=0, Sunday=6
        # Convert: Instacart uses Sunday=0, Python uses Monday=0
        target_dow_python = (target_dow + 6) % 7
        dow_diff = (target_dow_python - current_dow) % 7
        adjusted_date = base_date + timedelta(days=dow_diff)
        
        # Add time
        minute = np.random.randint(0, 60)
        second = np.random.randint(0, 60)
        final_date = adjusted_date.replace(hour=hour, minute=minute, second=second)
        
        order_dates.append(final_date)
    
    # Return series with the same index as the group
    return pd.Series(order_dates, index=user_group.index)


def generate_order_dates_optimized(orders_df, start_date='2025-01-01', date_range_days=365):
    """
    Generate simulated order dates for all users using groupby and apply.
    
    How it works:
    1. Each user gets a random start date within [start_date, start_date + date_range_days]
    2. Subsequent orders use days_since_prior_order to calculate dates
    3. Dates are adjusted to match the correct day of week (order_dow)
    4. Hour of day is added for complete timestamps
    
    Parameters:
    -----------
    orders_df : DataFrame
        Must have: user_id, order_number, order_dow, order_hour_of_day, days_since_prior_order
    start_date : str
        Starting date for the simulation (format: 'YYYY-MM-DD')
    date_range_days : int
        Range in days for distributing users' first orders
        
    Returns:
    --------
    DataFrame with added 'order_date' column
    """
    df = orders_df.copy()
    
    # Sort to ensure chronological order per user
    df = df.sort_values(['user_id', 'order_number']).reset_index(drop=True)
    
    start_dt = pd.to_datetime(start_date)
    
    print(f"Generating dates for {df['user_id'].nunique():,} users and {len(df):,} orders...")
    print(f"Date range: {start_date} to {(start_dt + timedelta(days=date_range_days)).date()}")
    print("Using vectorized groupby approach...")
    
    # Apply the date generation function to each user group
    df['order_date'] = df.groupby('user_id', group_keys=False).apply(
        lambda group: generate_dates_for_user(group, start_dt, date_range_days)
    )
    
    print("✓ Date generation complete!")
    return df


In [3]:
import pandas as pd

orders_df = pd.read_csv('../data/orders.csv')

orders_df = orders_df[orders_df['eval_set'] == 'prior']

# get sample of 10,000 unique user_ids
sample_user_ids = orders_df['user_id'].unique()
sample_user_ids.sort()
sample_user_ids = sample_user_ids[:10000]

orders_df_with_dates = orders_df[orders_df['user_id'].isin(sample_user_ids)]

print(orders_df.shape)
print(orders_df_with_dates.shape)
orders_df.head()

(3214874, 7)
(155676, 7)


Unnamed: 0,order_id,user_id,eval_set,order_number,order_dow,order_hour_of_day,days_since_prior_order
0,2539329,1,prior,1,2,8,
1,2398795,1,prior,2,3,7,15.0
2,473747,1,prior,3,3,12,21.0
3,2254736,1,prior,4,4,7,29.0
4,431534,1,prior,5,4,15,28.0


In [8]:
prior_order_products = pd.read_csv('../data/order_products__prior.csv')

print(prior_order_products.shape)
prior_order_products.head()

(32434489, 4)


Unnamed: 0,order_id,product_id,add_to_cart_order,reordered
0,2,33120,1,1
1,2,28985,2,1
2,2,9327,3,0
3,2,45918,4,1
4,2,30035,5,0


In [4]:
# Generate order dates starting from 2025-01-01
# Each user gets a random first order date spread across the year
orders_with_dates = generate_order_dates_optimized(
    orders_df_with_dates, 
    start_date='2025-01-01',
    date_range_days=365  # Spread first orders across 2025
)

print("\n" + "="*70)
print("Sample Results:")
print("="*70)
orders_with_dates[['user_id', 'order_number', 'order_dow', 'order_hour_of_day', 
                          'days_since_prior_order', 'order_date']].head(20)

Generating dates for 10,000 users and 155,676 orders...
Date range: 2025-01-01 to 2026-01-01
Using vectorized groupby approach...
✓ Date generation complete!

Sample Results:
    user_id  order_number  order_dow  order_hour_of_day  \
0         1             1          2                  8   
1         1             2          3                  7   
2         1             3          3                 12   
3         1             4          4                  7   
4         1             5          4                 15   
5         1             6          2                  7   
6         1             7          1                  9   
7         1             8          1                 14   
8         1             9          1                 16   
9         1            10          4                  8   
10        2             1          2                 11   
11        2             2          5                 10   
12        2             3          1                 10   

In [5]:
orders_with_dates.loc[orders_with_dates['user_id'] == 1]

Unnamed: 0,order_id,user_id,eval_set,order_number,order_dow,order_hour_of_day,days_since_prior_order,order_date
0,2539329,1,prior,1,2,8,,2025-11-25 08:00:49
1,2398795,1,prior,2,3,7,15.0,2025-12-10 07:21:58
2,473747,1,prior,3,3,12,21.0,2025-12-31 12:16:51
3,2254736,1,prior,4,4,7,29.0,2026-01-29 07:17:59
4,431534,1,prior,5,4,15,28.0,2026-02-26 15:27:02
5,3367565,1,prior,6,2,7,19.0,2026-03-17 07:46:30
6,550135,1,prior,7,1,9,20.0,2026-04-06 09:23:00
7,3108588,1,prior,8,1,14,14.0,2026-04-20 14:35:50
8,2295261,1,prior,9,1,16,0.0,2026-04-20 16:59:11
9,2550362,1,prior,10,4,8,30.0,2026-05-21 08:25:01


In [7]:
# orders_with_dates.to_csv('../data/initial_load_orders.csv', index=False)

In [12]:
# get order products for orders with dates
order_products = prior_order_products[prior_order_products['order_id'].isin(orders_with_dates['order_id'])]

order_products.shape

(1544933, 4)

In [13]:
# Import the data generation module
import sys
sys.path.append('../src')
from data_generation import InstacartDataLoader

In [14]:
from dotenv import load_dotenv
load_dotenv("../.env")

True

In [17]:
# Database configuration
# Database connection parameters
import os
db_config = {
    'host': os.getenv('DB_HOST'),
    'port': os.getenv('DB_PORT'),
    'database': os.getenv('DB_NAME'),
    'user': os.getenv('DB_USER'),
    'password': os.getenv('DB_PASSWORD')
}

# Create data loader instance
data_dir = '../data'
log_file = '../logs/data_insertion.log'

loader = InstacartDataLoader(db_config, data_dir, log_file)

In [19]:
# Insert orders data
conn = loader.get_db_connection()

try:
    print("Inserting orders...")
    # Select only the columns needed for the database table
    order_columns = ['order_id', 'user_id', 'order_date']
    
    loader.insert_dataframe_batch(conn, orders_with_dates, 'instacart.orders', order_columns)
    print(f"✓ Successfully inserted {len(orders_with_dates)} orders")
    
except Exception as e:
    print(f"Error inserting orders: {e}")
    conn.rollback()
finally:
    conn.close()

Inserting orders...


2025-12-21 14:49:44,865 - INFO - Inserted 155676 rows into instacart.orders


✓ Successfully inserted 155676 orders


In [20]:
# Insert order products data
conn = loader.get_db_connection()

try:
    print("Inserting order products...")
    # Select only the columns needed for the database table
    order_product_columns = ['order_id', 'product_id', 'add_to_cart_order']
    
    loader.insert_dataframe_batch(conn, order_products, 'instacart.order_products', order_product_columns)
    print(f"✓ Successfully inserted {len(order_products)} order products")
    
except Exception as e:
    print(f"Error inserting order products: {e}")
    conn.rollback()
finally:
    conn.close()

Inserting order products...


2025-12-21 14:51:23,815 - INFO - Inserted 1544933 rows into instacart.order_products


✓ Successfully inserted 1544933 order products
