In [2]:
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
import random

# Seed for reproducibility
random.seed(42)
np.random.seed(42)

# Define dish pools (120+ veg Indian dishes, categorized)
breakfast_dishes = [
    'Poha', 'Upma', 'Idli', 'Dosa', 'Masala Dosa', 'Pongal', 'Uttapam', 'Dibba Rotti', 'Paniyaram', 'Wheat Upma',
    'Sevai', 'Aloo Paratha', 'Puri Bhaji', 'Chole Bhature', 'Vegetable Sandwich', 'Thepla', 'Oondees', 'Cheera Doi',
    'Hurum', 'Kumol Sawul', 'Bora Sawul', 'Hando Guri', 'Sabudana Khichadi', 'Rava Idli', 'Ragi Dosa', 'Neer Dosa',
    'Appam', 'Akki Rotti', 'Rava Dosa', 'Set Dosa', 'Poori', 'Paratha', 'Medu Vada'
]  # 33 dishes

lunch_dinner_dishes = [
    'Khichdi', 'Dal Tadka', 'Chana Masala', 'Aloo Matar', 'Matar Paneer', 'Palak Paneer', 'Rajma', 'Sambar', 'Rasam',
    'Avial', 'Poriyal', 'Pachadi', 'Theeyal', 'Olan', 'Kadhi Pakoda', 'Dal Makhani', 'Aloo Gobi', 'Bhindi Masala',
    'Gobhi Matar', 'Aloo Shimla Mirch', 'Methi Saag', 'Chaulai Saag', 'Mixed Veg Curry', 'Lobiya', 'Dalithoy',
    'Daal Dhokli', 'Paneer Butter Masala', 'Shahi Paneer', 'Khoya Paneer', 'Navrattan Korma', 'Kadai Paneer',
    'Dum Aloo', 'Aloo Phalliyaan', 'Gatte ki Sabzi', 'Kofta', 'Lauki ke Kofte', 'Aloo Tuk', 'Jalfrezi', 'Karela Bharta',
    'Mushroom Do Pyaza', 'Mushroom Matar', 'Batata Saung', 'Ennai Kathirikkai', 'Kaara Kozhambu',
    'Keerai Koottu', 'Kos Kootu', 'Koottu', 'Kuzhambu', 'Veg Kolhapuri', 'Undhiyu', 'Turiya Patra Vatana Sabji',
    'Mag Dhokli', 'Pav Bhaji', 'Jeera Aloo', 'Shukto', 'Mishti Chholar Dal', 'Veg Biryani', 'Jeera Rice', 'Curd Rice',
    'Kheer', 'Fruit Custard', 'Green Salad', 'Chapati', 'Baingan Bharta', 'Masoor Dal', 'Mix Veg Curry'
]  # 67 dishes (shared for lunch/dinner)

snack_dishes = [
    'Maggi', 'Sandwich', 'Bread Pakoda', 'Tea', 'Coffee', 'Milk', 'Samosa', 'Spring Rolls', 'French Fries', 'Pav Bhaji',
    'Chaat', 'Aloo Tikki', 'Kachori', 'Mirchi Bada', 'Pani Puri', 'Papad', 'Missi Roti', 'Naan Khatai', 'Bonda',
    'Goli Bajje', 'Kodubale', 'Kuzhakkattai', 'Thattai', 'Sandige', 'Bhajji', 'Dahi Vada', 'Dhokla', 'Dabeli', 'Chevdo',
    'Chorafali', 'Khaman', 'Bread Pakora', 'Khakhra', 'Khandvi', 'Methi na Gota', 'Patra', 'Shakarpara', 'Shankarpali',
    'Thalipeeth', 'Chakri', 'Muthiya', 'Rasya Muthia', 'Sev Khamani', 'Sev Tameta', 'Lilva Kachori', 'Farsi Puri'
]  # 46 dishes (light evening)

# Popularity scores (power-law: top few high, tail low)
all_dishes = set(breakfast_dishes + lunch_dinner_dishes + snack_dishes)
popularity = {dish: max(1, int(1000 / (i+1))) for i, dish in enumerate(sorted(all_dishes))}  # Power-law
# Boost popular ones manually
popular_boosts = {
    'Paneer Butter Masala': 1500, 'Veg Biryani': 1400, 'Aloo Paratha': 1300, 'Palak Paneer': 1200,
    'Dal Tadka': 1100, 'Chana Masala': 1000, 'Rajma': 900, 'Sambar': 800, 'Idli': 700, 'Dosa': 600
}
for dish, boost in popular_boosts.items():
    if dish in popularity:
        popularity[dish] = boost
# Low for less popular
low_pop = {'Khichdi': 80, 'Karela Bharta': 150, 'Lauki ke Kofte': 100}
for dish, low in low_pop.items():
    if dish in popularity:
        popularity[dish] = low

# Real Indian holidays/festivals (clustered, for 2022-2025; approx dates, adjust for actual)
holidays = [
    # 2022
    '2022-01-26',  # Republic Day
    '2022-03-18',  # Holi
    '2022-04-14',  # Ambedkar Jayanti
    '2022-08-15',  # Independence Day
    '2022-08-19',  # Janmashtami
    '2022-10-02',  # Gandhi Jayanti
    '2022-10-24',  # Diwali
    '2022-11-08',  # Guru Nanak Jayanti
    '2022-12-25',  # Christmas
    # 2023 similar...
    # (add more for 2023-2025 similarly; for script, I'll use a function to generate)
    '2023-01-26', '2023-03-08', '2023-04-04', '2023-08-15', '2023-08-30', '2023-10-02', '2023-10-24', '2023-11-12', '2023-12-25',
    '2024-01-26', '2024-03-25', '2024-04-11', '2024-08-15', '2024-08-26', '2024-10-02', '2024-10-31', '2024-11-15', '2024-12-25',
    '2025-01-26', '2025-03-14', '2025-04-10', '2025-08-15', '2025-09-06', '2025-10-02', '2025-10-20', '2025-11-05', '2025-12-25'
]

# Vacation periods (summer May-Jun, winter Dec-Jan, plus holidays)
vacation_periods = [
    (datetime(2022, 5, 20), datetime(2022, 6, 20)),
    (datetime(2022, 12, 20), datetime(2023, 1, 10)),
    (datetime(2023, 5, 20), datetime(2023, 6, 20)),
    (datetime(2023, 12, 20), datetime(2024, 1, 10)),
    (datetime(2024, 5, 20), datetime(2024, 6, 20)),
    (datetime(2024, 12, 20), datetime(2025, 1, 10)),
    (datetime(2025, 5, 20), datetime(2025, 6, 20)),
    (datetime(2025, 12, 20), datetime(2025, 12, 31))
]

# Event clusters (fests Feb-Mar, sports Oct, plus festivals)
event_clusters = [
    (datetime(2022, 2, 15), datetime(2022, 3, 5), 0.7),
    (datetime(2022, 10, 1), datetime(2022, 10, 15), 0.5),
    (datetime(2023, 2, 15), datetime(2023, 3, 5), 0.7),
    (datetime(2023, 10, 1), datetime(2023, 10, 15), 0.5),
    (datetime(2024, 2, 15), datetime(2024, 3, 5), 0.7),
    (datetime(2024, 10, 1), datetime(2024, 10, 15), 0.5),
    (datetime(2025, 2, 15), datetime(2025, 3, 5), 0.7),
    (datetime(2025, 10, 1), datetime(2025, 10, 15), 0.5)
]

# Function to check if date is vacation
def is_vacation(date):
    if date.strftime('%Y-%m-%d') in holidays:
        return 1
    for start, end in vacation_periods:
        if start <= date <= end:
            return 1
    return 0

# Function to check if event (higher prob in clusters, overall ~9%)
def is_event(date):
    if random.random() < 0.06:  # Base 6%
        return 1
    for start, end, prob in event_clusters:
        if start <= date <= end and random.random() < prob:
            return 1
    return 0

# Track recent dishes to avoid repetition (>3/week)
recent_dishes = {meal: [] for meal in ['Breakfast', 'Lunch', 'Snacks', 'Dinner']}

# Generate dates
start_date = datetime(2022, 1, 1)
end_date = datetime(2025, 12, 31)
dates = [start_date + timedelta(days=i) for i in range((end_date - start_date).days + 1)]

# Data list
data = []

# Yearly enrollment growth
base_students = 2800
growth_rate = 0.03  # 3%/year

for date in dates:
    year = date.year
    current_base = int(base_students * (1 + growth_rate) ** (year - 2022))
    student_count = random.randint(current_base, current_base + 1400)  # 2800-4200 base range

    vacation_flag = is_vacation(date)
    event_flag = is_event(date) if not vacation_flag else 0  # No event on vacation

    # Adjust student count
    if vacation_flag:
        student_count = int(student_count * random.uniform(0.3, 0.6))  # -40-70%
    elif event_flag:
        student_count = int(student_count * random.uniform(1.1, 1.25))  # +10-25%

    day_of_week = date.strftime('%A')

    # Meal configs
    meals = {
        'Breakfast': {'dishes': breakfast_dishes, 'num': random.randint(5, 8), 'participation': (0.48, 0.68), 'avg_dishes': (1.8, 2.5)},
        'Lunch': {'dishes': lunch_dinner_dishes, 'num': random.randint(6, 10), 'participation': (0.78, 0.96), 'avg_dishes': (2.5, 3.5)},
        'Snacks': {'dishes': snack_dishes, 'num': random.randint(1, 2), 'participation': (0.65, 0.80), 'avg_dishes': (1.2, 1.8)},
        'Dinner': {'dishes': lunch_dinner_dishes, 'num': random.randint(5, 9), 'participation': (0.62, 0.88), 'avg_dishes': (2.3, 3.0)}
    }

    # Adjust participation for day type (your rules)
    if event_flag:
        adj_part = (1.1, 1.25)  # +10-25%
    elif vacation_flag:
        adj_part = (0.65, 0.8)  # -20-35%
    else:
        adj_part = (1.0, 1.0)

    # Boost Fri/Mon lunch/dinner
    if day_of_week in ['Friday', 'Monday'] and random.random() > 0.5:
        adj_part = (adj_part[0] * 1.05, adj_part[1] * 1.05)

    for meal_type, config in meals.items():
        part_min, part_max = config['participation']
        part_min *= adj_part[0]
        part_max *= adj_part[1]
        participation = random.uniform(part_min, part_max)
        total_eaters = int(student_count * participation)

        # Select dishes (avoid recent repeats)
        avail_dishes = [d for d in config['dishes'] if recent_dishes[meal_type].count(d) < 3]  # <=2/week, but week approx
        if len(avail_dishes) < config['num']:
            avail_dishes = config['dishes']  # Fallback
        selected_dishes = random.sample(avail_dishes, config['num'])
        recent_dishes[meal_type].extend(selected_dishes)
        if len(recent_dishes[meal_type]) > 21:  # Approx week (3 meals/day x7)
            recent_dishes[meal_type] = recent_dishes[meal_type][-21:]

        # Distribute servings
        avg_dishes_min, avg_dishes_max = config['avg_dishes']
        avg_dishes = random.uniform(avg_dishes_min, avg_dishes_max)
        total_servings_needed = int(total_eaters * avg_dishes)

        pops = np.array([popularity.get(d, 500) for d in selected_dishes])
        pops = pops / pops.sum()  # Normalize to probs
        servings = np.random.multinomial(total_servings_needed, pops)

        # Apply min/max per dish (your rules)
        num_dishes = len(selected_dishes)
        min_serving = int(0.6 * total_eaters / num_dishes)
        max_serving = int(random.uniform(1.8, 2.5) * (total_eaters / num_dishes))
        servings = np.clip(servings, min_serving, max_serving)

        # Normalize to exact total if needed
        diff = total_servings_needed - servings.sum()
        if diff > 0:
            servings += np.random.multinomial(diff, pops)
        elif diff < 0:
            over = np.where(servings > min_serving)[0]
            if len(over):
                servings[over[:abs(diff)]] -= 1

        for i, dish in enumerate(selected_dishes):
            data.append({
                'Date': date.strftime('%Y-%m-%d'),
                'Day': day_of_week,
                'Meal_Type': meal_type,
                'Dish_Name': dish,
                'Servings': max(0, servings[i]),  # Ensure no negative
                'Student_Count': student_count,
                'Event_Flag': event_flag,
                'Vacation_Flag': vacation_flag,
                'Total_Eaters': total_eaters
            })

# Create DF and save
df = pd.DataFrame(data)
df.to_csv('smartbite_cafeteria_demand_2022_2025.csv', index=False)
print(f"SmartBite Dataset Generated {len(df)} rows. Saved to CSV.")


SmartBite Dataset Generated 33567 rows. Saved to CSV.
