# QuickBite Express - Data Exploration

Comprehensive data quality checks and exploration of food delivery datasets.

## Setup

In [23]:
import pandas as pd
import numpy as np
from pathlib import Path
import warnings
warnings.filterwarnings('ignore')

DATA_DIR = Path('/home/parambrata-ghosh/Development/Personal/Projects/Food_Delivery_Startup/input/RPC_18_Datasets')

## Load Datasets

In [24]:
# Load dimension tables
df_customer = pd.read_csv(DATA_DIR / 'dim_customer.csv')
df_delivery_partner = pd.read_csv(DATA_DIR / 'dim_delivery_partner_.csv')
df_menu_item = pd.read_csv(DATA_DIR / 'dim_menu_item.csv')
df_restaurant = pd.read_csv(DATA_DIR / 'dim_restaurant.csv')

# Load fact tables
df_orders = pd.read_csv(DATA_DIR / 'fact_orders.csv')
df_order_items = pd.read_csv(DATA_DIR / 'fact_order_items.csv')
df_delivery_performance = pd.read_csv(DATA_DIR / 'fact_delivery_performance.csv')
df_ratings = pd.read_csv(DATA_DIR / 'fact_ratings.csv')

print(f"Loaded {df_customer.shape[0]:,} customers, {df_restaurant.shape[0]:,} restaurants, {df_delivery_partner.shape[0]:,} delivery partners")
print(f"Loaded {df_orders.shape[0]:,} orders, {df_ratings.shape[0]:,} ratings")

Loaded 107,776 customers, 19,995 restaurants, 15,000 delivery partners
Loaded 149,166 orders, 68,842 ratings


## Data Structure

In [25]:
datasets = {
    'customer': df_customer,
    'delivery_partner': df_delivery_partner,
    'menu_item': df_menu_item,
    'restaurant': df_restaurant,
    'orders': df_orders,
    'order_items': df_order_items,
    'delivery_performance': df_delivery_performance,
    'ratings': df_ratings
}

for name, df in datasets.items():
    print(f"\n{name.upper()}: {df.shape[0]:,} rows × {df.shape[1]} columns")
    
    info_df = pd.DataFrame({
        'Column': df.columns,
        'Type': df.dtypes.values,
        'Null %': [f"{(df[col].isna().sum() / len(df) * 100):.1f}%" for col in df.columns]
    })
    print(info_df.to_string(index=False))


CUSTOMER: 107,776 rows × 4 columns
             Column   Type Null %
        customer_id object   0.0%
        signup_date object   0.0%
               city object   0.0%
acquisition_channel object   0.0%

DELIVERY_PARTNER: 15,000 rows × 7 columns
             Column    Type Null %
delivery_partner_id  object   0.0%
       partner_name  object   0.0%
               city  object   0.0%
       vehicle_type  object   0.0%
    employment_type  object   0.0%
         avg_rating float64   0.0%
          is_active  object   0.0%

MENU_ITEM: 342,671 rows × 6 columns
       Column    Type Null %
 menu_item_id  object   0.0%
restaurant_id  object   0.0%
    item_name  object   0.0%
     category  object   0.0%
       is_veg  object   0.0%
        price float64   0.0%

RESTAURANT: 19,995 rows × 7 columns
           Column   Type Null %
    restaurant_id object   0.0%
  restaurant_name object   0.0%
             city object   0.0%
     cuisine_type object   0.0%
     partner_type object   0.0%
av

## Data Quality

In [26]:
for name, df in datasets.items():
    missing = df.isnull().sum()
    missing = missing[missing > 0]
    duplicates = df.duplicated().sum()
    
    if len(missing) > 0 or duplicates > 0:
        print(f"\n{name.upper()}:")
        if len(missing) > 0:
            for col, count in missing.items():
                print(f"  Missing {col}: {count:,} ({count/len(df)*100:.1f}%)")
        if duplicates > 0:
            print(f"  Duplicates: {duplicates:,}")


ORDERS:
  Missing delivery_partner_id: 5,635 (3.8%)

RATINGS:
  Missing order_id: 17 (0.0%)
  Missing customer_id: 17 (0.0%)
  Missing restaurant_id: 17 (0.0%)
  Missing rating: 17 (0.0%)
  Missing review_text: 17 (0.0%)
  Missing review_timestamp: 17 (0.0%)
  Missing sentiment_score: 17 (0.0%)
  Duplicates: 16


## City Consistency

In [27]:
city_tables = {
    'customer': df_customer,
    'restaurant': df_restaurant,
    'delivery_partner': df_delivery_partner
}

for name, df in city_tables.items():
    if 'city' in df.columns:
        print(f"\n{name.upper()}:")
        print(df['city'].value_counts().to_string())


CUSTOMER:
city
Bengaluru    30281
Mumbai       17317
Delhi        15090
Chennai      10823
Hyderabad    10755
Pune          8532
Ahmedabad     7512
Kolkata       7466

RESTAURANT:
city
Bengaluru    4963
Delhi        2966
Mumbai       2963
Hyderabad    2027
Chennai      2021
Kolkata      1835
Ahmedabad    1630
Pune         1590

DELIVERY_PARTNER:
city
Bengaluru    3775
Mumbai       2227
Delhi        2207
Chennai      1559
Hyderabad    1445
Kolkata      1382
Ahmedabad    1212
Pune         1193


## Time Period Analysis

In [28]:
df_orders['order_timestamp'] = pd.to_datetime(df_orders['order_timestamp'])
df_orders['month'] = df_orders['order_timestamp'].dt.month
df_orders['year'] = df_orders['order_timestamp'].dt.year

print("Order Date Range:", df_orders['order_timestamp'].min(), "to", df_orders['order_timestamp'].max())

monthly_orders = df_orders.groupby([df_orders['order_timestamp'].dt.to_period('M')]).size()
print("\nMonthly Orders:")
print(monthly_orders.to_string())

pre_crisis = df_orders[(df_orders['year'] == 2025) & (df_orders['month'].between(1, 5))]
crisis = df_orders[(df_orders['year'] == 2025) & (df_orders['month'].between(6, 9))]

print(f"\nPre-Crisis (Jan-May 2025): {len(pre_crisis):,} orders")
print(f"Crisis (Jun-Sep 2025): {len(crisis):,} orders")

Order Date Range: 2025-01-01 12:00:00 to 2025-09-30 22:59:00

Monthly Orders:
order_timestamp
2025-01    23539
2025-02    22667
2025-03    23543
2025-04    21466
2025-05    22591
2025-06     9293
2025-07     8818
2025-08     8555
2025-09     8694
Freq: M

Pre-Crisis (Jan-May 2025): 113,806 orders
Crisis (Jun-Sep 2025): 35,360 orders


In [29]:
df_customer['signup_date'] = pd.to_datetime(df_customer['signup_date'], format='%d-%m-%Y')
df_ratings['review_timestamp'] = pd.to_datetime(df_ratings['review_timestamp'], format='%d-%m-%Y %H:%M')

print("Customer Signups:", df_customer['signup_date'].min(), "to", df_customer['signup_date'].max())
print("Ratings Period:", df_ratings['review_timestamp'].min(), "to", df_ratings['review_timestamp'].max())

Customer Signups: 2024-11-02 00:00:00 to 2025-09-30 00:00:00
Ratings Period: 2025-01-01 14:03:00 to 2025-09-30 23:59:00


## Summary Statistics

In [30]:
print("DIMENSIONS:")
print(f"  Customers: {len(df_customer):,}")
print(f"  Restaurants: {len(df_restaurant):,}")
print(f"  Delivery Partners: {len(df_delivery_partner):,}")
print(f"  Menu Items: {len(df_menu_item):,}")

print("\nFACTS:")
print(f"  Orders: {len(df_orders):,}")
print(f"  Order Items: {len(df_order_items):,}")
print(f"  Ratings: {len(df_ratings):,}")
print(f"  Delivery Records: {len(df_delivery_performance):,}")

if 'is_cancelled' in df_orders.columns:
    cancelled = (df_orders['is_cancelled'] == 'Y').sum()
    print(f"\n  Cancelled Orders: {cancelled:,} ({cancelled/len(df_orders)*100:.1f}%)")

DIMENSIONS:
  Customers: 107,776
  Restaurants: 19,995
  Delivery Partners: 15,000
  Menu Items: 342,671

FACTS:
  Orders: 149,166
  Order Items: 342,994
  Ratings: 68,842
  Delivery Records: 149,166

  Cancelled Orders: 11,112 (7.4%)
