# ==============================================================================
# Project: FreshCart Customer Churn Prediction
# Goal: End-to-End Data Processing & Model Training Pipeline
#
# Purpose:
# This script consolidates all previous steps (Data Loading, Feature Engineering,
# and Modeling) into a single, reproducible pipeline. It simulates a production training run:
# 1. Load Raw Data
# 2. Apply Cutoff Strategy (Prevent Leakage)
# 3. Generate All Features (RFM + Behavioral + Advanced)
# 4. Train Final Model (using Optimized Hyperparameters)
# 5. Export Artifacts (Model & Metadata) for Deployment
# ==============================================================================

In [1]:
import pandas as pd
import numpy as np
import lightgbm as lgb
import joblib
import json
import sys
from pathlib import Path
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score, f1_score, classification_report

In [2]:
# Add src to path to import custom modules
# Adjust the path if you are running this script from a different location
sys.path.append('../src') 
# If running from the root 'FreshCart-Churn-Prediction' folder, use: sys.path.append('src')

In [3]:
from config import RAW_DATA_DIR, PROCESSED_DATA_DIR, MODEL_DIR, RANDOM_STATE
from data.data_loader import InstacartDataLoader

In [4]:
# Import Feature Engineering Modules
from features.rfm_features import RFMFeatureEngineer
from features.behavioral_features import BehavioralFeatureEngineer

print("‚úÖ Environment Setup Complete")

‚úÖ Environment Setup Complete


# ==============================================================================
# üì¶ Step 1: Ingest Raw Data
# ==============================================================================

In [None]:
def load_data():
    print("‚è≥ Loading Raw Data...")
    loader = InstacartDataLoader(RAW_DATA_DIR)
    data = loader.load_all_data()

    orders_df = data['orders']
    products_df = data['products']
    order_products = pd.concat([
        data['order_products_prior'],
        data['order_products_train']
    ], ignore_index=True)

    print(f"‚úÖ Data Loaded. Orders: {len(orders_df):,}, Products: {len(products_df):,}")
    return orders_df, products_df, order_products