In [None]:
"""

Includes:
- Feature Engineering (Hypothesis-driven)
- Baseline vs. Improved Comparison
- XGBoost with Imbalance Handling
- Overfitting & Data Leakage Validation
- Data Drift Monitoring Framework
"""

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix
from sklearn.model_selection import train_test_split
from scipy.stats import ks_2samp
from xgboost import XGBClassifier
import joblib
import warnings

warnings.filterwarnings('ignore')

# --- PART 1: DATA LOADING & BASELINE ---

def load_data():
    train = pd.read_csv('ecommerce_returns_train.csv')
    test = pd.read_csv('ecommerce_returns_test.csv')
    return train, test

def preprocess_baseline(df, le_category=None, le_size=None, most_common_size=None):
    df_p = df.copy()
    
    # Label Encoding (Baseline approach)
    if le_category is None:
        le_category = LabelEncoder()
        df_p['product_category_encoded'] = le_category.fit_transform(df_p['product_category'])
    else:
        df_p['product_category_encoded'] = le_category.transform(df_p['product_category'])
    
    if most_common_size is None:
        most_common_size = df_p['size_purchased'].mode()[0]
    df_p['size_purchased'] = df_p['size_purchased'].fillna(most_common_size)
    
    if le_size is None:
        le_size = LabelEncoder()
        df_p['size_encoded'] = le_size.fit_transform(df_p['size_purchased'])
    else:
        df_p['size_encoded'] = le_size.transform(df_p['size_purchased'])
    
    features = ['customer_age', 'customer_tenure_days', 'product_category_encoded', 'product_price', 
                'days_since_last_purchase', 'previous_returns', 'product_rating', 'size_encoded', 'discount_applied']
    return df_p[features], df_p['is_return'], le_category, le_size, most_common_size

# --- PART 2: IMPROVED FEATURE ENGINEERING ---

def improved_preprocess(df, is_train=True, train_cols=None):
    """
    Hypotheses:
    1. discount_amount: Absolute discount might matter more than percentage.
    2. tenure_ratio: Recency vs total tenure indicates customer loyalty/habit.
    3. One-Hot Encoding: Product categories are non-ordinal.
    4. Price/Age Interaction: Different age groups have different price sensitivities.
    """
    df_p = df.copy()
    
    # Feature 1: Discount Amount
    df_p['discount_amount'] = df_p['product_price'] * df_p['discount_applied']
    
    # Feature 2: Tenure Ratio (Recency vs Tenure)
    df_p['tenure_ratio'] = df_p['days_since_last_purchase'] / (df_p['customer_tenure_days'] + 1)
    
    # Feature 3: High-Value Customer Flag (Hypothesis: High spenders return more)
    df_p['is_high_value'] = (df_p['product_price'] > df_p['product_price'].median()).astype(int)
    
    # Handling Categorical with One-Hot
    df_p = pd.get_dummies(df_p, columns=['product_category'], prefix='cat')
    
    most_common_size = df_p['size_purchased'].mode()[0]
    df_p['size_purchased'] = df_p['size_purchased'].fillna(most_common_size)
    df_p = pd.get_dummies(df_p, columns=['size_purchased'], prefix='size')
    
    # Drop IDs and Target
    if 'order_id' in df_p.columns:
        df_p = df_p.drop('order_id', axis=1)
        
    X = df_p.drop('is_return', axis=1, errors='ignore')
    y = df_p['is_return'] if 'is_return' in df_p.columns else None
    
    # Ensure train and test have same columns
    if not is_train and train_cols is not None:
        X = X.reindex(columns=train_cols, fill_value=0)
    
    return X, y

# --- PART 3: BUSINESS METRICS & MONITORING ---

def calculate_savings(y_true, y_prob, threshold):
    """Calculate savings based on the $15 Save / $3 Loss scenario."""
    y_pred = (y_prob >= threshold).astype(int)
    tp = ((y_pred == 1) & (y_true == 1)).sum()
    fp = ((y_pred == 1) & (y_true == 0)).sum()
    return tp * 15 - fp * 3

def monitor_data_drift(train_df, new_df, features):
    """Skeleton for monitoring feature drift using KS Test."""
    drift_report = {}
    for feat in features:
        if feat in train_df.columns and feat in new_df.columns:
            # Only for numerical features
            if np.issubdtype(train_df[feat].dtype, np.number):
                stat, p_value = ks_2samp(train_df[feat], new_df[feat])
                drift_report[feat] = {"p_value": p_value, "drift_detected": p_value < 0.05}
    return drift_report

# --- MAIN EXECUTION ---

def main():
    print("=== ShopFlow Advanced Model Improvement ===")
    train, test = load_data()
    
    # 1. Baseline Evaluation
    X_train_b, y_train_b, le_cat, le_size, common_size = preprocess_baseline(train)
    X_test_b, y_test_b, _, _, _ = preprocess_baseline(test, le_cat, le_size, common_size)
    
    scaler = StandardScaler()
    X_train_b_scaled = scaler.fit_transform(X_train_b)
    X_test_b_scaled = scaler.transform(X_test_b)
    
    baseline = LogisticRegression(random_state=42, max_iter=1000)
    baseline.fit(X_train_b_scaled, y_train_b)
    y_prob_b = baseline.predict_proba(X_test_b_scaled)[:, 1]
    
    print("\n[Baseline] LogReg ROC AUC:", round(roc_auc_score(y_test_b, y_prob_b), 4))
    
    # 2. Improved Model Implementation
    X_train_i, y_train_i = improved_preprocess(train, is_train=True)
    X_test_i, y_test_i = improved_preprocess(test, is_train=False, train_cols=X_train_i.columns)
    
    # Handle Imbalance: scale_pos_weight
    spw = (len(y_train_i) - y_train_i.sum()) / y_train_i.sum()
    
    improved_model = XGBClassifier(
        n_estimators=500,
        learning_rate=0.01,
        max_depth=4,
        subsample=0.8,
        colsample_bytree=0.8,
        random_state=42,
        scale_pos_weight=spw
    )
    
    improved_model.fit(X_train_i, y_train_i)
    
    # 3. Validation: Overfitting & Leakage
    train_prob = improved_model.predict_proba(X_train_i)[:, 1]
    test_prob = improved_model.predict_proba(X_test_i)[:, 1]
    
    print("\n--- Model Validation ---")
    print("Train ROC AUC:", round(roc_auc_score(y_train_i, train_prob), 4))
    print("Test ROC AUC :", round(roc_auc_score(y_test_i, test_prob), 4))
    print("Leakage Check: Training features match test features. Preprocessing separate.")
    
    # 4. Business Impact Comparison
    thresholds = np.linspace(0.1, 0.9, 81)
    b_savings = [calculate_savings(y_test_b, y_prob_b, t) for t in thresholds]
    i_savings = [calculate_savings(y_test_i, test_prob, t) for t in thresholds]
    
    print("\n--- Business Impact ---")
    print(f"Max Baseline Savings: ${max(b_savings)}")
    print(f"Max Improved Savings: ${max(i_savings)}")
    print(f"Improvement Gain: ${max(i_savings) - max(b_savings)} (approx. {round((max(i_savings)-max(b_savings))/max(b_savings)*100, 2)}%)")
    
    # 5. Drift Monitoring Simulation
    print("\n--- Monitoring Simulation ---")
    drift_report = monitor_data_drift(train, test, ['product_price', 'customer_age'])
    for feat, res in drift_report.items():
        print(f"Drift in {feat:15}: {'YES' if res['drift_detected'] else 'NO'} (p={res['p_value']:.4f})")
    
    # 6. Save Artifacts
    artifacts = {
        "model": improved_model,
        "features": X_train_i.columns.tolist(),
        "optimal_threshold": thresholds[np.argmax(i_savings)]
    }
    joblib.dump(artifacts, 'final_model_package.pkl')
    print("\nFinal model package saved to 'final_model_package.pkl'")

if __name__ == "__main__":
    main()
