In [1]:
import pandas as pd

def load_and_clean_data(path="OnlineRetail.csv"):
    """
    Load and clean Online Retail dataset.
    Removes missing CustomerIDs, negative quantities, and invalid invoices.
    """
    # Load dataset
    df = pd.read_csv(path, encoding='unicode_escape')

    # Remove missing Customer IDs
    df = df.dropna(subset=['CustomerID'])

    # Remove negative quantities (returns)
    df = df[df['Quantity'] > 0]

    # Convert columns
    df['InvoiceDate'] = pd.to_datetime(df['InvoiceDate'])
    df['CustomerID'] = df['CustomerID'].astype(str)

    # Compute total price
    df['TotalPrice'] = df['Quantity'] * df['UnitPrice']

    return df

if __name__ == "__main__":
    data = load_and_clean_data()
    print("‚úÖ Data loaded:", data.shape)
    print(data.head())

‚úÖ Data loaded: (397924, 9)
  InvoiceNo StockCode                          Description  Quantity  \
0    536365    85123A   WHITE HANGING HEART T-LIGHT HOLDER         6   
1    536365     71053                  WHITE METAL LANTERN         6   
2    536365    84406B       CREAM CUPID HEARTS COAT HANGER         8   
3    536365    84029G  KNITTED UNION FLAG HOT WATER BOTTLE         6   
4    536365    84029E       RED WOOLLY HOTTIE WHITE HEART.         6   

          InvoiceDate  UnitPrice CustomerID         Country  TotalPrice  
0 2010-12-01 08:26:00       2.55    17850.0  United Kingdom       15.30  
1 2010-12-01 08:26:00       3.39    17850.0  United Kingdom       20.34  
2 2010-12-01 08:26:00       2.75    17850.0  United Kingdom       22.00  
3 2010-12-01 08:26:00       3.39    17850.0  United Kingdom       20.34  
4 2010-12-01 08:26:00       3.39    17850.0  United Kingdom       20.34  


In [1]:
import pandas as pd
from datetime import timedelta

def create_customer_features(df):
    """
    Create customer-level features (RFM + behavior features).
    """
    ref_date = df['InvoiceDate'].max() + timedelta(days=1)

    rfm = df.groupby('CustomerID').agg({
        'InvoiceDate': lambda x: (ref_date - x.max()).days,  # Recency
        'InvoiceNo': 'nunique',  # Frequency
        'TotalPrice': 'sum'      # Monetary
    }).reset_index()

    rfm.columns = ['CustomerID', 'Recency', 'Frequency', 'Monetary']

    # Add churn label: if no purchase in last 90 days ‚Üí churn
    rfm['Churn'] = (rfm['Recency'] > 90).astype(int)

    return rfm


if __name__ == "__main__":
    # ‚úÖ Option 1: If the function exists in etl.py
    from etl import load_and_clean_data   # or from etl import load_data

    df = load_and_clean_data()            # or df = load_data()
    features = create_customer_features(df)
    print(features.head())

   CustomerID  Recency  Frequency  Monetary  Churn
0     12346.0      326          1  77183.60      1
1     12347.0        2          7   4310.00      0
2     12348.0       75          4   1797.24      0
3     12349.0       19          1   1757.55      0
4     12350.0      310          1    334.40      1


In [3]:
import os
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
import joblib
from etl import load_and_clean_data
from features import create_customer_features


def train_model():
    # Load and prepare data
    df = load_and_clean_data()
    features = create_customer_features(df)

    X = features[['Recency', 'Frequency', 'Monetary']]
    y = features['Churn']

    # Split data
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Scale features
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)

    # Train model
    model = LogisticRegression()
    model.fit(X_train_scaled, y_train)

    # Evaluate
    preds = model.predict(X_test_scaled)
    acc = accuracy_score(y_test, preds)
    print("‚úÖ Accuracy:", acc)
    print(classification_report(y_test, preds))

    # Ensure the models folder exists
    os.makedirs("models", exist_ok=True)

    # Save model & scaler
    joblib.dump(model, "models/churn_model.pkl")
    joblib.dump(scaler, "models/scaler.pkl")
    print("üíæ Model and scaler saved!")


if __name__ == "__main__":
    train_model()

‚úÖ Accuracy: 0.9942396313364056
              precision    recall  f1-score   support

           0       0.99      1.00      1.00       569
           1       1.00      0.98      0.99       299

    accuracy                           0.99       868
   macro avg       1.00      0.99      0.99       868
weighted avg       0.99      0.99      0.99       868

üíæ Model and scaler saved!


In [3]:
# app.py
import streamlit as st
import pandas as pd
import plotly.express as px
import joblib
from etl import load_and_clean_data
from features import create_customer_features

st.set_page_config(page_title="Customer Churn & Sales Dashboard", layout="wide")

st.title("üìä Customer Churn Prediction and Sales Dashboard")

@st.cache_data
def load_data():
    return load_and_clean_data()

@st.cache_data
def compute_features(df):
    return create_customer_features(df)

# Load data
df = load_data()
features = compute_features(df)

# --- Sales Trends ---
st.header("üí∞ Sales Trends Over Time")

df['Month'] = df['InvoiceDate'].dt.to_period('M').astype(str)
monthly_sales = df.groupby('Month')['TotalPrice'].sum().reset_index()

fig_sales = px.line(monthly_sales, x='Month', y='TotalPrice', title="Monthly Sales Trend")
st.plotly_chart(fig_sales, use_container_width=True)

# --- Top Products ---
st.header("üèÜ Top Products")
top_products = df.groupby('Description')['TotalPrice'].sum().nlargest(10).reset_index()
fig_products = px.bar(top_products, x='Description', y='TotalPrice', title="Top 10 Products")
st.plotly_chart(fig_products, use_container_width=True)

# --- Churn Prediction ---
st.header("üîÆ Customer Churn Prediction")

try:
    model = joblib.load("models/churn_model.pkl")
    scaler = joblib.load("models/scaler.pkl")
except:
    st.error("Model not found. Please run train_model.py first.")
    st.stop()

customer_id = st.selectbox("Select Customer ID", features['CustomerID'])
row = features[features['CustomerID'] == customer_id][['Recency', 'Frequency', 'Monetary']]

scaled = scaler.transform(row)
prob = model.predict_proba(scaled)[0][1]

st.metric("Churn Probability", f"{prob:.2f}")

2025-11-15 18:38:24.454 No runtime found, using MemoryCacheStorageManager
2025-11-15 18:38:24.455 No runtime found, using MemoryCacheStorageManager


DeltaGenerator()