# MLOps Pipeline Demo - Data Exploration

This notebook demonstrates exploratory data analysis and model evaluation for the churn prediction pipeline.

## 1. Setup and Imports

In [None]:
import os
import sys
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import joblib
import mlflow

from sklearn.metrics import confusion_matrix, classification_report

# Set style
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (10, 6)

# Add pipeline to path
sys.path.append('/workspace/pipeline')

print("Setup complete!")

## 2. Load and Explore Data

In [None]:
# Load raw data
df = pd.read_csv('/workspace/data/sample_data.csv')
print(f"Dataset shape: {df.shape}")
df.head()

In [None]:
# Data summary
df.info()

In [None]:
# Statistical summary
df.describe()

## 3. Exploratory Data Analysis

In [None]:
# Churn distribution
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Count plot
churn_counts = df['Churn'].value_counts()
axes[0].bar(['No Churn', 'Churn'], churn_counts.values, color=['#2ecc71', '#e74c3c'])
axes[0].set_title('Churn Distribution (Count)', fontsize=14, fontweight='bold')
axes[0].set_ylabel('Count')
for i, v in enumerate(churn_counts.values):
    axes[0].text(i, v + 1, str(v), ha='center', fontweight='bold')

# Percentage plot
churn_pct = df['Churn'].value_counts(normalize=True) * 100
axes[1].bar(['No Churn', 'Churn'], churn_pct.values, color=['#2ecc71', '#e74c3c'])
axes[1].set_title('Churn Distribution (Percentage)', fontsize=14, fontweight='bold')
axes[1].set_ylabel('Percentage (%)')
for i, v in enumerate(churn_pct.values):
    axes[1].text(i, v + 1, f'{v:.1f}%', ha='center', fontweight='bold')

plt.tight_layout()
plt.show()

print(f"\nChurn Rate: {df['Churn'].mean():.2%}")

In [None]:
# Age distribution by churn
plt.figure(figsize=(12, 6))
sns.histplot(data=df, x='Age', hue='Churn', bins=20, kde=True, palette=['#2ecc71', '#e74c3c'])
plt.title('Age Distribution by Churn Status', fontsize=14, fontweight='bold')
plt.xlabel('Age')
plt.ylabel('Count')
plt.legend(['No Churn', 'Churn'])
plt.show()

In [None]:
# Tenure vs Monthly Charges by Churn
plt.figure(figsize=(12, 6))
sns.scatterplot(data=df, x='Tenure', y='MonthlyCharges', hue='Churn', 
                palette=['#2ecc71', '#e74c3c'], alpha=0.6, s=100)
plt.title('Tenure vs Monthly Charges by Churn Status', fontsize=14, fontweight='bold')
plt.xlabel('Tenure (months)')
plt.ylabel('Monthly Charges ($)')
plt.legend(['No Churn', 'Churn'])
plt.show()

In [None]:
# Contract type analysis
contract_churn = pd.crosstab(df['Contract'], df['Churn'], normalize='index') * 100

contract_churn.plot(kind='bar', figsize=(10, 6), color=['#2ecc71', '#e74c3c'])
plt.title('Churn Rate by Contract Type', fontsize=14, fontweight='bold')
plt.xlabel('Contract Type')
plt.ylabel('Percentage (%)')
plt.legend(['No Churn', 'Churn'])
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

print("\nChurn Rate by Contract Type:")
print(contract_churn)

In [None]:
# Correlation matrix
numeric_cols = ['Age', 'Tenure', 'MonthlyCharges', 'TotalCharges', 'Churn']
corr_matrix = df[numeric_cols].corr()

plt.figure(figsize=(10, 8))
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', center=0, 
            square=True, linewidths=1, cbar_kws={"shrink": 0.8})
plt.title('Correlation Matrix', fontsize=14, fontweight='bold')
plt.tight_layout()
plt.show()

## 4. Model Evaluation (if model exists)

In [None]:
# Check if model exists
model_path = '/workspace/models/churn_model.pkl'
if os.path.exists(model_path):
    print("Loading trained model...")
    model = joblib.load(model_path)
    
    # Load test data
    X_test = pd.read_csv('/workspace/data/processed/X_test.csv')
    y_test = pd.read_csv('/workspace/data/processed/y_test.csv').values.ravel()
    
    print(f"Model type: {type(model).__name__}")
    print(f"Test set size: {len(X_test)}")
else:
    print("Model not found. Please run training first.")
    print("Run: docker-compose run --rm pipeline python pipeline/train.py")

In [None]:
# Feature importance (if model exists)
if os.path.exists(model_path):
    importance_df = pd.DataFrame({
        'feature': X_test.columns,
        'importance': model.feature_importances_
    }).sort_values('importance', ascending=False)
    
    plt.figure(figsize=(10, 6))
    sns.barplot(data=importance_df, x='importance', y='feature', palette='viridis')
    plt.title('Feature Importance', fontsize=14, fontweight='bold')
    plt.xlabel('Importance Score')
    plt.ylabel('Feature')
    plt.tight_layout()
    plt.show()
    
    print("\nTop 5 Important Features:")
    print(importance_df.head())

In [None]:
# Model predictions and performance
if os.path.exists(model_path):
    y_pred = model.predict(X_test)
    y_pred_proba = model.predict_proba(X_test)[:, 1]
    
    print("Classification Report:")
    print(classification_report(y_test, y_pred, target_names=['No Churn', 'Churn']))

In [None]:
# Confusion matrix visualization
if os.path.exists(model_path):
    cm = confusion_matrix(y_test, y_pred)
    
    plt.figure(figsize=(8, 6))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', 
                xticklabels=['No Churn', 'Churn'],
                yticklabels=['No Churn', 'Churn'])
    plt.title('Confusion Matrix', fontsize=14, fontweight='bold')
    plt.ylabel('True Label')
    plt.xlabel('Predicted Label')
    plt.tight_layout()
    plt.show()

## 5. MLflow Experiments

In [None]:
# Connect to MLflow
mlflow.set_tracking_uri("http://mlflow:5000")

# Get experiment
experiment = mlflow.get_experiment_by_name("churn-prediction")
if experiment:
    print(f"Experiment ID: {experiment.experiment_id}")
    print(f"Experiment Name: {experiment.name}")
    print(f"Artifact Location: {experiment.artifact_location}")
    
    # Get all runs
    runs = mlflow.search_runs(experiment_ids=[experiment.experiment_id])
    print(f"\nTotal runs: {len(runs)}")
    
    if len(runs) > 0:
        print("\nLatest runs:")
        print(runs[['run_id', 'metrics.test_accuracy', 'metrics.test_f1', 
                    'params.n_estimators', 'params.max_depth']].head())
else:
    print("No experiments found. Please run training first.")

## 6. Next Steps

1. **Experiment with hyperparameters**: Try different values for `n_estimators`, `max_depth`, etc.
2. **Feature engineering**: Create new features to improve model performance
3. **Try different models**: Test SVM, XGBoost, or Neural Networks
4. **Handle class imbalance**: Try SMOTE or class weights
5. **Deploy model**: Create API endpoint for predictions

Visit MLflow UI at http://localhost:5000 to compare experiments!