In [None]:
# Cell 1: Install required packages
import subprocess
import sys

def install_package(package):
    try:
        subprocess.check_call([sys.executable, "-m", "pip", "install", package])
        print(f"✅ {package} installed successfully")
    except subprocess.CalledProcessError as e:
        print(f"❌ Failed to install {package}: {e}")

# Install required packages
packages = [
    "mlflow",
    "scikit-learn", 
    "pandas",
    "numpy",
    "matplotlib",
    "seaborn",
    "boto3",  # For S3/MinIO interaction
    "minio"   # MinIO Python client
]

for package in packages:
    install_package(package)

In [None]:
# Cell 2: Import libraries and setup
import mlflow
import mlflow.sklearn
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.datasets import make_classification
import os
import requests
from minio import Minio
import boto3
from botocore.client import Config
import warnings
warnings.filterwarnings('ignore')

print("✅ All libraries imported successfully!")

In [None]:
# Cell 3: Configure MLflow and test connectivity
# Set MLflow tracking URI to your MLflow server
mlflow.set_tracking_uri("http://mlflow.local")

# Test MLflow connectivity
try:
    client = mlflow.tracking.MlflowClient()
    experiments = client.search_experiments()
    print(f"✅ MLflow connection successful! Found {len(experiments)} experiments")
    print(f"MLflow tracking URI: {mlflow.get_tracking_uri()}")
except Exception as e:
    print(f"❌ MLflow connection failed: {e}")

In [None]:
# Cell 4: Test MinIO connectivity
# Test MinIO connection
try:
    # Using boto3 (S3-compatible)
    s3_client = boto3.client(
        's3',
        endpoint_url='http://minio.minio:9000',
        aws_access_key_id='minioadmin',
        aws_secret_access_key='minioadmin',
        config=Config(signature_version='s3v4'),
        region_name='us-east-1'
    )
    
    # List buckets
    buckets = s3_client.list_buckets()
    print("✅ MinIO connection successful!")
    print("Available buckets:")
    for bucket in buckets['Buckets']:
        print(f"  - {bucket['Name']}")
    
    # Check if mlflow bucket exists
    bucket_list = [bucket['Name'] for bucket in buckets['Buckets']]
    if 'mlflow' not in bucket_list:
        print("Creating 'mlflow' bucket...")
        s3_client.create_bucket(Bucket='mlflow')
        print("✅ MLflow bucket created!")
    
except Exception as e:
    print(f"❌ MinIO connection failed: {e}")

In [None]:
# Cell 5: Test direct MinIO client
try:
    # Using MinIO Python client
    minio_client = Minio(
        'minio.minio:9000',
        access_key='minioadmin',
        secret_key='minioadmin',
        secure=False
    )
    
    # Test connection
    if minio_client.bucket_exists('mlflow'):
        print("✅ MLflow bucket exists in MinIO")
    else:
        minio_client.make_bucket('mlflow')
        print("✅ Created MLflow bucket in MinIO")
        
    # Upload a test file
    test_data = "This is a test file for MLflow integration"
    with open('/tmp/test_file.txt', 'w') as f:
        f.write(test_data)
    
    minio_client.fput_object('mlflow', 'test/test_file.txt', '/tmp/test_file.txt')
    print("✅ Test file uploaded to MinIO")
    
    # List objects in mlflow bucket
    objects = minio_client.list_objects('mlflow', recursive=True)
    print("Objects in mlflow bucket:")
    for obj in objects:
        print(f"  - {obj.object_name}")
    
except Exception as e:
    print(f"❌ MinIO client error: {e}")

In [None]:
# Cell 6: Generate sample dataset
# Create a synthetic dataset for our ML experiment
X, y = make_classification(
    n_samples=1000,
    n_features=10,
    n_informative=8,
    n_redundant=2,
    n_clusters_per_class=1,
    random_state=42
)

# Convert to DataFrame for easier handling
feature_names = [f'feature_{i}' for i in range(X.shape[1])]
df = pd.DataFrame(X, columns=feature_names)
df['target'] = y

print(f"✅ Dataset created with shape: {df.shape}")
print(f"Target distribution:\n{df['target'].value_counts()}")

# Display first few rows
print("\nFirst 5 rows:")
print(df.head())

In [None]:
# Cell 7: Exploratory Data Analysis
# Create some plots
fig, axes = plt.subplots(2, 2, figsize=(15, 10))

# Feature correlation heatmap
correlation_matrix = df.corr()
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', center=0, ax=axes[0,0])
axes[0,0].set_title('Feature Correlation Heatmap')

# Target distribution
df['target'].value_counts().plot(kind='bar', ax=axes[0,1])
axes[0,1].set_title('Target Distribution')
axes[0,1].set_ylabel('Count')

# Feature distributions
df[['feature_0', 'feature_1', 'feature_2']].hist(bins=20, ax=axes[1,0])
axes[1,0].set_title('Feature Distributions (First 3 features)')

# Scatter plot
scatter = axes[1,1].scatter(df['feature_0'], df['feature_1'], c=df['target'], cmap='viridis', alpha=0.6)
axes[1,1].set_xlabel('Feature 0')
axes[1,1].set_ylabel('Feature 1')
axes[1,1].set_title('Feature 0 vs Feature 1 (colored by target)')
plt.colorbar(scatter, ax=axes[1,1])

plt.tight_layout()
plt.show()

print("✅ EDA plots generated successfully!")

In [None]:
# Cell 8: MLflow Experiment Setup (CORRECTED)
import mlflow

# Create a new experiment
experiment_name = "jupyter-minio-integration-test"

try:
    # First, check if experiment already exists
    try:
        experiment = mlflow.get_experiment_by_name(experiment_name)
        if experiment is not None:
            experiment_id = experiment.experiment_id
            print(f"✅ Using existing experiment: {experiment_name} (ID: {experiment_id})")
        else:
            raise mlflow.exceptions.MlflowException("Experiment not found")
    except:
        # Create new experiment if it doesn't exist
        experiment_id = mlflow.create_experiment(
            experiment_name,
            artifact_location=f"s3://mlflow/experiments/{experiment_name}"
        )
        print(f"✅ Created new experiment: {experiment_name} (ID: {experiment_id})")
    
    # Set the experiment - this is crucial!
    mlflow.set_experiment(experiment_name)
    
    # Verify the experiment is set correctly
    current_experiment = mlflow.get_experiment_by_name(experiment_name)
    print(f"Current active experiment: {current_experiment.name} (ID: {current_experiment.experiment_id})")
    print(f"Artifact location: {current_experiment.artifact_location}")
    
except Exception as e:
    print(f"❌ Error with experiment setup: {e}")
    # Fallback: use default experiment
    print("Using default experiment instead...")
    mlflow.set_experiment("Default")
    current_experiment = mlflow.get_experiment_by_name("Default")
    print(f"Using experiment: {current_experiment.name} (ID: {current_experiment.experiment_id})")

In [None]:
# Cell 8: MLflow Setup with S3
import mlflow
import os

# Set environment variables for S3/MinIO access
os.environ['MLFLOW_S3_ENDPOINT_URL'] = 'http://minio.minio.svc.cluster.local:9000'
os.environ['AWS_ACCESS_KEY_ID'] = 'minioadmin'
os.environ['AWS_SECRET_ACCESS_KEY'] = 'minioadmin'
os.environ['AWS_S3_FORCE_PATH_STYLE'] = 'true'
os.environ['AWS_DEFAULT_REGION'] = 'us-east-1'

print("✅ S3/MinIO environment variables set")

# Set MLflow tracking URI
mlflow.set_tracking_uri("http://mlflow.local")

# Test basic connectivity first
try:
    client = mlflow.tracking.MlflowClient()
    experiments = client.search_experiments()
    print(f"✅ MLflow server connected. Found {len(experiments)} experiments")
    
    # Use default experiment but set artifact location properly
    mlflow.set_experiment("Default")
    
    # Get current experiment info
    current_exp = mlflow.get_experiment_by_name("Default")
    print(f"Using experiment: {current_exp.name} (ID: {current_exp.experiment_id})")
    print(f"Artifact location: {current_exp.artifact_location}")
    
except Exception as e:
    print(f"❌ MLflow setup error: {e}")

In [None]:
# Cell 9: Train and log models with MLflow
# Prepare data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

def train_and_log_model(model, model_name, params):
    """Train a model and log it with MLflow"""
    
    with mlflow.start_run(run_name=f"{model_name}_run") as run:
        print(f"\n🚀 Training {model_name}...")
        
        # Log parameters
        mlflow.log_params(params)
        
        # Train model
        model.fit(X_train, y_train)
        
        # Make predictions
        y_pred = model.predict(X_test)
        
        # Calculate metrics
        accuracy = accuracy_score(y_test, y_pred)
        
        # Log metrics
        mlflow.log_metric("accuracy", accuracy)
        mlflow.log_metric("train_samples", len(X_train))
        mlflow.log_metric("test_samples", len(X_test))
        
        print(f"✅ {model_name} Accuracy: {accuracy:.4f}")
        
        # Log model
        mlflow.sklearn.log_model(
            model, 
            model_name.lower(),
            registered_model_name=f"{model_name}_Model"
        )
        
        # Create and log confusion matrix plot
        plt.figure(figsize=(8, 6))
        cm = confusion_matrix(y_test, y_pred)
        sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
        plt.title(f'{model_name} Confusion Matrix')
        plt.ylabel('True Label')
        plt.xlabel('Predicted Label')
        plt.tight_layout()
        
        # Save plot to file and log as artifact
        plot_path = f"/tmp/{model_name.lower()}_confusion_matrix.png"
        plt.savefig(plot_path)
        mlflow.log_artifact(plot_path)
        plt.show()
        
        # Log feature importance (if available)
        if hasattr(model, 'feature_importances_'):
            feature_importance = pd.DataFrame({
                'feature': feature_names,
                'importance': model.feature_importances_
            }).sort_values('importance', ascending=False)
            
            plt.figure(figsize=(10, 6))
            sns.barplot(data=feature_importance.head(10), x='importance', y='feature')
            plt.title(f'{model_name} Feature Importance (Top 10)')
            plt.tight_layout()
            
            importance_path = f"/tmp/{model_name.lower()}_feature_importance.png"
            plt.savefig(importance_path)
            mlflow.log_artifact(importance_path)
            plt.show()
            
            # Log feature importance as CSV
            importance_csv_path = f"/tmp/{model_name.lower()}_feature_importance.csv"
            feature_importance.to_csv(importance_csv_path, index=False)
            mlflow.log_artifact(importance_csv_path)
        
        # Log classification report
        report = classification_report(y_test, y_pred, output_dict=True)
        for label, metrics in report.items():
            if isinstance(metrics, dict):
                for metric_name, value in metrics.items():
                    mlflow.log_metric(f"{label}_{metric_name}", value)
        
        print(f"✅ {model_name} logged to MLflow with run ID: {run.info.run_id}")
        
        return model, accuracy

# Train Random Forest
rf_params = {
    "n_estimators": 100,
    "max_depth": 10,
    "random_state": 42
}
rf_model = RandomForestClassifier(**rf_params)
rf_trained, rf_accuracy = train_and_log_model(rf_model, "RandomForest", rf_params)

# Train Logistic Regression
lr_params = {
    "random_state": 42,
    "max_iter": 1000
}
lr_model = LogisticRegression(**lr_params)
lr_trained, lr_accuracy = train_and_log_model(lr_model, "LogisticRegression", lr_params)

In [None]:
# Cell 10: Compare models and log comparison
# Create model comparison
comparison_data = {
    'Model': ['RandomForest', 'LogisticRegression'],
    'Accuracy': [rf_accuracy, lr_accuracy]
}

comparison_df = pd.DataFrame(comparison_data)
print("Model Comparison:")
print(comparison_df)

# Plot comparison
plt.figure(figsize=(10, 6))
sns.barplot(data=comparison_df, x='Model', y='Accuracy')
plt.title('Model Accuracy Comparison')
plt.ylabel('Accuracy')
plt.ylim(0, 1)
for i, v in enumerate(comparison_df['Accuracy']):
    plt.text(i, v + 0.01, f'{v:.4f}', ha='center', va='bottom')
plt.tight_layout()

# Log comparison as artifact
comparison_path = "/tmp/model_comparison.png"
plt.savefig(comparison_path)
plt.show()

# Log the comparison in a new run
with mlflow.start_run(run_name="model_comparison") as run:
    mlflow.log_artifact(comparison_path)
    mlflow.log_metric("best_accuracy", max(rf_accuracy, lr_accuracy))
    mlflow.log_param("best_model", "RandomForest" if rf_accuracy > lr_accuracy else "LogisticRegression")
    
    # Save comparison CSV and log it
    comparison_csv_path = "/tmp/model_comparison.csv"
    comparison_df.to_csv(comparison_csv_path, index=False)
    mlflow.log_artifact(comparison_csv_path)
    
    print(f"✅ Model comparison logged with run ID: {run.info.run_id}")

In [None]:
# Cell 11: Test model loading and prediction
# Load the best model from MLflow
best_model_name = "RandomForest" if rf_accuracy > lr_accuracy else "LogisticRegression"
print(f"Loading best model: {best_model_name}")

try:
    # Get the latest version of the model
    client = mlflow.tracking.MlflowClient()
    model_versions = client.search_model_versions(f"name='{best_model_name}_Model'")
    latest_version = max([int(mv.version) for mv in model_versions])
    
    # Load model
    model_uri = f"models:/{best_model_name}_Model/{latest_version}"
    loaded_model = mlflow.sklearn.load_model(model_uri)
    
    print(f"✅ Loaded {best_model_name} model version {latest_version}")
    
    # Make predictions with loaded model
    sample_predictions = loaded_model.predict(X_test[:5])
    actual_values = y_test[:5]
    
    print("\nSample predictions vs actual:")
    for i, (pred, actual) in enumerate(zip(sample_predictions, actual_values)):
        print(f"Sample {i+1}: Predicted={pred}, Actual={actual}, Match={'✅' if pred==actual else '❌'}")
        
except Exception as e:
    print(f"❌ Error loading model: {e}")

In [None]:
# Cell 12: Verify artifacts in MinIO
print("🔍 Checking artifacts stored in MinIO...")

try:
    # List all objects in the mlflow bucket
    objects = minio_client.list_objects('mlflow', recursive=True)
    artifact_count = 0
    
    print("\nArtifacts in MinIO (mlflow bucket):")
    for obj in objects:
        print(f"  📁 {obj.object_name} ({obj.size} bytes)")
        artifact_count += 1
    
    print(f"\n✅ Total artifacts stored: {artifact_count}")
    
    # Download a sample artifact to verify
    if artifact_count > 0:
        try:
            # Try to download a model file
            sample_object = next(minio_client.list_objects('mlflow', recursive=True))
            local_path = f"/tmp/downloaded_{sample_object.object_name.split('/')[-1]}"
            minio_client.fget_object('mlflow', sample_object.object_name, local_path)
            print(f"✅ Successfully downloaded sample artifact: {sample_object.object_name}")
        except Exception as e:
            print(f"⚠️ Could not download sample artifact: {e}")
            
except Exception as e:
    print(f"❌ Error accessing MinIO: {e}")

In [None]:
# Cell 13: Integration Summary and Health Check
print("🎉 INTEGRATION TEST SUMMARY")
print("=" * 50)

# Check each component
components = {
    "JupyterHub": "✅ Running (you're using it now!)",
    "MLflow": None,
    "MinIO": None
}

# Test MLflow
try:
    experiments = mlflow.search_experiments()
    components["MLflow"] = f"✅ Connected - {len(experiments)} experiments found"
except:
    components["MLflow"] = "❌ Connection failed"

# Test MinIO
try:
    buckets = minio_client.list_buckets()
    bucket_count = len(list(buckets))
    components["MinIO"] = f"✅ Connected - {bucket_count} buckets available"
except:
    components["MinIO"] = "❌ Connection failed"

# Print results
for component, status in components.items():
    print(f"{component:12}: {status}")

print("\n📊 EXPERIMENT RESULTS:")
print(f"  • Trained {2} models")
print(f"  • Best accuracy: {max(rf_accuracy, lr_accuracy):.4f}")
print(f"  • Artifacts stored in MinIO: ✅")
print(f"  • Models registered in MLflow: ✅")

print(f"\n🌐 ACCESS URLS:")
print(f"  • JupyterHub: http://jupyter.local")
print(f"  • MLflow UI: http://mlflow.local")
print(f"  • MinIO Console: http://minio-console.local")

print(f"\n🔗 Next Steps:")
print(f"  1. Visit MLflow UI to explore your experiments")
print(f"  2. Check MinIO console to see stored artifacts")
print(f"  3. Try loading and using your models in other notebooks")
print(f"  4. Experiment with different ML algorithms and parameters")

print(f"\n✅ Integration test completed successfully!")