In [None]:
# Dataset parameters
dataset: str = "iris"  # {"description": "Dataset to analyze", "input_type": "select", "options": ["iris", "wine", "breast_cancer"]}
test_size: float = 0.3  # {"description": "Test set size", "validation": {"min": 0.1, "max": 0.5}}
random_state: int = 42  # {"description": "Random seed for reproducibility"}

# Model parameters
model_type: str = "random_forest"  # {"description": "Model type", "input_type": "select", "options": ["random_forest", "svm", "logistic_regression"]}
n_estimators: int = 100  # {"description": "Number of estimators (for Random Forest)", "validation": {"min": 10, "max": 500}}
max_depth: int = 5  # {"description": "Maximum tree depth (for Random Forest)", "validation": {"min": 1, "max": 20}}

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

# Set style
plt.style.use('seaborn')

# Load dataset
print(f"Loading {dataset} dataset...")
if dataset == "iris":
    data = datasets.load_iris()
elif dataset == "wine":
    data = datasets.load_wine()
elif dataset == "breast_cancer":
    data = datasets.load_breast_cancer()
else:
    raise ValueError(f"Unknown dataset: {dataset}")

# Create DataFrame
X = pd.DataFrame(data.data, columns=data.feature_names)
y = pd.Series(data.target, name='target')

# Display dataset info
print(f"Dataset shape: {X.shape}")
print(f"Number of classes: {len(np.unique(y))}")
print(f"Features: {X.columns.tolist()}")

# Split data
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=test_size, random_state=random_state, stratify=y
)

# Scale features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Select and train model
print(f"\nTraining {model_type} model...")
if model_type == "random_forest":
    model = RandomForestClassifier(
        n_estimators=n_estimators,
        max_depth=max_depth,
        random_state=random_state
    )
elif model_type == "svm":
    model = SVC(random_state=random_state)
elif model_type == "logistic_regression":
    model = LogisticRegression(random_state=random_state, max_iter=1000)
else:
    raise ValueError(f"Unknown model type: {model_type}")

# Train model
model.fit(X_train_scaled, y_train)

# Make predictions
y_pred = model.predict(X_test_scaled)

# Evaluate model
accuracy = accuracy_score(y_test, y_pred)
print(f"\nModel accuracy: {accuracy:.4f}")

# Display classification report
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

# Plot confusion matrix
plt.figure(figsize=(8, 6))
cm = confusion_matrix(y_test, y_pred)
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
plt.title('Confusion Matrix')
plt.xlabel('Predicted')
plt.ylabel('True')
plt.show()

# Feature importance (for Random Forest)
if model_type == "random_forest":
    plt.figure(figsize=(10, 6))
    importances = model.feature_importances_
    indices = np.argsort(importances)[::-1]
    plt.bar(range(X.shape[1]), importances[indices])
    plt.xticks(range(X.shape[1]), X.columns[indices], rotation=90)
    plt.title('Feature Importance')
    plt.tight_layout()
    plt.show()

# Return results as a dictionary
results = {
    "dataset": dataset,
    "model": model_type,
    "accuracy": float(accuracy),
    "n_samples": X.shape[0],
    "n_features": X.shape[1],
    "n_classes": len(np.unique(y))
}

results