In [1]:
pip install xgboost scikit-learn

Note: you may need to restart the kernel to use updated packages.


In [3]:
from sklearn.datasets import fetch_california_housing
from sklearn.model_selection import train_test_split
import pandas as pd

# Load dataset
data = fetch_california_housing()
X = pd.DataFrame(data.data, columns=data.feature_names)
y = pd.Series(data.target)

# Split into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [4]:
import xgboost as xgb

# Convert data into DMatrix format (optimized for XGBoost)
dtrain = xgb.DMatrix(X_train, label=y_train)
dtest = xgb.DMatrix(X_test, label=y_test)

# Set parameters for the model
params = {
    'objective': 'reg:squarederror',  # For regression tasks
    'max_depth': 3,                   # Maximum depth of trees
    'learning_rate': 0.1,             # Step size for each iteration
    'subsample': 0.8,                 # Fraction of samples used for training
    'colsample_bytree': 0.8,          # Fraction of features used for training
    'seed': 42                        # Random seed for reproducibility
}

# Train the model
model = xgb.train(params, dtrain, num_boost_round=100)

In [5]:
from sklearn.metrics import mean_squared_error

# Make predictions
y_pred = model.predict(dtest)

# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
print(f"Mean Squared Error: {mse}")

Mean Squared Error: 0.28851653534824134


We'll use the Breast Cancer Dataset to classify whether a tumor is benign (non-cancerous) or malignant (cancerous).