In [10]:
# !pip3 install xgboost
# !pip3 install scikit-learn

Collecting scikit-learn
  Downloading scikit_learn-1.6.0-cp39-cp39-macosx_12_0_arm64.whl (11.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m11.1/11.1 MB[0m [31m1.2 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0mm
Collecting joblib>=1.2.0
  Downloading joblib-1.4.2-py3-none-any.whl (301 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m301.8/301.8 KB[0m [31m1.0 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
Collecting threadpoolctl>=3.1.0
  Downloading threadpoolctl-3.5.0-py3-none-any.whl (18 kB)
Installing collected packages: threadpoolctl, joblib, scikit-learn
Successfully installed joblib-1.4.2 scikit-learn-1.6.0 threadpoolctl-3.5.0
You should consider upgrading via the '/Library/Frameworks/Python.framework/Versions/3.9/bin/python3.9 -m pip install --upgrade pip' command.[0m[33m
[0m

In [5]:
import xgboost as xgb
from sklearn.datasets import fetch_openml
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import numpy as np
import pandas as pd

In [6]:
# Load the MNIST dataset
#Q1
mnist = fetch_openml('mnist_784', version=1)
X, y = mnist["data"], mnist["target"]
y = y.astype(int)

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Convert data to DMatrix format for XGBoost
dtrain = xgb.DMatrix(data=X_train, label=y_train)
dtest = xgb.DMatrix(data=X_test, label=y_test)

# Set parameters for XGBoost
params = {
    'objective': 'multi:softmax',  # For multi-class classification
    'num_class': 10,                # 10 digits (0-9)
    'eval_metric': 'merror',       # Multiclass error rate
    'eta': 0.1,                     # Learning rate
    'max_depth': 6,                # Maximum depth of trees
    'subsample': 0.8,              # Subsample ratio of training instances
    'colsample_bytree': 0.8,       # Subsample ratio of features
    'seed': 42                      # Random seed for reproducibility
}

# Train the XGBoost model
num_rounds = 5  # Adjust as needed
model = xgb.train(params, dtrain, num_rounds)

# Make predictions on the test set
y_pred = model.predict(dtest)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f"Test Accuracy: {accuracy:.4f}")

Test Accuracy: 0.9089


In [17]:
def center_data(X):
    mean = np.mean(X, axis=0)
    return X - mean

def covariance_matrix(X):
    n_samples = X.shape[0]
    return (1 / (n_samples - 1)) * np.dot(X.T, X)

def eigen_decomposition(covariance):
    eigenvalues, eigenvectors = np.linalg.eigh(covariance)
    return eigenvalues, eigenvectors

def select_principal_components(eigenvalues, eigenvectors, n_components):
    idx = np.argsort(eigenvalues)[::-1]
    eigenvalues = eigenvalues[idx]
    eigenvectors = eigenvectors[:, idx]
    return eigenvectors[:, :n_components]

def transform_data(X, principal_components):
    return np.dot(X, principal_components)

In [18]:
from sklearn.datasets import fetch_openml
from sklearn.decomposition import PCA

#Load and preprocess data (same as before)
mnist = fetch_openml('mnist_784', version=1)
X, y = mnist["data"], mnist["target"]
y = y.astype(int)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

X_train_centered = center_data(X_train)
X_test_centered = center_data(X_test)

n_components = 50  # Number of principal components to keep

covariance = covariance_matrix(X_train_centered)
eigenvalues, eigenvectors = eigen_decomposition(covariance)
principal_components = select_principal_components(eigenvalues, eigenvectors, n_components)
# principal_components = PCA(n_components=n_components)
X_train_pca = transform_data(X_train_centered, principal_components)
X_test_pca = transform_data(X_test_centered, principal_components)
# X_train_pca = principal_components.fit_transform(X_train_centered)
# X_test_pca = principal_components.fit_transform(X_test_centered)


print("Shape of original data:", X.shape)
print("Shape of transformed data:", X_train_pca.shape)

Shape of original data: (70000, 784)
Shape of transformed data: (56000, 50)


In [19]:
# Convert to DMatrix
#Q2 - a
dtrain_pca = xgb.DMatrix(data=X_train_pca, label=y_train)
dtest_pca = xgb.DMatrix(data=X_test_pca, label=y_test)

# XGBoost parameters (adjust as needed)
params = {
    'objective': 'multi:softmax',
    'num_class': 10,
    'eval_metric': 'merror',
    'eta': 0.1,
    'max_depth': 6,
    'subsample': 0.8,
    'colsample_bytree': 0.8,
    'seed': 42
}

# Train XGBoost model
num_rounds = 5
model_pca = xgb.train(params, dtrain_pca, num_rounds)

# Make predictions
y_pred_pca = model_pca.predict(dtest_pca)

# Evaluate
accuracy_pca = accuracy_score(y_test, y_pred_pca)
print(f"Test Accuracy (PCA with 50 components): {accuracy_pca:.4f}")

Test Accuracy (PCA with 50 components): 0.8785


In [21]:
#Q2 - b
import numpy as np
import xgboost as xgb
from sklearn.datasets import fetch_openml
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from collections import defaultdict


# --- MNIST Data Loading and Preprocessing ---
mnist = fetch_openml('mnist_784', version=1)
X, y = mnist["data"], mnist["target"]
y = y.astype(int)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# --- Class-Conditional PCA ---
n_components = 50
class_data = defaultdict(list)

for i, label in enumerate(y_train):
#     print(i, label)
#     break
    class_data[label].append(X_train.iloc[i])

principal_components_per_class = {}
for digit, data in class_data.items():
    data_array = np.array(data)
    data_centered = center_data(data_array)
    covariance = covariance_matrix(data_centered)
    eigenvalues, eigenvectors = eigen_decomposition(covariance)
    principal_components_per_class[digit] = select_principal_components(eigenvalues, eigenvectors, n_components)

# Transform data
X_train_transformed = np.zeros((len(X_train), n_components))
for i, label in enumerate(y_train):
#     df_i = pd.DataFrame(X_train.iloc[i], columns=X_train.columns)
    X_train_transformed[i] = transform_data(X_train.iloc[i].values.reshape(1,-1), principal_components_per_class[label]).flatten()

X_test_transformed = np.zeros((len(X_test), n_components))
for i, label in enumerate(y_test):
#     df_i = pd.DataFrame(X_test.iloc[i], columns=X_train.columns)
    X_test_transformed[i] = transform_data(X_test.iloc[i].values.reshape(1,-1), principal_components_per_class[label]).flatten()

# --- XGBoost Training and Evaluation ---
dtrain_pca = xgb.DMatrix(data=X_train_transformed, label=y_train)
dtest_pca = xgb.DMatrix(data=X_test_transformed, label=y_test)

params = {  #Keep the same XGBoost parameters as before
    'objective': 'multi:softmax',
    'num_class': 10,
    'eval_metric': 'merror',
    'eta': 0.1,
    'max_depth': 6,
    'subsample': 0.8,
    'colsample_bytree': 0.8,
    'seed': 42
}

num_rounds = 5
model_pca = xgb.train(params, dtrain_pca, num_rounds)
y_pred_pca = model_pca.predict(dtest_pca)
accuracy_pca = accuracy_score(y_test, y_pred_pca)
print(f"Test Accuracy (Class-Conditional PCA with 50 components per class): {accuracy_pca:.4f}")

Test Accuracy (Class-Conditional PCA with 50 components per class): 0.9627


In [35]:
#Q3
import xgboost as xgb
import numpy as np
from tensorflow.keras.datasets import cifar10

(x_train, y_train), (x_test, y_test) = cifar10.load_data()

x_train = x_train.astype('float32') / 255.0
x_test = x_test.astype('float32') / 255.0
y_train = y_train.ravel()
y_test = y_test.ravel()
x_train = x_train.reshape(-1,32* 32*3)
x_test = x_test.reshape(-1,32* 32 *3)

params = {
    'objective': 'multi:softmax',
    'num_class': 10,
    'eval_metric': 'merror',
    'eta': 0.1,
    'max_depth': 3,
    'subsample': 0.8,
    'colsample_bytree': 0.8,
    'seed': 42
}

dtrain = xgb.DMatrix(data=x_train, label=y_train)
dtest = xgb.DMatrix(data=x_test, label=y_test)

num_round = 10  # Adjust as needed

model = xgb.train(params, dtrain, num_boost_round=num_round)

predictions = model.predict(dtest)
accuracy = np.sum(predictions == y_test) / len(y_test)
print(f"Test Accuracy: {accuracy:.4f}")

Test Accuracy: 0.3572


In [38]:
#Q4 - a
import xgboost as xgb
import numpy as np
from tensorflow.keras.datasets import cifar10
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# Load CIFAR-10 dataset
(x_train, y_train), (x_test, y_test) = cifar10.load_data()

# Preprocess data
x_train = x_train.astype('float32') / 255.0
x_test = x_test.astype('float32') / 255.0
y_train = y_train.ravel()
y_test = y_test.ravel()

# Reshape to (samples, features)
x_train = x_train.reshape(x_train.shape[0], -1)
x_test = x_test.reshape(x_test.shape[0], -1)

# Apply PCA
n_components = 50
covariance = covariance_matrix(x_train)
eigenvalues, eigenvectors = eigen_decomposition(covariance)
principal_components = select_principal_components(eigenvalues, eigenvectors, n_components)
x_train_pca = transform_data(x_train, principal_components)
x_test_pca = transform_data(x_test, principal_components)

# Split data into training and validation sets (optional, for better evaluation)
x_train_pca, x_val_pca, y_train, y_val = train_test_split(x_train_pca, y_train, test_size=0.2, random_state=42)

# XGBoost parameters
params = {
    'objective': 'multi:softmax',
    'num_class': 10,
    'eval_metric': 'merror',  # Use multiclass classification error
    'eta': 0.1,
    'max_depth': 6,  # Experiment with different depths
    'subsample': 0.8,
    'colsample_bytree': 0.8,
    'seed': 42
}

# Create DMatrices
dtrain = xgb.DMatrix(data=x_train_pca, label=y_train)
dval = xgb.DMatrix(data=x_val_pca, label=y_val)  # Validation set
dtest = xgb.DMatrix(data=x_test_pca, label=y_test)

# Train XGBoost model with early stopping
num_round = 100  # Experiment with the number of rounds
evallist = [(dval, 'eval')]  # Use validation set for early stopping

model = xgb.train(params, dtrain, num_boost_round=num_round, evals=evallist, early_stopping_rounds=10)

# Make predictions on the test set
y_pred = model.predict(dtest)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f"Test Accuracy (PCA with {n_components} components): {accuracy:.4f}")


[0]	eval-merror:0.67160
[1]	eval-merror:0.64560
[2]	eval-merror:0.63230
[3]	eval-merror:0.62450
[4]	eval-merror:0.61790
[5]	eval-merror:0.61190
[6]	eval-merror:0.61150
[7]	eval-merror:0.60520
[8]	eval-merror:0.60360
[9]	eval-merror:0.60050
[10]	eval-merror:0.59870
[11]	eval-merror:0.59470
[12]	eval-merror:0.59270
[13]	eval-merror:0.59130
[14]	eval-merror:0.59020
[15]	eval-merror:0.58710
[16]	eval-merror:0.58610
[17]	eval-merror:0.58500
[18]	eval-merror:0.58270
[19]	eval-merror:0.58130
[20]	eval-merror:0.58060
[21]	eval-merror:0.57670
[22]	eval-merror:0.57510
[23]	eval-merror:0.57440
[24]	eval-merror:0.57240
[25]	eval-merror:0.57270
[26]	eval-merror:0.57060
[27]	eval-merror:0.56880
[28]	eval-merror:0.56920
[29]	eval-merror:0.56770
[30]	eval-merror:0.56650
[31]	eval-merror:0.56470
[32]	eval-merror:0.56290
[33]	eval-merror:0.56320
[34]	eval-merror:0.56320
[35]	eval-merror:0.56190
[36]	eval-merror:0.56150
[37]	eval-merror:0.56110
[38]	eval-merror:0.56080
[39]	eval-merror:0.55890
[40]	eval-

In [43]:
#Q4 - b

(x_train, y_train), (x_test, y_test) = cifar10.load_data()
x_train = x_train.astype('float32') / 255.0
x_test = x_test.astype('float32') / 255.0
y_train = y_train.ravel()
y_test = y_test.ravel()

# Reshape to (samples, features)
x_train = x_train.reshape(x_train.shape[0], -1)
x_test = x_test.reshape(x_test.shape[0], -1)

# Separate data by class
n_classes = 10
x_train_by_class = [x_train[y_train == i] for i in range(n_classes)]

# Apply PCA per class
n_components = 50
pca_models = []
x_train_pca = np.zeros((len(x_train), n_components))
for i in range(n_classes):
    covariance = covariance_matrix(x_train_by_class[i])
    eigenvalues, eigenvectors = eigen_decomposition(covariance)
    pca_models.append(select_principal_components(eigenvalues, eigenvectors, n_components))
    # pca_models.append(transform_data(x_train_by_class[i], principal_components))
    x_train_pca[y_train == i, :] = transform_data(x_train[y_train == i], pca_models[i])
    # x_train_pca[y_train == i, :] = pca_models[i].transform(x_train[y_train == i])

# Apply PCA to the test data using the fitted PCA models
x_test_pca = np.zeros((len(x_test), n_components))
for i in range(n_classes):
    x_test_pca[y_test == i, :] = transform_data(x_test[y_test == i], pca_models[i])

# Split data into training and validation sets
x_train_pca, x_val_pca, y_train_split, y_val = train_test_split(x_train_pca, y_train, test_size=0.2, random_state=42)
# XGBoost parameters
params = {
    'objective': 'multi:softmax',
    'num_class': 10,
    'eval_metric': 'merror',  # Use multiclass classification error
    'eta': 0.1,
    'max_depth': 6,  # Experiment with different depths
    'subsample': 0.8,
    'colsample_bytree': 0.8,
    'seed': 42
}

# Create DMatrices
dtrain = xgb.DMatrix(data=x_train_pca, label=y_train)
dval = xgb.DMatrix(data=x_val_pca, label=y_val)  # Validation set
dtest = xgb.DMatrix(data=x_test_pca, label=y_test)

# Train XGBoost model with early stopping
num_round = 100  # Experiment with the number of rounds
evallist = [(dval, 'eval')]  # Use validation set for early stopping

model = xgb.train(params, dtrain, num_boost_round=num_round, evals=evallist, early_stopping_rounds=10)

# Make predictions on the test set
y_pred = model.predict(dtest)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f"Test Accuracy (PCA with {n_components} components): {accuracy:.4f}")


XGBoostError: [17:42:10] /Users/runner/work/xgboost/xgboost/src/data/data.cc:508: Check failed: this->labels.Size() % this->num_row_ == 0 (10000 vs. 0) : Incorrect size for labels.
Stack trace:
  [bt] (0) 1   libxgboost.dylib                    0x0000000162638428 dmlc::LogMessageFatal::~LogMessageFatal() + 124
  [bt] (1) 2   libxgboost.dylib                    0x000000016279635c xgboost::MetaInfo::SetInfoFromHost(xgboost::Context const&, xgboost::StringView, xgboost::Json) + 2532
  [bt] (2) 3   libxgboost.dylib                    0x00000001627957ec xgboost::MetaInfo::SetInfo(xgboost::Context const&, xgboost::StringView, xgboost::StringView) + 464
  [bt] (3) 4   libxgboost.dylib                    0x000000016264fa60 XGDMatrixSetInfoFromInterface + 228
  [bt] (4) 5   libffi.dylib                        0x00000001b9c00050 ffi_call_SYSV + 80
  [bt] (5) 6   libffi.dylib                        0x00000001b9c089e4 ffi_call_int + 948
  [bt] (6) 7   _ctypes.cpython-39-darwin.so        0x00000001051ce944 _ctypes_callproc + 1404
  [bt] (7) 8   _ctypes.cpython-39-darwin.so        0x00000001051c7c10 PyCFuncPtr_call + 1168
  [bt] (8) 9   Python                              0x0000000102a90560 _PyObject_MakeTpCall + 360

