In [14]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn import datasets
from sklearn.decomposition import PCA
import time
from sklearn import metrics

In [2]:
lfw_people = datasets.fetch_lfw_people(min_faces_per_person=70, resize=0.4)
X = lfw_people.data
y = lfw_people.target

num_samples = X.shape[0]
num_features = X.shape[1]
target_names = lfw_people.target_names
num_classes = target_names.shape[0]

print(f'Number of samples: {num_samples}')
print(f'Number of features: {num_features}')
print(f'target names: {target_names}')
print(f'Number of classes: {num_classes}')

Number of samples: 1288
Number of features: 1850
target names: ['Ariel Sharon' 'Colin Powell' 'Donald Rumsfeld' 'George W Bush'
 'Gerhard Schroeder' 'Hugo Chavez' 'Tony Blair']
Number of classes: 7


In [3]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=24)
n_components = 200
pca = PCA(n_components=n_components).fit(X_train)
X_train_pca = pca.transform(X_train)
X_test_pca = pca.transform(X_test)


In [16]:
params = {'objective': 'multi:softmax',
          'eval_metric': 'mlogloss',
          'num_class': num_classes,
          'n_estimators': 50,
          'booster': 'gbtree',
          'nthread': -1,
          'eta': 0.1,
          'gamma': 0,
          'max_depth': 6,
          'min_child_weight': 1,
          'subsample': 0.9,
          'colsample_bytree': 0.8,}
model_original = xgb.sklearn.XGBClassifier(**params)
t = time.time()
model_original.fit(X_train, y_train,eval_set=[(X_train, y_train)], verbose=True)
print(f'Training time before performing PCA: {time.time() - t:.2f} seconds')
y_pred = model_original.predict(X_test)
print(metrics.classification_report(y_test, y_pred,target_names=target_names))

[0]	validation_0-mlogloss:1.73093
[1]	validation_0-mlogloss:1.55367
[2]	validation_0-mlogloss:1.40094
[3]	validation_0-mlogloss:1.27297
[4]	validation_0-mlogloss:1.16054
[5]	validation_0-mlogloss:1.06282
[6]	validation_0-mlogloss:0.97411
[7]	validation_0-mlogloss:0.89331
[8]	validation_0-mlogloss:0.82055
[9]	validation_0-mlogloss:0.75577
[10]	validation_0-mlogloss:0.69694
[11]	validation_0-mlogloss:0.64377
[12]	validation_0-mlogloss:0.59633
[13]	validation_0-mlogloss:0.55242
[14]	validation_0-mlogloss:0.51255
[15]	validation_0-mlogloss:0.47620
[16]	validation_0-mlogloss:0.44223
[17]	validation_0-mlogloss:0.41183
[18]	validation_0-mlogloss:0.38481
[19]	validation_0-mlogloss:0.35875
[20]	validation_0-mlogloss:0.33524
[21]	validation_0-mlogloss:0.31316
[22]	validation_0-mlogloss:0.29242
[23]	validation_0-mlogloss:0.27347
[24]	validation_0-mlogloss:0.25652
[25]	validation_0-mlogloss:0.24039
[26]	validation_0-mlogloss:0.22551
[27]	validation_0-mlogloss:0.21158
[28]	validation_0-mlogloss:0.1

In [17]:
t = time.time()
model_original.fit(X_train_pca, y_train,eval_set=[(X_train_pca, y_train)], verbose=True)
print(f'Training time after performing PCA: {time.time() - t:.2f} seconds')
y_pred = model_original.predict(X_test_pca)
print(metrics.classification_report(y_test, y_pred, target_names=target_names))

[0]	validation_0-mlogloss:1.75716
[1]	validation_0-mlogloss:1.59657
[2]	validation_0-mlogloss:1.46392
[3]	validation_0-mlogloss:1.34017
[4]	validation_0-mlogloss:1.23517
[5]	validation_0-mlogloss:1.14045
[6]	validation_0-mlogloss:1.05204
[7]	validation_0-mlogloss:0.97258
[8]	validation_0-mlogloss:0.90574
[9]	validation_0-mlogloss:0.84310
[10]	validation_0-mlogloss:0.78494
[11]	validation_0-mlogloss:0.73133
[12]	validation_0-mlogloss:0.68074
[13]	validation_0-mlogloss:0.63400
[14]	validation_0-mlogloss:0.59335
[15]	validation_0-mlogloss:0.55519
[16]	validation_0-mlogloss:0.52022
[17]	validation_0-mlogloss:0.48761
[18]	validation_0-mlogloss:0.45759
[19]	validation_0-mlogloss:0.42852
[20]	validation_0-mlogloss:0.40228
[21]	validation_0-mlogloss:0.37860
[22]	validation_0-mlogloss:0.35528
[23]	validation_0-mlogloss:0.33462
[24]	validation_0-mlogloss:0.31535
[25]	validation_0-mlogloss:0.29804
[26]	validation_0-mlogloss:0.28152
[27]	validation_0-mlogloss:0.26587
[28]	validation_0-mlogloss:0.2