In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split

In [None]:
RANDOM_STATE = 42

## Generating classification data

In [None]:
X, y = make_classification(
  n_samples=50_000,
  n_features=500,
  n_informative=300,
  n_redundant=200,
  random_state=RANDOM_STATE,
)

X_train, X_test, y_train, y_test = train_test_split(
  X, y, test_size=0.3, random_state=RANDOM_STATE
)

## Creating model

In [None]:
model_hyperparameters = {
    "n_estimators": 20,
    "max_depth": 10,
    "min_samples_split": 5,
    "max_features": 25,
}

In [None]:
model = RandomForestClassifier(**model_hyperparameters)

## Fitting model without reducing dimensions

### Fitting model in all data

In [None]:
model.fit(X_train, y_train)

In [None]:
y_pred = model.predict(X_test)

In [None]:
from sklearn.metrics import f1_score

print(f1_score(y_test, y_pred))

0.7843189368770764


## Implementing PCA from Scratch

In [None]:
import numpy as np

In [None]:
class PCAProcessor:
  def __init__(self, n_components: int):
    # Number of components to find
    self.n_components = n_components

    # Array with the principal component weights
    self.components = None

    # Mean variable of values of all features observed in the training data
    self.mean = None

    # Proportion of variance explained by principal components
    self.variance_share = None

  def fit(self, X, *args, **kwargs):
    """
    Finds Principal Components.

    1. Centers (normalizes) data by subtracting the mean values for
      each variable, and stores these values inside the `self.mean` var.
    2. Calculate eigenvectors and eigenvalues of the covariant matrix.
    3. Sort eigenvalues and eigenvectors in a decreasing manner.
    4. Stores an array of the top `self.num_components` as `self.components`,
      and store `self.variance_share`.
    """

    # 1. Centering data
    self.mean = np.mean(X, axis=0)
    X -= self.mean

    # 2. Calculate eigenvalues and eigenvectors
    cov_matrix = np.cov(X.T)
    eigen_val, eigen_vec = np.linalg.eig(cov_matrix)
    eigen_val = np.real_if_close(eigen_val, tol=1)
    eigen_vec = np.real_if_close(eigen_vec, tol=1)

    # 3. Sort eigenvectors and eigenvalues
    sort_idx = np.argsort(eigen_val)[::-1]
    eigen_val = eigen_val[sort_idx]
    eigen_vec = eigen_vec[:, sort_idx]

    # 4. Store principal components and variance share
    self.components = eigen_vec[:self.n_components]
    self.variance_share = np.sum(
      eigen_val[:self.n_components]
    ) / np.sum(eigen_val)

    return self

  def transform(self, X, *args, **kwargs):
    """
    Transforms new data.

    1. Centers (normalizes) data with the same mean we found during `self.fit`.
    2. Multiplies the data matrix with the transpose of the matrix of
      selected components.
    """

    # 1. Centering data
    X -= self.mean

    # 2. Decomposition
    return np.dot(X, self.components.T)

### Fitting PCAProcessor to data

In [None]:
pca_processor = PCAProcessor(n_components=300).fit(X_train)

In [None]:
new_X_train = pca_processor.transform(X_train)
new_X_test = pca_processor.transform(X_test)

In [None]:
pca_processor.variance_share

1.0

### Fitting model to post-processed data

In [None]:
simpler_model = RandomForestClassifier(**model_hyperparameters)

In [None]:
simpler_model.fit(new_X_train, y_train)

In [None]:
new_y_pred = simpler_model.predict(new_X_test)

In [None]:
print(f1_score(y_test, new_y_pred))

0.7324552781600157
