# Principle Component Analysis 

Principal Component Analysis (PCA) is a dimensionality reduction technique used in the field of machine learning and statistics. It aims to simplify the complexity of high-dimensional data by transforming it into a lower-dimensional space while preserving as much of the original information as possible. 

In [34]:
import numpy as np
from sklearn.datasets import load_digits
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score


dataset = load_digits()

X = dataset.data
y = dataset.target

# Split dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [35]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((1437, 64), (360, 64), (1437,), (360,))

In [36]:
# Train a Random Forest classifier on the orignal dataset
rf_classifier_orig = RandomForestClassifier(random_state=42)
rf_classifier_orig.fit(X_train, y_train)



RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=10,
                       n_jobs=None, oob_score=False, random_state=42, verbose=0,
                       warm_start=False)

In [50]:
# Perform PCA to reduce dimensionality
# 64 dimentions to 5 dimentions
pca = PCA(n_components=5)
X_train_pca = pca.fit_transform(X_train)
X_test_pca = pca.transform(X_test)

In [51]:
X_train_pca.shape, X_test_pca.shape

((1437, 15), (360, 15))

# Let the magic begin !!!!

In [52]:
# Train a Random Forest classifier on the PCA-transformed data
rf_classifier_pca = RandomForestClassifier(random_state=42)
rf_classifier_pca.fit(X_train_pca, y_train)




RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=10,
                       n_jobs=None, oob_score=False, random_state=42, verbose=0,
                       warm_start=False)

In [53]:
# make Prediction using both classifiers on the test set
y_pred_orig = rf_classifier_orig.predict(X_test)
y_pred_pca = rf_classifier_pca.predict(X_test_pca)

In [54]:
# Calculate accuracy on the test set
accuracy_orig = accuracy_score(y_test, y_pred_orig)
accuracy_pca = accuracy_score(y_test, y_pred_pca)

In [55]:
print("Accuracy on orignal data:", accuracy_orig)
print("Accuracy on pca transformed data:", accuracy_pca)

Accuracy on orignal data: 0.9583333333333334
Accuracy on pca transformed data: 0.925


# Lab Task: Perform the PCA and compare accuracy for all the classifiers we have implemented so far. Also discuss the results.