# Step 4 - Dimensionality Reduction

Use PCA (Principal Component Analysis) and Factor Analysis as techniques to reduce the number of features while retaining most of the variance.

In [1]:
import pandas as pd
from sklearn.decomposition import PCA

## 4.0 - Load the Data

In [15]:
previous_years_SMOTEd = pd.read_csv('../data/07_previous_years_SMOTEd.csv')
present_scaled = pd.read_csv('../data/06_present_year_scaled.csv')

## 4.1 - Apply the PCA

PCA (Principal Component Analysis) is used to reduce the dimensionality of the dataset while preserving as much variance as possible. PCA should be applied to the training data, and the same transformation should be applied to the test data.

**Previous Years - Training Data**

In [27]:
# Separate target and features for training data
y_train_SMOTEd = previous_years_SMOTEd['class']
X_train_SMOTEd = previous_years_SMOTEd.drop('class', axis=1)

# Initialize PCA
pca = PCA(n_components=2)

# Apply PCA
X_train_pca = pca.fit_transform(X_train_SMOTEd)

# PCA results back to a DataFrame
previous_years_PCA = pd.DataFrame(X_train_pca, columns=['PC1', 'PC2'])
previous_years_PCA['class'] = y_train_SMOTEd.values
previous_years_PCA

Unnamed: 0,PC1,PC2,class
0,-11.063584,-1.567663,0
1,-13.752753,-0.614154,0
2,-14.278052,-0.260361,0
3,-16.118603,0.194279,0
4,-12.965438,-0.643940,0
...,...,...,...
117995,29.461699,-15.638997,1
117996,47.899332,-20.215931,1
117997,19.338496,11.651469,1
117998,10.457556,9.882504,1


**Present Year - Test Data**

In [23]:
# Separate target and features for test data
y_test_scaled = present_scaled['class']
X_test_scaled = present_scaled.drop('class', axis=1)

# Transform test data using the same PCA
X_test_pca = pca.transform(X_test_scaled)

# Convert PCA results to DataFrame if needed
present_year_PCA = pd.DataFrame(X_test_pca, columns=['PC1', 'PC2'])
present_year_PCA['class'] = y_test_scaled.values
present_year_PCA

Unnamed: 0,PC1,PC2,class
0,-15.901969,0.166094,0
1,-15.957528,0.160321,0
2,-12.694033,0.661840,0
3,-13.221928,-0.548579,0
4,-15.926757,0.161080,0
...,...,...,...
15995,-12.062013,-0.949061,0
15996,-15.980245,0.170760,0
15997,-11.942178,-1.404452,0
15998,-15.933016,0.162912,0


## 4.2 - PCA Data Saving


In [25]:
previous_years_PCA.to_csv('../data/08_previous_years_PCA.csv', index=False)
present_year_PCA.to_csv('../data/09_previous_years_PCA.csv', index=False)