# 02 - PCA Analysis and Dimensionality Reduction

This notebook covers:
1. Loading preprocessed data
2. Applying Principal Component Analysis (PCA)
3. Determining optimal number of components
4. Visualizing PCA results
5. Comparing original vs PCA-transformed data


In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
import joblib
import warnings
warnings.filterwarnings('ignore')

# Set style for better plots
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")

print("Libraries imported successfully!")


In [None]:
# Load preprocessed data
print("Loading preprocessed data...")
X_train = joblib.load('../data/X_train.pkl')
X_test = joblib.load('../data/X_test.pkl')
y_train = joblib.load('../data/y_train.pkl')
y_test = joblib.load('../data/y_test.pkl')

print(f"Training data shape: {X_train.shape}")
print(f"Test data shape: {X_test.shape}")
print(f"Feature names: {list(X_train.columns)}")

# Display first few rows
print("\nFirst 5 rows of training data:")
print(X_train.head())


In [None]:
# Apply PCA with all components first to analyze variance
pca_full = PCA()
X_train_pca_full = pca_full.fit_transform(X_train)

# Calculate explained variance ratio
explained_variance_ratio = pca_full.explained_variance_ratio_
cumulative_variance_ratio = np.cumsum(explained_variance_ratio)

print("Explained variance ratio for each component:")
for i, var in enumerate(explained_variance_ratio):
    print(f"PC{i+1}: {var:.4f} ({var*100:.2f}%)")

print(f"\nTotal variance explained: {cumulative_variance_ratio[-1]:.4f} ({cumulative_variance_ratio[-1]*100:.2f}%)")


In [None]:
# Visualize explained variance
plt.figure(figsize=(15, 5))

# 1. Individual explained variance
plt.subplot(1, 3, 1)
plt.bar(range(1, len(explained_variance_ratio) + 1), explained_variance_ratio, 
        color='skyblue', alpha=0.7, edgecolor='black')
plt.xlabel('Principal Component')
plt.ylabel('Explained Variance Ratio')
plt.title('Explained Variance by Component')
plt.xticks(range(1, len(explained_variance_ratio) + 1))

# 2. Cumulative explained variance
plt.subplot(1, 3, 2)
plt.plot(range(1, len(cumulative_variance_ratio) + 1), cumulative_variance_ratio, 
         marker='o', linewidth=2, markersize=6, color='red')
plt.axhline(y=0.95, color='green', linestyle='--', label='95% Variance')
plt.axhline(y=0.90, color='orange', linestyle='--', label='90% Variance')
plt.xlabel('Number of Components')
plt.ylabel('Cumulative Explained Variance Ratio')
plt.title('Cumulative Explained Variance')
plt.legend()
plt.grid(True, alpha=0.3)

# 3. Scree plot
plt.subplot(1, 3, 3)
plt.plot(range(1, len(explained_variance_ratio) + 1), explained_variance_ratio, 
         marker='o', linewidth=2, markersize=6, color='purple')
plt.xlabel('Principal Component')
plt.ylabel('Explained Variance Ratio')
plt.title('Scree Plot')
plt.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

# Find optimal number of components (95% variance)
n_components_95 = np.argmax(cumulative_variance_ratio >= 0.95) + 1
n_components_90 = np.argmax(cumulative_variance_ratio >= 0.90) + 1

print(f"Number of components for 90% variance: {n_components_90}")
print(f"Number of components for 95% variance: {n_components_95}")
print(f"Variance explained with {n_components_95} components: {cumulative_variance_ratio[n_components_95-1]:.4f}")


In [None]:
# Apply PCA with optimal number of components
n_components = n_components_95  # Using 95% variance
pca = PCA(n_components=n_components)

# Fit and transform training data
X_train_pca = pca.fit_transform(X_train)
X_test_pca = pca.transform(X_test)

print(f"PCA applied with {n_components} components")
print(f"Original feature space: {X_train.shape[1]} features")
print(f"PCA feature space: {X_train_pca.shape[1]} features")
print(f"Variance explained: {pca.explained_variance_ratio_.sum():.4f} ({pca.explained_variance_ratio_.sum()*100:.2f}%)")

# Create DataFrames for easier handling
X_train_pca_df = pd.DataFrame(X_train_pca, columns=[f'PC{i+1}' for i in range(n_components)])
X_test_pca_df = pd.DataFrame(X_test_pca, columns=[f'PC{i+1}' for i in range(n_components)])

print(f"\nPCA-transformed training data shape: {X_train_pca_df.shape}")
print(f"PCA-transformed test data shape: {X_test_pca_df.shape}")


In [None]:
# Visualize PCA results
plt.figure(figsize=(15, 10))

# 1. First two principal components colored by target
plt.subplot(2, 3, 1)
scatter = plt.scatter(X_train_pca_df['PC1'], X_train_pca_df['PC2'], 
                     c=y_train, cmap='viridis', alpha=0.7)
plt.xlabel(f'PC1 ({pca.explained_variance_ratio_[0]:.2%} variance)')
plt.ylabel(f'PC2 ({pca.explained_variance_ratio_[1]:.2%} variance)')
plt.title('PCA: First Two Components')
plt.colorbar(scatter, label='Heart Disease')

# 2. PC1 vs PC3
plt.subplot(2, 3, 2)
scatter = plt.scatter(X_train_pca_df['PC1'], X_train_pca_df['PC3'], 
                     c=y_train, cmap='viridis', alpha=0.7)
plt.xlabel(f'PC1 ({pca.explained_variance_ratio_[0]:.2%} variance)')
plt.ylabel(f'PC3 ({pca.explained_variance_ratio_[2]:.2%} variance)')
plt.title('PCA: PC1 vs PC3')
plt.colorbar(scatter, label='Heart Disease')

# 3. PC2 vs PC3
plt.subplot(2, 3, 3)
scatter = plt.scatter(X_train_pca_df['PC2'], X_train_pca_df['PC3'], 
                     c=y_train, cmap='viridis', alpha=0.7)
plt.xlabel(f'PC2 ({pca.explained_variance_ratio_[1]:.2%} variance)')
plt.ylabel(f'PC3 ({pca.explained_variance_ratio_[2]:.2%} variance)')
plt.title('PCA: PC2 vs PC3')
plt.colorbar(scatter, label='Heart Disease')

# 4. Component loadings (feature contributions)
plt.subplot(2, 3, 4)
components = pca.components_[:3]  # First 3 components
feature_names = X_train.columns
x_pos = np.arange(len(feature_names))

plt.bar(x_pos - 0.2, components[0], 0.2, label='PC1', alpha=0.7)
plt.bar(x_pos, components[1], 0.2, label='PC2', alpha=0.7)
plt.bar(x_pos + 0.2, components[2], 0.2, label='PC3', alpha=0.7)

plt.xlabel('Features')
plt.ylabel('Component Loading')
plt.title('Feature Contributions to First 3 PCs')
plt.xticks(x_pos, feature_names, rotation=45, ha='right')
plt.legend()

# 5. Explained variance by component
plt.subplot(2, 3, 5)
plt.bar(range(1, n_components + 1), pca.explained_variance_ratio_, 
        color='lightcoral', alpha=0.7, edgecolor='black')
plt.xlabel('Principal Component')
plt.ylabel('Explained Variance Ratio')
plt.title('Variance Explained by Each Component')
plt.xticks(range(1, n_components + 1))

# 6. Cumulative variance
plt.subplot(2, 3, 6)
cumulative_var = np.cumsum(pca.explained_variance_ratio_)
plt.plot(range(1, n_components + 1), cumulative_var, 
         marker='o', linewidth=2, markersize=6, color='green')
plt.axhline(y=0.95, color='red', linestyle='--', label='95% Variance')
plt.xlabel('Number of Components')
plt.ylabel('Cumulative Explained Variance')
plt.title('Cumulative Variance Explained')
plt.legend()
plt.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()


In [None]:
# Save PCA-transformed data and PCA model
import os

# Create directories if they don't exist
os.makedirs('../data', exist_ok=True)
os.makedirs('../models', exist_ok=True)

# Save PCA-transformed data
joblib.dump(X_train_pca_df, '../data/X_train_pca.pkl')
joblib.dump(X_test_pca_df, '../data/X_test_pca.pkl')
joblib.dump(pca, '../models/pca_model.pkl')

print("PCA analysis completed and data saved!")
print("Files saved:")
print("- ../data/X_train_pca.pkl")
print("- ../data/X_test_pca.pkl")
print("- ../models/pca_model.pkl")

# Display summary
print(f"\nPCA Summary:")
print(f"- Original features: {X_train.shape[1]}")
print(f"- PCA components: {n_components}")
print(f"- Variance explained: {pca.explained_variance_ratio_.sum():.4f} ({pca.explained_variance_ratio_.sum()*100:.2f}%)")
print(f"- Dimensionality reduction: {((X_train.shape[1] - n_components) / X_train.shape[1] * 100):.1f}%")
