# PCA Analysis Template: Feature Engineering + Dimensionality Reduction

This notebook template applies PCA strategy to any dataset. You can customize:
- Which features to treat as categorical (one-hot encoding)
- Which features to treat as numerical
- Number of PCA components to generate
- Models to evaluate performance

**Configuration needed:**
- Update file paths
- Specify categorical and numerical features
- Set number of PCA components
- Choose evaluation models


In [14]:
# ================================
# CONFIGURATION SECTION
# ================================

# File paths - UPDATE THESE FOR YOUR PROJECT
INPUT_FILE_PATH = '/Users/vonguyenkien/Workspaces/EEET2485-ResearchMethod/heart_datasets_comimp_binary_cleaned_for_modeling.csv'
OUTPUT_FILE_NAME = 'your_output_dataset_with_pca.csv'

# Feature configuration - UPDATE THESE FOR YOUR DATASET
CATEGORICAL_FEATURES = ['thal']  # Features to one-hot encode
NUMERICAL_FEATURES = ['ca']      # Features to keep as numerical
TARGET_COLUMN = 'target'         # Target column name
N_PCA_COMPONENTS = 2             # Number of PCA components to generate



In [15]:
import pandas as pd
import numpy as np
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
import warnings
warnings.filterwarnings('ignore')

print("All libraries imported successfully!")


All libraries imported successfully!


## 1. Data Transformation Function (Template)

This template function can be customized for any dataset by modifying the configuration section above.


In [16]:
def create_pca_dataset():
    """
    Template function to create dataset with PCA components from specified features
    Uses configuration variables defined above
    """
    print("="*80)
    print(f"PCA ANALYSIS: {len(CATEGORICAL_FEATURES)} CATEGORICAL + {len(NUMERICAL_FEATURES)} NUMERICAL → {N_PCA_COMPONENTS} PCA COMPONENTS")
    print("="*80)
    
    try:
        # Load the dataset
        df = pd.read_csv(INPUT_FILE_PATH)
        print(f"Original dataset loaded: {df.shape}")
        
        # Validate required columns exist
        required_columns = CATEGORICAL_FEATURES + NUMERICAL_FEATURES + [TARGET_COLUMN]
        missing_columns = [col for col in required_columns if col not in df.columns]
        if missing_columns:
            raise ValueError(f"Missing columns in dataset: {missing_columns}")
            
    except FileNotFoundError:
        print(f"Error: File not found at {INPUT_FILE_PATH}")
        print("Please update INPUT_FILE_PATH in the configuration section")
        return None
    except Exception as e:
        print(f"Error loading data: {str(e)}")
        return None
    
    # Separate target and other features
    y = df[TARGET_COLUMN]
    other_features = [col for col in df.columns if col not in CATEGORICAL_FEATURES + NUMERICAL_FEATURES + [TARGET_COLUMN]]
    
    print(f"\nTRANSFORMATION PROCESS:")
    print("-" * 50)
    
    # Step 1: Process categorical features (one-hot encoding)
    processed_features = []
    if CATEGORICAL_FEATURES:
        print(f"1. One-hot encoding categorical features: {CATEGORICAL_FEATURES}")
        for feature in CATEGORICAL_FEATURES:
            feature_onehot = pd.get_dummies(df[feature], prefix=feature, drop_first=False)
            processed_features.append(feature_onehot)
            print(f"   {feature}: {list(feature_onehot.columns)} ({feature_onehot.shape[1]} features)")
    
    # Step 2: Process numerical features
    if NUMERICAL_FEATURES:
        print(f"2. Adding numerical features: {NUMERICAL_FEATURES}")
        numerical_df = df[NUMERICAL_FEATURES].copy()
        processed_features.append(numerical_df)
        print(f"   Numerical features: {NUMERICAL_FEATURES} ({len(NUMERICAL_FEATURES)} features)")
    
    # Step 3: Combine all processed features
    if processed_features:
        combined_features = pd.concat(processed_features, axis=1)
        print(f"3. Combined features: {combined_features.shape[1]} total features")
    else:
        print("No features to process!")
        return None
    
    # Step 4: Apply PCA
    print(f"4. Applying PCA to get {N_PCA_COMPONENTS} components...")
    scaler = StandardScaler()
    features_scaled = scaler.fit_transform(combined_features)
    
    pca = PCA(n_components=N_PCA_COMPONENTS)
    features_pca = pca.fit_transform(features_scaled)
    
    variance_explained = np.sum(pca.explained_variance_ratio_)
    print(f"   PCA: {combined_features.shape[1]} features → {N_PCA_COMPONENTS} components")
    print(f"   Variance explained: {variance_explained:.4f} ({variance_explained*100:.1f}%)")
    
    # Display individual component variance
    for i, var_ratio in enumerate(pca.explained_variance_ratio_):
        print(f"   PC{i+1}: {var_ratio:.4f} ({var_ratio*100:.1f}%)")
    
    # Step 5: Create final dataset
    print("5. Creating final dataset...")
    
    # Create PCA DataFrame
    pca_columns = [f'PC{i+1}' for i in range(N_PCA_COMPONENTS)]
    pca_df = pd.DataFrame(features_pca, columns=pca_columns, index=df.index)
    
    # Add other features (not processed by PCA)
    final_components = [pca_df]
    if other_features:
        other_df = df[other_features].copy()
        final_components.append(other_df)
        print(f"   Other features kept: {other_features} ({len(other_features)} features)")
    
    # Combine all components
    final_df = pd.concat(final_components, axis=1)
    final_df[TARGET_COLUMN] = y
    
    print(f"   Final dataset: {final_df.shape}")
    print(f"   Features: {list(final_df.columns[:-1])}")  # Exclude target
    
    return final_df, pca, scaler


## 3. Save Dataset Function (Template)

This function saves the transformed dataset with a configurable filename.


In [17]:
def save_final_dataset(final_df):
    """
    Template function to save the final transformed DataFrame
    Uses OUTPUT_FILE_NAME from configuration
    """
    print(f"\n  SAVING FINAL DATASET:")
    print("-" * 50)
    
    try:
        final_df.to_csv(OUTPUT_FILE_NAME, index=False)
        print(f"   Dataset saved successfully as: {OUTPUT_FILE_NAME}")
        print(f"   Shape: {final_df.shape}")
        print(f"   Columns: {list(final_df.columns)}")
        
        # Display sample of the saved data
        print(f"\n📋 Sample of saved data:")
        print(final_df.head())
        
    except Exception as e:
        print(f"Error saving dataset: {str(e)}")


## 4. Main Execution Block (Template)

Run this cell to execute the entire workflow. Make sure to update the configuration section first!


In [18]:
def main():
    """Main execution function"""
    print("  Starting PCA Analysis Pipeline...")
    print(f"  Configuration check:")
    print(f"   Input file: {INPUT_FILE_PATH}")
    print(f"   Categorical features: {CATEGORICAL_FEATURES}")
    print(f"   Numerical features: {NUMERICAL_FEATURES}")
    print(f"   PCA components: {N_PCA_COMPONENTS}")
    
    try:
        # Load original dataset
        print(f"\n  Loading original dataset...")
        original_df = pd.read_csv(INPUT_FILE_PATH)
        print(f"  Original dataset loaded: {original_df.shape}")
        
        # Create the PCA-transformed dataset
        print(f"\n  Creating PCA-transformed dataset...")
        result = create_pca_dataset()
        
        if result is None:
            print("  Failed to create PCA dataset. Please check your configuration.")
            return
            
        final_df, pca_model, scaler_model = result
        
        # Save the final dataset
        print(f"\n Saving results...")
        save_final_dataset(final_df)
        
        print(f"\n  Pipeline completed successfully!")
        
        return final_df, pca_model, scaler_model
        
    except Exception as e:
        print(f"  Pipeline failed: {str(e)}")
        return None

# Execute the main pipeline
result = main()


  Starting PCA Analysis Pipeline...
  Configuration check:
   Input file: /Users/vonguyenkien/Workspaces/EEET2485-ResearchMethod/heart_datasets_comimp_binary_cleaned_for_modeling.csv
   Categorical features: ['thal']
   Numerical features: ['ca']
   PCA components: 2

  Loading original dataset...
  Original dataset loaded: (1894, 14)

  Creating PCA-transformed dataset...
PCA ANALYSIS: 1 CATEGORICAL + 1 NUMERICAL → 2 PCA COMPONENTS
Original dataset loaded: (1894, 14)

TRANSFORMATION PROCESS:
--------------------------------------------------
1. One-hot encoding categorical features: ['thal']
   thal: ['thal_0.0', 'thal_1.0', 'thal_1.997440975751684', 'thal_2.0', 'thal_2.0165613251602665', 'thal_2.024763624577878', 'thal_2.027429803802093', 'thal_2.038969688525873', 'thal_2.0475837782595074', 'thal_2.056134606990952', 'thal_2.0567884944241595', 'thal_2.0689851720141657', 'thal_2.07204706420601', 'thal_2.074720955617156', 'thal_2.075245934606187', 'thal_2.076947416938626', 'thal_2.07793