In [2]:
# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
import warnings

# Suppress warnings
warnings.filterwarnings('ignore')

# Set style for visualizations
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")

def load_and_analyze_dataset():
    """Load dataset and analyze feature types for proper PCA application"""
    print("="*80)
    print("PROPER PCA ANALYSIS FOLLOWING BEST PRACTICES")
    print("="*80)
    
    # Load the cleaned dataset
    df = pd.read_csv('/Users/vonguyenkien/Workspaces/RM_Training/heart_3datasets_comimp_columntransformer_cleaned_for_modeling.csv')
    
    print(f"\n📊 DATASET OVERVIEW:")
    print("-" * 40)
    print(f"Shape: {df.shape[0]:,} rows × {df.shape[1]} columns")
    print(f"Features: {list(df.columns)}")
    
    # Separate target from features
    y = df['target']
    X = df.drop('target', axis=1)
    
    return df, X, y

def apply_proper_pca(X):
    """Apply PCA with proper standardization"""
    print(f"\n🔧 APPLYING PROPER PCA STRATEGY:")
    print("-" * 50)
    
    # Standardize all features
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)
    
    # Apply PCA to retain 11 components
    pca = PCA(n_components=11)
    X_pca = pca.fit_transform(X_scaled)
    
    # Print PCA results
    variance_explained = np.sum(pca.explained_variance_ratio_)
    print(f"Retained {pca.n_components_} PCA components, variance explained: {variance_explained:.4f}")
    
    return X_pca, pca

def save_best_pca_dataset(X_pca, y, pca):
    """Save PCA-transformed dataset with target variable"""
    print(f"\n💾 SAVING BEST PCA DATASET:")
    
    # Create DataFrame with PCA components and target
    pca_df = pd.DataFrame(X_pca, columns=[f'PC{i+1}' for i in range(X_pca.shape[1])])
    pca_df['target'] = y.values
    
    # Save dataset
    filename = 'heart_3datasets_proper_pca_BEST.csv'
    pca_df.to_csv(filename, index=False)
    
    print(f"✅ Saved: {filename}")
    print(f"   Shape: {pca_df.shape}")
    print(f"   Features: {pca_df.columns.tolist()}")
    print(f"   Quality: {np.sum(pca.explained_variance_ratio_):.2%} variance retained")

def main():
    """Main function for proper PCA analysis"""
    print("🚀 STARTING PROPER PCA ANALYSIS WITH BEST PRACTICES")
    print("="*80)
    
    # Load and analyze dataset
    df, X, y = load_and_analyze_dataset()
    
    # Apply proper PCA strategy
    X_pca, pca = apply_proper_pca(X)
    
    # Save the best PCA dataset
    save_best_pca_dataset(X_pca, y, pca)
    
    print(f"\n✅ PROPER PCA ANALYSIS COMPLETE!")
    print("="*80)

# Run the main function
if __name__ == "__main__":
    main()


🚀 STARTING PROPER PCA ANALYSIS WITH BEST PRACTICES
PROPER PCA ANALYSIS FOLLOWING BEST PRACTICES

📊 DATASET OVERVIEW:
----------------------------------------
Shape: 2,919 rows × 14 columns
Features: ['age', 'ca', 'chol', 'cp', 'exang', 'fbs', 'oldpeak', 'restecg', 'sex', 'slope', 'thal', 'thalach', 'trestbps', 'target']

🔧 APPLYING PROPER PCA STRATEGY:
--------------------------------------------------
Retained 11 PCA components, variance explained: 0.9208

💾 SAVING BEST PCA DATASET:
✅ Saved: heart_3datasets_proper_pca_BEST.csv
   Shape: (2919, 12)
   Features: ['PC1', 'PC2', 'PC3', 'PC4', 'PC5', 'PC6', 'PC7', 'PC8', 'PC9', 'PC10', 'PC11', 'target']
   Quality: 92.08% variance retained

✅ PROPER PCA ANALYSIS COMPLETE!
