In [1]:
# Phase 4: Data Reduction
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import sys
sys.path.append('../src')

from data_reduction import execute_data_reduction_pipeline

print("ðŸš€ STARTING PHASE 4: DATA REDUCTION")
print("=" * 50)

# Load the transformed data from Phase 3
df_transformed = pd.read_csv('../data/processed/diabetes_transformed.csv')
print(f"Loaded transformed data: {df_transformed.shape}")

# Display feature overview
print(f"\nðŸ“Š DATASET OVERVIEW:")
print(f"â€¢ Total features: {df_transformed.shape[1]}")
print(f"â€¢ Samples: {df_transformed.shape[0]}")

feature_categories = {
    'Original Numerical': [col for col in df_transformed.columns if col in 
                          ['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 
                           'Insulin', 'BMI', 'DiabetesPedigreeFunction', 'Age', 'Outcome']],
    'Scaled Features': [col for col in df_transformed.columns if '_Scaled' in col],
    'Clinical Categories': [col for col in df_transformed.columns if 'Category' in col or 'Group' in col],
    'Interaction Features': [col for col in df_transformed.columns if 'Interaction' in col or 'Score' in col],
    'Encoded Features': [col for col in df_transformed.columns if 'Encoded' in col]
}

for category, features in feature_categories.items():
    print(f"â€¢ {category}: {len(features)} features")

# Execute reduction pipeline
reduced_df, selected_features, reduction_report = execute_data_reduction_pipeline(
    df_transformed, 
    target_col='Outcome',
    n_final_features=15
)

# Save reduced dataset
reduced_df.to_csv('../data/processed/diabetes_reduced.csv', index=False)
print(f"\nðŸ’¾ Reduced data saved to: ../data/processed/diabetes_reduced.csv")
print(f"ðŸ“‰ Final dataset: {reduced_df.shape}")

# Save feature selection report
import json
feature_report = {
    'selected_features': selected_features,
    'feature_count_original': df_transformed.shape[1],
    'feature_count_reduced': reduced_df.shape[1],
    'reduction_percentage': f"{((df_transformed.shape[1] - reduced_df.shape[1]) / df_transformed.shape[1]) * 100:.1f}%"
}

with open('../data/processed/feature_selection_report.json', 'w') as f:
    json.dump(feature_report, f, indent=2)

print(f"\nðŸ“‹ Feature selection report saved")

# Final comparison
print("\nðŸŽ¯ PHASE 4 COMPLETION SUMMARY")
print("=" * 40)
print(f"Original features: {df_transformed.shape[1]}")
print(f"Selected features: {len(selected_features)}")
print(f"Reduction: {feature_report['reduction_percentage']}")
print(f"Final dataset shape: {reduced_df.shape}")

print("\nâœ… PHASE 4: DATA REDUCTION COMPLETE!")

ðŸš€ STARTING PHASE 4: DATA REDUCTION
Loaded transformed data: (768, 32)

ðŸ“Š DATASET OVERVIEW:
â€¢ Total features: 32
â€¢ Samples: 768
â€¢ Original Numerical: 9 features
â€¢ Scaled Features: 8 features
â€¢ Clinical Categories: 12 features
â€¢ Interaction Features: 3 features
â€¢ Encoded Features: 3 features
ðŸ”— COMPREHENSIVE FEATURE CORRELATION ANALYSIS


ValueError: could not convert string to float: '50-59'