In [None]:
# # Importing necessary libraries
# import os
# import pandas as pd
# import numpy as np
# from sklearn.model_selection import train_test_split
# from sklearn.preprocessing import StandardScaler, OneHotEncoder
# from sklearn.compose import ColumnTransformer
# from sklearn.pipeline import Pipeline
# from imblearn.over_sampling import SMOTE
# import joblib
# import pickle

In [None]:
# # Paths
# processed_data_path = 'D:/healthcare_analytics_project/data/processed/framingham_preprocessed.csv'
# preprocessed_data_path = 'D:/healthcare_analytics_project/data/processed/'
# model_data_path = 'D:/healthcare_analytics_project/models/'

In [None]:
# # Create directories if they don't exist
# os.makedirs(preprocessed_data_path, exist_ok=True)
# os.makedirs(model_data_path, exist_ok=True)

In [None]:
# # Load the processed dataset
# data = pd.read_csv(processed_data_path)

# # Display basic information
# print(f"Shape of the dataset: {data.shape}")
# print(f"Columns: {data.columns.tolist()}")

Shape of the dataset: (3658, 17)
Columns: ['male', 'age', 'education', 'currentSmoker', 'cigsPerDay', 'BPMeds', 'prevalentStroke', 'prevalentHyp', 'diabetes', 'totChol', 'sysBP', 'diaBP', 'BMI', 'heartRate', 'glucose', 'TenYearCHD', 'age_group']


In [None]:
# # Features and target
# X = data.drop('TenYearCHD', axis=1)
# y = data['TenYearCHD']

# # Identify categorical and numerical columns
# categorical_features = ['education']
# numerical_features = X.select_dtypes(include=[np.number]).columns.tolist()

# # Remove categorical features from numerical list
# numerical_features = [col for col in numerical_features if col not in categorical_features]


In [None]:
# # Display identified features
# print(f"Categorical features: {categorical_features}")
# print(f"Numerical features: {numerical_features}")


Categorical features: ['education']
Numerical features: ['male', 'age', 'currentSmoker', 'cigsPerDay', 'BPMeds', 'prevalentStroke', 'prevalentHyp', 'diabetes', 'totChol', 'sysBP', 'diaBP', 'BMI', 'heartRate', 'glucose']


In [None]:
# # Handle class imbalance using SMOTE
# # Select only numeric features for SMOTE
# X_numeric = X[numerical_features]

# # Apply SMOTE
# smote = SMOTE(random_state=42)
# X_resampled_numeric, y_resampled = smote.fit_resample(X_numeric, y)

# # Add back categorical features after SMOTE
# X_resampled = pd.concat(
#     [pd.DataFrame(X_resampled_numeric, columns=numerical_features), 
#      X.reset_index(drop=True).iloc[:len(y_resampled)][categorical_features]],
#     axis=1
# )


In [None]:
# # Train-test split
# X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42, stratify=y_resampled)

# # Preprocessing pipeline
# preprocessor = ColumnTransformer(
#     transformers=[
#         ('num', StandardScaler(), numerical_features),
#         ('cat', OneHotEncoder(drop='first'), categorical_features)
#     ]
# )

In [None]:
# # Fit and transform the training data
# X_train_preprocessed = preprocessor.fit_transform(X_train)
# X_test_preprocessed = preprocessor.transform(X_test)

# # Save the preprocessor for future use
# preprocessor_path = os.path.join(model_data_path, 'preprocessor.pkl')
# with open(preprocessor_path, 'wb') as f:
#     pickle.dump(preprocessor, f)
# print(f"Preprocessor saved to {preprocessor_path}")

Preprocessor saved to D:/healthcare_analytics_project/models/preprocessor.pkl


In [None]:
# # Save preprocessed data
# train_data_path = os.path.join(preprocessed_data_path, 'train_data.npz')
# test_data_path = os.path.join(preprocessed_data_path, 'test_data.npz')

# np.savez_compressed(train_data_path, X_train=X_train_preprocessed, y_train=y_train)
# np.savez_compressed(test_data_path, X_test=X_test_preprocessed, y_test=y_test)

# print(f"Preprocessed training data saved to {train_data_path}")
# print(f"Preprocessed testing data saved to {test_data_path}")

Preprocessed training data saved to D:/healthcare_analytics_project/data/processed/train_data.npz
Preprocessed testing data saved to D:/healthcare_analytics_project/data/processed/test_data.npz


In [4]:
# %% [code]
# Importing necessary libraries
import os
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from imblearn.over_sampling import SMOTE
import pickle

# Paths
processed_data_path = 'D:/healthcare_analytics_project/data/processed/framingham_preprocessed.csv'
preprocessed_data_dir = 'D:/healthcare_analytics_project/data/processed/'
model_data_dir = 'D:/healthcare_analytics_project/models/'

# Create directories if they don't exist
os.makedirs(preprocessed_data_dir, exist_ok=True)
os.makedirs(model_data_dir, exist_ok=True)

# %% [code]
# Load the processed dataset
data = pd.read_csv(processed_data_path)

# Display basic information about the dataset
print(f"Dataset shape: {data.shape}")
print(f"Columns: {data.columns.tolist()}")

# Define features and target
X = data.drop('TenYearCHD', axis=1)
y = data['TenYearCHD']

# Identify categorical and numerical features
categorical_features = ['education']
numerical_features = X.select_dtypes(include=[np.number]).columns.tolist()
numerical_features = [col for col in numerical_features if col not in categorical_features]

print(f"Categorical features: {categorical_features}")
print(f"Numerical features: {numerical_features}")

# %% [code]
# Split the dataset into training and testing sets before applying SMOTE
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

print(f"Training data shape: {X_train.shape}, Testing data shape: {X_test.shape}")

# %% [code]
# Define the preprocessing pipeline using ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_features),
        # Use sparse_output=False for newer versions of scikit-learn
        ('cat', OneHotEncoder(drop='first', sparse_output=False), categorical_features)
    ]
)

# Fit the preprocessor on the training data and transform both training and test sets
X_train_preprocessed = preprocessor.fit_transform(X_train)
X_test_preprocessed = preprocessor.transform(X_test)

# %% [code]
# Apply SMOTE to the preprocessed training data to address class imbalance
smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train_preprocessed, y_train)

print(f"Resampled training data shape: {X_train_resampled.shape}")

# %% [code]
# Save the preprocessor for future use (e.g., during inference)
preprocessor_path = os.path.join(model_data_dir, 'preprocessor.pkl')
with open(preprocessor_path, 'wb') as f:
    pickle.dump(preprocessor, f)
print(f"Preprocessor saved to {preprocessor_path}")

# Save the preprocessed training and testing data
train_data_path = os.path.join(preprocessed_data_dir, 'train_data.npz')
test_data_path = os.path.join(preprocessed_data_dir, 'test_data.npz')

np.savez_compressed(train_data_path, X_train=X_train_resampled, y_train=y_train_resampled)
np.savez_compressed(test_data_path, X_test=X_test_preprocessed, y_test=y_test)

print(f"Preprocessed training data saved to {train_data_path}")
print(f"Preprocessed testing data saved to {test_data_path}")


Dataset shape: (3658, 17)
Columns: ['male', 'age', 'education', 'currentSmoker', 'cigsPerDay', 'BPMeds', 'prevalentStroke', 'prevalentHyp', 'diabetes', 'totChol', 'sysBP', 'diaBP', 'BMI', 'heartRate', 'glucose', 'TenYearCHD', 'age_group']
Categorical features: ['education']
Numerical features: ['male', 'age', 'currentSmoker', 'cigsPerDay', 'BPMeds', 'prevalentStroke', 'prevalentHyp', 'diabetes', 'totChol', 'sysBP', 'diaBP', 'BMI', 'heartRate', 'glucose']
Training data shape: (2926, 16), Testing data shape: (732, 16)
Resampled training data shape: (4960, 17)
Preprocessor saved to D:/healthcare_analytics_project/models/preprocessor.pkl
Preprocessed training data saved to D:/healthcare_analytics_project/data/processed/train_data.npz
Preprocessed testing data saved to D:/healthcare_analytics_project/data/processed/test_data.npz
