# Cholesterol-lowering Supplement Classification

This notebook implements a machine learning model to predict if a patient needs cholesterol-lowering supplements based on their health attributes.

In [None]:
# Import required libraries
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report, roc_curve, auc
import warnings

warnings.filterwarnings("ignore")

## 1. Load the Data

In [None]:
# Load the training data
training_data = pd.read_excel("data/train_choloesterol.xlsx")

# Load the prediction data
prediction_data = pd.read_excel("data/predict_cholesterol.xlsx")

# Display first few rows of training data
print("Training data shape:", training_data.shape)
training_data.head()

In [None]:
# Display first few rows of prediction data
print("Prediction data shape:", prediction_data.shape)
prediction_data.head()

## 2. Data Exploration and Preprocessing

In [None]:
# Check for missing values in training data
print("Missing values in training data:")
print(training_data.isnull().sum())

# Check data types
print("\nData types in training data:")
print(training_data.dtypes)

In [None]:
# Get statistical summary of training data
training_data.describe()

In [None]:
# Count of target variable
plt.figure(figsize=(6, 4))
sns.countplot(x="Need Supplement", data=training_data)
plt.title("Distribution of Target Variable")
plt.show()

print("Target variable distribution:")
print(training_data["Need Supplement"].value_counts())
print("Percentage:")
print(training_data["Need Supplement"].value_counts(normalize=True) * 100)

In [None]:
# Explore categorical variables
categorical_features = [
    "Gender",
    "Physical Activity",
    "Dietary Habits",
    "Family History",
]

fig, axes = plt.subplots(2, 2, figsize=(15, 10))
axes = axes.flatten()

for i, feature in enumerate(categorical_features):
    sns.countplot(x=feature, hue="Need Supplement", data=training_data, ax=axes[i])
    axes[i].set_title(f"Distribution of {feature} by Need Supplement")
    axes[i].tick_params(axis="x", rotation=45)

plt.tight_layout()
plt.show()

In [None]:
# Explore numerical variables
numerical_features = [
    "Age",
    "BMI",
    "Total Cholesterol",
    "LDL Cholesterol",
    "HDL Cholesterol",
    "Triglycerides",
]

fig, axes = plt.subplots(3, 2, figsize=(15, 15))
axes = axes.flatten()

for i, feature in enumerate(numerical_features):
    sns.boxplot(x="Need Supplement", y=feature, data=training_data, ax=axes[i])
    axes[i].set_title(f"Distribution of {feature} by Need Supplement")

plt.tight_layout()
plt.show()

In [None]:
# Correlation matrix for numerical features
plt.figure(figsize=(10, 8))
correlation_matrix = training_data[numerical_features + ["Need Supplement"]].corr()
sns.heatmap(correlation_matrix, annot=True, cmap="coolwarm", fmt=".2f")
plt.title("Correlation Matrix")
plt.tight_layout()
plt.show()

## 3. Feature Engineering and Preprocessing

In [None]:
# Define features and target variable for training data
X = training_data.drop("Need Supplement", axis=1)
y = training_data["Need Supplement"]

# Define categorical and numerical features
categorical_features = [
    "Gender",
    "Physical Activity",
    "Dietary Habits",
    "Family History",
]
numerical_features = [
    "Age",
    "BMI",
    "Total Cholesterol",
    "LDL Cholesterol",
    "HDL Cholesterol",
    "Triglycerides",
]

In [None]:
# Create preprocessing pipeline
# For numerical features: StandardScaler
# For categorical features: OneHotEncoder
preprocessor = ColumnTransformer(
    transformers=[
        ("num", StandardScaler(), numerical_features),
        ("cat", OneHotEncoder(handle_unknown="ignore"), categorical_features),
    ]
)

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

## 4. Model Building and Evaluation

### 4.1 Random Forest Classifier

In [None]:
# Create Random Forest Pipeline
rf_pipeline = Pipeline(
    [
        ("preprocessor", preprocessor),
        ("classifier", RandomForestClassifier(random_state=42)),
    ]
)

# Parameter grid for Random Forest
rf_param_grid = {
    "classifier__n_estimators": [100, 200],
    "classifier__max_depth": [None, 10, 20],
    "classifier__min_samples_split": [2, 5, 10],
    "classifier__min_samples_leaf": [1, 2, 4],
}

# Perform grid search with cross-validation
rf_grid_search = GridSearchCV(
    rf_pipeline, rf_param_grid, cv=5, scoring="accuracy", n_jobs=-1
)

rf_grid_search.fit(X_train, y_train)

# Best parameters
print("Best parameters for Random Forest:")
print(rf_grid_search.best_params_)

# Best estimator
rf_best = rf_grid_search.best_estimator_

# Evaluate on test set
y_pred_rf = rf_best.predict(X_test)
print("\nRandom Forest Performance:")
print(f"Accuracy: {accuracy_score(y_test, y_pred_rf):.4f}")
print("\nClassification Report:")
print(classification_report(y_test, y_pred_rf))

### 4.2 Gradient Boosting Classifier

In [None]:
# Create Gradient Boosting Pipeline
gb_pipeline = Pipeline(
    [
        ("preprocessor", preprocessor),
        ("classifier", GradientBoostingClassifier(random_state=42)),
    ]
)

# Parameter grid for Gradient Boosting
gb_param_grid = {
    "classifier__n_estimators": [100, 200],
    "classifier__learning_rate": [0.01, 0.1, 0.2],
    "classifier__max_depth": [3, 5, 7],
}

# Perform grid search with cross-validation
gb_grid_search = GridSearchCV(
    gb_pipeline, gb_param_grid, cv=5, scoring="accuracy", n_jobs=-1
)

gb_grid_search.fit(X_train, y_train)

# Best parameters
print("Best parameters for Gradient Boosting:")
print(gb_grid_search.best_params_)

# Best estimator
gb_best = gb_grid_search.best_estimator_

# Evaluate on test set
y_pred_gb = gb_best.predict(X_test)
print("\nGradient Boosting Performance:")
print(f"Accuracy: {accuracy_score(y_test, y_pred_gb):.4f}")
print("\nClassification Report:")
print(classification_report(y_test, y_pred_gb))

### 4.3 Logistic Regression

In [None]:
# Create Logistic Regression Pipeline
lr_pipeline = Pipeline(
    [
        ("preprocessor", preprocessor),
        ("classifier", LogisticRegression(random_state=42, max_iter=1000)),
    ]
)

# Parameter grid for Logistic Regression
lr_param_grid = {
    "classifier__C": [0.01, 0.1, 1, 10, 100],
    "classifier__solver": ["liblinear", "saga"],
}

# Perform grid search with cross-validation
lr_grid_search = GridSearchCV(
    lr_pipeline, lr_param_grid, cv=5, scoring="accuracy", n_jobs=-1
)

lr_grid_search.fit(X_train, y_train)

# Best parameters
print("Best parameters for Logistic Regression:")
print(lr_grid_search.best_params_)

# Best estimator
lr_best = lr_grid_search.best_estimator_

# Evaluate on test set
y_pred_lr = lr_best.predict(X_test)
print("\nLogistic Regression Performance:")
print(f"Accuracy: {accuracy_score(y_test, y_pred_lr):.4f}")
print("\nClassification Report:")
print(classification_report(y_test, y_pred_lr))

### 4.4 Support Vector Machine

In [None]:
# Create SVM Pipeline
svm_pipeline = Pipeline(
    [
        ("preprocessor", preprocessor),
        ("classifier", SVC(random_state=42, probability=True)),
    ]
)

# Parameter grid for SVM
svm_param_grid = {
    "classifier__C": [0.1, 1, 10],
    "classifier__kernel": ["linear", "rbf"],
    "classifier__gamma": ["scale", "auto"],
}

# Perform grid search with cross-validation
svm_grid_search = GridSearchCV(
    svm_pipeline, svm_param_grid, cv=5, scoring="accuracy", n_jobs=-1
)

svm_grid_search.fit(X_train, y_train)

# Best parameters
print("Best parameters for SVM:")
print(svm_grid_search.best_params_)

# Best estimator
svm_best = svm_grid_search.best_estimator_

# Evaluate on test set
y_pred_svm = svm_best.predict(X_test)
print("\nSVM Performance:")
print(f"Accuracy: {accuracy_score(y_test, y_pred_svm):.4f}")
print("\nClassification Report:")
print(classification_report(y_test, y_pred_svm))

### 4.5 Model Comparison

In [None]:
# Compare models
models = {
    "Random Forest": rf_best,
    "Gradient Boosting": gb_best,
    "Logistic Regression": lr_best,
    "SVM": svm_best,
}

# Cross-validation results
cv_results = {}
for name, model in models.items():
    scores = cross_val_score(model, X, y, cv=5, scoring="accuracy")
    cv_results[name] = scores
    print(f"{name} - Mean CV Accuracy: {scores.mean():.4f} (±{scores.std():.4f})")

# Visualize model comparison
plt.figure(figsize=(10, 6))
box_data = [cv_results[model_name] for model_name in models.keys()]
plt.boxplot(box_data, labels=list(models.keys()))
plt.title("Model Comparison - Cross-Validation Accuracy")
plt.ylabel("Accuracy")
plt.grid(True, alpha=0.3)
plt.show()

In [None]:
# Plot ROC curves
plt.figure(figsize=(10, 8))

for name, model in models.items():
    y_pred_proba = model.predict_proba(X_test)[:, 1]
    fpr, tpr, _ = roc_curve(y_test, y_pred_proba)
    roc_auc = auc(fpr, tpr)
    plt.plot(fpr, tpr, label=f"{name} (AUC = {roc_auc:.2f})")

plt.plot([0, 1], [0, 1], "k--", label="Random")
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("ROC Curves")
plt.legend(loc="lower right")
plt.grid(True, alpha=0.3)
plt.show()

In [None]:
# Select the best model based on cross-validation results
best_model_name = max(cv_results, key=lambda k: cv_results[k].mean())
best_model = models[best_model_name]

print(f"The best model is: {best_model_name}")

# Feature importance for the best model (if applicable)
if best_model_name in ["Random Forest", "Gradient Boosting"]:
    # Get feature names after preprocessing
    preprocessor_output_feature_names = numerical_features + list(
        best_model.named_steps["preprocessor"]
        .transformers_[1][1]  # OneHotEncoder
        .get_feature_names_out(categorical_features)
    )

    # Get feature importances
    importances = best_model.named_steps["classifier"].feature_importances_

    # Create a DataFrame for better visualization
    feature_importance_df = pd.DataFrame(
        {"Feature": preprocessor_output_feature_names, "Importance": importances}
    ).sort_values(by="Importance", ascending=False)

    # Display top 15 features
    plt.figure(figsize=(12, 8))
    sns.barplot(x="Importance", y="Feature", data=feature_importance_df.head(15))
    plt.title(f"Top 15 Feature Importances - {best_model_name}")
    plt.tight_layout()
    plt.show()

## 5. Make Predictions on New Data

In [None]:
# Display the first few rows of the prediction data
prediction_data.head()

In [None]:
# Make predictions using the best model
predictions = best_model.predict(prediction_data)
prediction_probabilities = best_model.predict_proba(prediction_data)[:, 1]

# Add predictions to the prediction data
prediction_results = prediction_data.copy()
prediction_results["Need Supplement"] = predictions
prediction_results["Probability"] = prediction_probabilities

# Display the results
print("Predictions for the 20 patients:")
display(prediction_results)

In [None]:
# Count of predictions
plt.figure(figsize=(8, 6))
sns.countplot(x="Need Supplement", data=prediction_results)
plt.title("Distribution of Predictions")
plt.xticks([0, 1], ["No Need for Supplement", "Need Supplement"])
plt.show()

print("Prediction distribution:")
print(prediction_results["Need Supplement"].value_counts())
print("Percentage:")
print(prediction_results["Need Supplement"].value_counts(normalize=True) * 100)

In [None]:
# Save the predictions to Excel file
prediction_results.to_excel("predict_cholesterol_surname_name.xlsx", index=False)

## 6. Conclusion

In this project, we developed a machine learning classification model to predict whether a patient needs a cholesterol-lowering supplement based on various health attributes.

Key findings:
1. We explored the data and found relationships between health metrics and the need for supplements
2. We built and compared four different machine learning models:
   - Random Forest
   - Gradient Boosting
   - Logistic Regression
   - Support Vector Machine
3. The best performing model was selected based on cross-validation results
4. We applied this model to predict supplement needs for 20 new patients

The most influential features in determining whether a patient needs a supplement were likely cholesterol levels (particularly LDL and Total Cholesterol), along with other risk factors like age, BMI, and family history.

This model can be used as a decision support tool for healthcare providers when determining if a patient would benefit from cholesterol-lowering supplements.