In [4]:
# Import necessary libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

In [5]:
# Load the dataset
heart_data_df = pd.read_csv('Resources/heart_attack_risk.csv')
heart_data_df

Unnamed: 0,Patient ID,Age,Sex,Cholesterol,Blood Pressure,Heart Rate,Diabetes,Family History,Smoking,Obesity,...,Sedentary Hours Per Day,Income,BMI,Triglycerides,Physical Activity Days Per Week,Sleep Hours Per Day,Country,Continent,Hemisphere,Heart Attack Risk
0,BMW7812,67,Male,208,158/88,72,0,0,1,0,...,6.615001,261404,31.251233,286,0,6,Argentina,South America,Southern Hemisphere,0
1,CZE1114,21,Male,389,165/93,98,1,1,1,1,...,4.963459,285768,27.194973,235,1,7,Canada,North America,Northern Hemisphere,0
2,BNI9906,21,Female,324,174/99,72,1,0,0,0,...,9.463426,235282,28.176571,587,4,4,France,Europe,Northern Hemisphere,0
3,JLN3497,84,Male,383,163/100,73,1,1,1,0,...,7.648981,125640,36.464704,378,3,4,Canada,North America,Northern Hemisphere,0
4,GFO8847,66,Male,318,91/88,93,1,1,1,1,...,1.514821,160555,21.809144,231,1,5,Thailand,Asia,Northern Hemisphere,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8758,MSV9918,60,Male,121,94/76,61,1,1,1,0,...,10.806373,235420,19.655895,67,7,7,Thailand,Asia,Northern Hemisphere,0
8759,QSV6764,28,Female,120,157/102,73,1,0,0,1,...,3.833038,217881,23.993866,617,4,9,Canada,North America,Northern Hemisphere,0
8760,XKA5925,47,Male,250,161/75,105,0,1,1,1,...,2.375214,36998,35.406146,527,4,4,Brazil,South America,Southern Hemisphere,1
8761,EPE6801,36,Male,178,119/67,60,1,0,1,0,...,0.029104,209943,27.294020,114,2,8,Brazil,South America,Southern Hemisphere,0


In [6]:
# Preprocess the data
# Select features and target
X = heart_data_df.drop(['Patient ID', 'Heart Attack Risk', 'Blood Pressure'], axis=1)
y = heart_data_df['Heart Attack Risk']

In [7]:
# Handling categorical and numerical features
categorical_features = X.select_dtypes(include=['object']).columns.tolist()
numerical_features = X.select_dtypes(exclude=['object']).columns.tolist()

categorical_features
numerical_features

['Age',
 'Cholesterol',
 'Heart Rate',
 'Diabetes',
 'Family History',
 'Smoking',
 'Obesity',
 'Alcohol Consumption',
 'Exercise Hours Per Week',
 'Previous Heart Problems',
 'Medication Use',
 'Stress Level',
 'Sedentary Hours Per Day',
 'Income',
 'BMI',
 'Triglycerides',
 'Physical Activity Days Per Week',
 'Sleep Hours Per Day']

In [8]:
# Creating a preprocessing pipeline
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_features),
        ('cat', OneHotEncoder(), categorical_features)
    ])

preprocessor

ColumnTransformer(transformers=[('num', StandardScaler(),
                                 ['Age', 'Cholesterol', 'Heart Rate',
                                  'Diabetes', 'Family History', 'Smoking',
                                  'Obesity', 'Alcohol Consumption',
                                  'Exercise Hours Per Week',
                                  'Previous Heart Problems', 'Medication Use',
                                  'Stress Level', 'Sedentary Hours Per Day',
                                  'Income', 'BMI', 'Triglycerides',
                                  'Physical Activity Days Per Week',
                                  'Sleep Hours Per Day']),
                                ('cat', OneHotEncoder(),
                                 ['Sex', 'Diet', 'Country', 'Continent',
                                  'Hemisphere'])])

In [9]:
# Define the model pipeline
model_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', LogisticRegression())
])

model_pipeline

Pipeline(steps=[('preprocessor',
                 ColumnTransformer(transformers=[('num', StandardScaler(),
                                                  ['Age', 'Cholesterol',
                                                   'Heart Rate', 'Diabetes',
                                                   'Family History', 'Smoking',
                                                   'Obesity',
                                                   'Alcohol Consumption',
                                                   'Exercise Hours Per Week',
                                                   'Previous Heart Problems',
                                                   'Medication Use',
                                                   'Stress Level',
                                                   'Sedentary Hours Per Day',
                                                   'Income', 'BMI',
                                                   'Triglycerides',
                    

In [10]:
# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [11]:
# Train the model
model_pipeline.fit(X_train, y_train)

Pipeline(steps=[('preprocessor',
                 ColumnTransformer(transformers=[('num', StandardScaler(),
                                                  ['Age', 'Cholesterol',
                                                   'Heart Rate', 'Diabetes',
                                                   'Family History', 'Smoking',
                                                   'Obesity',
                                                   'Alcohol Consumption',
                                                   'Exercise Hours Per Week',
                                                   'Previous Heart Problems',
                                                   'Medication Use',
                                                   'Stress Level',
                                                   'Sedentary Hours Per Day',
                                                   'Income', 'BMI',
                                                   'Triglycerides',
                    

In [12]:
# Predictions
y_pred = model_pipeline.predict(X_test)

In [13]:
# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy*100:.2f}%")
print(classification_report(y_test, y_pred))

Accuracy: 64.18%
              precision    recall  f1-score   support

           0       0.64      1.00      0.78      1125
           1       0.00      0.00      0.00       628

    accuracy                           0.64      1753
   macro avg       0.32      0.50      0.39      1753
weighted avg       0.41      0.64      0.50      1753



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


## Results

The logistic regression model achieved an accuracy of 64.18%. However, the classification report reveals significant performance issues, particularly with class `1` (indicating a heart attack risk), where both precision and recall are 0.00. This suggests the model is heavily biased towards predicting class `0` (no heart attack risk), likely due to class imbalance.

## Suggestions for Improvement

1. **Address Class Imbalance**: Apply techniques like SMOTE or adjust class weights in the logistic regression model. Consider evaluation metrics suited for imbalanced data, like AUC-ROC.

2. **Feature Engineering**: Explore more complex feature engineering to uncover patterns missed by the current model.

3. **Model Selection**: Consider using more complex models such as Random Forest, Gradient Boosting, or neural networks that might capture complex relationships in the data more effectively.

4. **Hyperparameter Tuning**: Optimize the logistic regression model's performance through hyperparameter tuning.

5. **Cross-Validation**: Employ cross-validation techniques for a robust evaluation of the model's performance across different subsets of the dataset.

6. **Evaluation Metrics**: Focus on metrics like F1-score, Precision-Recall AUC, or confusion matrix for a nuanced understanding of model performance, especially in identifying the minority class.

In [14]:
from sklearn.ensemble import RandomForestClassifier

In [15]:
# Define the Random Forest model pipeline
rf_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier(random_state=42))
])

In [16]:
# Train the Random Forest model
rf_pipeline.fit(X_train, y_train)

# Predictions
y_pred_rf = rf_pipeline.predict(X_test)

In [17]:
# Evaluate the Random Forest model
accuracy_rf = accuracy_score(y_test, y_pred_rf)
print(f"Random Forest Accuracy: {accuracy_rf*100:.2f}%")
print(classification_report(y_test, y_pred_rf))

Random Forest Accuracy: 64.00%
              precision    recall  f1-score   support

           0       0.64      0.98      0.78      1125
           1       0.45      0.02      0.04       628

    accuracy                           0.64      1753
   macro avg       0.55      0.50      0.41      1753
weighted avg       0.57      0.64      0.51      1753



## Results

- The Random Forest model achieved an accuracy of 64.00%, which seem moderate but requires context (e.g., baseline accuracy, class distribution) to fully evaluate.

- The classification report indicates a substantial imbalance in model performance across classes:
    - High precision (0.64) and recall (0.98) for class `0`, suggesting the model is effective at identifying the negative class.
    - Low precision (0.45) and especially low recall (0.02) for class `1`, indicating the model struggles significantly to correctly identify positive cases.
- The macro-average and weighted-average scores further highlight the model's biased performance towards the majority class.

## Suggestions for Improvement

1. **Addressing Class Imbalance**: Implement strategies to handle class imbalance, such as oversampling the minority class, undersampling the majority class, or adjusting class weights in the Random Forest classifier.

2. **Hyperparameter Tuning**: Utilize grid search or random search to fine-tune the hyperparameters of the Random Forest model (e.g., `n_estimators`, `max_depth`, `min_samples_split`) for potentially better performance.

3. **Feature Engineering**: Investigate the dataset further to identify opportunities for feature engineering that could improve model performance, such as creating new features or selecting a subset of features.

4. **Alternative Models**: Consider evaluating other machine learning models or ensemble methods that might perform better on this particular dataset, including gradient boosting machines (e.g., XGBoost, LightGBM) or neural networks if appropriate.

5. **Evaluation Metrics**: Given the class imbalance, focus on evaluation metrics beyond accuracy that better capture the model's performance on the minority class, such as the F1-score, Precision-Recall AUC, or the confusion matrix.

These steps aim to address the observed performance issues, especially the model's inability to effectively identify positive cases, and guide towards a more balanced and overall improved model performance.

In [18]:
from sklearn.ensemble import GradientBoostingClassifier

In [19]:
# Define the Gradient Boosting model pipeline
gb_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', GradientBoostingClassifier(random_state=42))
])

In [20]:
# Train the Gradient Boosting model
gb_pipeline.fit(X_train, y_train)

# Predictions
y_pred_gb = gb_pipeline.predict(X_test)

In [21]:
# Evaluate the Gradient Boosting model
accuracy_gb = accuracy_score(y_test, y_pred_gb)
print(f"Gradient Boosting Accuracy: {accuracy_gb*100:.2f}%")
print(classification_report(y_test, y_pred_gb))

Gradient Boosting Accuracy: 63.89%
              precision    recall  f1-score   support

           0       0.64      0.98      0.78      1125
           1       0.42      0.02      0.04       628

    accuracy                           0.64      1753
   macro avg       0.53      0.50      0.41      1753
weighted avg       0.56      0.64      0.51      1753



## Results

- The Gradient Boosting model achieved an accuracy of 63.89%, which seem reasonable but warrants deeper analysis.

- The classification report reveals significant disparities in model performance across different classes:
    - High precision (0.64) and recall (0.98) for class `0`, indicating effective identification of this class.
    - Conversely, both precision (0.42) and recall (0.02) for class `1` are notably low, particularly the recall, suggesting the         model is largely ineffectual at identifying positive instances of this class.


- These results imply a model heavily biased toward the majority class, leading to a high number of false negatives for class `1`.

## Suggestions for Improvement

1. **Class Imbalance Mitigation**: Implement strategies to better handle class imbalance, which could include techniques such as oversampling the minority class or adjusting the class weights parameter in the model.

2. **Hyperparameter Optimization**: Utilize methods like grid search or random search to fine-tune the hyperparameters of the Gradient Boosting classifier. Parameters of interest could include `n_estimators`, `learning_rate`, `max_depth`, among others.

3. **Feature Engineering**: Investigate the dataset for additional feature engineering opportunities. This could entail creating interaction terms, deriving new features, or applying more sophisticated encoding techniques for categorical variables.

4. **Alternative Models and Ensembles**: Experiment with other models or advanced ensemble techniques, such as XGBoost or LightGBM, which might offer better performance or handle class imbalance more effectively.

5. **Evaluation Metric Selection**: Given the class imbalance issue, prioritize metrics that offer more insight into model performance for minority classes, such as the F1-score, Precision-Recall curves, or even utilizing cost-sensitive learning approaches.

In [22]:
from collections import Counter
from imblearn.over_sampling import RandomOverSampler

# Assuming you have your data in X and y arrays
# X contains your features, and y contains your labels (0 or 1)

# Check the distribution of classes before oversampling
print("Before oversampling:", Counter(y))

# Define the oversampler with 'minority' sampling strategy
oversampler = RandomOverSampler(sampling_strategy='minority', random_state=42)

# Oversample the minority class (class 1)
X_resampled, y_resampled = oversampler.fit_resample(X, y)

# Check the distribution of classes after oversampling
print("After oversampling:", Counter(y_resampled))

Before oversampling: Counter({0: 5624, 1: 3139})
After oversampling: Counter({0: 5624, 1: 5624})


In [23]:
# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42)

In [24]:
# Train the model
model_pipeline.fit(X_train, y_train)

Pipeline(steps=[('preprocessor',
                 ColumnTransformer(transformers=[('num', StandardScaler(),
                                                  ['Age', 'Cholesterol',
                                                   'Heart Rate', 'Diabetes',
                                                   'Family History', 'Smoking',
                                                   'Obesity',
                                                   'Alcohol Consumption',
                                                   'Exercise Hours Per Week',
                                                   'Previous Heart Problems',
                                                   'Medication Use',
                                                   'Stress Level',
                                                   'Sedentary Hours Per Day',
                                                   'Income', 'BMI',
                                                   'Triglycerides',
                    

In [25]:
# Predictions
y_pred = model_pipeline.predict(X_test)

In [26]:
# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy*100:.2f}%")
print(classification_report(y_test, y_pred))

Accuracy: 49.96%
              precision    recall  f1-score   support

           0       0.50      0.51      0.50      1120
           1       0.50      0.49      0.49      1130

    accuracy                           0.50      2250
   macro avg       0.50      0.50      0.50      2250
weighted avg       0.50      0.50      0.50      2250



In [27]:
from sklearn.ensemble import RandomForestClassifier

In [28]:
# Define the Random Forest model pipeline
rf_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier(random_state=42))
])

In [29]:
# Train the Random Forest model
pipeline = rf_pipeline.fit(X_train, y_train)

# Predictions
y_pred_rf = rf_pipeline.predict(X_test)

In [30]:
# Evaluate the Random Forest model
accuracy_rf = accuracy_score(y_test, y_pred_rf)
print(f"Random Forest Accuracy: {accuracy_rf*100:.2f}%")
print(classification_report(y_test, y_pred_rf))

Random Forest Accuracy: 79.16%
              precision    recall  f1-score   support

           0       0.74      0.89      0.81      1120
           1       0.86      0.69      0.77      1130

    accuracy                           0.79      2250
   macro avg       0.80      0.79      0.79      2250
weighted avg       0.80      0.79      0.79      2250

