In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import warnings
warnings.filterwarnings('ignore')

In [2]:
df=pd.read_csv(r'C:\Users\b84266591\Desktop\Tech\MJ TINGs\cardio_train.csv', sep=';')
df

Unnamed: 0,id,age,gender,height,weight,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active,cardio
0,0,18393,2,168,62.0,110,80,1,1,0,0,1,0
1,1,20228,1,156,85.0,140,90,3,1,0,0,1,1
2,2,18857,1,165,64.0,130,70,3,1,0,0,0,1
3,3,17623,2,169,82.0,150,100,1,1,0,0,1,1
4,4,17474,1,156,56.0,100,60,1,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
69995,99993,19240,2,168,76.0,120,80,1,1,1,0,1,0
69996,99995,22601,1,158,126.0,140,90,2,2,0,0,1,1
69997,99996,19066,2,183,105.0,180,90,3,1,0,1,0,1
69998,99998,22431,1,163,72.0,135,80,1,2,0,0,0,1


In [3]:
df['height']=df['height']*0.01
df['bmi'] = (df['weight'] / (df['height'] ** 2)).astype('int')
df['age'] = (df['age'] / 365).round().astype('int')

# Define BMI categories
def categorize_bmi(bmi):
    if bmi < 18.5:
        return 'Underweight'
    elif 18.5 <= bmi < 25:
        return 'Normal'
    elif 25 <= bmi < 30:
        return 'Overweight'
    else:
        return 'Obese'

# Apply the function to create a new feature
df['bmi_category'] = df['bmi'].apply(categorize_bmi)

# Define age groups
def categorize_age(age):
    if age < 40:
        return 'Young'
    elif 40 <= age < 60:
        return 'Middle-aged'
    else:
        return 'Senior'

# Apply the function to create a new feature
df['age_group'] = df['age'].apply(categorize_age)

# Define blood pressure categories
def categorize_bp(ap_hi, ap_lo):
    if ap_hi < 120 and ap_lo < 80:
        return 'Normal'
    elif ap_hi >= 140 or ap_lo >= 90:
        return 'Hypertension'
    else:
        return 'High-Normal'

# Apply the function to create a new feature
df['bp_category'] = df.apply(lambda row: categorize_bp(row['ap_hi'], row['ap_lo']), axis=1)

<h2> Building The Model </h2>

<h3> Decision Tree Classifier </h3>

In [5]:
# Import necessary libraries
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [6]:
df = df.drop(columns=['id'])

In [7]:
df=pd.get_dummies(df)

In [8]:
# Define features (X) and target (y)
X = df.drop(columns=['cardio'])  # Exclude the target variable
y = df['cardio']

In [9]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [10]:
# Create a Decision Tree classifier
decision_tree = DecisionTreeClassifier(random_state=42)

In [11]:
# Train the model
decision_tree.fit(X_train, y_train)

In [12]:
# Make predictions on the test set
y_pred = decision_tree.predict(X_test)

In [13]:
# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
classification_rep = classification_report(y_test, y_pred)

In [14]:
# Print the results
print(f"Accuracy: {accuracy}")
print("Confusion Matrix:")
print(conf_matrix)
print("Classification Report:")
print(classification_rep)

Accuracy: 0.6381428571428571
Confusion Matrix:
[[4575 2413]
 [2653 4359]]
Classification Report:
              precision    recall  f1-score   support

           0       0.63      0.65      0.64      6988
           1       0.64      0.62      0.63      7012

    accuracy                           0.64     14000
   macro avg       0.64      0.64      0.64     14000
weighted avg       0.64      0.64      0.64     14000



We have gotten an accuracy of 63.8% which is lower by almost 10% of the model that we worked with earlier which was 72.44% using logistic regression. Let us perform Cross-Validation

<h3> GridSearch Cross Validation </h3>

In [15]:
from sklearn.model_selection import GridSearchCV

In [16]:
# Define the hyperparameters to tune
param_grid = {
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

In [17]:
# Create a Decision Tree classifier
decision_tree = DecisionTreeClassifier(random_state=42)

In [18]:
# Use GridSearchCV to find the best hyperparameters
grid_search = GridSearchCV(decision_tree, param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_train, y_train)

In [19]:
# Get the best parameters
best_params = grid_search.best_params_

In [20]:
# Train a new model with the best parameters
best_decision_tree = DecisionTreeClassifier(**best_params, random_state=42)
best_decision_tree.fit(X_train, y_train)

In [21]:
# Make predictions and evaluate the model
y_pred_best = best_decision_tree.predict(X_test)
accuracy_best = accuracy_score(y_test, y_pred_best)

# Print the results
print(f"Best Hyperparameters: {best_params}")
print(f"Accuracy with Best Model: {accuracy_best}")

Best Hyperparameters: {'max_depth': 10, 'min_samples_leaf': 1, 'min_samples_split': 10}
Accuracy with Best Model: 0.7243571428571428


It looks like the hyperparameter tuning has led to an improvement in accuracy. The best hyperparameters found by the grid search are:

max_depth: 10
min_samples_leaf: 1
min_samples_split: 10

And the accuracy achieved with the best model is approximately 72.4%. This indicates a modest improvement over the initial Decision Tree model.

We are still looking to further enhance performance, we might consider exploring other algorithms or ensemble methods like Random Forests. Additionally, feature engineering, addressing data imbalances, or trying more advanced techniques like gradient boosting could contribute to better results.

<h3> Ensemble Methods using Random Forest </h3>

In [22]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Create a Random Forest classifier
random_forest = RandomForestClassifier(random_state=42)

# Train the model
random_forest.fit(X_train, y_train)

# Make predictions on the test set
y_pred_rf = random_forest.predict(X_test)

# Evaluate the model
accuracy_rf = accuracy_score(y_test, y_pred_rf)
conf_matrix_rf = confusion_matrix(y_test, y_pred_rf)
classification_rep_rf = classification_report(y_test, y_pred_rf)

# Print the results
print(f"Random Forest Accuracy: {accuracy_rf}")
print("Confusion Matrix:")
print(conf_matrix_rf)
print("Classification Report:")
print(classification_rep_rf)

Random Forest Accuracy: 0.7031428571428572
Confusion Matrix:
[[4919 2069]
 [2087 4925]]
Classification Report:
              precision    recall  f1-score   support

           0       0.70      0.70      0.70      6988
           1       0.70      0.70      0.70      7012

    accuracy                           0.70     14000
   macro avg       0.70      0.70      0.70     14000
weighted avg       0.70      0.70      0.70     14000



The Random Forest achieved an accuracy of 70.3%. It's worth noting that the Random Forest accuracy is slightly lower than the Decision Tree with tuned hyperparameters. However, the Random Forest model might still offer advantages in terms of robustness and generalization to new data.

<h3> Hyperparameter Tuning For Random Forest </h3> We will use a similar approach as that of Decision Trees 

In [23]:
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier

In [24]:
# Define the hyperparameters to tune
param_grid_rf = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

In [25]:
# Create a Random Forest classifier
random_forest = RandomForestClassifier(random_state=42)

In [26]:
# Use GridSearchCV to find the best hyperparameters
grid_search_rf = GridSearchCV(random_forest, param_grid_rf, cv=5, scoring='accuracy')
grid_search_rf.fit(X_train, y_train)

In [28]:
# Get the best parameters
best_params_rf = grid_search_rf.best_params_

In [29]:
# Train a new Random Forest model with the best parameters
best_random_forest = RandomForestClassifier(**best_params_rf, random_state=42)
best_random_forest.fit(X_train, y_train)

In [30]:
# Make predictions and evaluate the model
y_pred_rf_best = best_random_forest.predict(X_test)
accuracy_rf_best = accuracy_score(y_test, y_pred_rf_best)

# Print the results
print(f"Best Hyperparameters for Random Forest: {best_params_rf}")
print(f"Accuracy with Best Random Forest Model: {accuracy_rf_best}")

Best Hyperparameters for Random Forest: {'max_depth': 10, 'min_samples_leaf': 1, 'min_samples_split': 5, 'n_estimators': 200}
Accuracy with Best Random Forest Model: 0.7389285714285714


 It seems like the hyperparameter tuning for the Random Forest model has resulted in improved performance. The best hyperparameters found are:

max_depth: 10
min_samples_leaf: 1
min_samples_split: 5
n_estimators: 200
And the accuracy achieved with the best Random Forest model is approximately 73.9%, which is a notable improvement.

<h2> Gradient Boosting </h2>

In [None]:
from sklearn.ensemble import GradientBoostingClassifier

# Create a Gradient Boosting classifier
gradient_boosting = GradientBoostingClassifier(random_state=42)

# Train the model
gradient_boosting.fit(X_train, y_train)

# Make predictions and evaluate the model
y_pred_gb = gradient_boosting.predict(X_test)
accuracy_gb = accuracy_score(y_test, y_pred_gb)

# Print the results
print(f"Gradient Boosting Accuracy: {accuracy_gb}")