In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split,GridSearchCV
from sklearn.preprocessing import StandardScaler, LabelEncoder,OneHotEncoder
import seaborn as sns
import matplotlib.pyplot as plt
import kagglehub
import pandas as pd
from sklearn.metrics import accuracy_score, classification_report
from xgboost import XGBClassifier
from imblearn.over_sampling import SMOTE
import seaborn as sns
import matplotlib.pyplot as plt
import kagglehub
import warnings

In [2]:
# Download latest version
path = kagglehub.dataset_download("alphiree/cardiovascular-diseases-risk-prediction-dataset")

print("Path to dataset files:", path)

Downloading from https://www.kaggle.com/api/v1/datasets/download/alphiree/cardiovascular-diseases-risk-prediction-dataset?dataset_version_number=3...


100%|██████████| 4.87M/4.87M [00:00<00:00, 73.5MB/s]

Extracting files...





Path to dataset files: /root/.cache/kagglehub/datasets/alphiree/cardiovascular-diseases-risk-prediction-dataset/versions/3


In [3]:
url=path+'/CVD_cleaned.csv'
heart = pd.read_csv(url)

In [4]:
# Clean column names
heart.columns = heart.columns.str.lower().str.replace(" ", "_")

# Rename columns
heart.rename(columns={
    'height_(cm)': 'height',
    'weight_(kg)': 'weight',
    'green_vegetables_consumption': 'vegetables_consumption',
    'friedpotato_consumption': 'potato_consumption'
}, inplace=True)

# Create BMI group column
bmi_bins = [12.02, 18.3, 26.85, 31.58, 37.8, 100]
bmi_labels = ['Underweight', 'Normal weight', 'Overweight', 'Obese I', 'Obese II']
heart['bmi_group'] = pd.cut(heart['bmi'], bins=bmi_bins, labels=bmi_labels, right=False)

# Move bmi_group column
column_to_move = heart.pop('bmi_group')
heart.insert(14, 'bmi_group', column_to_move)
heart['bmi_group'] = heart['bmi_group'].astype('object')

# Encode target
heart['heart_disease'] = heart['heart_disease'].map({'Yes': 1, 'No': 0})

# Label Encoding for nominal features
from sklearn.preprocessing import LabelEncoder

cat = ['sex', 'smoking_history']
le = LabelEncoder()

for col in cat:
    heart[col] = le.fit_transform(heart[col])

In [5]:
# Label Encoding for ordinal categorical features
categorical_columns = ['general_health', 'checkup', 'exercise', 'skin_cancer', 'other_cancer',
                       'depression', 'diabetes', 'arthritis', 'age_category', 'bmi_group']

# Store LabelEncoders in a dictionary to access their classes later
label_encoders = {}

for col in categorical_columns:
    le = LabelEncoder()
    heart[col] = le.fit_transform(heart[col])
    label_encoders[col] = le  # Save the encoder

# Print mappings for each column
print("=== Label Encoding Mappings ===")
for col, encoder in label_encoders.items():
    print(f"\nColumn: {col}")
    for value, encoded_value in zip(encoder.classes_, range(len(encoder.classes_))):
        print(f"  {value} → {encoded_value}")

# Check class distribution (optional)
print("Class distribution:\n", heart['heart_disease'].value_counts())




=== Label Encoding Mappings ===

Column: general_health
  Excellent → 0
  Fair → 1
  Good → 2
  Poor → 3
  Very Good → 4

Column: checkup
  5 or more years ago → 0
  Never → 1
  Within the past 2 years → 2
  Within the past 5 years → 3
  Within the past year → 4

Column: exercise
  No → 0
  Yes → 1

Column: skin_cancer
  No → 0
  Yes → 1

Column: other_cancer
  No → 0
  Yes → 1

Column: depression
  No → 0
  Yes → 1

Column: diabetes
  No → 0
  No, pre-diabetes or borderline diabetes → 1
  Yes → 2
  Yes, but female told only during pregnancy → 3

Column: arthritis
  No → 0
  Yes → 1

Column: age_category
  18-24 → 0
  25-29 → 1
  30-34 → 2
  35-39 → 3
  40-44 → 4
  45-49 → 5
  50-54 → 6
  55-59 → 7
  60-64 → 8
  65-69 → 9
  70-74 → 10
  75-79 → 11
  80+ → 12

Column: bmi_group
  Normal weight → 0
  Obese I → 1
  Obese II → 2
  Overweight → 3
  Underweight → 4
Class distribution:
 heart_disease
0    283883
1     24971
Name: count, dtype: int64


In [6]:

# Feature scaling - only for numerical features (not needed for tree-based models but good practice)
scaler = StandardScaler()
num_features = ['height', 'weight', 'bmi', 'alcohol_consumption', 'fruit_consumption',
                'vegetables_consumption', 'potato_consumption']
heart[num_features] = scaler.fit_transform(heart[num_features])

In [7]:
# Features and target
X = heart.drop("heart_disease", axis=1)
y = heart['heart_disease']

heart['heart_disease'].fillna(heart['heart_disease'].mode()[0], inplace=True)

#Reset index after dropping rows to avoid potential issues.
X = heart.drop("heart_disease", axis=1).reset_index(drop=True)
y = heart['heart_disease'].reset_index(drop=True)


In [8]:
# Handle imbalance
smote = SMOTE(random_state=42)
X_balanced, y_balanced = smote.fit_resample(X, y)

In [9]:
# Train/Test split
X_train, X_test, y_train, y_test = train_test_split(X_balanced, y_balanced, test_size=0.2, random_state=42)


In [10]:
from sklearn.feature_selection import SelectKBest, f_classif

# Feature selection using ANOVA F-value
selector = SelectKBest(f_classif, k=11)
X_new = selector.fit_transform(X_balanced, y_balanced)

# Get selected feature names
selected_features = X_balanced.columns[selector.get_support()]
print("Selected features:", selected_features.tolist())

# Update X_train and X_test with selected features
X_train_selected = selector.transform(X_train)
X_test_selected = selector.transform(X_test)

Selected features: ['general_health', 'checkup', 'exercise', 'diabetes', 'arthritis', 'sex', 'age_category', 'weight', 'bmi', 'smoking_history', 'alcohol_consumption']


In [11]:
# Train XGBoost with best parameters
model = XGBClassifier(
    use_label_encoder=False,
    eval_metric='logloss',
    random_state=42,
    colsample_bytree=1,
    gamma=0,
    learning_rate=0.2,
    max_depth=7,
    n_estimators=200,
    reg_alpha=0,
    reg_lambda=1,
    subsample=0.8
)

model.fit(X_train, y_train)

Parameters: { "use_label_encoder" } are not used.



In [12]:
y_pred = model.predict(X_test)

# Evaluate
accuracy = accuracy_score(y_test, y_pred)
print(f"\nXGBoost Accuracy after tuning: {accuracy * 100:.2f}%")
print("\nClassification Report:")
print(classification_report(y_test, y_pred))


XGBoost Accuracy after tuning: 94.90%

Classification Report:
              precision    recall  f1-score   support

           0       0.92      0.99      0.95     56701
           1       0.99      0.91      0.95     56853

    accuracy                           0.95    113554
   macro avg       0.95      0.95      0.95    113554
weighted avg       0.95      0.95      0.95    113554



In [13]:
import pickle
from google.colab import drive

# Mount Google Drive
drive.mount('/content/drive')

# Save the model to Google Drive
model_path = '/content/drive/MyDrive/GP/Heart_disease/new_XGB_2.pkl'
with open(model_path, 'wb') as file:
    pickle.dump(model, file)
with open('/content/drive/MyDrive/GP/Heart_disease/scaler.pkl', 'wb') as f:
    pickle.dump(scaler, f)

# Save the feature names
with open('/content/drive/MyDrive/GP/Heart_disease/feature_names.pkl', 'wb') as f:
    pickle.dump(selected_features.tolist(), f)

Mounted at /content/drive


In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer, f1_score
from sklearn.model_selection import StratifiedKFold
# Define the parameter grid
param_grid = {
    'n_estimators': [100, 150, 200],
    'max_depth': [3, 5, 7],
    'learning_rate': [0.01, 0.05, 0.1],
    'subsample': [0.6, 0.8, 1.0],
    'colsample_bytree': [0.6, 0.8, 1.0],
    'gamma': [0, 0.1, 0.2],
    'reg_alpha': [0, 0.1, 0.5],
    'reg_lambda': [0.5, 1.0, 2.0],
    'min_child_weight': [1, 3, 5]
}

# Create the XGBoost model
xgb = XGBClassifier(
    objective='binary:logistic',
    eval_metric='logloss',
    use_label_encoder=False,
    random_state=42
)

# Create a custom scorer (F1 score is often good for imbalanced datasets)
scorer = make_scorer(f1_score, average='weighted')

# Set up the grid search
grid_search = GridSearchCV(
    estimator=xgb,
    param_grid=param_grid,
    scoring=scorer,
    cv=StratifiedKFold(3),  # Using stratified KFold for imbalanced data
    n_jobs=-1,  # Use all available cores
    verbose=2  # Show progress
)

# Run the grid search
print("Starting grid search...")
grid_search.fit(X_train, y_train)

# Print the best parameters and score
print("\nBest parameters found:")
print(grid_search.best_params_)

Starting grid search...
Fitting 3 folds for each of 19683 candidates, totalling 59049 fits


PicklingError: Could not pickle the task to send it to the workers.