age                                int64
sex                               object
chest_pain_type                   object
resting_blood_pressure             int64
cholestoral                        int64
fasting_blood_sugar               object
rest_ecg                          object
Max_heart_rate                     int64
exercise_induced_angina           object
oldpeak                          float64
slope                             object
vessels_colored_by_flourosopy     object
thalassemia                       object
target                             int64
dtype: object
Index(['age', 'sex', 'chest_pain_type', 'resting_blood_pressure',
       'cholestoral', 'fasting_blood_sugar', 'rest_ecg', 'Max_heart_rate',
       'exercise_induced_angina', 'oldpeak', 'slope',
       'vessels_colored_by_flourosopy', 'thalassemia'],
      dtype='object')


In [20]:
import pandas as pd

# Load data
df = pd.read_csv("HeartDiseaseTrain-Test.csv")

# Dictionary to store mappings
enum_mapping = {}

# Convert all object columns to integer codes
for col in df.select_dtypes(include=['object']).columns:
    df[col], uniques = pd.factorize(df[col])
    enum_mapping[col] = {category: code for code, category in enumerate(uniques)}

# Ensure all columns are int (floats can be kept as they are if needed)
for col in df.columns:
    if df[col].dtype == 'float':
        # Keep floats (like oldpeak) as-is
        pass
    else:
        df[col] = df[col].astype(int)

print("Enum mapping:")
for col, mapping in enum_mapping.items():
    print(f"{col}: {mapping}")

print("\nConverted DataFrame dtypes:")
print(df.dtypes)


Enum mapping:
sex: {'Male': 0, 'Female': 1}
chest_pain_type: {'Typical angina': 0, 'Atypical angina': 1, 'Non-anginal pain': 2, 'Asymptomatic': 3}
fasting_blood_sugar: {'Lower than 120 mg/ml': 0, 'Greater than 120 mg/ml': 1}
rest_ecg: {'ST-T wave abnormality': 0, 'Normal': 1, 'Left ventricular hypertrophy': 2}
exercise_induced_angina: {'No': 0, 'Yes': 1}
slope: {'Downsloping': 0, 'Upsloping': 1, 'Flat': 2}
vessels_colored_by_flourosopy: {'Two': 0, 'Zero': 1, 'One': 2, 'Three': 3, 'Four': 4}
thalassemia: {'Reversable Defect': 0, 'Fixed Defect': 1, 'Normal': 2, 'No': 3}

Converted DataFrame dtypes:
age                                int64
sex                                int64
chest_pain_type                    int64
resting_blood_pressure             int64
cholestoral                        int64
fasting_blood_sugar                int64
rest_ecg                           int64
Max_heart_rate                     int64
exercise_induced_angina            int64
oldpeak                    

In [21]:
import pandas as pd
from sklearn.feature_selection import RFE
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix, classification_report
from xgboost import XGBClassifier
import numpy as np

# Separate features and target
X = df.drop(columns=["target"])
y = df["target"]

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print(df.dtypes)
print(X_train.columns)


age                                int64
sex                                int64
chest_pain_type                    int64
resting_blood_pressure             int64
cholestoral                        int64
fasting_blood_sugar                int64
rest_ecg                           int64
Max_heart_rate                     int64
exercise_induced_angina            int64
oldpeak                          float64
slope                              int64
vessels_colored_by_flourosopy      int64
thalassemia                        int64
target                             int64
dtype: object
Index(['age', 'sex', 'chest_pain_type', 'resting_blood_pressure',
       'cholestoral', 'fasting_blood_sugar', 'rest_ecg', 'Max_heart_rate',
       'exercise_induced_angina', 'oldpeak', 'slope',
       'vessels_colored_by_flourosopy', 'thalassemia'],
      dtype='object')


In [24]:
# XGBClassifier setup
xgb_model = XGBClassifier(
    # Core parameters
    booster='gbtree',
    objective='binary:logistic',
    eval_metric='logloss',    # avoid default warnings, good for binary
    
    # Tree complexity
    max_depth=4,              # controls model complexity
    min_child_weight=3,       # minimum sum of instance weight in a child node
    gamma=0.2,                 # min loss reduction for further partition
    
    # Sampling for generalization
    subsample=0.8,             # row sampling
    colsample_bytree=0.8,      # feature sampling
    
    # Regularization
    reg_alpha=0.1,             # L1 regularization
    reg_lambda=1.0,            # L2 regularization
    
    # Boosting process
    n_estimators=400,          # number of trees
    learning_rate=0.03,        # smaller for smoother learning
    scale_pos_weight=1,        # adjust if target is imbalanced
    
    # Randomness control
    random_state=42,
    n_jobs=-1                  # use all CPU cores
)


# Train
xgb_model.fit(X_train, y_train)

# Predictions
y_pred = xgb_model.predict(X_test)

# Evaluation
acc = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
cm = confusion_matrix(y_test, y_pred)
report = classification_report(y_test, y_pred)

print(acc, f1, cm, report)
print("done")

0.9463414634146341 0.9463414634146341 [[97  5]
 [ 6 97]]               precision    recall  f1-score   support

           0       0.94      0.95      0.95       102
           1       0.95      0.94      0.95       103

    accuracy                           0.95       205
   macro avg       0.95      0.95      0.95       205
weighted avg       0.95      0.95      0.95       205

done


In [25]:
import joblib

# Save trained model to model.pkl
joblib.dump(xgb_model, "jn_model.pkl")
print("✅ model created successfully!")

✅ model created successfully!
