In [None]:
# CELL 1: IMPORTS AND DATA PREPARATION

import pandas as pd
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report

# --- 1. Load CSV ---
csv_file = "heartattack_dataset_no_continents.csv"
df = pd.read_csv(csv_file)
print("Loaded CSV:", csv_file)
print(df.columns.tolist())


# --- 3. Split features and target ---
X = df.drop(columns=['Heart Attack Risk'])
y = df['Heart Attack Risk']

# Stratified Train / Validation / Test split
# 60% train, 20% validation, 20% test
X_train, X_temp, y_train, y_temp = train_test_split(
    X, y, test_size=0.4, stratify=y, random_state=42
)
X_val, X_test, y_val, y_test = train_test_split(
    X_temp, y_temp, test_size=0.5, stratify=y_temp, random_state=42
)

print("Train:", X_train.shape, "Validation:", X_val.shape, "Test:", X_test.shape)
print(X.columns.tolist())

Loaded CSV: heartattack_dataset_no_continents.csv
['Age', 'Sex', 'Cholesterol', 'Heart Rate', 'Diabetes', 'Family History', 'Smoking', 'Obesity', 'Alcohol Consumption', 'Exercise Hours Per Week', 'Diet', 'Previous Heart Problems', 'Medication Use', 'Stress Level', 'Sedentary Hours Per Day', 'Income', 'BMI', 'Triglycerides', 'Physical Activity Days Per Week', 'Sleep Hours Per Day', 'Hemisphere', 'Heart Attack Risk', 'Systolic Blood Pressure', 'Diastolic Blood Pressure', 'Country_Germany', 'Country_Argentina', 'Country_Brazil', 'Country_United Kingdom', 'Country_Australia', 'Country_Nigeria', 'Country_France', 'Country_Canada', 'Country_China', 'Country_New Zealand', 'Country_Japan', 'Country_Italy', 'Country_Spain', 'Country_Colombia', 'Country_Thailand', 'Country_South Africa', 'Country_Vietnam', 'Country_United States', 'Country_India', 'Country_South Korea']
Train: (5257, 38) Validation: (1753, 38) Test: (1753, 38)
['Age', 'Sex', 'Cholesterol', 'Heart Rate', 'Diabetes', 'Family Histo

In [None]:
# CELL 2: BASELINE (DEFAULT PARAMETERS NOT GOOD)

# Initialize baseline XGBoost classifier
model_baseline = XGBClassifier(
    use_label_encoder=False,
    eval_metric="logloss",
    random_state=42
)

# Train the model
model_baseline.fit(X_train, y_train)

# Predict and evaluate
y_train_pred = model_baseline.predict(X_train)
y_val_pred = model_baseline.predict(X_val)

# --- Printing results ---
print("Baseline Training Results wihtout hyperparameter tuning")
print("Accuracy:", accuracy_score(y_train, y_train_pred))
print(classification_report(y_train, y_train_pred))
print()
print("Baseline Validation Results without hyperparameter tuning")
print("Accuracy:", accuracy_score(y_val, y_val_pred))
print(classification_report(y_val, y_val_pred))

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


Baseline Training Results wihtout hyperparameter tuning
Accuracy: 0.9902986494198212
              precision    recall  f1-score   support

           0       0.99      1.00      0.99      3374
           1       1.00      0.97      0.99      1883

    accuracy                           0.99      5257
   macro avg       0.99      0.99      0.99      5257
weighted avg       0.99      0.99      0.99      5257


Baseline Validation Results without hyperparameter tuning
Accuracy: 0.6012549914432401
              precision    recall  f1-score   support

           0       0.65      0.82      0.72      1125
           1       0.40      0.22      0.28       628

    accuracy                           0.60      1753
   macro avg       0.52      0.52      0.50      1753
weighted avg       0.56      0.60      0.57      1753



In [None]:
# XGBoost with Hyperparameter Tuning (Me likey)

from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, classification_report

# Scale_pos_weight
pos = (y_train == 1).sum()
neg = (y_train == 0).sum()
scale_pos_weight = neg / pos

# Model
model = XGBClassifier(
    objective="binary:logistic",
    eval_metric="logloss",
    random_state=42,
    n_estimators=300,
    learning_rate=0.05,
    max_depth=3,
    scale_pos_weight=scale_pos_weight,
)

# Train the model
model.fit(X_train, y_train)

# Predict and evaluate
y_train_pred = model.predict(X_train)
y_val_pred = model.predict(X_val)

print("Training results With hyper parameters")
print("Accuracy:", accuracy_score(y_train, y_train_pred))
print(classification_report(y_train, y_train_pred))

print("Validation results with hyperparameters")
print("Accuracy:", accuracy_score(y_val, y_val_pred))
print(classification_report(y_val, y_val_pred))


Training results With hyper parameters
Accuracy: 0.7378733117747764
              precision    recall  f1-score   support

           0       0.83      0.74      0.78      3374
           1       0.61      0.73      0.67      1883

    accuracy                           0.74      5257
   macro avg       0.72      0.74      0.73      5257
weighted avg       0.75      0.74      0.74      5257

Validation results with hyperparameters
Accuracy: 0.5270964061608671
              precision    recall  f1-score   support

           0       0.65      0.58      0.61      1125
           1       0.36      0.43      0.39       628

    accuracy                           0.53      1753
   macro avg       0.50      0.51      0.50      1753
weighted avg       0.54      0.53      0.53      1753



In [61]:
import pandas as pd
import xgboost as xgb

model = xgb.XGBClassifier()
model.fit(X_train, y_train)

booster = model.get_booster()
booster.feature_names = list(X_train.columns)

gain = booster.get_score(importance_type='gain')
cover = booster.get_score(importance_type='cover')
freq  = booster.get_score(importance_type='weight')

# Lav alle keys ens (nogle features kan mangle i et bestemt importance-type)
all_features = set(gain.keys()) | set(cover.keys()) | set(freq.keys())

data = {
    "feature": [],
    "gain": [],
    "cover": [],
    "frequency": []
}

for f in all_features:
    data["feature"].append(f)
    data["gain"].append(gain.get(f, 0))
    data["cover"].append(cover.get(f, 0))
    data["frequency"].append(freq.get(f, 0))

df = pd.DataFrame(data)

df = df.sort_values("gain", ascending=False)
df = df.reset_index(drop=True)
print(df)


                            feature      gain        cover  frequency
0                 Country_Australia  3.961110    55.540985        8.0
1                  Country_Thailand  3.476732   275.832306        7.0
2                   Country_Nigeria  3.055968   666.454956        7.0
3                     Country_Japan  2.728905   396.655457        8.0
4            Country_United Kingdom  2.678047   235.508331        7.0
5                              Diet  2.675431    47.066818       67.0
6                        Heart Rate  2.664414    94.464058      262.0
7                     Country_India  2.555101   444.575195        7.0
8               Country_South Korea  2.543792   750.372009        6.0
9               Country_New Zealand  2.485161   436.091705        6.0
10                              Age  2.435272   104.327148      320.0
11          Systolic Blood Pressure  2.426494   129.494736      277.0
12              Sleep Hours Per Day  2.407400    63.962494      106.0
13                  