In [1]:
import pandas as pd
import kagglehub
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.preprocessing import StandardScaler
from imblearn.combine import SMOTETomek
from sklearn.model_selection import train_test_split

# 1. Load datasets
df_train = pd.read_csv("train(43).csv")
path = kagglehub.dataset_download("fedesoriano/stroke-prediction-dataset")
df_kaggle = pd.read_csv(f"{path}/healthcare-dataset-stroke-data.csv")

# 2. Add 'stroke' column to df_test if not present
if 'stroke' not in df_kaggle.columns:
    df_kaggle['stroke'] = None  # (In this case, Kaggle data already has 'stroke')

# 3. Align columns in both dataframes
common_cols = list(set(df_train.columns) | set(df_kaggle.columns))
df_train = df_train.reindex(columns=common_cols)
df_kaggle = df_kaggle.reindex(columns=common_cols)

# 4. Concatenate dataframes
df = pd.concat([df_train, df_kaggle], ignore_index=True)

# 5. Convert categorical features to numeric codes
df['gender'] = df['gender'].map({'Male': 1, 'Female': 0})
df['ever_married'] = df['ever_married'].map({'Yes': 1, 'No': 0})
df['Residence_type'] = df['Residence_type'].map({'Urban': 1, 'Rural': 0})
df['smoking_status'] = df['smoking_status'].map({
    'never smoked': 0,
    'formerly smoked': 0.5,
    'smokes': 1
})

# 6. One-hot encode the work_type feature
df = pd.get_dummies(df, columns=['work_type'], prefix='work')

# 7. Remove any rows without a stroke label
df = df[df['stroke'].notna()]
df['stroke'] = df['stroke'].astype(int)

# 8. Impute missing numeric values using IterativeImputer (MICE)
mice_imputer = IterativeImputer(random_state=42)
numeric_cols = df.select_dtypes(include=['float64', 'int64']).columns.drop('stroke')
df[numeric_cols] = mice_imputer.fit_transform(df[numeric_cols])

# 9. Train-test split (80/20)
X = df.drop(columns=['stroke'])
y = df['stroke']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 10. Feature scaling
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# 11. Balance the training set using SMOTE + Tomek Links
sampler = SMOTETomek(random_state=42)
X_resampled, y_resampled = sampler.fit_resample(X_train_scaled, y_train)


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

# 12. Train Logistic Regression
lr_model = LogisticRegression(max_iter=1000, random_state=42)
lr_model.fit(X_resampled, y_resampled)

# 13. Evaluate on test set
y_pred_lr = lr_model.predict(X_test_scaled)
print("📊 Accuracy:", accuracy_score(y_test, y_pred_lr))
print("\n🧩 Confusion Matrix:\n", confusion_matrix(y_test, y_pred_lr))
print("\n📋 Classification Report:\n", classification_report(y_test, y_pred_lr, digits=4))


📊 Accuracy: 0.7468563182848897

🧩 Confusion Matrix:
 [[7094 2408]
 [  48  152]]

📋 Classification Report:
               precision    recall  f1-score   support

           0     0.9933    0.7466    0.8524      9502
           1     0.0594    0.7600    0.1101       200

    accuracy                         0.7469      9702
   macro avg     0.5263    0.7533    0.4813      9702
weighted avg     0.9740    0.7469    0.8371      9702



In [3]:
from sklearn.ensemble import RandomForestClassifier

rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_resampled, y_resampled)

y_pred_rf = rf_model.predict(X_test_scaled)
print("📊 Accuracy:", accuracy_score(y_test, y_pred_rf))
print("\n🧩 Confusion Matrix:\n", confusion_matrix(y_test, y_pred_rf))
print("\n📋 Classification Report:\n", classification_report(y_test, y_pred_rf, digits=4))


📊 Accuracy: 0.9727891156462585

🧩 Confusion Matrix:
 [[9348  154]
 [ 110   90]]

📋 Classification Report:
               precision    recall  f1-score   support

           0     0.9884    0.9838    0.9861      9502
           1     0.3689    0.4500    0.4054       200

    accuracy                         0.9728      9702
   macro avg     0.6786    0.7169    0.6957      9702
weighted avg     0.9756    0.9728    0.9741      9702



In [4]:
from sklearn.ensemble import GradientBoostingClassifier

gb_model = GradientBoostingClassifier(random_state=42)
gb_model.fit(X_resampled, y_resampled)

y_pred_gb = gb_model.predict(X_test_scaled)
print("📊 Accuracy:", accuracy_score(y_test, y_pred_gb))
print("\n🧩 Confusion Matrix:\n", confusion_matrix(y_test, y_pred_gb))
print("\n📋 Classification Report:\n", classification_report(y_test, y_pred_gb, digits=4))


📊 Accuracy: 0.8297258297258298

🧩 Confusion Matrix:
 [[7932 1570]
 [  82  118]]

📋 Classification Report:
               precision    recall  f1-score   support

           0     0.9898    0.8348    0.9057      9502
           1     0.0699    0.5900    0.1250       200

    accuracy                         0.8297      9702
   macro avg     0.5298    0.7124    0.5153      9702
weighted avg     0.9708    0.8297    0.8896      9702



In [5]:
from xgboost import XGBClassifier

xgb_model = XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42)
xgb_model.fit(X_resampled, y_resampled)

y_pred_xgb = xgb_model.predict(X_test_scaled)
print("📊 Accuracy:", accuracy_score(y_test, y_pred_xgb))
print("\n🧩 Confusion Matrix:\n", confusion_matrix(y_test, y_pred_xgb))
print("\n📋 Classification Report:\n", classification_report(y_test, y_pred_xgb, digits=4))


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


📊 Accuracy: 0.9575345289631004

🧩 Confusion Matrix:
 [[9236  266]
 [ 146   54]]

📋 Classification Report:
               precision    recall  f1-score   support

           0     0.9844    0.9720    0.9782      9502
           1     0.1688    0.2700    0.2077       200

    accuracy                         0.9575      9702
   macro avg     0.5766    0.6210    0.5929      9702
weighted avg     0.9676    0.9575    0.9623      9702



In [6]:
from lightgbm import LGBMClassifier

lgb_model = LGBMClassifier(random_state=42)
lgb_model.fit(X_resampled, y_resampled)

y_pred_lgb = lgb_model.predict(X_test_scaled)
print("📊 Accuracy:", accuracy_score(y_test, y_pred_lgb))
print("\n🧩 Confusion Matrix:\n", confusion_matrix(y_test, y_pred_lgb))
print("\n📋 Classification Report:\n", classification_report(y_test, y_pred_lgb, digits=4))


[LightGBM] [Info] Number of positive: 37941, number of negative: 37941
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.003809 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2156
[LightGBM] [Info] Number of data points in the train set: 75882, number of used features: 15
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
📊 Accuracy: 0.9475365903937333

🧩 Confusion Matrix:
 [[9148  354]
 [ 155   45]]

📋 Classification Report:
               precision    recall  f1-score   support

           0     0.9833    0.9627    0.9729      9502
           1     0.1128    0.2250    0.1503       200

    accuracy                         0.9475      9702
   macro avg     0.5481    0.5939    0.5616      9702
weighted avg     0.9654    0.9475    0.9560      9702





In [7]:
   # install NGBoost library
from ngboost import NGBClassifier

ngb_model = NGBClassifier(random_state=42)
ngb_model.fit(X_resampled, y_resampled)

y_pred_ngb = ngb_model.predict(X_test_scaled)
print("📊 Accuracy:", accuracy_score(y_test, y_pred_ngb))
print("\n🧩 Confusion Matrix:\n", confusion_matrix(y_test, y_pred_ngb))
print("\n📋 Classification Report:\n", classification_report(y_test, y_pred_ngb, digits=4))


[iter 0] loss=0.6931 val_loss=0.0000 scale=2.0000 norm=4.0000
[iter 100] loss=0.4302 val_loss=0.0000 scale=2.0000 norm=3.3765
[iter 200] loss=0.3917 val_loss=0.0000 scale=1.0000 norm=1.6860
[iter 300] loss=0.3801 val_loss=0.0000 scale=1.0000 norm=1.6839
[iter 400] loss=0.3672 val_loss=0.0000 scale=1.0000 norm=1.6727
📊 Accuracy: 0.7692228406514121

🧩 Confusion Matrix:
 [[7313 2189]
 [  50  150]]

📋 Classification Report:
               precision    recall  f1-score   support

           0     0.9932    0.7696    0.8672      9502
           1     0.0641    0.7500    0.1182       200

    accuracy                         0.7692      9702
   macro avg     0.5287    0.7598    0.4927      9702
weighted avg     0.9741    0.7692    0.8518      9702



In [8]:
  # install the PyTorch TabNet implementation
from pytorch_tabnet.tab_model import TabNetClassifier

tabnet_model = TabNetClassifier(seed=42, verbose=0)
tabnet_model.fit(X_resampled, y_resampled, eval_set=[(X_test_scaled, y_test)], eval_metric=['accuracy'], patience=10, max_epochs=100)

y_pred_tab = tabnet_model.predict(X_test_scaled)
print("📊 Accuracy:", accuracy_score(y_test, y_pred_tab))
print("\n🧩 Confusion Matrix:\n", confusion_matrix(y_test, y_pred_tab))
print("\n📋 Classification Report:\n", classification_report(y_test, y_pred_tab, digits=4))



Early stopping occurred at epoch 61 with best_epoch = 51 and best_val_0_accuracy = 0.87333




📊 Accuracy: 0.8733250876108019

🧩 Confusion Matrix:
 [[8359 1143]
 [  86  114]]

📋 Classification Report:
               precision    recall  f1-score   support

           0     0.9898    0.8797    0.9315      9502
           1     0.0907    0.5700    0.1565       200

    accuracy                         0.8733      9702
   macro avg     0.5403    0.7249    0.5440      9702
weighted avg     0.9713    0.8733    0.9155      9702



In [11]:
from sklearn.ensemble import VotingClassifier

# 14. Create a voting ensemble of all trained models
ensemble_model = VotingClassifier(
    estimators=[
        ('lr', lr_model),
        ('rf', rf_model),
        ('gb', gb_model),
        ('xgb', xgb_model),
        ('lgb', lgb_model),
        # ('ngb', ngb_model),  # исключён
        # ('tabnet', tabnet_model)
    ],
    voting='soft'
)

ensemble_model.fit(X_resampled, y_resampled)

# 15. Evaluate the ensemble on the test set
y_pred_ensemble = ensemble_model.predict(X_test_scaled)
print("📊 Accuracy:", accuracy_score(y_test, y_pred_ensemble))
print("\n🧩 Confusion Matrix:\n", confusion_matrix(y_test, y_pred_ensemble))
print("\n📋 Classification Report:\n", classification_report(y_test, y_pred_ensemble, digits=4))


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


[LightGBM] [Info] Number of positive: 37941, number of negative: 37941
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.002812 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2156
[LightGBM] [Info] Number of data points in the train set: 75882, number of used features: 15
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
📊 Accuracy: 0.935374149659864

🧩 Confusion Matrix:
 [[8986  516]
 [ 111   89]]

📋 Classification Report:
               precision    recall  f1-score   support

           0     0.9878    0.9457    0.9663      9502
           1     0.1471    0.4450    0.2211       200

    accuracy                         0.9354      9702
   macro avg     0.5675    0.6953    0.5937      9702
weighted avg     0.9705    0.9354    0.9509      9702



