# Idea:

* Use 3 original dataset, but only with stroke = 1
* MinMaxScaler for colums: age, avg_glucose_level, bmi
* OrdinalEncoder for others
* RepeatedKFold with 12 folds
* LGBMClassifier
* CatBoostRegressor
* RandomForest
* Ensemble
* Easy toggle some of this parameters in configuration

Please upvote if You find notebook usefull

# Imports

In [None]:
# importing all libraries data could be used
import os
import numpy as np
from numpy import arange
import pandas as pd
import sklearn
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler, OrdinalEncoder, MinMaxScaler, RobustScaler
from sklearn.metrics import accuracy_score, roc_auc_score
from sklearn.model_selection import train_test_split, cross_val_predict, KFold, StratifiedKFold, RepeatedKFold, RepeatedStratifiedKFold
from sklearn.linear_model import LogisticRegression, Lasso, Ridge, RidgeCV ,LassoCV, LogisticRegressionCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
import matplotlib.pyplot as plt
import matplotlib as mpl
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
from lightgbm.sklearn import LGBMClassifier

%matplotlib inline

# Config

In [None]:
class conf:
    index = 'id'
    target = 'stroke'
    random = 2023
    
    load_original = True
    only_positive = True
    
    folds = 12

np.random.seed(conf.random)

# Load data

In [None]:
train_full = pd.read_csv("/kaggle/input/playground-series-s3e2/train.csv", index_col=conf.index)
test_full = pd.read_csv("/kaggle/input/playground-series-s3e2/test.csv", index_col=conf.index)
train = train_full.copy()
if conf.load_original:
    print("Load external data...")
    original = pd.read_csv('/kaggle/input/stroke-prediction-dataset/healthcare-dataset-stroke-data.csv', index_col=conf.index)
    original1 = pd.read_csv('/kaggle/input/brain-stroke-dataset/brain_stroke.csv')
    original2 = pd.read_csv('/kaggle/input/full-filled-brain-stroke-dataset/full_data.csv')
    if conf.only_positive:
        train = pd.concat([original[original[conf.target] == 1], train_full], ignore_index=True)
        train0 = pd.concat([original1[original1[conf.target] == 1], train], ignore_index=True)
        train1 = pd.concat([original2[original2[conf.target] == 1], train0], ignore_index=True)
    else:
        train1 = pd.concat([original, train_full])
train1.info()

In [None]:
train1.bmi=train.bmi.fillna(-9)

# Prepare Data

In [None]:
x_full = train1.copy()
y_full = x_full.pop(conf.target).to_numpy()

num_cols = ["age", "avg_glucose_level", "bmi"]
cat_cols = x_full.columns.difference(num_cols)
print(cat_cols)

num_pipe = Pipeline([
#     ('imputer', SimpleImputer(strategy="median")),
    ('scaler', MinMaxScaler())
])

tr = ColumnTransformer([
    ("num", num_pipe, num_cols),
    ("cat", OrdinalEncoder(), cat_cols)
])

x_full = tr.fit_transform(x_full)
x_test = tr.transform(test_full)
print("train shape = ", x_full.shape)
print("test shape = ", x_test.shape)

# Train

In [None]:
models = []
skf = RepeatedKFold(n_splits=conf.folds, random_state=conf.random)

# LGBMClassifier

In [None]:
scores = []
for train_index, val_index in skf.split(x_full, y_full):
    x_train, x_val = x_full[train_index], x_full[val_index]
    y_train, y_val = y_full[train_index], y_full[val_index]
    
    m = LGBMClassifier(n_estimators=24, random_state=2023)
    m.fit(x_train, y_train)

    models.append(m)
    scores.append(roc_auc_score(y_val, m.predict_proba(x_val)[:, 1]))
print(f'mean score: {np.mean(scores):.4f}')

# CatBoostClassifier

In [None]:
scores = []
for train_index, val_index in skf.split(x_full, y_full):
    x_train, x_val = x_full[train_index], x_full[val_index]
    y_train, y_val = y_full[train_index], y_full[val_index]

    m = CatBoostClassifier(iterations=150, verbose=0, random_state=2023)
    m.fit(x_train, y_train, verbose = False)

    models.append(m)
    scores.append(roc_auc_score(y_val, m.predict_proba(x_val)[:, 1]))
print(f'mean score: {np.mean(scores):.4f}')

# Random Forest

In [None]:
scores = []
for train_index, val_index in skf.split(x_full, y_full):
    x_train, x_val = x_full[train_index], x_full[val_index]
    y_train, y_val = y_full[train_index], y_full[val_index]

    m = RandomForestClassifier(n_estimators=50, min_samples_leaf=50, max_depth=10, max_samples=None, class_weight='balanced', random_state=2023)
    m.fit(x_train, y_train)

    models.append(m)
    scores.append(roc_auc_score(y_val, m.predict_proba(x_val)[:, 1]))
print(f'mean score: {np.mean(scores):.4f}')

# Predict

In [None]:
test_preds = []

for m in models:
    preds = m.predict_proba(x_test)[:, 1]
    test_preds.append(preds)

In [None]:
test_preds = np.array(test_preds).mean(0)

pd.DataFrame(test_preds).hist(bins=25, figsize=(16,9))

# Submission

In [None]:
ss = pd.read_csv("/kaggle/input/playground-series-s3e2/sample_submission.csv", index_col=conf.index)
ss[conf.target] = test_preds
ss.to_csv("submission.csv")
ss.head()