In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

## https://www.kaggle.com/code/edouardo/my-stroke-of-insight/notebook#Training-models

In [None]:
import matplotlib.pyplot as plt 
import seaborn as sns 
plt.style.use('fivethirtyeight')

from sklearn.tree import DecisionTreeRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import KFold, StratifiedKFold, RepeatedStratifiedKFold
from sklearn.metrics import roc_auc_score
from sklearn.linear_model import LassoCV
import catboost as cb
from scipy.stats import rankdata

In [None]:
train = pd.read_csv("/kaggle/input/playground-series-s3e2/train.csv", index_col='id')
test = pd.read_csv('/kaggle/input/playground-series-s3e2/test.csv', index_col='id')
submission = pd.read_csv('/kaggle/input/playground-series-s3e2/sample_submission.csv')

In [None]:
original = pd.read_csv("/kaggle/input/stroke-prediction-dataset/healthcare-dataset-stroke-data.csv")

In [None]:
pd.DataFrame(
    dict(
        train = train.mean(),
        original = original.mean()
    )
)

In [None]:
pd.DataFrame(
    dict(
        train_Stroke = train.query('stroke == 1').mean(),
        original_Stroke = original.query('stroke == 1').mean()
    )
)

In [None]:
train['Original'] = 0
test['Original'] = 0
original['Original'] = 1

In [None]:
train = pd.concat([train, original.query('stroke == 1').drop(columns=['id'], axis=1)], ignore_index=True)

In [None]:
train.isnull().sum()

## Fill missing bmi

In [None]:
dtr = DecisionTreeRegressor(random_state=42)

X = train[['age', 'gender', 'bmi']].copy()
X['gender'] = X['gender'].replace({
    'Male': 0,
    'Female': 1,
    'Other': -1
}).astype(np.int8)

Missing = X[X['bmi'].isna()]
X = X[~X['bmi'].isna()]
y = X.pop('bmi')
dtr.fit(X, y)
preds_bmi = pd.Series(dtr.predict(Missing[['age', 'gender']]), index=Missing.index)
train.loc[Missing.index, 'bmi'] = preds_bmi

In [None]:
df = pd.concat([train, test], axis=0)

In [None]:
df.nunique()

In [None]:
df['stroke'].sum() / len(df)

In [None]:
df.groupby('gender')['stroke'].sum() / df.groupby('gender')['age'].count()

In [None]:
df['gender'].replace('Other', 'Female', inplace=True)
df = pd.concat([df, pd.get_dummies(df['gender'], drop_first=True)], axis=1)
df = df.drop(columns='gender', axis=1)

In [None]:
df['smoking_status'].value_counts()

## Why we need to do these two operations ? 

In [None]:
df['morbid'] = np.where(df['bmi']>40, 1, 0)
df['obese'] = np.where(df['bmi']>30, 1, 0)

In [None]:
def feature_risk_factors(df):
    df['risk_factors'] = df[['avg_glucose_level', 'age', 'bmi', 'hypertension', 'heart_disease', 'smoking_status']].apply(
        lambda x : 0 + (1 if x['avg_glucose_level'] > 99 else 0) +
        (1 if x['age'] > 45 else 0) + (1 if x['bmi'] > 24.99 else 0) +
        (1 if x['hypertension'] == 1 else 0) + (1 if x['heart_disease'] == 1 else 0) +
        (1 if x['smoking_status'] in ['formerly smoked', 'smokes'] else 0), 
        axis=1
    )
    return df

In [None]:
feature_risk_factors(df)

In [None]:
Age = df['age'].astype('int')
rate = []
for i in range(Age.min(), Age.max()):
    rate.append(df[Age < i]['stroke'].sum() / len(df[Age < i]['stroke']) * 100.0)

In [None]:
plt.plot(rate)
plt.xlabel('Age', fontsize=12)
plt.ylabel('% of strokes', fontsize=12)
plt.show()

In [None]:
df = pd.get_dummies(df)

In [None]:
scaler = StandardScaler()
num_cols = ['age', 'avg_glucose_level', 'bmi']
df[num_cols] = scaler.fit_transform(df[num_cols])

In [None]:
df.info()

In [None]:
X = df[:len(train)]
y = X['stroke']
X = X.drop(columns=['stroke', 'Original'], axis=1)

In [None]:
X_test = df.tail(len(test))
X_test = X_test.drop(columns=['stroke', 'Original'], axis=1)

In [None]:
X.shape, y.shape, X_test.shape

## Lasso

In [None]:
preds = []
scores = []

skf = StratifiedKFold(n_splits=20, random_state=42, shuffle=True)

model_params = {
    'precompute': 'auto',
    'fit_intercept': True,
    'max_iter': 100000,
    'verbose': False,
    'eps': 1e-04, 
    'n_alphas': 1000,
    'n_jobs': -1,
}

for train_index, test_index in skf.split(X, y):
    X_train, X_valid = X.iloc[train_index], X.iloc[test_index]
    y_train, y_valid = y.iloc[train_index], y.iloc[test_index]
    
    model = LassoCV(**model_params)
    
    model.fit(X_train, y_train)
    preds.append(model.predict(X_test))
    s = roc_auc_score(y_valid, model.predict(X_valid))
    print(f'Best val auc: {s: .4f}')
    scores.append(s)
    
print(f'mean scores: {np.mean(scores): .4f}')

## CatBoost

In [None]:
def sigmoid(z):
    return 1 / (1 + np.exp(-z))

In [None]:
skf = StratifiedKFold(n_splits=10, random_state=42, shuffle=True)

cb_params = {
    'depth': 3,
    'learning_rate': 0.01,
    'rsm': 0.5,
    'subsample': 0.931,
    'l2_leaf_reg': 69,
    'min_data_in_leaf': 20,
    'random_strength': 0.175,
    'use_best_model': True,
    'task_type': 'CPU',
    'bootstrap_type': 'Bernoulli',
    'grow_policy': 'SymmetricTree',
    'loss_function': 'Logloss',
    'eval_metric': 'AUC',
    'scale_pos_weight': 5
}

for train_index, val_index in skf.split(X, y):
    cb_train = cb.Pool(data=X.iloc[train_index], label=y.iloc[train_index])
    cb_valid = cb.Pool(data=X.iloc[val_index], label=y.iloc[val_index])
    
    model = cb.train(
        params=cb_params,
        dtrain=cb_train,
        num_boost_round=10000,
        evals=cb_valid, 
        early_stopping_rounds=500,
        verbose=False
    )
    
    s = roc_auc_score(y[val_index], model.predict(cb_valid))
    
    print(f"Best val auc: {s:.4f}")
    scores.append(s)
    preds.append(sigmoid(model.predict(X_test)))

print(f'mean scores: {np.mean(scores): .4f}')

In [None]:
rankpreds = []

for pred in preds:
    rankpred = rankdata(pred)
    rankpreds.append(rankpred)

finalrank = np.average(np.array(rankpreds), axis=0) / len(X_test)

In [None]:
finalrank.shape

In [None]:
submission['stroke'] = finalrank
submission.to_csv('submission.csv', index=False)

In [None]:
submission.head(10)