In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

Title from 'My Stroke of Insight', wonderful book of [Jill Bolte Taylor](https://www.youtube.com/watch?v=UyyjU8fzEYU)

## Importing Libraries

In [None]:
#import numpy as np
#import pandas as pd 

import matplotlib.pylab as plt
import seaborn as sns

## Load Data

In [None]:
train_df = pd.read_csv("/kaggle/input/playground-series-s3e2/train.csv",index_col='id')
test_df = pd.read_csv('/kaggle/input/playground-series-s3e2/test.csv',index_col='id')
submission = pd.read_csv('/kaggle/input/playground-series-s3e2/sample_submission.csv')

## Add Original data to Train_df?

The original data seems statiscally different than the train data: train people are more prone to stroke than original (they stroke with lower levels of everything) - see https://www.kaggle.com/competitions/playground-series-s3e2/discussion/377199

In [None]:
original = pd.read_csv("/kaggle/input/stroke-prediction-dataset/healthcare-dataset-stroke-data.csv")


In [None]:
pd.DataFrame(dict(
    train = train_df.mean(),
    original = original.mean()
))

More women in train data than in original:

In [None]:
print(f'percentage of women in train data: {100*train_df.gender.value_counts()[0]/len(train_df):.2f}%')
print(f'percentage of women in original data: {100*original.gender.value_counts()[0]/len(original):.2f}%')

But taking only men doesn't solve the problem:

In [None]:
pd.DataFrame(dict(
    train_Male = train_df.query('gender=="Male"').mean(),
    original_Male = original.query('gender=="Male"').mean()
))

At the end I will use the extra data, but only the ones with stroke, as it makes a big jump in the leaderscore - but I suspect it will not be the same with the private score:

In [None]:
pd.DataFrame(dict(
    train_Stroke = train_df.query('stroke==1').mean(),
    original_Stroke = original.query('stroke==1').mean()
))

to diferentiate before concatenate, but the feature 'Original' should be removed before training the models to avoid data leakage.

In [None]:
train_df['Original'] = 0
original['Original'] = 1
test_df['Original'] = 0

In [None]:
train_df = pd.concat([train_df,original.query('stroke==1').drop('id',axis=1)],ignore_index = True)

Now we have NaN coming from the original data:

In [None]:
train_df.isnull().sum()

Replace the NaN in Bmi using a Decision Tree, in function of Age and Gender
(propably not necessary but interesting and replicable in other dataframes)

from [thomas konstantin](https://www.kaggle.com/code/thomaskonstantin/analyzing-and-modeling-stroke-data): 

In [None]:
from sklearn.tree import DecisionTreeRegressor

Dtr = DecisionTreeRegressor(random_state=42)

X = train_df[['age','gender','bmi']].copy()
X.gender = X.gender.replace({'Male':0,'Female':1,'Other':-1}).astype(np.uint8)

Missing = X[X.bmi.isna()]
X = X[~X.bmi.isna()]
Y = X.pop('bmi')
Dtr.fit(X,Y)
predicted_bmi = pd.Series(Dtr.predict(Missing[['age','gender']]),index=Missing.index)
train_df.loc[Missing.index,'bmi'] = predicted_bmi



## Feature engineering

In [None]:
df = pd.concat([train_df, test_df], axis = 0)

In [None]:
df.nunique()

In [None]:
df.stroke.sum()/len(df)

In [None]:
df.groupby('gender').stroke.sum()/df.groupby('gender').age.count()

Will change Other by Female (no offense: more females than males), and transform the feature to a categorical one:

In [None]:
df.gender.replace('Other','Female', inplace = True)
df = pd.concat([df,pd.get_dummies(df.gender, drop_first=True)],axis=1)
df = df.drop('gender',axis=1)

not a lot to do with Unknown in smoking_status:

In [None]:
df.smoking_status.value_counts()

we will put 2 extra features with bmi: (from Brenden Siekman)

In [None]:
df['morbid'] = np.where(df.bmi>40,1,0)
df['obese'] = np.where(df.bmi>30,1,0)

from CRAIG THOMAS:

In [None]:
def feature_risk_factors(df):
    df["risk_factors"] = df[[
        "avg_glucose_level", "age", "bmi", 
        "hypertension", "heart_disease", 
        "smoking_status"
    ]].apply(
        lambda x: \
        0 + (1 if x.avg_glucose_level > 99 else 0) + \
        (1 if x.age > 45 else 0) + (1 if x.bmi > 24.99 else 0) + \
        (1 if x.hypertension == 1 else 0) + \
        (1 if x.heart_disease == 1 else 0) + \
        (1 if x.smoking_status in ["formerly smoked", "smokes"] else 0),
        axis=1
    )
    return df

In [None]:
feature_risk_factors(df)

from ALEXANDER SHUMILIN, PS S3, E2: Ensemble model + addition data

from Josh: Predicting a Stroke

I like this idea (maybe because I am 65):

In [None]:
Age = df['age'].astype(int)
rate = []
for i in range(Age.min(), Age.max()):
    rate.append(df[Age < i]['stroke'].sum() / len(df[Age < i]['stroke']))

In [None]:
plt.plot(rate)
plt.xlabel('Age', fontsize=12)
plt.ylabel('% of strokes', fontsize=12)
plt.show()

In [None]:
df = pd.get_dummies(df)

In [None]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

num_cols = ['age', 'avg_glucose_level', 'bmi']

df[num_cols] = scaler.fit_transform(df[num_cols])

In [None]:
df.info()

## Training models


Ideas from Dmitry Uarov, Tilii...

In [None]:
from sklearn.model_selection import KFold, StratifiedKFold, RepeatedStratifiedKFold
from sklearn.metrics import roc_auc_score

I will try SMOTE: (credit [Craig Thomas](https://www.kaggle.com/competitions/playground-series-s3e2/discussion/377373))

In [None]:
#X = df[:len(train_df)].query('Original==0')
#y = X.stroke
#X = X.drop('stroke',axis=1)

In [None]:
#y.sum() / len(y)

will try to add more men, but not with the original dataset:

In [None]:
#train_df_men = df[:len(train_df)].query('Original==0 & Male ==1')
#train_df_women = df[:len(train_df)].query('Original==0 & Male ==0')

In [None]:
#from sklearn.utils import resample
#train_df_men_upsample = resample(train_df_men,
#             replace=True,
#             n_samples=len(train_df_women),
#             random_state=42)

In [None]:
#train_df_macho = pd.concat([train_df_women,train_df_men_upsample],axis=0,ignore_index=True)

In [None]:
#train_df_macho.shape

In [None]:
#len(df)-len(test_df)

In [None]:
#from imblearn.over_sampling import SMOTE

#X_oversampled, y_oversampled = SMOTE(
#    random_state=2023, 
#    sampling_strategy=0.05
#     ).fit_resample(X, y)

In [None]:
#len(y_oversampled) - len(y)

eventually add now the data coming from the original dataset:

In [None]:
#X_original = df[:len(train_df)].query('Original==1')
#y_original = X_original.stroke
#X_original = X_original.drop('stroke',axis=1)

In [None]:
#y_original.shape

In [None]:
#y_oversampled.shape

In [None]:

#y = train_df_macho.stroke
#X = train_df_macho.drop(['stroke','Original'], axis = 1)

In [None]:
X.shape

In [None]:
#X_original.shape

In [None]:
#X = pd.concat([X, X_original],axis=0, ignore_index=True)
#y = pd.concat([y,y_original],axis=0,ignore_index=True)

In [None]:
#X = pd.concat([X_oversampled, X_original],axis=0, ignore_index=True)
#y = pd.concat([y_oversampled,y_original],axis=0,ignore_index=True)

In [None]:
X = df[:len(train_df)]
y = X.stroke
X = X.drop(['stroke','Original'],axis=1)

In [None]:
#df[-len(test_df):].shape

In [None]:
X_test = df.tail(len(test_df))
X_test = X_test.drop(['stroke','Original'], axis=1)

In [None]:
print(f'train_df shape: {train_df.shape}')
print(f'X shape: {X.shape}')
print(f'y shape: {y.shape}')
print(f'X_test shape: {X_test.shape}')

### lasso 

In [None]:
eval_predsLR = []
predsLR = []
scores = []
models = []

In [None]:
from sklearn.linear_model import LassoCV


n_folds = 20  #20
skf = StratifiedKFold(n_splits=n_folds, random_state=42, shuffle=True)
#rskf = RepeatedStratifiedKFold(n_splits=n_folds, n_repeats=10,
#    random_state=42)

MODEL_PARAMS = {
                       'precompute': "auto",
                       'fit_intercept': True,
                       'max_iter': 100000, #1000 before
                       'verbose': False,
                       'eps': 1e-04, 
                        #'cv': 5,
                        'n_alphas': 1000,
                       'n_jobs': -1,
}

for train_index, test_index in skf.split(X, y):
    X_train, X_valid = X.iloc[train_index], X.iloc[test_index]
    y_train, y_valid = y.iloc[train_index], y.iloc[test_index]
    
    model = LassoCV(**MODEL_PARAMS)
    
    model.fit(X=X_train, y=y_train,)
    predsLR.append(model.predict(X_test))
    s = roc_auc_score(y_valid, model.predict(X_valid))
    print(f"Best val auc: {s:.4f}")
    scores.append(s)
    

print(f'mean scores:  {np.mean(scores):.4f}')
models.append(model)
#     eval_predsLB.append(model.predict(X))'''

### catboost

In fact, we don't need the probabilities to compute the AUC, just the order - on the same page, we don't need them to blend the models if we blend them through that order.

In [None]:
def sigmoid(z):
    return 1/(1 + np.exp(-z))

In [None]:
import catboost as cb
n_folds = 10
skf = StratifiedKFold(n_splits=n_folds, random_state=42, shuffle=True)

cb_params = {
    'depth': 3,
    'learning_rate': 0.01,
    'rsm': 0.5,
    'subsample': 0.931,
    'l2_leaf_reg': 69,
    'min_data_in_leaf': 20,
    'random_strength': 0.175,
    'use_best_model': True,
    'task_type': 'CPU',
    'bootstrap_type': 'Bernoulli',
    'grow_policy': 'SymmetricTree',
    'loss_function': 'Logloss',
    'eval_metric': 'AUC',
    'scale_pos_weight': 5
}

for train_index, val_index in skf.split(X, y):
    cb_train = cb.Pool(data=X.iloc[train_index], label = y.iloc[train_index])
    cb_valid = cb.Pool(data=X.iloc[val_index], label = y.iloc[val_index])
    
    model = cb.train(params=cb_params,
                     dtrain=cb_train,
                     num_boost_round=10000,
                     evals=cb_valid, 
                     early_stopping_rounds=500,
                     verbose=False)
    
    s = roc_auc_score(y[val_index], model.predict(cb_valid))
    print(f"Best val auc: {s:.4f}")
    scores.append(s)
    predsLR.append(sigmoid(model.predict(X_test)))
models.append(model)
print(f'mean scores:  {np.mean(scores):.4f}')

### Keras - Binary Classification Model
from https://keras.io/examples/structured_data/imbalanced_classification/ and THARUN_NAYAK for callbacks

In [None]:
from tensorflow import keras

model = keras.Sequential(
    [
        keras.layers.Dense(
            256, activation="relu", input_shape=(X.shape[-1],)
        ),
        keras.layers.Dense(256, activation="relu"),
        keras.layers.Dropout(0.3),
        keras.layers.Dense(256, activation="relu"),
        keras.layers.Dropout(0.3),
        keras.layers.Dense(1, activation="sigmoid"),
    ]
)
model.summary()

In [None]:
counts = np.bincount(y)
print(
    "Number of positive samples in training data: {} ({:.2f}% of total)".format(
        counts[1], 100 * float(counts[1]) / len(y)
    )
)

weight_for_0 = 1.0 / counts[0]
weight_for_1 = 1.0 / counts[1]

In [None]:
model.compile(
    optimizer=keras.optimizers.Adam(1e-2), loss="binary_crossentropy", metrics=keras.metrics.AUC(name="AUC")
)

plat = keras.callbacks.ReduceLROnPlateau(monitor="val_auc", mode='max', patience=7, factor=0.1, min_lr=1e-6, min_delta=0.0001)
es = keras.callbacks.EarlyStopping(monitor="val_auc", mode='max', patience=3, min_delta=0.0001, restore_best_weights = True)
#callbacks = [keras.callbacks.ModelCheckpoint("stroke_model_at_epoch_{epoch}.h5")]
callbacks = [plat,es]
class_weight = {0: weight_for_0, 1: weight_for_1}


In [None]:
n_folds = 5  #20
skf = StratifiedKFold(n_splits=n_folds, random_state=42, shuffle=True)

for train_index, test_index in skf.split(X, y):
    X_train, X_valid = X.iloc[train_index], X.iloc[test_index]
    y_train, y_valid = y.iloc[train_index], y.iloc[test_index]
    
    model.fit(
    X_train,
    y_train,
    batch_size=2048,
    epochs=20,
    verbose=0,
    callbacks=callbacks,
    validation_data=(X_valid, y_valid),
    class_weight=class_weight,
    )
    

    predsLR.append(model.predict(X_test))
    s = roc_auc_score(y_valid, model.predict(X_valid))
    print(f"Best val auc: {s:.4f}")
    scores.append(s)

print(f'mean scores:  {np.mean(scores):.4f}')
models.append(model)

Just to see the curves:

In [None]:
from sklearn.metrics import roc_curve


In [None]:
def plot_roc_curve(fpr, tpr, label=None, color='blue',model_name='model 1'): 
    plt.plot(fpr, tpr, linewidth=1, color=color, label= model_name) 
    plt.plot([0, 1], [0, 1], 'k--') # dashed diagonal
    plt.xlabel('false positive rate', fontsize=12)
    plt.ylabel('true positive rate', fontsize=12)
    
    plt.legend(loc="lower right")
    
    plt.grid(True)

In [None]:
colors = ['blue', 'red','green']
model_names = ['Lasso','Catboost','NeuralNet']
for i,model in enumerate(models):
    model_pred = model.predict(X)
    fpr, tpr, thresholds = roc_curve(y,model_pred)
    AUC = roc_auc_score(y, model_pred)
    model_AUC = f'{model_names[i]} {AUC:.4f}'
                        
    
    plot_roc_curve(fpr, tpr,color=colors[i], model_name=model_AUC)
plt.show()

In [None]:
#with ranking:
from scipy.stats import rankdata
rankpreds = []
for pred in predsLR:
    rankpred = rankdata(pred)
    rankpreds.append(rankpred)
rankpreds += rankpreds[-5:]     #adding more NN just to see what happens
finalrank = np.average(np.array(rankpreds),axis=0)/len(X_test)


In [None]:
len(predsLR)

In [None]:
#predLR = np.average(np.array(predsLR),axis=0)
#submission['stroke'] = predLR

In [None]:
submission['stroke'] = finalrank

In [None]:
submission.to_csv('submission.csv', index=False)

from TILII

In [None]:
coef = pd.DataFrame(models[0].coef_, columns=["LassoCV_score"])
coef["Feature"] = X.columns
coef["Relative score"] = coef["LassoCV_score"] / coef["LassoCV_score"].sum()
coef = coef.sort_values("Relative score", ascending=False)
coef = coef[["Feature", "LassoCV_score", "Relative score"]]

coef.plot(kind="barh", x="Feature", y="LassoCV_score", legend=False, figsize=(6, 12))
plt.title("Features Coefficients")
plt.xlabel("LassoCV score")

plt.show()

In [None]:

feature_importance = models[1].feature_importances_
sorted_idx = np.argsort(feature_importance)
fig = plt.figure(figsize=(12, 6))
plt.barh(range(len(sorted_idx)), feature_importance[sorted_idx], align='center')
plt.yticks(range(len(sorted_idx)), np.array(X_test.columns)[sorted_idx])
plt.title('Feature Importance CatBoost')
plt.show()


In [None]:
a=[1,2,3,4,5]
b=a+a[-2:]

In [None]:
a[-2:]
