In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.compose import ColumnTransformer
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.optimizers import Adam
import tensorflow as tf
import joblib
import os




In [8]:
# 1. Load Data
try:
    df_stroke = pd.read_csv('stroke_data.csv')
    print("Stroke data loaded successfully.")
except FileNotFoundError:
    print("File stroke_data.csv not found.")
except Exception as e:
    print(f"An error occurred: {e}")


Stroke data loaded successfully.


In [5]:
# 2. EDA
print(df_stroke.head())
print(df_stroke.info())
print(df_stroke.isnull().sum()) 
print(df_stroke['stroke'].value_counts()) 



   sex   age  hypertension  heart_disease  ever_married  work_type  \
0  1.0  63.0             0              1             1          4   
1  1.0  42.0             0              1             1          4   
2  0.0  61.0             0              0             1          4   
3  1.0  41.0             1              0             1          3   
4  1.0  85.0             0              0             1          4   

   Residence_type  avg_glucose_level   bmi  smoking_status  stroke  
0               1             228.69  36.6               1       1  
1               0             105.92  32.5               0       1  
2               1             171.23  34.4               1       1  
3               0             174.12  24.0               0       1  
4               1             186.21  29.0               1       1  
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 40910 entries, 0 to 40909
Data columns (total 11 columns):
 #   Column             Non-Null Count  Dtype  
---  ---

In [6]:
# 3. Preprocessing

if 'sex' in df_stroke.columns and df_stroke['sex'].isnull().any():
    mode_sex = df_stroke['sex'].mode()[0]
    df_stroke['sex'] = df_stroke['sex'].fillna(mode_sex)

X_stroke = df_stroke.drop('stroke', axis=1)
y_stroke = df_stroke['stroke']

categorical_cols_stroke = ['sex', 'ever_married', 'work_type', 'Residence_type', 'smoking_status']
numerical_cols_stroke = ['age', 'hypertension', 'heart_disease', 'avg_glucose_level', 'bmi']

preprocessor_stroke = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_cols_stroke),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_cols_stroke)
    ],
    remainder='passthrough'
)

X_processed_stroke = preprocessor_stroke.fit_transform(X_stroke)

X_train_stroke, X_test_stroke, y_train_stroke, y_test_stroke = train_test_split(
    X_processed_stroke, y_stroke, test_size=0.2, random_state=42, stratify=y_stroke
)

print(f"Stroke X_train shape: {X_train_stroke.shape}")
print(f"Stroke X_test shape: {X_test_stroke.shape}")

Stroke X_train shape: (32728, 18)
Stroke X_test shape: (8182, 18)


In [None]:
# 4 membuat model
model_stroke = Sequential([
    Dense(128, activation='relu', input_shape=(X_train_stroke.shape[1],)),
    Dropout(0.3),
    Dense(64, activation='relu'),
    Dropout(0.3),
    Dense(32, activation='relu'),
    Dense(1, activation='sigmoid') 
])


model_stroke.compile(optimizer=Adam(learning_rate=0.001),
                     loss='binary_crossentropy',
                     metrics=['accuracy'])

print("\nStroke Model Summary:")
model_stroke.summary()


print("\nTraining Stroke Model...")
history_stroke = model_stroke.fit(
    X_train_stroke, y_train_stroke,
    epochs=50,
    batch_size=32,
    validation_split=0.1,
    verbose=0
)

print(f"Stroke Training Accuracy: {history_stroke.history['accuracy'][-1]:.4f}")
print(f"Stroke Validation Accuracy: {history_stroke.history['val_accuracy'][-1]:.4f}")



Stroke Model Summary:
Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense (Dense)               (None, 128)               2560      
                                                                 
 dropout (Dropout)           (None, 128)               0         
                                                                 
 dense_1 (Dense)             (None, 64)                8256      
                                                                 
 dropout_1 (Dropout)         (None, 64)                0         
                                                                 
 dense_2 (Dense)             (None, 32)                2080      
                                                                 
 dense_3 (Dense)             (None, 1)                 33        
                                                                 
Total params: 12929 (50.50 KB)
T

In [None]:
# 7. evaluasi model
loss_stroke, accuracy_stroke = model_stroke.evaluate(X_test_stroke, y_test_stroke, verbose=0)
print(f"\nStroke Test Loss: {loss_stroke:.4f}")
print(f"Stroke Test Accuracy: {accuracy_stroke:.4f}")

# 8. Menyimpan model
model_dir_stroke = './saved_models/stroke_model'
os.makedirs(model_dir_stroke, exist_ok=True)
model_stroke.save(model_dir_stroke)
print(f"Stroke model saved to {model_dir_stroke}")

preprocessor_stroke_path = './saved_models/stroke_preprocessor.pkl'
joblib.dump(preprocessor_stroke, preprocessor_stroke_path)
print(f"Stroke preprocessor saved to {preprocessor_stroke_path}")


Stroke Test Loss: 0.1950
Stroke Test Accuracy: 0.9190
INFO:tensorflow:Assets written to: ./saved_models/stroke_model\assets


INFO:tensorflow:Assets written to: ./saved_models/stroke_model\assets


Stroke model saved to ./saved_models/stroke_model
Stroke preprocessor saved to ./saved_models/stroke_preprocessor.pkl
