In [71]:
import streamlit as st
from streamlit_timeline import timeline
import json
import numpy as np
from datetime import datetime, timedelta
import pandas as pd
from ydata_profiling import ProfileReport
from streamlit_pandas_profiling import st_profile_report
import plotly.express as px
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, mean_squared_error
from sklearn.linear_model import LogisticRegression, LinearRegression, Ridge
from sklearn import tree
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import MinMaxScaler, StandardScaler, LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.svm import SVC

In [72]:
df = pd.read_csv('stroke.csv', index_col=0)
display(df)

Unnamed: 0_level_0,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
0,Male,28.0,0,0,Yes,Private,Urban,79.53,31.1,never smoked,0
1,Male,33.0,0,0,Yes,Private,Rural,78.44,23.9,formerly smoked,0
2,Female,42.0,0,0,Yes,Private,Rural,103.00,40.3,Unknown,0
3,Male,56.0,0,0,Yes,Private,Urban,64.87,28.8,never smoked,0
4,Female,24.0,0,0,No,Private,Rural,73.36,28.8,never smoked,0
...,...,...,...,...,...,...,...,...,...,...,...
15299,Female,22.0,0,0,No,Govt_job,Urban,72.63,19.5,never smoked,0
15300,Female,46.0,1,0,Yes,Private,Urban,101.19,32.1,never smoked,0
15301,Female,75.0,0,0,Yes,Self-employed,Urban,87.69,26.2,never smoked,0
15302,Male,46.0,0,0,Yes,Private,Rural,101.13,22.5,Unknown,0


In [77]:
# Custom imputation using random values between min and max for each column
def random_impute(X):
    for i in range(X.shape[1]):
        col = X[:, i]
        missing = np.isnan(col)
        col_min, col_max = np.nanmin(col), np.nanmax(col)
        col[missing] = np.random.uniform(col_min, col_max, size=missing.sum())
    return X
        
def clean_data(X, imputation_strategy='most_frequent', scaling_method='minmax'):
    # Separate numerical and non-numerical columns
    numerical_cols = X.select_dtypes(include=[np.number]).columns
    categorical_cols = X.select_dtypes(exclude=[np.number]).columns

    # Handle missing values based on the chosen imputation strategy
    if imputation_strategy in ['mean', 'median','most_frequent']:
        imputer = SimpleImputer(strategy=imputation_strategy)
        X_imputed = imputer.fit_transform(X)
    elif imputation_strategy == 'random':        
        X_imputed = random_impute(X.copy())  # Apply random imputation before scaling
    else:
        raise ValueError("Invalid imputation strategy. Choose 'mean', 'median', 'most_frequent' or 'random'.")
    
    # Encode non-numerical columns with integer encoding
    df[categorical_cols] = df[categorical_cols].apply(LabelEncoder().fit_transform)


    # Apply scaling based on the chosen scaling method
    if scaling_method == 'minmax':
        scaler = MinMaxScaler()
    elif scaling_method == 'standard':
        scaler = StandardScaler()
    else:
        raise ValueError("Invalid scaling method. Choose 'minmax' or 'standard'.")
    
    # Fit and transform the data
    X_scaled = scaler.fit_transform(X_imputed)
    X_cleaned = pd.DataFrame.from_records(data=X_scaled, columns=X.columns)
    return X_cleaned

In [83]:
df = clean_data(df)
print(df)

       gender       age  hypertension  heart_disease  ever_married  work_type  \
0         0.5  0.340820           0.0            0.0           1.0       0.50   
1         0.5  0.401855           0.0            0.0           1.0       0.50   
2         0.0  0.511719           0.0            0.0           1.0       0.50   
3         0.5  0.682617           0.0            0.0           1.0       0.50   
4         0.0  0.291992           0.0            0.0           0.0       0.50   
...       ...       ...           ...            ...           ...        ...   
15299     0.0  0.267578           0.0            0.0           0.0       0.00   
15300     0.0  0.560547           1.0            0.0           1.0       0.50   
15301     0.0  0.914551           0.0            0.0           1.0       0.75   
15302     0.5  0.560547           0.0            0.0           1.0       0.50   
15303     0.0  0.169922           0.0            0.0           0.0       0.50   

       Residence_type  avg_

In [79]:
def train_model(model, train_test_split, folds):
    X_train, X_test, y_train, y_test = train_test_split
    metric = model["metric"]
    param_grid = model["params"]
    method = model["method"]

    # Create a GridSearchCV object with cross-validation
    grid_search = GridSearchCV(method, param_grid, cv=folds, n_jobs=-1, scoring=metric, return_train_score=True)

    # Fit the model on the training data
    grid_search.fit(X_train, y_train)
    
    # Evaluate the model on the test set
    test_score = grid_search.score(X_test, y_test)
    
    return grid_search.best_estimator_, grid_search.best_params_, grid_search.best_score_, test_score

In [80]:
def ML_Pipeline(train_test_split, models, train_config):
    
    stats = np.zeros(shape=(2,len(models)))
    hyperparams = []
    trained_models = []
    
    folds = train_config["folds"]

    for i,model in enumerate(models):               
        trained_model, best_params, val_score, test_score = train_model(model, train_test_split, folds)
        stats[0,i] = val_score
        stats[1,i] = test_score
        trained_models.append(trained_model)
        hyperparams.append(best_params)
    return stats, trained_models,hyperparams

def get_best_model(stats, trained_models):
    index = np.argmin(stats[1,:])
    best_value = np.min(stats[1,:])
    model= trained_models[index]
    return model,best_value,index

In [81]:
# train and test split
target = "stroke"
df = clean_data(df)
X = df.drop(target, axis=1)
y = df[target]

split = train_test_split(X, y, test_size = 0.2, shuffle=True, random_state=4)

In [88]:
param_grid_logreg = {
    'penalty': ['l1', 'l2'],    # Regularization type
    'C': [0.1, 1, 10],                    # Inverse of regularization strength
    'solver': ['liblinear'],  # Optimization algorithm
    'l1_ratio': [0.5, 0.75, 1]              # For elasticnet, mix of L1 and L2 regularization
}

param_grid_rf = {
    'n_estimators': [100, 200, 500, 1000],           # Number of trees in the forest
    'max_depth': [None, 10, 20],             # Maximum depth of each tree
    'min_samples_split': [2, 5],                 # Minimum number of samples required to split a node
    'min_samples_leaf': [1, 2, 4],       # Number of features to consider for the best split
    'bootstrap': [True, False],                      # Whether bootstrap samples are used when building trees
    'criterion': ['gini', 'entropy', 'log_loss']     # Function to measure the quality of a split
}


rf = {"method": RandomForestClassifier(), "params": param_grid_rf, "metric": "f1_macro"}
svc = {"method": LogisticRegression(), "params": param_grid_logreg, "metric": "accuracy"}
train_settings = {"folds": 5}

ML_Pipeline(split, [svc, rf], train_settings)



KeyboardInterrupt: 