# Hearth Care Stroke data Machine Learning Analysing

In [1]:
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.ensemble import RandomForestClassifier, BaggingClassifier, AdaBoostClassifier, GradientBoostingClassifier, VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
import plotly.express as px
import numpy as np
import seaborn as sns
from sklearn.neighbors import KNeighborsClassifier
from functions import *

## Cleaning Data

Import dataset:

In [2]:
data = pd.read_csv("healthcare-dataset-stroke-data.csv")

In [3]:
data

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,9046,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1
1,51676,Female,61.0,0,0,Yes,Self-employed,Rural,202.21,,never smoked,1
2,31112,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1
3,60182,Female,49.0,0,0,Yes,Private,Urban,171.23,34.4,smokes,1
4,1665,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,1
...,...,...,...,...,...,...,...,...,...,...,...,...
5105,18234,Female,80.0,1,0,Yes,Private,Urban,83.75,,never smoked,0
5106,44873,Female,81.0,0,0,Yes,Self-employed,Urban,125.20,40.0,never smoked,0
5107,19723,Female,35.0,0,0,Yes,Self-employed,Rural,82.99,30.6,never smoked,0
5108,37544,Male,51.0,0,0,Yes,Private,Rural,166.29,25.6,formerly smoked,0


Check how many NA valuse do we have in the data:

In [4]:
data.isna().sum()

id                     0
gender                 0
age                    0
hypertension           0
heart_disease          0
ever_married           0
work_type              0
Residence_type         0
avg_glucose_level      0
bmi                  201
smoking_status         0
stroke                 0
dtype: int64

because the number of NAs were not many, so we droped all the rows containing NA value:

In [5]:
data = data.dropna()
data.head()

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,9046,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1
2,31112,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1
3,60182,Female,49.0,0,0,Yes,Private,Urban,171.23,34.4,smokes,1
4,1665,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,1
5,56669,Male,81.0,0,0,Yes,Private,Urban,186.21,29.0,formerly smoked,1


In [6]:
data.isna().sum()

id                   0
gender               0
age                  0
hypertension         0
heart_disease        0
ever_married         0
work_type            0
Residence_type       0
avg_glucose_level    0
bmi                  0
smoking_status       0
stroke               0
dtype: int64

Check if the data type of all columns are correct:

In [7]:
data.dtypes

id                     int64
gender                object
age                  float64
hypertension           int64
heart_disease          int64
ever_married          object
work_type             object
Residence_type        object
avg_glucose_level    float64
bmi                  float64
smoking_status        object
stroke                 int64
dtype: object

### EDA for data exploration

In [8]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np

def eda_plots(data):
    # Setting up the plot style
    sns.set(style="whitegrid")

    # Loop through each column
    for column in data.columns:
        print(f"\nPlotting for Column: {column}")
        print("-" * 40)

        # Categorical Data (includes object type or columns with fewer than 20 unique values)
        if data[column].dtype == 'object' or data[column].nunique() < 20:
            plt.figure(figsize=(10, 6))
            sns.countplot(x=data[column], palette="Set2")
            plt.title(f'Distribution of {column}')
            plt.xlabel(column)
            plt.ylabel('Count')
            plt.xticks(rotation=45)
            plt.show()

        # Numerical Data
        elif np.issubdtype(data[column].dtype, np.number):
            # Histogram with KDE
            plt.figure(figsize=(10, 6))
            sns.histplot(data[column].dropna(), kde=True, color="blue", bins=30)
            plt.title(f'Distribution of {column}')
            plt.xlabel(column)
            plt.ylabel('Frequency')
            plt.show()

            # Boxplot for detecting outliers
            plt.figure(figsize=(10, 6))
            sns.boxplot(x=data[column], color="lightblue")
            plt.title(f'Boxplot of {column}')
            plt.xlabel(column)
            plt.show()

        print("\n" + "="*80)

# Example usage
# eda_plots(data)


Transform the "AgeGroup" column into differernt bins and make it numerical:

In [9]:
data = clean_age_column(data)

data.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['AgeGroup'] = pd.cut(data['age'], bins=[0, 18, 35, 50, 65, 100], labels=[0, 1, 2, 3, 4])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['AgeGroup'] = data['AgeGroup'].astype('int64')
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data.drop(columns=["age"], inplace=True)


Unnamed: 0,id,gender,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke,AgeGroup
0,9046,Male,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1,4
2,31112,Male,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1,4
3,60182,Female,0,0,Yes,Private,Urban,171.23,34.4,smokes,1,2
4,1665,Female,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,1,4
5,56669,Male,0,0,Yes,Private,Urban,186.21,29.0,formerly smoked,1,4


In [10]:
data['gender'].unique()

array(['Male', 'Female', 'Other'], dtype=object)

Transform the "gender" column from categorical to numerical:

In [11]:
# Drop rows where 'gender' is 'Other'
data = data[data['gender'] != 'Other']

In [12]:
data= pd.get_dummies(data, columns=['gender', 'ever_married', 'work_type', 'Residence_type'])
data.head()

Unnamed: 0,id,hypertension,heart_disease,avg_glucose_level,bmi,smoking_status,stroke,AgeGroup,gender_Female,gender_Male,ever_married_No,ever_married_Yes,work_type_Govt_job,work_type_Never_worked,work_type_Private,work_type_Self-employed,work_type_children,Residence_type_Rural,Residence_type_Urban
0,9046,0,1,228.69,36.6,formerly smoked,1,4,0,1,0,1,0,0,1,0,0,0,1
2,31112,0,1,105.92,32.5,never smoked,1,4,0,1,0,1,0,0,1,0,0,1,0
3,60182,0,0,171.23,34.4,smokes,1,2,1,0,0,1,0,0,1,0,0,0,1
4,1665,1,0,174.12,24.0,never smoked,1,4,1,0,0,1,0,0,0,1,0,1,0
5,56669,0,0,186.21,29.0,formerly smoked,1,4,0,1,0,1,0,0,1,0,0,0,1


The "Category" column is our Target column. so we want to make sure how to categorize it in the best form.

In [13]:
data["stroke"].unique()

array([1, 0])

In [14]:
data.drop(columns="id", inplace=True)
data.head()

Unnamed: 0,hypertension,heart_disease,avg_glucose_level,bmi,smoking_status,stroke,AgeGroup,gender_Female,gender_Male,ever_married_No,ever_married_Yes,work_type_Govt_job,work_type_Never_worked,work_type_Private,work_type_Self-employed,work_type_children,Residence_type_Rural,Residence_type_Urban
0,0,1,228.69,36.6,formerly smoked,1,4,0,1,0,1,0,0,1,0,0,0,1
2,0,1,105.92,32.5,never smoked,1,4,0,1,0,1,0,0,1,0,0,1,0
3,0,0,171.23,34.4,smokes,1,2,1,0,0,1,0,0,1,0,0,0,1
4,1,0,174.12,24.0,never smoked,1,4,1,0,0,1,0,0,0,1,0,1,0
5,0,0,186.21,29.0,formerly smoked,1,4,0,1,0,1,0,0,1,0,0,0,1


In [15]:
data["smoking_status"].value_counts()

never smoked       1852
Unknown            1483
formerly smoked     836
smokes              737
Name: smoking_status, dtype: int64

In [16]:
data['smoking_status'] = data['smoking_status'].map({'never smoked': 0,'Unknown': 1,'formerly smoked': 2,'smokes': 3})
data['smoking_status'].astype(int)
data.head()

Unnamed: 0,hypertension,heart_disease,avg_glucose_level,bmi,smoking_status,stroke,AgeGroup,gender_Female,gender_Male,ever_married_No,ever_married_Yes,work_type_Govt_job,work_type_Never_worked,work_type_Private,work_type_Self-employed,work_type_children,Residence_type_Rural,Residence_type_Urban
0,0,1,228.69,36.6,2,1,4,0,1,0,1,0,0,1,0,0,0,1
2,0,1,105.92,32.5,0,1,4,0,1,0,1,0,0,1,0,0,1,0
3,0,0,171.23,34.4,3,1,2,1,0,0,1,0,0,1,0,0,0,1
4,1,0,174.12,24.0,0,1,4,1,0,0,1,0,0,0,1,0,1,0
5,0,0,186.21,29.0,2,1,4,0,1,0,1,0,0,1,0,0,0,1


In [17]:
# change the data type of the 'bmi' column to float
data = data.apply(pd.to_numeric, errors='coerce')

## Feature Selection

In [18]:
# Calculate the correlation matrix
correlation_matrix = np.abs(data.corr())

# Create the heatmap using Plotly Express
fig = px.imshow(correlation_matrix,
                x=correlation_matrix.columns,
                y=correlation_matrix.columns,
                color_continuous_scale='RdBu_r',  # Red-Blue diverging color scale
                zmin=-1,
                zmax=1,
                aspect="auto",
                title='Correlation Heatmap of Numerical Variables')

# Update the layout for better readability
fig.update_layout(
    xaxis_title="",
    yaxis_title="",
    xaxis={'side': 'top'},  # Move x-axis labels to the top
    width=800,
    height=700
)

# Add correlation values as text annotations
for i, row in enumerate(correlation_matrix.values):
    for j, value in enumerate(row):
        fig.add_annotation(
            x=correlation_matrix.columns[j],
            y=correlation_matrix.columns[i],
            text=f"{value:.2f}",
            showarrow=False,
            font=dict(size=8)
        )

# Show the plot
fig.show()

In [19]:
data.columns

Index(['hypertension', 'heart_disease', 'avg_glucose_level', 'bmi',
       'smoking_status', 'stroke', 'AgeGroup', 'gender_Female', 'gender_Male',
       'ever_married_No', 'ever_married_Yes', 'work_type_Govt_job',
       'work_type_Never_worked', 'work_type_Private',
       'work_type_Self-employed', 'work_type_children', 'Residence_type_Rural',
       'Residence_type_Urban'],
      dtype='object')

In [20]:
data = data.drop(['gender_Female', 'gender_Male',
       'ever_married_No', 'ever_married_Yes'], axis=1)

## Split Data into Train and Test

In [21]:
features = data.drop(columns = ["stroke"])
target = data["stroke"]

In [22]:
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size = 0.20, random_state=0)

### Standardize The Data

In [23]:
# Standardize the features
X_train_scaled, X_test_scaled = Standardizer(X_train, X_test)

### Normalize The Data

In [24]:
# Normalize the features
X_train_norm, X_test_norm = Normalizer(X_train, X_test)

Based on the result of Heatmap we can drop these two not important columns : 

In [25]:
X_train_norm = pd.DataFrame(X_train_norm, columns = X_train.columns)
# X_train_reduced = X_train_norm.drop(columns = ["gender","Residence_type"])

X_test_norm = pd.DataFrame(X_test_norm, columns = X_test.columns)
# X_test_reduced = X_test_norm.drop(columns = ["gender","Residence_type"])

In [26]:
X_train_scaled = pd.DataFrame(X_train_scaled, columns = X_train.columns)

X_test_scaled = pd.DataFrame(X_test_scaled, columns = X_test.columns)

## Modeling

#### Define and Evaluate Models

In [27]:
models = {
    'Random Forest': (RandomForestClassifier(random_state=42), {
        'n_estimators': [50, 100],
        'max_depth': [None, 10, 20]
    }),
    'KNN': (KNeighborsClassifier(), {
        'n_neighbors': [3, 5, 7]
    }),
    'Logistic Regression': (LogisticRegression(max_iter=1000, random_state=42), {
        'C': [0.1, 1, 10]
    }),
    'SVM': (SVC(probability=True, random_state=42), {
        'C': [0.1, 1, 10],
        'kernel': ['linear', 'rbf']
    }),
    'Decision Tree': (DecisionTreeClassifier(random_state=42), {
        'max_depth': [None, 10, 20]
    })
}


In [36]:
ensemble_models = {
    'Bagging with Random Forest': (BaggingClassifier(estimator=RandomForestClassifier(random_state=42), random_state=42), {
        'n_estimators': [10, 20],
        'base_estimator__n_estimators': [50, 100],
        'base_estimator__max_depth': [None, 10, 20],
        'max_samples': [0.8, 1.0],
        'max_features': [0.8, 1.0]
    }),
    'Bagging with KNN': (BaggingClassifier(estimator=KNeighborsClassifier(), random_state=42), {
        'n_estimators': [10, 20],
        'base_estimator__n_neighbors': [3, 5, 7],
        'max_samples': [0.8, 1.0],
        'max_features': [0.8, 1.0]
    }),
    'Bagging with Logistic Regression': (BaggingClassifier(estimator=LogisticRegression(max_iter=1000, random_state=42), random_state=42), {
        'n_estimators': [10, 20],
        'base_estimator__C': [0.1, 1, 10],
        'max_samples': [0.8, 1.0],
        'max_features': [0.8, 1.0]
    }),
    'Bagging with SVM': (BaggingClassifier(estimator=SVC(probability=True, random_state=42), random_state=42), {
        'n_estimators': [10, 20],
        'base_estimator__C': [0.1, 1, 10],
        'base_estimator__kernel': ['linear', 'rbf'],
        'max_samples': [0.8, 1.0],
        'max_features': [0.8, 1.0]
    }),
    'Bagging with Decision Tree': (BaggingClassifier(estimator=DecisionTreeClassifier(random_state=42), random_state=42), {
        'n_estimators': [10, 20],
        'base_estimator__max_depth': [None, 10, 20],
        'max_samples': [0.8, 1.0],
        'max_features': [0.8, 1.0]
    }),
    'AdaBoost with Decision Tree Stump': (AdaBoostClassifier(estimator=DecisionTreeClassifier(max_depth=1, random_state=42), random_state=42), {
        'n_estimators': [50, 100],
        'learning_rate': [0.01, 0.1, 1]
    }),
    'AdaBoost with Logistic Regression': (AdaBoostClassifier(estimator=LogisticRegression(max_iter=1000, random_state=42), random_state=42), {
        'n_estimators': [50, 100],
        'learning_rate': [0.01, 0.1, 1]
    }),
    'AdaBoost with KNN': (AdaBoostClassifier(estimator=KNeighborsClassifier(), random_state=42), {
        'n_estimators': [50, 100],
        'learning_rate': [0.01, 0.1, 1]
    }),
    'Gradient Boosting with Decision Tree': (GradientBoostingClassifier(init=DecisionTreeClassifier(random_state=42), random_state=42), {
        'n_estimators': [50, 100],
        'learning_rate': [0.01, 0.1, 1],
        'max_depth': [3, 5, 7]
    })
}


#### Function for preprocessing and evalution

In [37]:
import numpy as np
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

def preprocess_and_evaluate(model, model_name, param_distributions, X_train, y_train, X_test, y_test, scaler, n_iter=100):
    # Apply scaling if needed
    if scaler:
        X_train_scaled = scaler.fit_transform(X_train)
        X_test_scaled = scaler.transform(X_test)
    else:
        X_train_scaled, X_test_scaled = X_train, X_test

    # Set up RandomizedSearchCV
    random_search = RandomizedSearchCV(model, param_distributions, n_iter=n_iter, cv=5, n_jobs=-1, scoring='accuracy', random_state=42)
    random_search.fit(X_train_scaled, y_train)

    # Get the best model
    best_model = random_search.best_estimator_

    # Predict and evaluate
    y_pred = best_model.predict(X_test_scaled)
    result = {
        'Model': model_name,
        'Scaler': scaler.__class__.__name__ if scaler else 'None',
        'Best Params': random_search.best_params_,
        'Accuracy': accuracy_score(y_test, y_pred),
        'Precision': precision_score(y_test, y_pred, average='weighted'),
        'Recall': recall_score(y_test, y_pred, average='weighted'),
        'F1 Score': f1_score(y_test, y_pred, average='weighted')
    }
    
    return result


#### Define Model Evaluation Function for models without ensembles

In [55]:
def evaluate_all_models(models, X_train, y_train, X_test, y_test):
    results = []
    scalers = [None, StandardScaler(), MinMaxScaler()]
    
    for model_name, (model, param_grid) in models.items():
        for scaler in scalers:
            result = preprocess_and_evaluate(model, model_name, param_grid, X_train, y_train, X_test, y_test, scaler)
            results.append(result)
    
    results_df = pd.DataFrame(results)
    results_df.to_csv('individual_model_results.csv', index=False)
    print(results_df)
    return results_df

# Evaluate individual models
individual_results_df = evaluate_all_models(models, X_train, y_train, X_test, y_test)



The total space of parameters 6 is smaller than n_iter=100. Running 6 iterations. For exhaustive searches, use GridSearchCV.


Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.


The total space of parameters 6 is smaller than n_iter=100. Running 6 iterations. For exhaustive searches, use GridSearchCV.


Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.


The total space of parameters 6 is smaller than n_iter=100. Running 6 iterations. For exhaustive searches, use GridSearchCV.


Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.


The total space of parameters 3 is smaller than n_iter=100. Running 3 iterations. For exhaustive searches, use GridSearchCV.


Precision is ill-defined and being set to 0.0 in labels with no pred

#### Define Model Evaluation Function for models with ensembles

In [31]:
def evaluate_all_ensemble_models(ensemble_models, X_train, y_train, X_test, y_test):
    results = []
    scalers = [None, StandardScaler(), MinMaxScaler()]
    
    for model_name, (model, param_grid) in ensemble_models.items():
        for scaler in scalers:
            result = preprocess_and_evaluate(model, model_name, param_grid, X_train, y_train, X_test, y_test, scaler)
            results.append(result)
    
    results_df = pd.DataFrame(results)
    results_df.to_csv('ensemble_model_results.csv', index=False)
    print(results_df)
    return results_df

# Evaluate ensemble models
ensemble_results_df = evaluate_all_ensemble_models(ensemble_models, X_train, y_train, X_test, y_test)



The total space of parameters 48 is smaller than n_iter=100. Running 48 iterations. For exhaustive searches, use GridSearchCV.


`base_estimator` was renamed to `estimator` in version 1.2 and will be removed in 1.4.


Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.


The total space of parameters 48 is smaller than n_iter=100. Running 48 iterations. For exhaustive searches, use GridSearchCV.


`base_estimator` was renamed to `estimator` in version 1.2 and will be removed in 1.4.


Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.


The total space of parameters 48 is smaller than n_iter=100. Running 48 iterations. For exhaustive searches, use GridSearchCV.


`base_estimator` was renamed to `estimator` in version 1.2 and will be removed in 1.4.


Precision is ill-defined and being set to 0.0 in labels with no

#### Save results of models as csv and dataframe and Choose the Best Model

In [None]:
# Load individual and ensemble results
individual_results_df = pd.read_csv('individual_model_results.csv')
ensemble_results_df = pd.read_csv('ensemble_model_results.csv')

# Combine results
combined_results_df = pd.concat([individual_results_df, ensemble_results_df], ignore_index=True)

# Save combined results
combined_results_df.to_csv('combined_model_results.csv', index=False)
print(combined_results_df)

# Choose the best model based on a chosen metric, e.g., highest F1 Score
best_model_result = combined_results_df.loc[combined_results_df['F1 Score'].idxmax()]
print("Best Model:")
print(best_model_result)


#### Make a sample dataframe to test your model

In [None]:
sample_data = {
    'id': [10001, 10002, 10003, 10004, 10005, 10006, 10007, 10008, 10009, 10010],
    'gender': ['Male', 'Female', 'Male', 'Female', 'Male', 'Female', 'Male', 'Female', 'Male', 'Female'],
    'age': [45, 34, 56, 62, 50, 41, 37, 55, 70, 65],
    'hypertension': [0, 0, 1, 0, 1, 0, 0, 1, 0, 1],
    'heart_disease': [0, 0, 1, 0, 0, 1, 0, 1, 0, 0],
    'ever_married': ['Yes', 'No', 'Yes', 'Yes', 'Yes', 'Yes', 'No', 'Yes', 'Yes', 'Yes'],
    'work_type': ['Private', 'Self-employed', 'Govt_job', 'Private', 'Private', 'Self-employed', 'Private', 'Govt_job', 'Private', 'Private'],
    'Residence_type': ['Urban', 'Rural', 'Urban', 'Urban', 'Rural', 'Urban', 'Urban', 'Rural', 'Urban', 'Rural'],
    'avg_glucose_level': [140.55, 95.30, 180.50, 210.30, 125.45, 150.70, 110.20, 95.00, 200.80, 180.00],
    'bmi': [30, 22, 28, 27, 32, 29, 31, 23, 30, 26],
    'smoking_status': ['never smoked', 'smokes', 'formerly smoked', 'never smoked', 'smokes', 'never smoked', 'formerly smoked', 'never smoked', 'smokes', 'formerly smoked'],
    'stroke': [0, 0, 1, 0, 1, 1, 0, 1, 0, 1]
}

# Create DataFrame
df_sample = pd.DataFrame(sample_data)

# Save to CSV
df_sample.to_csv('sample_data.csv', index=False)

# Visualization Performance of different Models

In [52]:
import pandas as pd
import plotly.express as px

# Assuming 'combined_results_df' contains the results with columns like 'Model', 'Accuracy', 'Precision', etc.
combined_results_df = pd.read_csv("individual_model_results.csv")

# Melt the dataframe to long format for easy plotting with Plotly
results_melted_df = combined_results_df.melt(id_vars=['Model', 'Scaler'], 
                                    value_vars=['Accuracy', 'Precision', 'Recall', 'F1 Score'],
                                    var_name='Metric', value_name='Value')

# Create the Plotly bar chart
fig = px.bar(results_melted_df, x='Metric', y='Value', color='Model', barmode='group',
             facet_col='Scaler', title='Model Performance Metrics')

# Update layout for better visualization
fig.update_layout(xaxis_title='Metric',
                  yaxis_title='Value',
                  legend_title='Model',
                  yaxis=dict(range=[0.9, 1]))  # Scaled y-axis

# Show the figure
fig.show()


-----------------------------------------------------------------------------------------------------------------------------------------------------
-----------------------------------------------------------------------------------------------------------------------------------------------------
-----------------------------------------------------------------------------------------------------------------------------------------------------

In [53]:
import numpy as np

def plot_feature_importance(model, feature_names):
    if hasattr(model, 'feature_importances_'):
        importances = model.feature_importances_
        indices = np.argsort(importances)[::-1]
        plt.figure(figsize=(10, 6))
        plt.title("Feature Importances")
        plt.bar(range(len(feature_names)), importances[indices], align="center")
        plt.xticks(range(len(feature_names)), np.array(feature_names)[indices], rotation=90)
        plt.xlim([-1, len(feature_names)])
        plt.xlabel('Feature')
        plt.ylabel('Importance')
        plt.tight_layout()
        plt.show()

# Assuming `best_model` is the model with feature importance
plot_feature_importance(best_model, X_train.columns)


NameError: name 'best_model' is not defined

In [54]:
from sklearn.metrics import confusion_matrix
import seaborn as sns

def plot_confusion_matrix(y_true, y_pred, classes):
    cm = confusion_matrix(y_true, y_pred)
    plt.figure(figsize=(8, 6))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=classes, yticklabels=classes)
    plt.xlabel('Predicted Label')
    plt.ylabel('True Label')
    plt.title('Confusion Matrix')
    plt.show()

# Assuming `y_test` and `y_pred` are your true and predicted labels, and `class_names` is a list of class names
plot_confusion_matrix(y_test, y_pred, class_names)


NameError: name 'y_pred' is not defined

In [None]:
from sklearn.metrics import roc_curve, auc

def plot_roc_curve(y_true, y_score):
    fpr, tpr, _ = roc_curve(y_true, y_score)
    roc_auc = auc(fpr, tpr)
    plt.figure(figsize=(8, 6))
    plt.plot(fpr, tpr, color='darkorange', lw=2, label=f'ROC curve (area = {roc_auc:.2f})')
    plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('Receiver Operating Characteristic (ROC)')
    plt.legend(loc='lower right')
    plt.show()

# Assuming `y_test` and `y_score` (model's prediction probabilities) are available
plot_roc_curve(y_test, y_score)


NameError: name 'y_score' is not defined

In [None]:
# Summarize the best model results
summary_df = combined_results_df.loc[combined_results_df['Accuracy'].idxmax()]
print("Summary of Best Model:")
print(summary_df)
