# HCV data Machine Learning Analysing

In [2]:
import pandas as pd
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler, StandardScaler
import plotly.express as px
import numpy as np
from sklearn.neighbors import KNeighborsClassifier

from functions import *

## Cleaning Data

Import dataset:

In [3]:
data = pd.read_csv("healthcare-dataset-stroke-data.csv")

In [4]:
data

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,9046,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1
1,51676,Female,61.0,0,0,Yes,Self-employed,Rural,202.21,,never smoked,1
2,31112,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1
3,60182,Female,49.0,0,0,Yes,Private,Urban,171.23,34.4,smokes,1
4,1665,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,1
...,...,...,...,...,...,...,...,...,...,...,...,...
5105,18234,Female,80.0,1,0,Yes,Private,Urban,83.75,,never smoked,0
5106,44873,Female,81.0,0,0,Yes,Self-employed,Urban,125.20,40.0,never smoked,0
5107,19723,Female,35.0,0,0,Yes,Self-employed,Rural,82.99,30.6,never smoked,0
5108,37544,Male,51.0,0,0,Yes,Private,Rural,166.29,25.6,formerly smoked,0


Check how many NA valuse do we have in the data:

In [5]:
data.isna().sum()

id                     0
gender                 0
age                    0
hypertension           0
heart_disease          0
ever_married           0
work_type              0
Residence_type         0
avg_glucose_level      0
bmi                  201
smoking_status         0
stroke                 0
dtype: int64

because the number of NAs were not many, so we droped all the rows containing NA value:

In [6]:
data = data.dropna()
data.head()

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,9046,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1
2,31112,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1
3,60182,Female,49.0,0,0,Yes,Private,Urban,171.23,34.4,smokes,1
4,1665,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,1
5,56669,Male,81.0,0,0,Yes,Private,Urban,186.21,29.0,formerly smoked,1


In [7]:
data.isna().sum()

id                   0
gender               0
age                  0
hypertension         0
heart_disease        0
ever_married         0
work_type            0
Residence_type       0
avg_glucose_level    0
bmi                  0
smoking_status       0
stroke               0
dtype: int64

Check if the data type of all columns are correct:

In [8]:
data.dtypes

id                     int64
gender                object
age                  float64
hypertension           int64
heart_disease          int64
ever_married          object
work_type             object
Residence_type        object
avg_glucose_level    float64
bmi                  float64
smoking_status        object
stroke                 int64
dtype: object

Transform the "AgeGroup" column into differernt bins and make it numerical:

In [9]:
data = clean_age_column(data)

data.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['AgeGroup'] = pd.cut(data['age'], bins=[0, 18, 35, 50, 65, 100], labels=[0, 1, 2, 3, 4])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['AgeGroup'] = data['AgeGroup'].astype('int64')
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data.drop(columns=["age"], inplace=True)


Unnamed: 0,id,gender,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke,AgeGroup
0,9046,Male,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1,4
2,31112,Male,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1,4
3,60182,Female,0,0,Yes,Private,Urban,171.23,34.4,smokes,1,2
4,1665,Female,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,1,4
5,56669,Male,0,0,Yes,Private,Urban,186.21,29.0,formerly smoked,1,4


In [10]:
data['gender'].unique()

array(['Male', 'Female', 'Other'], dtype=object)

Transform the "gender" column from categorical to numerical:

In [11]:
# Drop rows where 'gender' is 'Other'
data = data[data['gender'] != 'Other']

In [12]:
# data['gender'] = data['gender'].map({'Male': 0,'Female': 1})
# data['gender'].astype(int)
# data.head()
data= pd.get_dummies(data, columns=['gender', 'ever_married', 'work_type', 'Residence_type'])
data.head()

Unnamed: 0,id,hypertension,heart_disease,avg_glucose_level,bmi,smoking_status,stroke,AgeGroup,gender_Female,gender_Male,ever_married_No,ever_married_Yes,work_type_Govt_job,work_type_Never_worked,work_type_Private,work_type_Self-employed,work_type_children,Residence_type_Rural,Residence_type_Urban
0,9046,0,1,228.69,36.6,formerly smoked,1,4,0,1,0,1,0,0,1,0,0,0,1
2,31112,0,1,105.92,32.5,never smoked,1,4,0,1,0,1,0,0,1,0,0,1,0
3,60182,0,0,171.23,34.4,smokes,1,2,1,0,0,1,0,0,1,0,0,0,1
4,1665,1,0,174.12,24.0,never smoked,1,4,1,0,0,1,0,0,0,1,0,1,0
5,56669,0,0,186.21,29.0,formerly smoked,1,4,0,1,0,1,0,0,1,0,0,0,1


The "Category" column is our Target column. so we want to make sure how to categorize it in the best form.

In [13]:
data["stroke"].unique()

array([1, 0])

In [14]:
data.drop(columns="id", inplace=True)
data.head()

Unnamed: 0,hypertension,heart_disease,avg_glucose_level,bmi,smoking_status,stroke,AgeGroup,gender_Female,gender_Male,ever_married_No,ever_married_Yes,work_type_Govt_job,work_type_Never_worked,work_type_Private,work_type_Self-employed,work_type_children,Residence_type_Rural,Residence_type_Urban
0,0,1,228.69,36.6,formerly smoked,1,4,0,1,0,1,0,0,1,0,0,0,1
2,0,1,105.92,32.5,never smoked,1,4,0,1,0,1,0,0,1,0,0,1,0
3,0,0,171.23,34.4,smokes,1,2,1,0,0,1,0,0,1,0,0,0,1
4,1,0,174.12,24.0,never smoked,1,4,1,0,0,1,0,0,0,1,0,1,0
5,0,0,186.21,29.0,formerly smoked,1,4,0,1,0,1,0,0,1,0,0,0,1


In [15]:
data['smoking_status'] = data['smoking_status'].map({'never smoked': 0,'Unknown': 1,'formerly smoked': 2,'smokes': 3})
data['smoking_status'].astype(int)
data.head()

Unnamed: 0,hypertension,heart_disease,avg_glucose_level,bmi,smoking_status,stroke,AgeGroup,gender_Female,gender_Male,ever_married_No,ever_married_Yes,work_type_Govt_job,work_type_Never_worked,work_type_Private,work_type_Self-employed,work_type_children,Residence_type_Rural,Residence_type_Urban
0,0,1,228.69,36.6,2,1,4,0,1,0,1,0,0,1,0,0,0,1
2,0,1,105.92,32.5,0,1,4,0,1,0,1,0,0,1,0,0,1,0
3,0,0,171.23,34.4,3,1,2,1,0,0,1,0,0,1,0,0,0,1
4,1,0,174.12,24.0,0,1,4,1,0,0,1,0,0,0,1,0,1,0
5,0,0,186.21,29.0,2,1,4,0,1,0,1,0,0,1,0,0,0,1


In [16]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4908 entries, 0 to 5109
Data columns (total 18 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   hypertension             4908 non-null   int64  
 1   heart_disease            4908 non-null   int64  
 2   avg_glucose_level        4908 non-null   float64
 3   bmi                      4908 non-null   float64
 4   smoking_status           4908 non-null   int64  
 5   stroke                   4908 non-null   int64  
 6   AgeGroup                 4908 non-null   int64  
 7   gender_Female            4908 non-null   uint8  
 8   gender_Male              4908 non-null   uint8  
 9   ever_married_No          4908 non-null   uint8  
 10  ever_married_Yes         4908 non-null   uint8  
 11  work_type_Govt_job       4908 non-null   uint8  
 12  work_type_Never_worked   4908 non-null   uint8  
 13  work_type_Private        4908 non-null   uint8  
 14  work_type_Self-employed 

In [17]:
data = data.apply(pd.to_numeric, errors='coerce')

In [18]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4908 entries, 0 to 5109
Data columns (total 18 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   hypertension             4908 non-null   int64  
 1   heart_disease            4908 non-null   int64  
 2   avg_glucose_level        4908 non-null   float64
 3   bmi                      4908 non-null   float64
 4   smoking_status           4908 non-null   int64  
 5   stroke                   4908 non-null   int64  
 6   AgeGroup                 4908 non-null   int64  
 7   gender_Female            4908 non-null   uint8  
 8   gender_Male              4908 non-null   uint8  
 9   ever_married_No          4908 non-null   uint8  
 10  ever_married_Yes         4908 non-null   uint8  
 11  work_type_Govt_job       4908 non-null   uint8  
 12  work_type_Never_worked   4908 non-null   uint8  
 13  work_type_Private        4908 non-null   uint8  
 14  work_type_Self-employed 

## Feature Selection

In [19]:
# Calculate the correlation matrix
correlation_matrix = np.abs(data.corr())

# Create the heatmap using Plotly Express
fig = px.imshow(correlation_matrix,
                x=correlation_matrix.columns,
                y=correlation_matrix.columns,
                color_continuous_scale='RdBu_r',  # Red-Blue diverging color scale
                zmin=-1,
                zmax=1,
                aspect="auto",
                title='Correlation Heatmap of Numerical Variables')

# Update the layout for better readability
fig.update_layout(
    xaxis_title="",
    yaxis_title="",
    xaxis={'side': 'top'},  # Move x-axis labels to the top
    width=800,
    height=700
)

# Add correlation values as text annotations
for i, row in enumerate(correlation_matrix.values):
    for j, value in enumerate(row):
        fig.add_annotation(
            x=correlation_matrix.columns[j],
            y=correlation_matrix.columns[i],
            text=f"{value:.2f}",
            showarrow=False,
            font=dict(size=8)
        )

# Show the plot
fig.show()

## Split Data into Train and Test

In [20]:
features = data.drop(columns = ["stroke"])
target = data["stroke"]

In [21]:
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size = 0.20, random_state=0)

### Standardize The Data

In [22]:
# Standardize the features
X_train_scaled, X_test_scaled = Standardizer(X_train, X_test)

### Normalize The Data

In [23]:
# Normalize the features
X_train_norm, X_test_norm = Normalizer(X_train, X_test)

Based on the result of Heatmap we can drop these two not important columns : 

In [24]:
X_train_norm = pd.DataFrame(X_train_norm, columns = X_train.columns)
# X_train_reduced = X_train_norm.drop(columns = ["gender","Residence_type"])

X_test_norm = pd.DataFrame(X_test_norm, columns = X_test.columns)
# X_test_reduced = X_test_norm.drop(columns = ["gender","Residence_type"])

In [25]:
X_train_scaled = pd.DataFrame(X_train_scaled, columns = X_train.columns)

X_test_scaled = pd.DataFrame(X_test_scaled, columns = X_test.columns)

# SVM

In [26]:
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, classification_report

### Original Train and Test Data

In [31]:
# Define the parameter grid for SVM
param_grid = {
    'C': [0.1, 1, 10, 100],
    'gamma': [1, 0.1, 0.01, 0.001]
}

# Initialize the GridSearchCV with the SVM classifier
grid_search = GridSearchCV(estimator=SVC(random_state=42),
                           param_grid=param_grid,
                           cv=5,  # 5-fold cross-validation
                           scoring='accuracy',  # you can choose other scoring metrics
                           n_jobs=-1)  # use all available cores

# Fit the grid search to the training data
grid_search.fit(X_train, y_train)

# Get the best parameters and the best score
best_params = grid_search.best_params_
best_score = grid_search.best_score_

# Evaluate the best model on the test data
best_model = grid_search.best_estimator_
y_pred_best = best_model.predict(X_test)
test_accuracy = accuracy_score(y_test, y_pred_best)
classification_rep = classification_report(y_test, y_pred_best, output_dict=True)

report_original = pd.DataFrame(classification_rep).transpose()
best_params, best_score, test_accuracy


Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.


Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.


Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.



({'C': 0.1, 'gamma': 1}, 0.957718027260498, 0.9562118126272913)

### Normalized Train and Test Data

In [32]:
# Define the parameter grid for SVM
param_grid = {
    'C': [0.1, 1, 10, 100],
    'gamma': [1, 0.1, 0.01, 0.001]
}

# Initialize the GridSearchCV with the SVM classifier
grid_search = GridSearchCV(estimator=SVC(random_state=42),
                           param_grid=param_grid,
                           cv=5,  # 5-fold cross-validation
                           scoring='accuracy',  # you can choose other scoring metrics
                           n_jobs=-1)  # use all available cores

# Fit the grid search to the training data
grid_search.fit(X_train_norm, y_train)

# Get the best parameters and the best score
best_params = grid_search.best_params_
best_score = grid_search.best_score_

# Evaluate the best model on the test data
best_model = grid_search.best_estimator_
y_pred_best = best_model.predict(X_test_norm)
test_accuracy = accuracy_score(y_test, y_pred_best)
classification_rep = classification_report(y_test, y_pred_best, output_dict=True)

report_norm = pd.DataFrame(classification_rep).transpose()
best_params, best_score, test_accuracy


Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.


Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.


Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.



({'C': 0.1, 'gamma': 1}, 0.957718027260498, 0.9562118126272913)

### Standardized Train and Test Data

In [33]:
# Define the parameter grid for SVM
param_grid = {
    'C': [0.1, 1, 10, 100],
    'gamma': [1, 0.1, 0.01, 0.001]
}

# Initialize the GridSearchCV with the SVM classifier
grid_search = GridSearchCV(estimator=SVC(random_state=42),
                           param_grid=param_grid,
                           cv=5,  # 5-fold cross-validation
                           scoring='accuracy',  # you can choose other scoring metrics
                           n_jobs=-1)  # use all available cores

# Fit the grid search to the training data
grid_search.fit(X_train_scaled, y_train)

# Get the best parameters and the best score
best_params = grid_search.best_params_
best_score = grid_search.best_score_

# Evaluate the best model on the test data
best_model = grid_search.best_estimator_
y_pred_best = best_model.predict(X_test_scaled)
test_accuracy = accuracy_score(y_test, y_pred_best)
classification_rep = classification_report(y_test, y_pred_best, output_dict=True)

report_scaled = pd.DataFrame(classification_rep).transpose()
best_params, best_score, test_accuracy


Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.


Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.


Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.



({'C': 0.1, 'gamma': 1}, 0.957718027260498, 0.9562118126272913)

In [34]:
combined_report = pd.concat([report_original, report_norm, report_scaled], axis=1)
combined_report
#TODO

Unnamed: 0,precision,recall,f1-score,support,precision.1,recall.1,f1-score.1,support.1,precision.2,recall.2,f1-score.2,support.2
0,0.956212,1.0,0.977616,939.0,0.956212,1.0,0.977616,939.0,0.956212,1.0,0.977616,939.0
1,0.0,0.0,0.0,43.0,0.0,0.0,0.0,43.0,0.0,0.0,0.0,43.0
accuracy,0.956212,0.956212,0.956212,0.956212,0.956212,0.956212,0.956212,0.956212,0.956212,0.956212,0.956212,0.956212
macro avg,0.478106,0.5,0.488808,982.0,0.478106,0.5,0.488808,982.0,0.478106,0.5,0.488808,982.0
weighted avg,0.914341,0.956212,0.934808,982.0,0.914341,0.956212,0.934808,982.0,0.914341,0.956212,0.934808,982.0
