<a href="https://colab.research.google.com/github/pkolakal/IT7103/blob/main/pkolakal_Assignment3.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [22]:
# Step 1: Importing the Data

In [16]:
# Importing the pandas library for data manipulation
import pandas as pd

# Specify the path to the CSV file containing the stroke dataset
csv_file_path = 'stroke.csv'

# Load the stroke dataset into a pandas DataFrame
stroke_df = pd.read_csv(csv_file_path)

# Display the first few rows of the stroke dataset
stroke_df.head()


Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,9046,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1
1,51676,Female,61.0,0,0,Yes,Self-employed,Rural,202.21,,never smoked,1
2,31112,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1
3,60182,Female,49.0,0,0,Yes,Private,Urban,171.23,34.4,smokes,1
4,1665,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,1


In [23]:
# Step 2: Preprocessing the Data

In [19]:
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

# Define categorical and numerical features in the original dataset
categorical_features = ['gender', 'ever_married', 'work_type', 'Residence_type', 'smoking_status']
numerical_features = ['age', 'hypertension', 'heart_disease', 'avg_glucose_level', 'bmi']

# Preprocessing pipeline definition
preprocessor = ColumnTransformer(
    transformers=[
        ('num', Pipeline(steps=[
            ('imputer', SimpleImputer(strategy='mean')),  # Impute missing values with mean
            ('scaler', StandardScaler())]), numerical_features),  # Standardize numerical features
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)])  # One-hot encode categorical features

# Split the data into features (X) and target (y)
X = stroke_df.drop(columns=['id', 'stroke'])  # Features are all columns except 'id' and 'stroke'
y = stroke_df['stroke']  # Target variable is 'stroke'

# Split the data into training and testing sets
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)  # 80% train, 20% test

# Fit the preprocessor on the training data and transform both training and testing data
X_train_processed = preprocessor.fit_transform(X_train)
X_test_processed = preprocessor.transform(X_test)

# Extract transformed feature names for easier interpretation
numerical_features_transformed = numerical_features
categorical_features_transformed = preprocessor.named_transformers_['cat'].get_feature_names_out(categorical_features)
all_feature_names = list(numerical_features_transformed) + list(categorical_features_transformed)

# Convert processed data back to DataFrame for clarity and easier inspection
X_train_df = pd.DataFrame(X_train_processed, columns=all_feature_names)
X_test_df = pd.DataFrame(X_test_processed, columns=all_feature_names)

# Display the first few rows of the processed training data
print("Processed Training Data:")
print(X_train_df.head())


Processed Training Data:
        age  hypertension  heart_disease  avg_glucose_level       bmi  \
0  1.584155     -0.321981      -0.236189           0.135593 -0.058307   
1  0.829708     -0.321981      -0.236189          -0.397457  0.947173   
2 -0.989841     -0.321981      -0.236189          -1.028701  0.612013   
3 -0.546049     -0.321981      -0.236189          -0.893246  0.186618   
4 -0.546049     -0.321981      -0.236189          -1.026290 -1.166912   

   gender_Female  gender_Male  ever_married_No  ever_married_Yes  \
0            0.0          1.0              0.0               1.0   
1            1.0          0.0              0.0               1.0   
2            1.0          0.0              1.0               0.0   
3            0.0          1.0              0.0               1.0   
4            1.0          0.0              1.0               0.0   

   work_type_Govt_job  work_type_Never_worked  work_type_Private  \
0                 0.0                     0.0              

In [None]:
# Step 3: Model Training and Evaluation - Class Weight Adjustment

In [24]:
from sklearn.linear_model import LogisticRegression  # Import Logistic Regression model
from sklearn.ensemble import RandomForestClassifier  # Import Random Forest Classifier model
from sklearn.svm import SVC  # Import Support Vector Classifier (SVC) model
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score  # Import evaluation metrics

# Initialize the models with class weights
log_reg_weighted = LogisticRegression(class_weight='balanced', random_state=42)  # Logistic Regression with balanced class weights
random_forest_weighted = RandomForestClassifier(class_weight='balanced', random_state=42)  # Random Forest Classifier with balanced class weights
svm_weighted = SVC(class_weight='balanced', probability=True, random_state=42)  # SVC with balanced class weights and probability estimation

# Train the models
log_reg_weighted.fit(X_train_processed, y_train)  # Train Logistic Regression model
random_forest_weighted.fit(X_train_processed, y_train)  # Train Random Forest Classifier model
svm_weighted.fit(X_train_processed, y_train)  # Train SVC model

# Make predictions
y_pred_log_reg_weighted = log_reg_weighted.predict(X_test_processed)  # Predict using Logistic Regression model
y_pred_random_forest_weighted = random_forest_weighted.predict(X_test_processed)  # Predict using Random Forest Classifier model
y_pred_svm_weighted = svm_weighted.predict(X_test_processed)  # Predict using SVC model

# Probabilities for ROC AUC
y_prob_log_reg_weighted = log_reg_weighted.predict_proba(X_test_processed)[:, 1]  # Predict probabilities (positive class) using Logistic Regression model
y_prob_random_forest_weighted = random_forest_weighted.predict_proba(X_test_processed)[:, 1]  # Predict probabilities (positive class) using Random Forest Classifier model
y_prob_svm_weighted = svm_weighted.predict_proba(X_test_processed)[:, 1]  # Predict probabilities (positive class) using SVC model

# Evaluate the models
def evaluate_model(y_test, y_pred, y_prob):
    accuracy = accuracy_score(y_test, y_pred)  # Calculate accuracy
    precision = precision_score(y_test, y_pred, zero_division=0)  # Calculate precision
    recall = recall_score(y_test, y_pred)  # Calculate recall
    f1 = f1_score(y_test, y_pred)  # Calculate F1 score
    roc_auc = roc_auc_score(y_test, y_prob)  # Calculate ROC AUC score
    return accuracy, precision, recall, f1, roc_auc

# Evaluate each model on the test set
results_log_reg_weighted = evaluate_model(y_test, y_pred_log_reg_weighted, y_prob_log_reg_weighted)  # Evaluate Logistic Regression model
results_random_forest_weighted = evaluate_model(y_test, y_pred_random_forest_weighted, y_prob_random_forest_weighted)  # Evaluate Random Forest Classifier model
results_svm_weighted = evaluate_model(y_test, y_pred_svm_weighted, y_prob_svm_weighted)  # Evaluate SVC model

# Display the evaluation results as a DataFrame
results_weighted = pd.DataFrame({
    'Model': ['Logistic Regression (Weighted)', 'Random Forest (Weighted)', 'SVM (Weighted)'],
    'Accuracy': [results_log_reg_weighted[0], results_random_forest_weighted[0], results_svm_weighted[0]],
    'Precision': [results_log_reg_weighted[1], results_random_forest_weighted[1], results_svm_weighted[1]],
    'Recall': [results_log_reg_weighted[2], results_random_forest_weighted[2], results_svm_weighted[2]],
    'F1 Score': [results_log_reg_weighted[3], results_random_forest_weighted[3], results_svm_weighted[3]],
    'ROC AUC': [results_log_reg_weighted[4], results_random_forest_weighted[4], results_svm_weighted[4]]
})

# Print the evaluation results
print(results_weighted)


                            Model  Accuracy  Precision    Recall  F1 Score  \
0  Logistic Regression (Weighted)  0.750489   0.168385  0.790323  0.277620   
1        Random Forest (Weighted)  0.939335   0.000000  0.000000  0.000000   
2                  SVM (Weighted)  0.770059   0.158103  0.645161  0.253968   

    ROC AUC  
0  0.852907  
1  0.803805  
2  0.792288  


In [None]:
# Step 4: Resampling Techniques : the Synthetic Minority Over-sampling Technique (SMOTE) to balance the dataset and then train the models.

In [25]:
from imblearn.over_sampling import SMOTE  # Import SMOTE for oversampling
from sklearn.pipeline import make_pipeline  # Import make_pipeline for creating pipelines

# Apply SMOTE to the training data
smote = SMOTE(random_state=42)  # Initialize SMOTE with random state for reproducibility
X_train_smote, y_train_smote = smote.fit_resample(X_train_processed, y_train)  # Apply SMOTE to X_train_processed and y_train

# Initialize the models
log_reg_smote = LogisticRegression(random_state=42)  # Initialize Logistic Regression model
random_forest_smote = RandomForestClassifier(random_state=42)  # Initialize Random Forest Classifier model
svm_smote = SVC(probability=True, random_state=42)  # Initialize SVM model with probability estimation

# Train the models
log_reg_smote.fit(X_train_smote, y_train_smote)  # Train Logistic Regression model on SMOTE-resampled data
random_forest_smote.fit(X_train_smote, y_train_smote)  # Train Random Forest Classifier model on SMOTE-resampled data
svm_smote.fit(X_train_smote, y_train_smote)  # Train SVM model on SMOTE-resampled data

# Make predictions
y_pred_log_reg_smote = log_reg_smote.predict(X_test_processed)  # Predict using Logistic Regression model
y_pred_random_forest_smote = random_forest_smote.predict(X_test_processed)  # Predict using Random Forest Classifier model
y_pred_svm_smote = svm_smote.predict(X_test_processed)  # Predict using SVM model

# Probabilities for ROC AUC
y_prob_log_reg_smote = log_reg_smote.predict_proba(X_test_processed)[:, 1]  # Predict probabilities (positive class) using Logistic Regression model
y_prob_random_forest_smote = random_forest_smote.predict_proba(X_test_processed)[:, 1]  # Predict probabilities (positive class) using Random Forest Classifier model
y_prob_svm_smote = svm_smote.predict_proba(X_test_processed)[:, 1]  # Predict probabilities (positive class) using SVM model

# Evaluate the models
results_log_reg_smote = evaluate_model(y_test, y_pred_log_reg_smote, y_prob_log_reg_smote)  # Evaluate Logistic Regression model
results_random_forest_smote = evaluate_model(y_test, y_pred_random_forest_smote, y_prob_random_forest_smote)  # Evaluate Random Forest Classifier model
results_svm_smote = evaluate_model(y_test, y_pred_svm_smote, y_prob_svm_smote)  # Evaluate SVM model

# Display the results
results_smote = pd.DataFrame({
    'Model': ['Logistic Regression (SMOTE)', 'Random Forest (SMOTE)', 'SVM (SMOTE)'],
    'Accuracy': [results_log_reg_smote[0], results_random_forest_smote[0], results_svm_smote[0]],
    'Precision': [results_log_reg_smote[1], results_random_forest_smote[1], results_svm_smote[1]],
    'Recall': [results_log_reg_smote[2], results_random_forest_smote[2], results_svm_smote[2]],
    'F1 Score': [results_log_reg_smote[3], results_random_forest_smote[3], results_svm_smote[3]],
    'ROC AUC': [results_log_reg_smote[4], results_random_forest_smote[4], results_svm_smote[4]]
})

print(results_smote)


                         Model  Accuracy  Precision    Recall  F1 Score  \
0  Logistic Regression (SMOTE)  0.753425   0.174658  0.822581  0.288136   
1        Random Forest (SMOTE)  0.925636   0.181818  0.064516  0.095238   
2                  SVM (SMOTE)  0.804305   0.161765  0.532258  0.248120   

    ROC AUC  
0  0.850521  
1  0.779561  
2  0.765054  
