# Statistical Analysis

In [2]:
# Import Libraries
import pandas as pd
import numpy as np
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
import scipy
import PIL
import requests
import plotly as ply
from sklearn.impute import KNNImputer
from scipy import stats
from scipy.stats import chi2_contingency, ttest_ind, mannwhitneyu


In [3]:
# Import Dataset
# the relative path of the stroke csv dataset is saved into the variable data for ease of use
data = '/Users/marylopez/Documents/DSI/stroke-prediction/data/stroke.csv'
#/Users/marylopez/Documents/DSI/stroke-prediction/data

# The data is read and saved in a variable (stroke)
stroke = pd.read_csv(data)

stroke = pd.DataFrame(stroke)


stroke

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,9046,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1
1,51676,Female,61.0,0,0,Yes,Self-employed,Rural,202.21,,never smoked,1
2,31112,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1
3,60182,Female,49.0,0,0,Yes,Private,Urban,171.23,34.4,smokes,1
4,1665,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,1
...,...,...,...,...,...,...,...,...,...,...,...,...
5105,18234,Female,80.0,1,0,Yes,Private,Urban,83.75,,never smoked,0
5106,44873,Female,81.0,0,0,Yes,Self-employed,Urban,125.20,40.0,never smoked,0
5107,19723,Female,35.0,0,0,Yes,Self-employed,Rural,82.99,30.6,never smoked,0
5108,37544,Male,51.0,0,0,Yes,Private,Rural,166.29,25.6,formerly smoked,0


In [4]:
# data standardization - chnaging the uppercase 'R' to lowercase 'r' 
stroke = stroke.rename(columns={"Residence_type": "residence_type"})


In [5]:
# bmi imputation with KNN
#bmi
#building the KNN regression model for imputation.
stroke_copy_bmi = stroke.copy() # df to handle bmi

In [6]:
# Imputation Using KNN = 9

# Convert categorical column 'gender' to numeric values (Male=1, Female=0, Other=2)
stroke_copy_bmi['gender'] = stroke_copy_bmi['gender'].map({'Male':1, 'Female':0, 'Other':2})

# Select only numerical columns for KNN imputation
X = stroke_copy_bmi[['gender', 'age', 'hypertension', 'heart_disease', 
                     'avg_glucose_level', 'bmi', 'stroke']]

# Save mask of NaN positions in bmi (optional, useful for checking imputations later)
nan_mask = X['bmi'].isna()

# Apply KNN imputer with n_neighbors=9
imputer = KNNImputer(n_neighbors=9, weights="uniform")
X_imputed = imputer.fit_transform(X)

# Convert back to DataFrame with same columns and index
X_imputed_df = pd.DataFrame(X_imputed, columns=X.columns, index=X.index)

# Replace the original bmi column with the imputed one
stroke_copy_bmi['bmi'] = X_imputed_df['bmi']



In [7]:
# Create a copy of the DataFrame with processed values
stroke_copy_processed = stroke_copy_bmi.copy()

# Map numeric values back to categorical labels
stroke_copy_processed['gender'] = stroke_copy_processed['gender'].map({1:'Male', 0:'Female', 2:'Other'})



In [8]:
# Create subcategories for age, bmi and glucose average:
# Define the bins (boundaries) of x-axis categories
bins = [0, 14, 25, 65, float('inf')]

# Define the labels for each bin
labels = ['Children', 'Youth', 'Adult', 'Senior']

# Create a new column for age categories
stroke_copy_processed['age_category'] = pd.cut(
    stroke_copy_processed['age'],
    bins=bins,
    labels=labels,
    right=False
)

# risk factors: 
# df copy to graph risk factors processed in patients with and without stroke: smoke, BMI and Glucose

#risk_f_df = stroke_copy_processed[['avg_glucose_level'	, 'bmi', 'smoking_status', 'stroke']].copy()

# created categories: 

#BMI categories: 
# Define the bins (boundaries) of x-axis categories
bins = [ 0, 18.5, 25, 30, float('inf')]

# Define the labels for each bin
labels = [ 'Underweight','Healthy Weight', 'Overweight', 'Obesity']

# Create a new column for BMI category type
stroke_copy_processed['bmi_category'] = pd.cut(
    stroke_copy_processed['bmi'],
      bins=bins, 
      labels=labels, 
      right=False)


# Glcuose average categories: 
# Define the bins (boundaries) of x-axis categories
bins = [0, 70, 140, 200, float('inf')]

# Define the labels for each bin
labels = ['Low', 'Healthy', 'Pre-Diabetic', 'Diabetic']

# Create a new column for glucose categories
stroke_copy_processed['glucose_category'] = pd.cut(
    stroke_copy_processed['avg_glucose_level'],
    bins=bins,
    labels=labels,
    right=False
)


In [9]:
stroke_copy_processed.head()

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,residence_type,avg_glucose_level,bmi,smoking_status,stroke,age_category,bmi_category,glucose_category
0,9046,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1,Senior,Obesity,Diabetic
1,51676,Female,61.0,0,0,Yes,Self-employed,Rural,202.21,34.388889,never smoked,1,Adult,Obesity,Diabetic
2,31112,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1,Senior,Obesity,Healthy
3,60182,Female,49.0,0,0,Yes,Private,Urban,171.23,34.4,smokes,1,Adult,Obesity,Pre-Diabetic
4,1665,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,1,Senior,Healthy Weight,Pre-Diabetic


In [10]:
# Show all features (column names) in the dataset
print(stroke_copy_processed.columns.tolist())



['id', 'gender', 'age', 'hypertension', 'heart_disease', 'ever_married', 'work_type', 'residence_type', 'avg_glucose_level', 'bmi', 'smoking_status', 'stroke', 'age_category', 'bmi_category', 'glucose_category']


In [12]:
# Divide groups: Stroke and non stroke

stroke = stroke_copy_processed[stroke_copy_processed['stroke']==1]
non_stroke = stroke = stroke_copy_processed[stroke_copy_processed['stroke']==0]

In [13]:
# Variables to include: gender, age, hypertension, hear_disease , ever_married, work_type, residence_type, avg_glucose_level, bmi, smoking_status
# Numerical Variables: age (categories: 4), hypertension, hear_disease,avg_glucose_level (categories: 4), bmi (categories: 4)
# Categorical Variables: gender, ever_married, work_type, residence_type, smoking_status



In [14]:
# Logistic Regression
# ['id', 'gender', 'age', 'hypertension', 'heart_disease', 'ever_married', 'work_type', 'residence_type', 'avg_glucose_level', 'bmi', 'smoking_status', 'stroke', 'age_category', 'bmi_category', 'glucose_category']

In [15]:
'age_category', 'bmi_category', 'glucose_category'

('age_category', 'bmi_category', 'glucose_category')

In [16]:
# Statistical analysis with Python pipeline, multivariable: 
# Categorical: Chi-square
# Continuas: t‑test/Mann‑Whitney

# Define your dataset
df = stroke_copy_processed  

# Dependent variable
target = 'stroke'

# List of variables to test
variables = ['gender', 'ever_married',
             'smoking_status', 'glucose_category', 'hypertension',
             'heart_disease', 'work_type', 'residence_type', 
             'age', 'bmi_category', 'glucose_category']

results = []

for var in variables:
    if df[var].dtype == 'object' or df[var].nunique() < 10:
        # Loop through each subcategory of the categorical variable
        for cat in df[var].unique():
            df['in_cat'] = (df[var] == cat).astype(int)
            table = pd.crosstab(df[target], df['in_cat'])
            chi2, p, dof, expected = chi2_contingency(table)
            results.append([var, cat, 'Categorical (Chi-square)', p, p < 0.05])
    else:
        # Continuous variable → t-test and Mann-Whitney
        group0 = df[df[target] == 0][var].dropna()
        group1 = df[df[target] == 1][var].dropna()
        
        # t-test
        t_stat, p_ttest = ttest_ind(group0, group1, equal_var=False)
        # Mann-Whitney
        u_stat, p_mw = mannwhitneyu(group0, group1, alternative='two-sided')
        
        results.append([var, '-', 'Continuous (t-test)', p_ttest, p_ttest < 0.05])
        results.append([var, '-', 'Continuous (Mann-Whitney)', p_mw, p_mw < 0.05])

# Convert results to DataFrame
results_df = pd.DataFrame(results, columns=['Variable', 'Subcategory', 'Test Type', 'p-value', 'Significant'])

results_df


Unnamed: 0,Variable,Subcategory,Test Type,p-value,Significant
0,gender,Male,Categorical (Chi-square),0.5580285,False
1,gender,Female,Categorical (Chi-square),0.5623841,False
2,gender,Other,Categorical (Chi-square),1.0,False
3,ever_married,Yes,Categorical (Chi-square),1.638902e-14,True
4,ever_married,No,Categorical (Chi-square),1.638902e-14,True
5,smoking_status,formerly smoked,Categorical (Chi-square),5.930415e-06,True
6,smoking_status,never smoked,Categorical (Chi-square),0.8197579,False
7,smoking_status,smokes,Categorical (Chi-square),0.5829352,False
8,smoking_status,Unknown,Categorical (Chi-square),8.686386e-05,True
9,glucose_category,Diabetic,Categorical (Chi-square),1.180764e-15,True


In [17]:
# Married  and smoking adjusted per age. 
# age could be a confusing factor in this specific varaibles

# Statistical analysis with Python pipeline, multivariable: 
# Categorical: Chi-square
# Continuas: t‑test/Mann‑Whitney

import pandas as pd
import statsmodels.api as sm

df = stroke_copy_processed.copy()
df['ever_married_num'] = df['ever_married'].map({'Yes': 1, 'No': 0})

X = df[['ever_married_num', 'age']]
y = df['stroke']

# Add constant (intercept)
X = sm.add_constant(X)

logit_model = sm.Logit(y, X).fit()
print(logit_model.summary())

#ever_married_num = -0.1604, p = 0.458 (not significant):
# After adjusting for age, having ever been married 
# is not independently associated with stroke.

#age = 0.0753, p < 0.001 (highly significant)
#Each additional year of age increases the log-odds of stroke by 0.0753.
#For each year of age, the risk of stroke increases by ~7.8%.

Optimization terminated successfully.
         Current function value: 0.158094
         Iterations 9
                           Logit Regression Results                           
Dep. Variable:                 stroke   No. Observations:                 5110
Model:                          Logit   Df Residuals:                     5107
Method:                           MLE   Df Model:                            2
Date:                Mon, 10 Nov 2025   Pseudo R-squ.:                  0.1882
Time:                        13:40:16   Log-Likelihood:                -807.86
converged:                       True   LL-Null:                       -995.19
Covariance Type:            nonrobust   LLR p-value:                 4.420e-82
                       coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------------
const               -7.1284      0.358    -19.915      0.000      -7.830      -6.427
ever_marrie

In [18]:
# smoking status: 


import pandas as pd
import statsmodels.api as sm

df = stroke_copy_processed.copy()

# Create dummy variables for smoking_status
df = pd.get_dummies(df, columns=['smoking_status'], drop_first=True)

# Select predictors: age + all smoking_status dummies EXCEPT Unknown
smoke_cols = [col for col in df.columns 
              if col.startswith('smoking_status_') and 'Unknown' not in col]

X = df[['age'] + smoke_cols].astype(float)
y = df['stroke'].astype(int)

# Add constant
X = sm.add_constant(X)

# Fit logistic regression
logit_model = sm.Logit(y, X).fit()
print(logit_model.summary())



Optimization terminated successfully.
         Current function value: 0.157832
         Iterations 9
                           Logit Regression Results                           
Dep. Variable:                 stroke   No. Observations:                 5110
Model:                          Logit   Df Residuals:                     5105
Method:                           MLE   Df Model:                            4
Date:                Mon, 10 Nov 2025   Pseudo R-squ.:                  0.1896
Time:                        13:40:16   Log-Likelihood:                -806.52
converged:                       True   LL-Null:                       -995.19
Covariance Type:            nonrobust   LLR p-value:                 2.195e-80
                                     coef    std err          z      P>|z|      [0.025      0.975]
--------------------------------------------------------------------------------------------------
const                             -7.3044      0.366    -19.974     

In [19]:
target= 'stroke'
predictors selected: #[ 'gender', 'age', 'hypertension', 'heart_disease', 'ever_married', 'work_type', 'residence_type', 'avg_glucose_level', 'bmi', 'smoking_status', 'stroke', 'age_category', 'bmi_category', 'glucose_category']

 

SyntaxError: invalid syntax (3342076706.py, line 2)