# Statistical Analysis

In [1]:
# Import Libraries
import pandas as pd
import numpy as np
import scipy
import statsmodels.api as sm
from scipy import stats
from scipy.stats import chi2_contingency, ttest_ind, mannwhitneyu



In [2]:
# Import Dataset
# the relative path of the stroke csv dataset is saved into the variable data for ease of use
data = '../data/processed/stroke_copy_processed.csv'
#/Users/marylopez/Documents/DSI/stroke-prediction/data

# The data is read and saved in a variable (stroke)
stroke = pd.read_csv(data)

stroke = pd.DataFrame(stroke)


stroke

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,residence_type,avg_glucose_level,bmi,smoking_status,stroke,age_category,bmi_category,glucose_category
0,9046,Male,67.0,0,1,Yes,Private,Urban,229,37,formerly smoked,1,Senior,Obesity,Diabetic
1,51676,Female,61.0,0,0,Yes,Self-employed,Rural,202,34,never smoked,1,Adult,Obesity,Diabetic
2,31112,Male,80.0,0,1,Yes,Private,Rural,106,32,never smoked,1,Senior,Obesity,Healthy
3,60182,Female,49.0,0,0,Yes,Private,Urban,171,34,smokes,1,Adult,Obesity,Pre-Diabetic
4,1665,Female,79.0,1,0,Yes,Self-employed,Rural,174,24,never smoked,1,Senior,Healthy Weight,Pre-Diabetic
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5105,18234,Female,80.0,1,0,Yes,Private,Urban,84,27,never smoked,0,Senior,Overweight,Healthy
5106,44873,Female,81.0,0,0,Yes,Self-employed,Urban,125,40,never smoked,0,Senior,Obesity,Healthy
5107,19723,Female,35.0,0,0,Yes,Self-employed,Rural,83,31,never smoked,0,Adult,Obesity,Healthy
5108,37544,Male,51.0,0,0,Yes,Private,Rural,166,26,formerly smoked,0,Adult,Overweight,Pre-Diabetic


In [3]:
# Show all features (column names) in the dataset
print(stroke.columns.tolist())



['id', 'gender', 'age', 'hypertension', 'heart_disease', 'ever_married', 'work_type', 'residence_type', 'avg_glucose_level', 'bmi', 'smoking_status', 'stroke', 'age_category', 'bmi_category', 'glucose_category']


- It was decided to remove all observations under 18 years old to reduce noise associated with the physiological differences between the pediatric and adult populations. In addition, there were only two stroke cases under 18 years old; therefore, this does not have an impact on the final outcome and could increase the accuracy of the model and analysis.

In [4]:
# Removing all stroke positive patients under the age of 18 years
stroke_adults = stroke[stroke['age'] >= 18]

# This data set is only comprised of patients who have suffered a stroke and are over the age of 18+ (Adult)
stroke_adults = pd.DataFrame(stroke_adults)


stroke_adults

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,residence_type,avg_glucose_level,bmi,smoking_status,stroke,age_category,bmi_category,glucose_category
0,9046,Male,67.0,0,1,Yes,Private,Urban,229,37,formerly smoked,1,Senior,Obesity,Diabetic
1,51676,Female,61.0,0,0,Yes,Self-employed,Rural,202,34,never smoked,1,Adult,Obesity,Diabetic
2,31112,Male,80.0,0,1,Yes,Private,Rural,106,32,never smoked,1,Senior,Obesity,Healthy
3,60182,Female,49.0,0,0,Yes,Private,Urban,171,34,smokes,1,Adult,Obesity,Pre-Diabetic
4,1665,Female,79.0,1,0,Yes,Self-employed,Rural,174,24,never smoked,1,Senior,Healthy Weight,Pre-Diabetic
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5105,18234,Female,80.0,1,0,Yes,Private,Urban,84,27,never smoked,0,Senior,Overweight,Healthy
5106,44873,Female,81.0,0,0,Yes,Self-employed,Urban,125,40,never smoked,0,Senior,Obesity,Healthy
5107,19723,Female,35.0,0,0,Yes,Self-employed,Rural,83,31,never smoked,0,Adult,Obesity,Healthy
5108,37544,Male,51.0,0,0,Yes,Private,Rural,166,26,formerly smoked,0,Adult,Overweight,Pre-Diabetic


In [5]:
# Divide groups: Stroke and non stroke

stroke = stroke_adults[stroke_adults['stroke']==1]
non_stroke = stroke = stroke_adults[stroke_adults['stroke']==0]

-  Variables to include: gender, age, hypertension, hear_disease , ever_married, work_type, residence_type, avg_glucose_level, bmi, smoking_status
    - Numerical Variables: age (categories: 3), hypertension, hear_disease,avg_glucose_level (categories: 4), bmi (categories: 4)
    - Categorical Variables: gender, ever_married, work_type, residence_type, smoking_status

In [6]:
# Statistical analysis with Python pipeline, multivariable: 
# Categorical: Chi-square
# Continuas: t‑test/Mann‑Whitney

# Define your dataset
df = stroke_adults 

# Dependent variable
target = 'stroke'

# List of variables to test
variables = ['gender', 'ever_married',
             'smoking_status', 'glucose_category', 'hypertension',
             'heart_disease', 'work_type', 'residence_type', 
             'age', 'bmi_category', 'glucose_category']

results = []

for var in variables:
    if df[var].dtype == 'object' or df[var].nunique() < 10:
        # Loop through each subcategory of the categorical variable
        for cat in df[var].unique():
            df['in_cat'] = (df[var] == cat).astype(int)
            table = pd.crosstab(df[target], df['in_cat'])
            chi2, p, dof, expected = chi2_contingency(table)
            results.append([var, cat, 'Categorical (Chi-square)', p, p < 0.05])
    else:
        # Continuous variable → t-test and Mann-Whitney
        group0 = df[df[target] == 0][var].dropna()
        group1 = df[df[target] == 1][var].dropna()
        
        # t-test
        t_stat, p_ttest = ttest_ind(group0, group1, equal_var=False)
        # Mann-Whitney
        u_stat, p_mw = mannwhitneyu(group0, group1, alternative='two-sided')
        
        results.append([var, '-', 'Continuous (t-test)', p_ttest, p_ttest < 0.05])
        results.append([var, '-', 'Continuous (Mann-Whitney)', p_mw, p_mw < 0.05])

# Convert results to DataFrame
results_df = pd.DataFrame(results, columns=['Variable', 'Subcategory', 'Test Type', 'p-value', 'Significant'])

results_df


Unnamed: 0,Variable,Subcategory,Test Type,p-value,Significant
0,gender,Male,Categorical (Chi-square),0.1742125,False
1,gender,Female,Categorical (Chi-square),0.1767396,False
2,gender,Other,Categorical (Chi-square),1.0,False
3,ever_married,Yes,Categorical (Chi-square),6.84228e-05,True
4,ever_married,No,Categorical (Chi-square),6.84228e-05,True
5,smoking_status,formerly smoked,Categorical (Chi-square),0.001403301,True
6,smoking_status,never smoked,Categorical (Chi-square),0.1348015,False
7,smoking_status,smokes,Categorical (Chi-square),0.6365355,False
8,smoking_status,Unknown,Categorical (Chi-square),0.4579909,False
9,glucose_category,Diabetic,Categorical (Chi-square),1.979527e-12,True


Statistical analyst: 
-	Variables with p-value >5 (no significant difference): gender, glucose category (low), residence type, work_type (private, govt_job, never_worked)

-	Variables with p-value <5 (significant diference):age, glucose category (dibetic, healthy and prediabetic), hypertension, heart_disease, work_type (self employed)

-	To deeper analyst: 
    - Ever_married: significant
    - Smoking_status:
        - significant: formerly smoked
        - no significant : never smoked, smokes, unknown
    - bmi category: no significant


# Married  and smoking adjusted per age. 

In [7]:
# Married  and smoking adjusted per age. 
# age could be a confusing factor in this specific varaibles

# Statistical analysis with Python pipeline, multivariable: 
# Categorical: Chi-square
# Continuas: t‑test/Mann‑Whitney

df = stroke_adults.copy()
df['ever_married_num'] = df['ever_married'].map({'Yes': 1, 'No': 0})

X = df[['ever_married_num', 'age']]
y = df['stroke']

# Add constant (intercept)
X = sm.add_constant(X)

logit_model = sm.Logit(y, X).fit()
print(logit_model.summary())



Optimization terminated successfully.
         Current function value: 0.186487
         Iterations 8
                           Logit Regression Results                           
Dep. Variable:                 stroke   No. Observations:                 4254
Model:                          Logit   Df Residuals:                     4251
Method:                           MLE   Df Model:                            2
Date:                Thu, 13 Nov 2025   Pseudo R-squ.:                  0.1585
Time:                        16:16:08   Log-Likelihood:                -793.31
converged:                       True   LL-Null:                       -942.70
Covariance Type:            nonrobust   LLR p-value:                 1.321e-65
                       coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------------
const               -7.2163      0.392    -18.432      0.000      -7.984      -6.449
ever_marrie

Ever Married Analysis:
- ever_married_num = coef: -0.139, p = 0.529 (not significant):
    -  After adjusting for age, having ever been married  is not independently associated with stroke.

- age = coef: 0.0763, p < 0.001 (highly significant)
    - Each additional year of age increases the log-odds of stroke by 0.0763.
    - For each year of age, the risk of stroke increases by ~7.9%.

Conclusion: 
- it was decided not include ever_married as a variable in the final model.

In [8]:
# smoking status: 

df = stroke_adults.copy()

# Create dummy variables for smoking_status
df = pd.get_dummies(df, columns=['smoking_status'], drop_first=True)

# Select predictors: age + all smoking_status dummies EXCEPT Unknown
smoke_cols = [col for col in df.columns 
              if col.startswith('smoking_status_') and 'Unknown' not in col]

X = df[['age'] + smoke_cols].astype(float)
y = df['stroke'].astype(int)

# Add constant
X = sm.add_constant(X)

# Fit logistic regression
logit_model = sm.Logit(y, X).fit()
print(logit_model.summary())



Optimization terminated successfully.
         Current function value: 0.186138
         Iterations 8
                           Logit Regression Results                           
Dep. Variable:                 stroke   No. Observations:                 4254
Model:                          Logit   Df Residuals:                     4249
Method:                           MLE   Df Model:                            4
Date:                Thu, 13 Nov 2025   Pseudo R-squ.:                  0.1600
Time:                        16:16:08   Log-Likelihood:                -791.83
converged:                       True   LL-Null:                       -942.70
Covariance Type:            nonrobust   LLR p-value:                 4.553e-64
                                     coef    std err          z      P>|z|      [0.025      0.975]
--------------------------------------------------------------------------------------------------
const                             -7.4330      0.393    -18.893     

Analysis:

In logistic regression:
- A positive coefficient means the variable increases the likelihood (log-odds) of the event (here: stroke).
- A negative coefficient means the variable decreases the likelihood.
- The p-value tells us whether the effect is statistically significant (usually we require p < 0.05).

Results: 
- smoking_status_formerly smoked: coef:0.1544,  p-value : 0.454 
- smoking_status_never smoked: coef: - 0.0516, p-value:  0.792 
- smoking_status_smokes: coef: 0.2826,  p-value 0.222

Interpretation: 
- All the p-values are greater than 0.05, therefore none of the smoking status variables are statistically significant predictors of stroke in this model.
- Smoking_status_formerly smoked: Positive coefficient. This suggests that people who smoked in the past may have slightly higher odds of stroke compared to the reference group, but the effect is not statistically significant.
- Smoking_status_smokes: Positive coefficient. This suggests that current smokers may have higher odds of stroke compared to the reference group, but again the effect is not statistically significant.
- Smoking_status_never smoked: Negative coefficient. This suggests that people who never smoked may have slightly lower odds of stroke compared to the reference group, but the effect is not statistically significant.

Conclusion: 
- Because the number of stroke observations is relatively low, and the coefficients suggest that people who currently smoke or smoked in the past may have slightly higher odds of stroke, while people who never smoked may have slightly lower odds, it was decided to include smoking status as a variable in the final model, even though these effects are not statistically significant.


In [9]:
# BMI category:
df = stroke_adults.copy()

# Create dummy variables for bmi_category (drop first to avoid colinearity)
df = pd.get_dummies(df, columns=['bmi_category'], drop_first=True)

# Select predictors: age + all bmi_category dummies
bmi_category_cols = [col for col in df.columns if col.startswith('bmi_category_')]

X = df[['age'] + bmi_category_cols].astype(float)
y = df['stroke'].astype(int)

# Add constant
X = sm.add_constant(X)

# Fit logistic regression
logit_model = sm.Logit(y, X).fit()
print(logit_model.summary())


Optimization terminated successfully.
         Current function value: 0.186186
         Iterations 8
                           Logit Regression Results                           
Dep. Variable:                 stroke   No. Observations:                 4254
Model:                          Logit   Df Residuals:                     4249
Method:                           MLE   Df Model:                            4
Date:                Thu, 13 Nov 2025   Pseudo R-squ.:                  0.1598
Time:                        16:16:08   Log-Likelihood:                -792.04
converged:                       True   LL-Null:                       -942.70
Covariance Type:            nonrobust   LLR p-value:                 5.584e-64
                               coef    std err          z      P>|z|      [0.025      0.975]
--------------------------------------------------------------------------------------------
const                       -7.5146      0.408    -18.411      0.000      -8.315

Analysis:

In logistic regression:
- A positive coefficient means the variable increases the likelihood (log-odds) of the event (here: stroke).
- A negative coefficient means the variable decreases the likelihood.
- The p-value tells us whether the effect is statistically significant (usually we require p < 0.05).

Results:
- bmi_category_Obesity: coef = 0.2011, p-value = 0.335
- bmi_category_Overweight: coef = 0.1993, p-value = 0.358
- bmi_category_Underweight: coef = -1.0194, p-value = 0.331

Interpretation:
- All p-values are greater than 0.05, therefore none of the BMI category variables are statistically significant predictors of stroke in this model.
- Obesity: Positive coefficient. This suggests that individuals with obesity may have slightly higher odds of stroke compared to the reference group, but the effect is not statistically significant.
- Overweight: Positive coefficient. This suggests that individuals who are overweight may have slightly higher odds of stroke compared to the reference group, but the effect is not statistically significant.
- Underweight: Negative coefficient. This suggests that individuals who are underweight may have slightly lower odds of stroke compared to the reference group, but the effect is not statistically significant.

Conclusion: 
- Because the number of stroke observations is relatively low, and the coefficients suggest that individuals with obesity and overweight may have slightly higher odds of stroke, while those who are underweight may have slightly lower odds, it was decided to include BMI category as a variable in the final model, even though these effects are not statistically significant.