In [14]:
import pandas as pd
import numpy as np
from pathlib import Path
import matplotlib.pyplot as plt
import statsmodels.formula.api as smf
from sqlalchemy import create_engine
from sklearn.metrics import roc_curve, roc_auc_score
from sklearn.preprocessing import StandardScaler
from scipy.stats import ttest_ind
import statsmodels.api as sm
from sklearn.decomposition import PCA
from imblearn.ensemble import BalancedRandomForestClassifier
from sklearn.metrics import roc_auc_score, accuracy_score, classification_report, f1_score, confusion_matrix
from imblearn.pipeline import Pipeline as ImbPipeline
from sklearn.model_selection import train_test_split

import warnings
warnings.filterwarnings('ignore')

In [15]:
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None) 
pd.set_option('display.width', 1000) 
pd.set_option('display.max_colwidth', None)

In [16]:
df = pd.read_csv("preprocessed_data.csv")
df.drop(columns='Unnamed: 0', inplace=True)

In [17]:
df['age_group'] = pd.cut(
    df['age_years'],
    bins=[18, 29, 39, 49, 59, float('inf')],
    labels=['18-29', '30-39', '40-49', '50-59', '60+'],
    right=True,
    include_lowest=True
)

In [18]:
def fnx(x):
    if pd.isna(x):
        return np.nan
    elif x < 5:
        return 'none-minimal'
    elif (x >= 5) and (x < 10):
        return 'mild'
    elif (x >= 10) and (x < 15):
        return 'moderate'
    elif (x >= 15) and (x < 20):
        return 'moderately severe'
    else:
        return 'severe'

df['dpq_cat'] = df['dpq_total'].apply(lambda x: fnx(x))

In [19]:
len(df[df['hscrp_mg_l_raw']>20])

103

In [20]:
df.head()

Unnamed: 0,seqn,age_years,age_group,sex,race_eth,educ_level,pir,ever_cancer,dpq_total,dpq_cat,smoke_status,hscrp_mg_l_raw,hscrp_cat,bmx_bmi
0,130379,66.0,60+,1,3.0,5.0,5.0,1,1.0,none-minimal,former,2.03,average,33.5
1,130380,44.0,40-49,2,2.0,3.0,1.41,0,2.0,none-minimal,never,5.62,high,29.7
2,130386,34.0,30-39,1,1.0,4.0,1.33,0,1.0,none-minimal,former,1.05,average,30.2
3,130388,27.0,18-29,2,4.0,4.0,0.81,0,,,current,,,43.7
4,130390,31.0,30-39,2,3.0,3.0,2.16,0,,,never,11.2,high,46.0


In [21]:
for i in ['age_years','age_group', 'sex', 'race_eth', 'educ_level', 'pir', 
          'ever_cancer', 'dpq_total', 'dpq_cat', 'smoke_status', 'hscrp_mg_l_raw', 'hscrp_cat', 'bmx_bmi']:
    cnt = df[i].isna().sum()
    pct = df[i].isna().mean()*100
    print(f"{i}: {cnt} missing ({pct:.2f}%)")

age_years: 84 missing (1.08%)
age_group: 84 missing (1.08%)
sex: 0 missing (0.00%)
race_eth: 0 missing (0.00%)
educ_level: 24 missing (0.31%)
pir: 1384 missing (17.74%)
ever_cancer: 0 missing (0.00%)
dpq_total: 4200 missing (53.85%)
dpq_cat: 4200 missing (53.85%)
smoke_status: 27 missing (0.35%)
hscrp_mg_l_raw: 2213 missing (28.37%)
hscrp_cat: 2213 missing (28.37%)
bmx_bmi: 1836 missing (23.54%)


In [70]:
df = df[df['hscrp_mg_l_raw'] <= 20]

df['psych_stress'] = np.where(df['dpq_total'] >= 5, 1, 0)

df['bio_stress'] = np.where(df['hscrp_mg_l_raw'] >= 3, 1, 0)

df['smoke_stress'] = np.where(df['smoke_status'] == 'current', 1, 0)

df['pir_stress'] = np.where(df['pir'] < 1.3, 1, 0)

# df['bmi_stress'] = np.where(df['bmx_bmi'] >= 30, 1, 0)

stress_cols = ['psych_stress', 'bio_stress', 'smoke_stress', 'pir_stress']
df[stress_cols] = df[stress_cols].fillna(0)


df['clinical_stress'] = df['psych_stress'] + df['bio_stress']

df['all_stress'] = (
    df['psych_stress'] +
    df['bio_stress'] +
    df['smoke_stress'] +
    df['pir_stress'] 
)
df['all_stress_cat'] = np.where(
    df['all_stress'] >= 2, '2+',
    df['all_stress'].astype(str)
)

In [71]:
df['clinical_stress_bin'] = np.where(df['clinical_stress'] >= 1, 1, 0)

df['all_stress_bin'] = np.where(df['all_stress'] >= 2, 1, 0)

In [72]:
young_df = df[df['age_years'] < 50]
old_df = df[df['age_years'] >= 50]

young_high = young_df[young_df['clinical_stress_bin'] == 1]['ever_cancer'].mean()
young_low  = young_df[young_df['clinical_stress_bin'] == 0]['ever_cancer'].mean()

old_high = old_df[old_df['clinical_stress_bin'] == 1]['ever_cancer'].mean()
old_low  = old_df[old_df['clinical_stress_bin'] == 0]['ever_cancer'].mean()

clinical_risk_summary = pd.DataFrame({
    'young': [young_low, young_high],
    'old':   [old_low, old_high]
}, index=['low_stress_cancer_rate', 'high_stress_cancer_rate'])

print(clinical_risk_summary)

print("Young clinical RR:", young_high / young_low)
print("Old clinical RR:",   old_high / old_low)


                            young       old
low_stress_cancer_rate   0.021719  0.225154
high_stress_cancer_rate  0.028382  0.215827
Young clinical RR: 1.3068158446793634
Old clinical RR: 0.9585747860564416


In [73]:
# multi domain stress cancer rates
young_high2 = young_df[young_df['all_stress_bin'] == 1]['ever_cancer'].mean()
young_low2  = young_df[young_df['all_stress_bin'] == 0]['ever_cancer'].mean()

old_high2 = old_df[old_df['all_stress_bin'] == 1]['ever_cancer'].mean()
old_low2  = old_df[old_df['all_stress_bin'] == 0]['ever_cancer'].mean()

all_stress_risk_summary = pd.DataFrame({
    'young': [young_low2, young_high2],
    'old':   [old_low2, old_high2]
}, index=['low_stress_cancer_rate', 'high_stress_cancer_rate'])

print(all_stress_risk_summary)

print("Young all stress RR:", young_high2 / young_low2)
print("Old all stress RR:",   old_high2 / old_low2)

                            young       old
low_stress_cancer_rate   0.023552  0.236943
high_stress_cancer_rate  0.029358  0.161473
Young all stress RR: 1.246516241011654
Old all stress RR: 0.6814844743596219


In [74]:
df.head()

Unnamed: 0,seqn,age_years,age_group,sex,race_eth,educ_level,pir,ever_cancer,dpq_total,dpq_cat,smoke_status,hscrp_mg_l_raw,hscrp_cat,bmx_bmi,psych_stress,bio_stress,smoke_stress,pir_stress,bmi_stress,clinical_stress,all_stress,all_stress_cat,clinical_stress_bin,all_stress_bin,stress_pca_score,stress_pca_cat,stress_pca_bin
0,130379,66.0,60+,1,3.0,5.0,5.0,1,1.0,none-minimal,former,2.03,average,33.5,0,0,0,0,1,0,0,0,0,0,-0.178085,0,1
1,130380,44.0,40-49,2,2.0,3.0,1.41,0,2.0,none-minimal,never,5.62,high,29.7,0,1,0,0,0,1,1,1,1,0,-0.025963,1,1
2,130386,34.0,30-39,1,1.0,4.0,1.33,0,1.0,none-minimal,former,1.05,average,30.2,0,0,0,0,1,0,0,0,0,0,-0.178085,0,1
4,130390,31.0,30-39,2,3.0,3.0,2.16,0,,,never,11.2,high,46.0,0,1,0,0,1,1,1,1,1,0,1.146724,1,1
5,130391,33.0,30-39,2,3.0,3.0,0.93,0,24.0,severe,current,3.55,high,38.9,1,1,1,1,1,2,4,2+,1,1,3.509738,1,1


### Method 1 Using PCA of all the stress to get a stress scorer

In [75]:
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
import numpy as np
import pandas as pd

stress_vars = ['psych_stress', 'bio_stress', 'smoke_stress', 'pir_stress']

X = df[stress_vars].values

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

pca = PCA(n_components=1)
df['stress_pca_score'] = pca.fit_transform(X_scaled)

In [76]:
loadings = pca.components_[0]

pca_weights = pd.DataFrame({
    'stressor': stress_vars,
    'loading': loadings
})

print(pca_weights)


       stressor   loading
0  psych_stress  0.460319
1    bio_stress  0.405508
2  smoke_stress  0.560274
3    pir_stress  0.556563


In [77]:
df['stress_pca_cat'] = pd.qcut(
    df['stress_pca_score'],
    q=2,
    labels=[0, 1]
)


In [80]:
young_df = df[df['age_years'] < 50]
old_df   = df[df['age_years'] >= 50]

pca_results = pd.DataFrame({
    'young_cancer_rate': young_df.groupby('stress_pca_cat')['ever_cancer'].mean(),
    'old_cancer_rate': old_df.groupby('stress_pca_cat')['ever_cancer'].mean()
})
print(pca_results)

print("Young RR:", pca_results.loc[1,'young_cancer_rate'] / pca_results.loc[0,'young_cancer_rate'])
print("Old RR:", pca_results.loc[1,'old_cancer_rate'] / pca_results.loc[0,'old_cancer_rate'])

                young_cancer_rate  old_cancer_rate
stress_pca_cat                                    
0                        0.021409         0.241598
1                        0.029146         0.186953
Young RR: 1.361348408710218
Old RR: 0.7738198347319527


### Method 2 Predict dpq based on other 3 stressors and then assign weights based on paper

In [82]:
import statsmodels.api as sm

reg_df = df[['dpq_total', 'bio_stress', 'smoke_stress', 'pir_stress']].dropna()

X = reg_df[['bio_stress', 'smoke_stress', 'pir_stress']]
X = sm.add_constant(X)
y = reg_df['dpq_total']

model = sm.OLS(y, X).fit()
print(model.summary())


                            OLS Regression Results                            
Dep. Variable:              dpq_total   R-squared:                       0.050
Model:                            OLS   Adj. R-squared:                  0.049
Method:                 Least Squares   F-statistic:                     57.05
Date:                Sun, 30 Nov 2025   Prob (F-statistic):           6.08e-36
Time:                        00:24:30   Log-Likelihood:                -9655.8
No. Observations:                3273   AIC:                         1.932e+04
Df Residuals:                    3269   BIC:                         1.934e+04
Df Model:                           3                                         
Covariance Type:            nonrobust                                         
                   coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------------------------------------------------
const            4.2427      0.108     39.258   

In [83]:
betas = model.params[['bio_stress', 'smoke_stress', 'pir_stress']].abs()

# normalize to sum to 1
weights = betas / betas.sum()

print(weights)

bio_stress      0.184752
smoke_stress    0.447893
pir_stress      0.367355
dtype: float64


In [84]:
df['stress_reg_weighted'] = (
    df['psych_stress'] +
    weights['bio_stress']   * df['bio_stress'] +
    weights['smoke_stress'] * df['smoke_stress'] +
    weights['pir_stress']   * df['pir_stress']
)

In [88]:
df['stress_reg_weighted'].unique()

array([0.        , 0.18475185, 2.        , 0.44789333, 0.36735482,
       1.        , 1.55210667, 1.18475185, 0.55210667, 1.36735482,
       0.63264518, 1.44789333, 1.63264518, 0.81524815, 1.81524815])

In [92]:
df['stress_reg_cat'] = pd.qcut(
    df['stress_reg_weighted'],
    q=2,
    labels=['low', 'high'],
    duplicates='drop'
)

df['stress_reg_bin'] = np.where(
    df['stress_reg_cat'] == 'low',
    0,
    1
)

In [93]:
young = df[df['age_years'] < 50]
old   = df[df['age_years'] >= 50]

young_high = young[young['stress_reg_bin']==1]['ever_cancer'].mean()
young_low  = young[young['stress_reg_bin']==0]['ever_cancer'].mean()

old_high = old[old['stress_reg_bin']==1]['ever_cancer'].mean()
old_low  = old[old['stress_reg_bin']==0]['ever_cancer'].mean()

print("Young RR:", young_high / young_low)
print("Old RR:", old_high / old_low)

Young RR: 1.361348408710218
Old RR: 0.7738198347319527
