In [23]:
import pandas as pd
import numpy as np
from pathlib import Path
import matplotlib.pyplot as plt
import statsmodels.formula.api as smf
from sqlalchemy import create_engine
from sklearn.metrics import roc_curve, roc_auc_score
from sklearn.preprocessing import StandardScaler
from scipy.stats import ttest_ind
import statsmodels.api as sm
from sklearn.decomposition import PCA
from imblearn.ensemble import BalancedRandomForestClassifier
from sklearn.metrics import roc_auc_score, accuracy_score, classification_report, f1_score, confusion_matrix
from imblearn.pipeline import Pipeline as ImbPipeline
from sklearn.model_selection import train_test_split

import warnings
warnings.filterwarnings('ignore')

In [24]:
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None) 
pd.set_option('display.width', 1000) 
pd.set_option('display.max_colwidth', None)

In [25]:
df = pd.read_csv("preprocessed_data.csv")
df.drop(columns='Unnamed: 0', inplace=True)

In [18]:
df['age_group'] = pd.cut(
    df['age_years'],
    bins=[18, 29, 39, 49, 59, float('inf')],
    labels=['18-29', '30-39', '40-49', '50-59', '60+'],
    right=True,
    include_lowest=True
)

In [12]:
def fnx(x):
    if pd.isna(x):
        return np.nan
    elif x < 5:
        return 'none-minimal'
    elif (x >= 5) and (x < 10):
        return 'mild'
    elif (x >= 10) and (x < 15):
        return 'moderate'
    elif (x >= 15) and (x < 20):
        return 'moderately severe'
    else:
        return 'severe'

df['dpq_cat'] = df['dpq_total'].apply(lambda x: fnx(x))

In [27]:
len(df[df['hscrp_mg_l_raw']>20])

103

In [20]:
df.head()

Unnamed: 0,seqn,age_years,age_group,sex,race_eth,educ_level,pir,ever_cancer,dpq_total,dpq_cat,smoke_status,hscrp_mg_l_raw,hscrp_cat,bmx_bmi
0,130379,66.0,60+,1,3.0,5.0,5.0,1,1.0,none-minimal,former,2.03,average,33.5
1,130380,44.0,40-49,2,2.0,3.0,1.41,0,2.0,none-minimal,never,5.62,high,29.7
2,130386,34.0,30-39,1,1.0,4.0,1.33,0,1.0,none-minimal,former,1.05,average,30.2
3,130388,27.0,18-29,2,4.0,4.0,0.81,0,,,current,,,43.7
4,130390,31.0,30-39,2,3.0,3.0,2.16,0,,,never,11.2,high,46.0


In [19]:
for i in ['age_years','age_group', 'sex', 'race_eth', 'educ_level', 'pir', 
          'ever_cancer', 'dpq_total', 'dpq_cat', 'smoke_status', 'hscrp_mg_l_raw', 'hscrp_cat', 'bmx_bmi']:
    cnt = df[i].isna().sum()
    pct = df[i].isna().mean()*100
    print(f"{i}: {cnt} missing ({pct:.2f}%)")

age_years: 84 missing (1.08%)
age_group: 84 missing (1.08%)
sex: 0 missing (0.00%)
race_eth: 0 missing (0.00%)
educ_level: 24 missing (0.31%)
pir: 1384 missing (17.74%)
ever_cancer: 0 missing (0.00%)
dpq_total: 4200 missing (53.85%)
dpq_cat: 4200 missing (53.85%)
smoke_status: 27 missing (0.35%)
hscrp_mg_l_raw: 2213 missing (28.37%)
hscrp_cat: 2213 missing (28.37%)
bmx_bmi: 1836 missing (23.54%)


In [None]:
df = df[df['hscrp_mg_l_raw'] <= 20]

df['psych_stress'] = np.where(df['dpq_total'] >= 5, 1, 0)

df['bio_stress'] = np.where(df['hscrp_mg_l_raw'] >= 3, 1, 0)

df['smoke_stress'] = np.where(df['smoke_status'] == 'current', 1, 0)

df['pir_stress'] = np.where(df['pir'] < 1.3, 1, 0)

stress_cols = ['psych_stress', 'bio_stress', 'smoke_stress', 'pir_stress']
df[stress_cols] = df[stress_cols].fillna(0)


df['clinical_stress'] = df['psych_stress'] + df['bio_stress']

df['all_stress'] = (
    df['psych_stress'] +
    df['bio_stress'] +
    df['smoke_stress'] +
    df['pir_stress']
)
df['all_stress_cat'] = np.where(
    df['all_stress'] >= 2, '2+',
    df['all_stress'].astype(str)
)

In [40]:
df['clinical_stress_bin'] = np.where(df['clinical_stress'] >= 1, 1, 0)

df['all_stress_bin'] = np.where(df['all_stress'] >= 2, 1, 0)

In [41]:
young_df = df[df['age_years'] < 50]
old_df = df[df['age_years'] >= 50]

young_high = young_df[young_df['clinical_stress_bin'] == 1]['ever_cancer'].mean()
young_low  = young_df[young_df['clinical_stress_bin'] == 0]['ever_cancer'].mean()

old_high = old_df[old_df['clinical_stress_bin'] == 1]['ever_cancer'].mean()
old_low  = old_df[old_df['clinical_stress_bin'] == 0]['ever_cancer'].mean()

clinical_risk_summary = pd.DataFrame({
    'young': [young_low, young_high],
    'old':   [old_low, old_high]
}, index=['low_stress_cancer_rate', 'high_stress_cancer_rate'])

print(clinical_risk_summary)

print("Young clinical RR:", young_high / young_low)
print("Old clinical RR:",   old_high / old_low)


                            young       old
low_stress_cancer_rate   0.021719  0.225154
high_stress_cancer_rate  0.028382  0.215827
Young clinical RR: 1.3068158446793634
Old clinical RR: 0.9585747860564416


In [42]:
# multi domain stress cancer rates
young_high2 = young_df[young_df['all_stress_bin'] == 1]['ever_cancer'].mean()
young_low2  = young_df[young_df['all_stress_bin'] == 0]['ever_cancer'].mean()

old_high2 = old_df[old_df['all_stress_bin'] == 1]['ever_cancer'].mean()
old_low2  = old_df[old_df['all_stress_bin'] == 0]['ever_cancer'].mean()

all_stress_risk_summary = pd.DataFrame({
    'young': [young_low2, young_high2],
    'old':   [old_low2, old_high2]
}, index=['low_stress_cancer_rate', 'high_stress_cancer_rate'])

print(all_stress_risk_summary)

print("Young all stress RR:", young_high2 / young_low2)
print("Old all stress RR:",   old_high2 / old_low2)

                            young       old
low_stress_cancer_rate   0.023552  0.236943
high_stress_cancer_rate  0.029358  0.161473
Young all stress RR: 1.246516241011654
Old all stress RR: 0.6814844743596219


In [43]:
clinical_q = df['clinical_stress'].quantile([0.33, 0.66])

clinical_low_cut = clinical_q.loc[0.33]
clinical_high_cut = clinical_q.loc[0.66]

# create clinical stress tertiles
df['clinical_stress_qcat'] = pd.cut(
    df['clinical_stress'],
    bins=[-1, clinical_low_cut, clinical_high_cut, df['clinical_stress'].max()],
    labels=['low', 'medium', 'high']
)

# compute quantile cut points for all domain stress
all_q = df['all_stress'].quantile([0.33, 0.66])

all_low_cut = all_q.loc[0.33]
all_high_cut = all_q.loc[0.66]

# create multi domain stress tertiles
df['all_stress_qcat'] = pd.cut(
    df['all_stress'],
    bins=[-1, all_low_cut, all_high_cut, df['all_stress'].max()],
    labels=['low', 'medium', 'high']
)

df[['clinical_stress', 'clinical_stress_qcat', 'all_stress', 'all_stress_qcat']].head()

Unnamed: 0,clinical_stress,clinical_stress_qcat,all_stress,all_stress_qcat
0,0,low,0,low
1,1,medium,1,medium
2,0,low,0,low
4,1,medium,1,medium
5,2,high,4,high
