In [1]:
# Load clinical + subtypes

from pathlib import Path
import pandas as pd

PROJECT = Path.cwd()
PROCESSED = PROJECT / "data" / "processed"

clin = pd.read_csv(PROCESSED / "clinical_aligned.csv", index_col=0)
sub = pd.read_csv(PROCESSED / "subtype_labels_k2.csv", index_col=0)

print("clin:", clin.shape)
print("sub :", sub.shape)


clin: (95, 124)
sub : (95, 1)


In [2]:
# Join (adds subtype column)

clin_sub = clin.join(sub, how="inner")

print("clin_sub:", clin_sub.shape)
print(clin_sub["subtype"].value_counts())


clin_sub: (95, 125)
subtype
C1    54
C2    41
Name: count, dtype: int64


In [9]:
# Age by subtype

# Make a numeric version of age (non-numeric values become NaN)
clin_sub["age_num"] = pd.to_numeric(clin_sub["age"], errors="coerce")

# How many ages are usable?
print("age missing after conversion:", int(clin_sub["age_num"].isna().sum()))

# Summary by subtype
summary = clin_sub.groupby("subtype")["age_num"].agg(["count", "mean", "std", "min", "max"])
print(summary)



age missing after conversion: 1
         count       mean        std   min   max
subtype                                         
C1          53  63.773585  10.214322  38.0  86.0
C2          41  62.439024   9.260261  45.0  86.0


In [4]:
# Sex counts by subtype

if "sex" in clin_sub.columns:
    print(pd.crosstab(clin_sub["subtype"], clin_sub["sex"]))
else:
    print("No 'sex' column found")


sex      Female
subtype        
C1           54
C2           41


In [None]:
# Save labelled clinical table

clin_sub.to_csv(PROCESSED / "clinical_with_subtypes_k2.csv")
print("Saved: clinical_with_subtypes_k2.csv")


In [10]:
candidates = [c for c in clin_sub.columns if any(k in c.lower() for k in [
    "grade", "stage", "staging", "ajcc", "pt", "pn", "metastasis", "tumor_size", "histologic", "necrosis", "margin"
])]
candidates[:40], len(candidates)


(['tumor_size_cm',
  'histologic_type',
  'histologic_grade',
  'tumor_necrosis',
  'margin_status',
  'ajcc_tnm_cancer_staging_edition_used',
  'pathologic_staging_primary_tumor_pt',
  'pathologic_staging_regional_lymph_nodes_pn',
  'clinical_staging_distant_metastasis_cm',
  'pathologic_staging_distant_metastasis_pm',
  'specify_distant_metastasis_documented_sites',
  'tumor_stage_pathological',
  'alcohol_consumption',
  'additional_surgery_for_new_tumor_metastasis',
  'number_of_days_from_date_of_initial_pathologic_diagnosis_to_date_of_additional_surgery_for_new_tumor_event_metastasis'],
 15)

In [11]:
def tab(col):
    if col not in clin_sub.columns:
        print(f"{col} not found")
        return
    t = pd.crosstab(clin_sub["subtype"], clin_sub[col], dropna=False)
    print(col)
    print(t)
    print("\nRow %:")
    print(t.div(t.sum(axis=1), axis=0).round(3))
    print("\n" + "-"*40 + "\n")


In [12]:
tab("histologic_grade")
tab("pathologic_staging_primary_tumor_pt")
tab("pathologic_staging_regional_lymph_nodes_pn")


histologic_grade
histologic_grade  G1 Well differentiated  G2 Moderately differentiated  \
subtype                                                                  
C1                                    18                            20   
C2                                    14                            19   

histologic_grade  G3 Poorly differentiated  \
subtype                                      
C1                                      15   
C2                                       8   

histologic_grade  GX Grading is not applicable, cannot be assessed or not specified  
subtype                                                                              
C1                                                                1                  
C2                                                                0                  

Row %:
histologic_grade  G1 Well differentiated  G2 Moderately differentiated  \
subtype                                                                  
C

In [13]:
if "tumor_size_cm" in clin_sub.columns:
    clin_sub["tumor_size_cm_num"] = pd.to_numeric(clin_sub["tumor_size_cm"], errors="coerce")
    print(clin_sub.groupby("subtype")["tumor_size_cm_num"].agg(["count","mean","std","min","max"]))
else:
    print("tumor_size_cm not found")


         count      mean       std  min   max
subtype                                      
C1          54  3.788889  2.313510  0.8  13.5
C2          41  3.465854  2.073838  0.0  11.0


In [14]:
# tests
import pandas as pd
from scipy.stats import chi2_contingency

def chi2_test(col):
    t = pd.crosstab(clin_sub["subtype"], clin_sub[col])
    chi2, p, dof, _ = chi2_contingency(t)
    print(col)
    print("table shape:", t.shape, "dof:", dof)
    print("p-value:", p)
    print("-"*40)

chi2_test("histologic_grade")
chi2_test("pathologic_staging_primary_tumor_pt")
chi2_test("pathologic_staging_regional_lymph_nodes_pn")



histologic_grade
table shape: (2, 4) dof: 3
p-value: 0.5906693507862304
----------------------------------------
pathologic_staging_primary_tumor_pt
table shape: (2, 8) dof: 7
p-value: 0.41823713182340566
----------------------------------------
pathologic_staging_regional_lymph_nodes_pn
table shape: (2, 5) dof: 4
p-value: 0.5645068226151561
----------------------------------------


In [15]:
# collapse pT categories

pt = clin_sub["pathologic_staging_primary_tumor_pt"].astype(str)

pt_simple = (
    pt.str.replace(r"\s*\(.*\)", "", regex=True)   # remove "(FIGO ...)"
      .str.strip()
)

clin_sub["pt_simple"] = pt_simple
print(pd.crosstab(clin_sub["subtype"], clin_sub["pt_simple"]))



pt_simple  pT1  pT1a  pT1b  pT2  pT3a  pT3b
subtype                                    
C1           1    31    12    4     4     2
C2           0    25     5    7     3     1


In [16]:
chi2_test("pt_simple")


pt_simple
table shape: (2, 6) dof: 5
p-value: 0.5325915311130457
----------------------------------------


In [18]:
chi2_test("histologic_grade")

grade = clin_sub["histologic_grade"].astype(str)

clin_sub["grade_simple"] = grade.replace({
    "G1 Well differentiated": "G1",
    "G2 Moderately differentiated": "G2",
    "G3 Poorly differentiated": "G3",
    "GX Grading is not applicable, cannot be assessed or not specified": "GX",
})

# low vs high (drop GX later if you want)
clin_sub["grade_low_high"] = clin_sub["grade_simple"].replace({
    "G1": "Low (G1)",
    "G2": "Low (G2)",
    "G3": "High (G3)",
    "GX": "Unknown",
})

print(pd.crosstab(clin_sub["subtype"], clin_sub["grade_low_high"]))
chi2_test("grade_low_high")


histologic_grade
table shape: (2, 4) dof: 3
p-value: 0.5906693507862304
----------------------------------------
grade_low_high  High (G3)  Low (G1)  Low (G2)  Unknown
subtype                                               
C1                     15        18        20        1
C2                      8        14        19        0
grade_low_high
table shape: (2, 4) dof: 3
p-value: 0.5906693507862304
----------------------------------------


In [19]:
pn = clin_sub["pathologic_staging_regional_lymph_nodes_pn"].astype(str).str.strip()

def pn_group(x):
    if x in ["pN0"]:
        return "N0"
    if x.startswith("pN1") or x.startswith("pN2"):
        return "N+"
    if x in ["pNX", "0", "nan", "None"]:
        return "NX/Unknown"
    return "Other"

clin_sub["pn_simple"] = pn.map(pn_group)

print(pd.crosstab(clin_sub["subtype"], clin_sub["pn_simple"]))
chi2_test("pn_simple")


pn_simple  N+  N0  NX/Unknown
subtype                      
C1          5  22          27
C2          2  22          17
pn_simple
table shape: (2, 3) dof: 2
p-value: 0.4038441533539012
----------------------------------------


In [20]:
from pathlib import Path
PROCESSED = Path.cwd() / "data" / "processed"
clin_sub.to_csv(PROCESSED / "clinical_with_subtypes_k2.csv")
print("Saved: clinical_with_subtypes_k2.csv")


Saved: clinical_with_subtypes_k2.csv
