Imports and path resolution

In [1]:
import sys
from pathlib import Path

PROJECT_ROOT = Path.cwd().parent
if str(PROJECT_ROOT) not in sys.path:
    sys.path.insert(0, str(PROJECT_ROOT))

from src.config import get_paths, set_seed
from src.io import read_table, save_dataset

import pandas as pd

Initialize paths and constants

In [2]:
paths = get_paths()
_ = set_seed(42)

DATA_FILE = paths.raw / "AtrofiaMuscularRadiomicaML.xlsx"
assert DATA_FILE.exists(), f"Data file not found: {DATA_FILE.resolve()}"

DATA_FILE

WindowsPath('c:/Users/modre/Documents/masseter/data/raw/AtrofiaMuscularRadiomicaML.xlsx')

Load Excel (sheet selection)

In [3]:
df = pd.read_excel(DATA_FILE, sheet_name=0)

print(df.shape)
df.head(3)

(571, 328)


Unnamed: 0,identificação_imagem,estado_toxina,bloco_de_estudo,grupo_estudo,lateralidade,aumento_de_dados,shape_Elongation_pre,shapeFlatness_pre,shapeLeastAxisLength_pre,shapeMajorAxisLength_pre,...,glszmSmallAreaLowGrayLevelEmphasis_delta,glszmZoneEntropy_delta,glszmZonePercentage_delta,glszmZoneVariance_delta,ngtdmBusyness_delta,ngtdmCoarseness_delta,ngtdmComplexity_delta,ngtdmContrast_delta,ngtdmStrength_delta,pct_area_contratil
0,GC CX1 R0 (2) 16:34:23,0,1,GC,0,0,0.112424,0,0,329.273323,...,0.037473,-0.640015,-0.000106,-110276.354897,4.942555,0.000705,-1.950018,0.002474,-0.016493,0.484456
1,GC CX1 R0 (2) 16:34:28,0,1,GC,0,0,0.114866,0,0,329.053329,...,0.029948,-1.165615,-0.001097,-195378.88078,7.725856,0.000806,-4.114569,0.001974,-0.044182,0.407464
2,GC CX1 R0 (2) 16:34:33,0,1,GC,0,0,0.133911,0,0,348.50255,...,0.018554,-0.398994,-7.2e-05,-242108.606222,-20.32631,0.000743,0.181492,-0.000398,0.063477,0.36417


Define columns

In [4]:
RADIOMICS_COLS = [
    'shapeLeastAxisLength_delta', 'shapeMajorAxisLength_delta', 'shapeMaximum2DDiameterColumn_delta',
    'shapeMaximum2DDiameterRow_delta', 'shapeMaximum2DDiameterSlice_delta', 'shapeMaximum3DDiameter_delta',
    'shapeMeshVolume_delta', 'shapeMinorAxisLength_delta', 'shapeSphericity_delta', 'shapeSurfaceArea_delta',
    'shapeSurfaceVolumeRatio_delta', 'shapeVoxelVolume_delta', 'firstorder10Percentile_delta',
    'firstorder90Percentile_delta', 'firstorderEnergy_delta', 'firstorderEntropy_delta',
    'firstorderInterquartileRange_delta', 'firstorderKurtosis_delta', 'firstorderMaximum_delta',
    'firstorderMeanAbsoluteDeviation_delta', 'firstorderMean_delta', 'firstorderMedian_delta',
    'firstorderMinimum_delta', 'firstorderRange_delta', 'firstorderRobustMeanAbsoluteDeviation_delta',
    'firstorderRootMeanSquared_delta', 'firstorderSkewness_delta', 'firstorderTotalEnergy_delta',
    'firstorderUniformity_delta', 'firstorderVariance_delta', 'glcmAutocorrelation_delta',
    'glcmClusterProminence_delta', 'glcmClusterShade_delta', 'glcmClusterTendency_delta',
    'glcmContrast_delta',  'glcmCorrelation_delta', 'glcmDifferenceAverage_delta', 'glcmDifferenceEntropy_delta',
    'glcmDifferenceVariance_delta', 'glcmId_delta', 'glcmIdm_delta', 'glcmIdmn_delta', 'glcmIdn_delta',
    'glcmImc1_delta', 'glcmImc2_delta', 'glcmInverseVariance_delta', 'glcmJointAverage_delta',
    'glcmJointEnergy_delta', 'glcmJointEntropy_delta', 'glcmMCC_delta', 'glcmMaximumProbability_delta',
    'glcmSumAverage_delta', 'glcmSumEntropy_delta', 'glcmSumSquares_delta', 'gldmDependenceEntropy_delta',
    'gldmDependenceNonUniformity_delta', 'gldmDependenceNonUniformityNormalized_delta',
    'gldmDependenceVariance_delta', 'gldmGrayLevelNonUniformity_delta', 'gldmGrayLevelVariance_delta',
    'gldmHighGrayLevelEmphasis_delta', 'gldmLargeDependenceEmphasis_delta',
    'gldmLargeDependenceHighGrayLevelEmphasis_delta', 'gldmLargeDependenceLowGrayLevelEmphasis_delta',
    'gldmLowGrayLevelEmphasis_delta', 'gldmSmallDependenceEmphasis_delta',
    'gldmSmallDependenceHighGrayLevelEmphasis_delta', 'gldmSmallDependenceLowGrayLevelEmphasis_delta',
    'glrlmGrayLevelNonUniformity_delta', 'glrlmGrayLevelNonUniformityNormalized_delta',
    'glrlmGrayLevelVariance_delta', 'glrlmHighGrayLevelRunEmphasis_delta', 'glrlmLongRunEmphasis_delta',
    'glrlmLongRunHighGrayLevelEmphasis_delta', 'glrlmLongRunLowGrayLevelEmphasis_delta',
    'glrlmLowGrayLevelRunEmphasis_delta', 'glrlmRunEntropy_delta', 'glrlmRunLengthNonUniformity_delta',
    'glrlmRunLengthNonUniformityNormalized_delta', 'glrlmRunPercentage_delta', 'glrlmRunVariance_delta',
    'glrlmShortRunEmphasis_delta', 'glrlmShortRunHighGrayLevelEmphasis_delta',
    'glrlmShortRunLowGrayLevelEmphasis_delta', 'glszmGrayLevelNonUniformity_delta',
    'glszmGrayLevelNonUniformityNormalized_delta', 'glszmGrayLevelVariance_delta',
    'glszmHighGrayLevelZoneEmphasis_delta', 'glszmLargeAreaEmphasis_delta',
    'glszmLargeAreaHighGrayLevelEmphasis_delta', 'glszmLargeAreaLowGrayLevelEmphasis_delta',
    'glszmLowGrayLevelZoneEmphasis_delta', 'glszmSizeZoneNonUniformity_delta',
    'glszmSizeZoneNonUniformityNormalized_delta', 'glszmSmallAreaEmphasis_delta',
    'glszmSmallAreaHighGrayLevelEmphasis_delta', 'glszmSmallAreaLowGrayLevelEmphasis_delta',
    'glszmZoneEntropy_delta', 'glszmZonePercentage_delta', 'glszmZoneVariance_delta',
    'ngtdmBusyness_delta', 'ngtdmCoarseness_delta', 'ngtdmComplexity_delta', 'ngtdmContrast_delta',
    'ngtdmStrength_delta'
]

PCT_COL = "pct_area_contratil"
GROUP_COL = "grupo_estudo"

GROUP_TO_DAYS = {"GC": 0, "G2": 2, "G5": 5, "G7": 7, "G14": 14}


Validate required columns

In [5]:
required = set(RADIOMICS_COLS + [PCT_COL, GROUP_COL])
missing = sorted(list(required - set(df.columns)))

assert not missing, f"Missing columns in Excel: {missing[:10]}{'...' if len(missing)>10 else ''}"

print(f"All required columns found ({len(required)}).")


All required columns found (107).


Build processed datasets

In [6]:
# Predictors
X_radiomics = df[RADIOMICS_COLS].copy()
X_pct_contractile = df[[PCT_COL]].copy().rename(columns={PCT_COL: "pct_contractile_area"})

# Targets
y_group = df[GROUP_COL].astype(str).rename("group")
assert set(y_group.unique()).issubset(set(GROUP_TO_DAYS.keys())), f"Unexpected groups: {sorted(y_group.unique())}"

y_days = y_group.map(GROUP_TO_DAYS).astype(int).rename("days_post_btx")

# Unified base dataset (for downstream modeling)
df_model = pd.concat(
    [X_radiomics, X_pct_contractile, y_group.to_frame(), y_days.to_frame()],
    axis=1
)

print("Shapes:")
print("X_radiomics:", X_radiomics.shape)
print("X_pct_contractile:", X_pct_contractile.shape)
print("y_group:", y_group.shape)
print("y_days:", y_days.shape)
print("df_model:", df_model.shape)

df_model.head(3)


Shapes:
X_radiomics: (571, 105)
X_pct_contractile: (571, 1)
y_group: (571,)
y_days: (571,)
df_model: (571, 108)


Unnamed: 0,shapeLeastAxisLength_delta,shapeMajorAxisLength_delta,shapeMaximum2DDiameterColumn_delta,shapeMaximum2DDiameterRow_delta,shapeMaximum2DDiameterSlice_delta,shapeMaximum3DDiameter_delta,shapeMeshVolume_delta,shapeMinorAxisLength_delta,shapeSphericity_delta,shapeSurfaceArea_delta,...,glszmZonePercentage_delta,glszmZoneVariance_delta,ngtdmBusyness_delta,ngtdmCoarseness_delta,ngtdmComplexity_delta,ngtdmContrast_delta,ngtdmStrength_delta,pct_contractile_area,group,days_post_btx
0,0,-41.555109,-5.0,-22.0,0.776988,0.776988,-7774.0,-25.33351,0.019725,-15595.539105,...,-0.000106,-110276.354897,4.942555,0.000705,-1.950018,0.002474,-0.016493,0.484456,GC,0
1,0,-66.735515,-25.0,-22.0,-27.13546,-27.13546,-8689.75,-24.115956,0.022502,-17454.019425,...,-0.001097,-195378.88078,7.725856,0.000806,-4.114569,0.001974,-0.044182,0.407464,GC,0
2,0,-96.399966,-66.0,-24.0,-39.821221,-39.821221,-9482.5,-19.316921,0.019996,-19090.211412,...,-7.2e-05,-242108.606222,-20.32631,0.000743,0.181492,-0.000398,0.063477,0.36417,GC,0


Save processed datasets to data/processed

In [7]:
out_dir = paths.processed

saved = {}
saved["X_radiomics"] = save_dataset(X_radiomics, out_dir / "X_radiomics", index=False)
saved["X_pct_contractile"] = save_dataset(X_pct_contractile, out_dir / "X_pct_contractile", index=False)
saved["y_group"] = save_dataset(y_group.to_frame(), out_dir / "y_group", index=False)
saved["y_days"] = save_dataset(y_days.to_frame(), out_dir / "y_days", index=False)
saved["df_model"] = save_dataset(df_model, out_dir / "df_model", index=False)

saved


{'X_radiomics': {'csv': WindowsPath('c:/Users/modre/Documents/masseter/data/processed/X_radiomics.csv'),
  'parquet': WindowsPath('c:/Users/modre/Documents/masseter/data/processed/X_radiomics.parquet')},
 'X_pct_contractile': {'csv': WindowsPath('c:/Users/modre/Documents/masseter/data/processed/X_pct_contractile.csv'),
  'parquet': WindowsPath('c:/Users/modre/Documents/masseter/data/processed/X_pct_contractile.parquet')},
 'y_group': {'csv': WindowsPath('c:/Users/modre/Documents/masseter/data/processed/y_group.csv'),
  'parquet': WindowsPath('c:/Users/modre/Documents/masseter/data/processed/y_group.parquet')},
 'y_days': {'csv': WindowsPath('c:/Users/modre/Documents/masseter/data/processed/y_days.csv'),
  'parquet': WindowsPath('c:/Users/modre/Documents/masseter/data/processed/y_days.parquet')},
 'df_model': {'csv': WindowsPath('c:/Users/modre/Documents/masseter/data/processed/df_model.csv'),
  'parquet': WindowsPath('c:/Users/modre/Documents/masseter/data/processed/df_model.parquet')}

Sanity checks (counts by group)

In [8]:
summary = (
    df_model.groupby("group")
    .agg(n_samples=("group", "size"))
    .reset_index()
    .sort_values("group")
)

summary


Unnamed: 0,group,n_samples
0,G14,110
1,G2,110
2,G5,121
3,G7,110
4,GC,120
