# Updated Labels from Stulberg Classification
## Extract and match Stulberg labels with existing classification metadata

This notebook:
- Loads Stulberg classification data from second sheet
- Parses image filenames to extract patient number, view, and side
- Filters for AP (anterior-posterior) view images only
- Matches with existing classification_metadata.xlsx
- Creates updated classification_metadata2.xlsx with matched entries only

## 1. Imports and Setup

In [1]:
print("hello world")

hello world


In [2]:
import pandas as pd
import numpy as np
import re
from pathlib import Path

# Setup paths
BASE_DIR = Path('C:/FeatureEx')
STULBERG_FILE = BASE_DIR / 'Stulberg classification.xlsx'
METADATA_FILE = BASE_DIR / 'classification_metadata.xlsx'
OUTPUT_FILE = BASE_DIR / 'classification_metadata2.xlsx'

print(f"Base directory: {BASE_DIR}")
print(f"Input files:")
print(f"  Stulberg: {STULBERG_FILE}")
print(f"  Metadata: {METADATA_FILE}")
print(f"\nOutput file:")
print(f"  {OUTPUT_FILE}")

Base directory: C:\FeatureEx
Input files:
  Stulberg: C:\FeatureEx\Stulberg classification.xlsx
  Metadata: C:\FeatureEx\classification_metadata.xlsx

Output file:
  C:\FeatureEx\classification_metadata2.xlsx


## 2. Load Stulberg Classification Data

In [3]:
# Load Stulberg classification from second sheet
print("Loading Stulberg classification data...")

# First, let's check what sheets are available
xls = pd.ExcelFile(STULBERG_FILE)
print(f"\nAvailable sheets in Stulberg classification.xlsx:")
for i, sheet in enumerate(xls.sheet_names):
    print(f"  {i}: {sheet}")

# Load the second sheet (index 1)
stulberg_df = pd.read_excel(STULBERG_FILE, sheet_name=1)

print(f"\nStulberg data shape: {stulberg_df.shape}")
print(f"\nFirst few rows:")
print(stulberg_df.head(10))

print(f"\nColumn names:")
print(stulberg_df.columns.tolist())

print(f"\nData types:")
print(stulberg_df.dtypes)

Loading Stulberg classification data...

Available sheets in Stulberg classification.xlsx:
  0: Stulberg classification
  1: AK - Healed FH
  2: Jessica - 

Stulberg data shape: (1110, 22)

First few rows:
                        IPSG Name  Alex  IPSG  Unnamed: 3  Agreement  \
0      Patient_1_AP_FinalSM_L.bmp     3     2          32          0   
1      Patient_1_AP_FinalSM_R.bmp     1     1          11          1   
2    Patient_1_Frog_FinalSM_L.bmp     3     2          32          0   
3    Patient_1_Frog_FinalSM_R.bmp     1     1          11          1   
4    Patient_100_AP_FinalSM_L.bmp     1     1          11          1   
5    Patient_100_AP_FinalSM_R.bmp     4     4          44          1   
6  Patient_100_Frog_FinalSM_L.bmp     1     1          11          1   
7  Patient_100_Frog_FinalSM_R.bmp     4     4          44          1   
8     Patient_1003_ap_final_R.bmp     2     1          21          0   
9   Patient_1003_frog_final_R.bmp     2     1          21          0   

 

## 3. Parse Image Filenames to Extract Patient, View, and Side

In [4]:
# Extract information from image filenames
print("Parsing image filenames to extract patient number, view, and side...\n")

def parse_image_filename(filename):
    """
    Parse image filename in format: Patient_X_VIEW_..._SIDE.bmp
    Example: Patient_1_AP_..._R.bmp
    
    Returns: {'patient_number': str, 'view': str, 'side': str}
    """
    if pd.isna(filename) or not isinstance(filename, str):
        return {'patient_number': None, 'view': None, 'side': None}
    
    filename = filename.strip()
    
    # Pattern: Patient_X_VIEW_..._SIDE.bmp
    # We need to extract:
    # 1. Patient number: after 'Patient_' up to the next underscore
    # 2. View: the next component (AP, LAT, PA, etc.)
    # 3. Side: the last character before .bmp extension (L or R)
    
    try:
        # Remove .bmp extension
        name_no_ext = filename.replace('.bmp', '').replace('.BMP', '')
        
        # Split by underscore
        parts = name_no_ext.split('_')
        
        if len(parts) >= 3:
            # parts[0] = 'Patient'
            # parts[1] = patient number
            # parts[2] = view (AP, LAT, PA, etc.)
            # parts[-1] = side (L or R)
            
            patient_num = parts[1]  # e.g., '1', '1008', etc.
            view = parts[2]         # e.g., 'AP', 'LAT', 'PA'
            side = parts[-1]        # e.g., 'L', 'R'
            
            return {
                'patient_number': patient_num,
                'view': view,
                'side': side
            }
    except Exception as e:
        print(f"Error parsing '{filename}': {e}")
    
    return {'patient_number': None, 'view': None, 'side': None}


# Apply parsing to first column (image filename column)
first_col = stulberg_df.iloc[:, 0]  # Get the first column
print(f"Sample filenames from first column:")
print(first_col.head(10).tolist())

# Parse all filenames
parsed_data = first_col.apply(parse_image_filename).apply(pd.Series)

print(f"\nParsed data (first 10 rows):")
print(parsed_data.head(10))

# Add parsed columns to dataframe
stulberg_df['patient_number'] = parsed_data['patient_number']
stulberg_df['view'] = parsed_data['view']
stulberg_df['side'] = parsed_data['side']

print(f"\nStulberg data with parsed columns:")
print(stulberg_df.head(10))

Parsing image filenames to extract patient number, view, and side...

Sample filenames from first column:
['Patient_1_AP_FinalSM_L.bmp', 'Patient_1_AP_FinalSM_R.bmp', 'Patient_1_Frog_FinalSM_L.bmp', 'Patient_1_Frog_FinalSM_R.bmp', 'Patient_100_AP_FinalSM_L.bmp', 'Patient_100_AP_FinalSM_R.bmp', 'Patient_100_Frog_FinalSM_L.bmp', 'Patient_100_Frog_FinalSM_R.bmp', 'Patient_1003_ap_final_R.bmp', 'Patient_1003_frog_final_R.bmp']

Parsed data (first 10 rows):
  patient_number  view side
0              1    AP    L
1              1    AP    R
2              1  Frog    L
3              1  Frog    R
4            100    AP    L
5            100    AP    R
6            100  Frog    L
7            100  Frog    R
8           1003    ap    R
9           1003  frog    R

Stulberg data with parsed columns:
                        IPSG Name  Alex  IPSG  Unnamed: 3  Agreement  \
0      Patient_1_AP_FinalSM_L.bmp     3     2          32          0   
1      Patient_1_AP_FinalSM_R.bmp     1     1          

## 4. Filter for AP View Images Only

In [8]:
# Filter for AP view images only
print("Filtering for AP view images only...\n")

print(f"View distribution in original data:")
print(stulberg_df['view'].value_counts())

# Filter for AP images
stulberg_ap_df = stulberg_df[stulberg_df['view'].str.upper() == 'AP'].copy()

print(f"\nAP images only:")
print(f"  Total rows: {len(stulberg_ap_df)}")
print(f"\nFirst few AP entries:")
print(stulberg_ap_df.head(10))

# Display relevant columns
print(f"\nRelevant columns:")
print(stulberg_ap_df[['patient_number', 'view', 'side', 'Alex']].head(15))

Filtering for AP view images only...

View distribution in original data:
view
ap      478
frog    463
AP       87
Frog     82
Name: count, dtype: int64

AP images only:
  Total rows: 565

First few AP entries:
                       IPSG Name  Alex  IPSG  Unnamed: 3  Agreement  \
0     Patient_1_AP_FinalSM_L.bmp     3     2          32          0   
1     Patient_1_AP_FinalSM_R.bmp     1     1          11          1   
4   Patient_100_AP_FinalSM_L.bmp     1     1          11          1   
5   Patient_100_AP_FinalSM_R.bmp     4     4          44          1   
8    Patient_1003_ap_final_R.bmp     2     1          21          0   
10   Patient_1007_ap_final_L.bmp     1     1          11          1   
11   Patient_1007_ap_final_R.bmp     2     2          22          1   
14    Patient_101_ap_final_L.bmp     1     1          11          1   
15    Patient_101_ap_final_R.bmp     2     1          21          0   
18   Patient_1018_ap_final_L.bmp     1     1          11          1   

    Unn

## 5. Load Classification Metadata

In [9]:
# Load existing classification metadata
print("Loading classification_metadata.xlsx...\n")

metadata_df = pd.read_excel(METADATA_FILE, sheet_name='samples')

print(f"Metadata shape: {metadata_df.shape}")
print(f"\nFirst few rows:")
print(metadata_df.head(10))

print(f"\nColumn names:")
print(metadata_df.columns.tolist())

print(f"\nData types:")
print(metadata_df.dtypes)

print(f"\nFirst column (sample_id) sample values:")
print(metadata_df.iloc[:, 0].head(15).tolist())

Loading classification_metadata.xlsx...

Metadata shape: (488, 6)

First few rows:
  sample_id      image_path  label  age gender  split
0      1.5L     1.5L.nii.gz      3   67      F  train
1      1.6L     1.6L.nii.gz      4   55      F  train
2     10.1L    10.1L.nii.gz      2   37      M  train
3     10.2L    10.2L.nii.gz      4   68      F  train
4     10.3L    10.3L.nii.gz      4   58      F  train
5   1022.1L  1022.1L.nii.gz      1   51      M  train
6   1099.1L  1099.1L.nii.gz      2   43      F  train
7     11.1L    11.1L.nii.gz      2   42      M  train
8     11.2L    11.2L.nii.gz      2   51      F  train
9     11.3L    11.3L.nii.gz      4   56      M  train

Column names:
['sample_id', 'image_path', 'label', 'age', 'gender', 'split']

Data types:
sample_id     object
image_path    object
label          int64
age            int64
gender        object
split         object
dtype: object

First column (sample_id) sample values:
['1.5L', '1.6L', '10.1L', '10.2L', '10.3L', '1022.1

## 6. Create Matching Key for Stulberg Data

In [10]:
# Create a matching key in format: 'Patient XXXX Side' or 'Patient X Side'
print("Creating matching keys for AP images...\n")

def create_patient_side_key(row):
    """
    Create matching key in format: 'Patient XXXX Side'
    Example: 'Patient 1008 Right' or 'Patient 99 Left'
    """
    patient_num = row['patient_number']
    side = row['side']
    
    if pd.isna(patient_num) or pd.isna(side):
        return None
    
    # Convert side (L/R to Left/Right)
    side_full = 'Right' if side == 'R' else 'Left' if side == 'L' else None
    
    if side_full is None:
        return None
    
    return f"Patient {patient_num} {side_full}"

# Apply to AP filtered data
stulberg_ap_df['match_key'] = stulberg_ap_df.apply(create_patient_side_key, axis=1)

print(f"Sample match keys:")
print(stulberg_ap_df[['patient_number', 'side', 'match_key']].head(15))

print(f"\nMatch key statistics:")
print(f"  Total AP entries: {len(stulberg_ap_df)}")
print(f"  Valid match keys: {stulberg_ap_df['match_key'].notna().sum()}")
print(f"  Missing match keys: {stulberg_ap_df['match_key'].isna().sum()}")

Creating matching keys for AP images...

Sample match keys:
   patient_number side           match_key
0               1    L      Patient 1 Left
1               1    R     Patient 1 Right
4             100    L    Patient 100 Left
5             100    R   Patient 100 Right
8            1003    R  Patient 1003 Right
10           1007    L   Patient 1007 Left
11           1007    R  Patient 1007 Right
14            101    L    Patient 101 Left
15            101    R   Patient 101 Right
18           1018    L   Patient 1018 Left
20           1019    R  Patient 1019 Right
22           1024    R  Patient 1024 Right
24           1026    L   Patient 1026 Left
26            103    R   Patient 103 Right
28           1034    L   Patient 1034 Left

Match key statistics:
  Total AP entries: 565
  Valid match keys: 565
  Missing match keys: 0


## 7. Parse Classification Metadata Sample IDs

In [11]:
# The first column of classification_metadata.xlsx contains sample_id
# We need to create a matching key from these
print("Extracting patient and side info from classification_metadata.xlsx...\n")

# The first column name
id_column = metadata_df.columns[0]
print(f"ID column: '{id_column}'")

# Sample values
print(f"\nSample ID values:")
print(metadata_df[id_column].unique()[:20])

# Create the match key directly from the sample_id
# It appears these are already in the format 'Patient XXXX Side' or similar
# Let's check if they match our generated keys
metadata_df['match_key'] = metadata_df[id_column].str.strip()

print(f"\nMatch keys from metadata (first 15):")
print(metadata_df['match_key'].head(15).tolist())

Extracting patient and side info from classification_metadata.xlsx...

ID column: 'sample_id'

Sample ID values:
['1.5L' '1.6L' '10.1L' '10.2L' '10.3L' '1022.1L' '1099.1L' '11.1L' '11.2L'
 '11.3L' '11.4L' '12.1R' '12.2R' '13.1L' '13.2L' '14.1R' '14.2R' '15.1R'
 '15.2R' '16.1R']

Match keys from metadata (first 15):
['1.5L', '1.6L', '10.1L', '10.2L', '10.3L', '1022.1L', '1099.1L', '11.1L', '11.2L', '11.3L', '11.4L', '12.1R', '12.2R', '13.1L', '13.2L']


## 8. Match and Merge Data

In [12]:
# Perform the merge/join
print("Matching Stulberg AP data with classification_metadata...\n")

# Create a subset of Stulberg with only relevant columns
stulberg_to_merge = stulberg_ap_df[['match_key', 'Alex']].copy()
stulberg_to_merge.columns = ['match_key', 'stulberg_label']
stulberg_to_merge = stulberg_to_merge[stulberg_to_merge['match_key'].notna()]

print(f"Stulberg AP data to merge:")
print(f"  Rows: {len(stulberg_to_merge)}")
print(stulberg_to_merge.head(10))

print(f"\nMetadata match keys:")
print(f"  Rows: {len(metadata_df)}")
print(metadata_df[['match_key']].head(10))

# Inner join to keep only matching entries
merged_df = metadata_df.merge(
    stulberg_to_merge,
    on='match_key',
    how='inner'
)

print(f"\nMerge results:")
print(f"  Original metadata rows: {len(metadata_df)}")
print(f"  Stulberg AP rows: {len(stulberg_to_merge)}")
print(f"  Matched rows: {len(merged_df)}")

print(f"\nMerged data (first 10 rows):")
print(merged_df.head(10))

Matching Stulberg AP data with classification_metadata...

Stulberg AP data to merge:
  Rows: 565
             match_key  stulberg_label
0       Patient 1 Left               3
1      Patient 1 Right               1
4     Patient 100 Left               1
5    Patient 100 Right               4
8   Patient 1003 Right               2
10   Patient 1007 Left               1
11  Patient 1007 Right               2
14    Patient 101 Left               1
15   Patient 101 Right               2
18   Patient 1018 Left               1

Metadata match keys:
  Rows: 488
  match_key
0      1.5L
1      1.6L
2     10.1L
3     10.2L
4     10.3L
5   1022.1L
6   1099.1L
7     11.1L
8     11.2L
9     11.3L

Merge results:
  Original metadata rows: 488
  Stulberg AP rows: 565
  Matched rows: 161

Merged data (first 10 rows):
            sample_id                 image_path  label  age gender  split  \
0    Patient 101 Left    Patient 101 Left.nii.gz      3   32      M  train   
1   Patient 103 Right   Patient

## 9. Create Updated Classification Metadata

In [13]:
# Prepare output dataframe in the same format as original
print("Preparing output file...\n")

# Get all original columns from metadata
original_columns = [col for col in metadata_df.columns if col != 'match_key']
print(f"Original columns: {original_columns}")

# Select output columns
output_df = merged_df[original_columns].copy()

print(f"\nOutput dataframe shape: {output_df.shape}")
print(f"\nOutput dataframe (first 15 rows):")
print(output_df.head(15))

print(f"\nOutput dataframe columns:")
print(output_df.columns.tolist())

print(f"\nOutput statistics:")
print(output_df.describe())

Preparing output file...

Original columns: ['sample_id', 'image_path', 'label', 'age', 'gender', 'split']

Output dataframe shape: (161, 6)

Output dataframe (first 15 rows):
             sample_id                 image_path  label  age gender  split
0     Patient 101 Left    Patient 101 Left.nii.gz      3   32      M  train
1    Patient 103 Right   Patient 103 Right.nii.gz      2   65      M  train
2   Patient 1034 Right  Patient 1034 Right.nii.gz      3   25      M  train
3    Patient 104 Right   Patient 104 Right.nii.gz      0   70      M  train
4    Patient 105 Right   Patient 105 Right.nii.gz      2   21      M  train
5     Patient 109 Left    Patient 109 Left.nii.gz      1   74      M  train
6     Patient 111 Left    Patient 111 Left.nii.gz      3   52      F  train
7    Patient 111 Right   Patient 111 Right.nii.gz      3   73      M  train
8    Patient 112 Right   Patient 112 Right.nii.gz      1   37      F  train
9     Patient 115 Left    Patient 115 Left.nii.gz      0   51   

## 10. Export to Excel

In [18]:
# Save to Excel
print("Exporting to classification_metadata2.xlsx...\n")

# Save with the same sheet name as original
output_df.to_excel(OUTPUT_FILE, sheet_name='samples', index=False)

print(f"Successfully saved to: {OUTPUT_FILE}")
print(f"\nFile statistics:")
print(f"  Rows: {len(output_df)}")
print(f"  Columns: {len(output_df.columns)}")

# Verify the output
verification_df = pd.read_excel(OUTPUT_FILE, sheet_name='samples')
print(f"\nVerification - Read back from file:")
print(f"  Shape: {verification_df.shape}")
print(f"\nFirst 10 rows:")
print(verification_df)#.head(100))

Exporting to classification_metadata2.xlsx...

Successfully saved to: C:\FeatureEx\classification_metadata2.xlsx

File statistics:
  Rows: 161
  Columns: 6

Verification - Read back from file:
  Shape: (161, 6)

First 10 rows:
              sample_id                 image_path  label  age gender  split
0      Patient 101 Left    Patient 101 Left.nii.gz      3   32      M  train
1     Patient 103 Right   Patient 103 Right.nii.gz      2   65      M  train
2    Patient 1034 Right  Patient 1034 Right.nii.gz      3   25      M  train
3     Patient 104 Right   Patient 104 Right.nii.gz      0   70      M  train
4     Patient 105 Right   Patient 105 Right.nii.gz      2   21      M  train
..                  ...                        ...    ...  ...    ...    ...
156   Patient 930 Right   Patient 930 Right.nii.gz      4   36      F   test
157    Patient 945 Left    Patient 945 Left.nii.gz      0   39      F   test
158   Patient 953 Right   Patient 953 Right.nii.gz      4   23      F   test
159

## 11. Summary and Statistics

In [15]:
print("\n" + "="*70)
print("LABEL UPDATE SUMMARY")
print("="*70)

print(f"\n1. INPUT DATA")
print(f"   - Stulberg classification file: {STULBERG_FILE.name}")
print(f"   - Total rows in Stulberg sheet 2: {len(stulberg_df)}")
print(f"   - AP view images extracted: {len(stulberg_ap_df)}")
print(f"   - Valid match keys created: {stulberg_ap_df['match_key'].notna().sum()}")

print(f"\n2. ORIGINAL METADATA")
print(f"   - Original file: {METADATA_FILE.name}")
print(f"   - Total entries: {len(metadata_df)}")

print(f"\n3. MATCHING RESULTS")
print(f"   - Entries found in both files: {len(output_df)}")
print(f"   - Match percentage: {len(output_df) / len(metadata_df) * 100:.1f}%")

print(f"\n4. OUTPUT FILE")
print(f"   - Filename: {OUTPUT_FILE.name}")
print(f"   - Location: {OUTPUT_FILE}")
print(f"   - Rows: {len(output_df)}")
print(f"   - Columns: {len(output_df.columns)}")

print(f"\n5. COLUMNS IN OUTPUT")
for col in output_df.columns:
    print(f"   - {col}")

print(f"\n" + "="*70)


LABEL UPDATE SUMMARY

1. INPUT DATA
   - Stulberg classification file: Stulberg classification.xlsx
   - Total rows in Stulberg sheet 2: 1110
   - AP view images extracted: 565
   - Valid match keys created: 565

2. ORIGINAL METADATA
   - Original file: classification_metadata.xlsx
   - Total entries: 488

3. MATCHING RESULTS
   - Entries found in both files: 161
   - Match percentage: 33.0%

4. OUTPUT FILE
   - Filename: classification_metadata2.xlsx
   - Location: C:\FeatureEx\classification_metadata2.xlsx
   - Rows: 161
   - Columns: 6

5. COLUMNS IN OUTPUT
   - sample_id
   - image_path
   - label
   - age
   - gender
   - split

