In [8]:
import pandas as pd
import re
from datetime import datetime

### Load in spreadsheets and create final df outline

In [9]:
# define these

template= '1Bulk_sample_template_V11.xlsx'
raw= 'PRB_LB_0325_raw.xlsx'
sheet_name= 'Clinical Data'
raw_header= 1
template_header= 1

In [10]:
# load in the data

template= pd.read_excel(template, header= template_header)
raw= pd.read_excel(raw, sheet_name= sheet_name, header= raw_header)
final= pd.DataFrame(index= raw.index, columns= template.columns)

### Clean biomarker data (includes menopausal status)

In [11]:
# menopausal status
biomarker_cols= ["Biomarker 1", "Biomarker 2", "Biomarker 3", "Biomarker 4", "Biomarker 5", "Biomarker 6"]

menopause_mapping= {
    "premenopause": ["pre menopause"],
    "perimenopause": ["peri menopause"],
    "menopause": ["menopause"],
    "postmenopause": ["post menopause"]
}

def extract_menopause_status(row):
    for col in biomarker_cols:
        val= row.get(col)
        if pd.isna(val):
            continue
        val_str= str(val).strip().lower()

        if "menopause status" in val_str:
            parts= val_str.split("=")
            if len(parts)>1:
                raw_status= parts[1].strip().lower()

                for clean_val, options in menopause_mapping.items():
                    if raw_status in [opt.lower() for opt in options]:
                        return clean_val
                return "Unknown"
    return pd.NA

final["Menopausal Status"]= raw.apply(extract_menopause_status, axis=1)

In [12]:
# biomarkers

biomarker_lookup = {
    "HER2": ["her2"],
    "ER": ["er", "estrogen receptor"],
    "PR": ["pr", "progesterone receptor"],
    "FISH": ["fish"],
    "PDL1": ["pdl1"],
    "ALK": ["alk"],
    "ROS": ["ros"],
    "EGFR": ["egfr"],
    "KRAS": ["kras"],
    "PIK3CA": ["pik3ca"],
    "ESR1": ["esr1"],
    "AR": ["ar"],
}

pos_neg_mapping = {
    "Positive": [
        "positive", "strong positive", "weak positive", "2+", "3+", "12", "8", "6", "9", "3", ">= 10", ">=10"
    ],
    "Negative": [
        "negative", "0", "none", "not detected", "less than 10%", "less than 10", "1+"
    ]
}

def interpret_her2_status(val):
    if pd.isna(val):
        return pd.NA

    val = str(val).lower()
    her2_score = None
    fish_result = None

    match_ihc = re.search(r"her2\s*=\s*([0-3]\+?)", val)
    if match_ihc:
        her2_score = match_ihc.group(1)

    if "fish" in val or "ish" in val:
        if "positive" in val:
            fish_result = "positive"
        elif "negative" in val:
            fish_result = "negative"

    if her2_score in ["0", "1+"]:
        return "negative"
    elif her2_score == "3+":
        return "positive"
    elif her2_score == "2+":
        if fish_result == "positive":
            return "positive"
        elif fish_result == "negative":
            return "negative"
        else:
            return "HER2 2+ (FISH/ISH missing)"

    return pd.NA

def extract_pos_neg_biomarker(row, biomarker_cols, biomarker_lookup, pos_neg_mapping):
    results = {}

    for col in biomarker_cols:
        cell = row.get(col)
        if pd.isna(cell):
            continue
        val = str(cell).strip().lower()

        if "=" in val:
            biomarker_name, raw_result = [s.strip() for s in val.split("=", 1)]

            for template_col, raw_variants in biomarker_lookup.items():
                if biomarker_name.lower() in [v.lower() for v in raw_variants]:
                    if template_col == "HER2":
                        mapped = interpret_her2_status(val)
                    else:
                        mapped = None
                        for label, values in pos_neg_mapping.items():
                            for v in values:
                                if v.lower() in raw_result:
                                    mapped = label
                                    break
                            if mapped:
                                break
                        if not mapped:
                            mapped = raw_result
                    results[template_col] = mapped
    return results

biomarker_cols = ["Biomarker 1", "Biomarker 2", "Biomarker 3", "Biomarker 4", "Biomarker 5", "Biomarker 6"]

biomarker_data = raw.apply(
    lambda row: extract_pos_neg_biomarker(row, biomarker_cols, biomarker_lookup, pos_neg_mapping),
    axis=1
)

for idx, result in biomarker_data.items():
    for col, val in result.items():
        final.at[idx, col] = val


### Map any columns that just need basic cleaning

In [13]:
# map columns to eachother

column_mapping= {
    "Date of Blood Draw/Cell Collection": "Date of blood collection [yyyy-mm-dd]",
    "Time of Draw": "Date of blood collection [yyyy-mm-dd]",
    "ExternalId": "Sample ID\nconsecutive",
    "Stabilizer": "Collection Tube Type",
    "Gender": "Gender",
    "Single or Double Spun": "Single or Double Spun",
    "Height": "Body height [cm]",
    "Sample Timepoint": "Blood sample timepoint",
    "TNM": "TNM",
    "Stage": "Stage",
    "Morphology Code": "Morphology code",
    "Description of Morphology Code": "Description of morphology code",
    "ExPatientId": "Patient ID\nconsecutive"  
}
    

In [14]:
# cleaning functions - one for each template column that just needs basic cleaning/mapping


def clean_date(val):
    try:
        return pd.to_datetime(val).strftime("%Y-%m-%d")
    except:
        return pd.NA

def clean_time(val):
    try: 
        return pd.to_datetime(val).strftime("%I:%M:%S %p")
    except: 
        return pd.NA

# function factory for basic mapping cleaning
def make_cleaner(mapping_dict):
    '''
    returns a cleaning function that maps raw values to the standard values
    '''
    lookup= {}
    for standard_val, raw_options in mapping_dict.items():
        for raw_val in raw_options:
            lookup[str(raw_val).strip().lower()]= standard_val
    def cleaner(val):
        if pd.isna(val):
            return pd.NA
        val_str= str(val).strip().lower()
        return lookup.get(val_str, pd.NA)
    return cleaner

# create functions from function factory
clean_stabilizer= make_cleaner({
    "Streck": ["Streck Cell-Free DNA BCT"]
})

clean_gender= make_cleaner({
    "Male": ['m', 'male', 'M', 'Male'],
    "Female": ['f', 'female', 'F', 'Female']
})

clean_single_double= make_cleaner({
    "Single": ['single', 'Single'],
    "Double": ['double', 'Double']
})

clean_sample_timepoint= make_cleaner({
    "treatment-naïve": ["Initial-0"]
})

clean_stage= make_cleaner({
    "I": ["I", "IA", "IB"],
    "II": ["II", "IIA", "IIB"],
    "III": ["III", "IIIA", "IIIB"],
    "IV": ["IV", "IVA", "IVB"]
})
    

In [15]:
# transformation registry

transformations= {
    "Date of Blood Draw/Cell Collection": clean_date,
    "Time of Draw": clean_time,
    "Stabilizer": clean_stabilizer,
    "Gender": clean_gender,
    "Single or Double Spun": clean_single_double,
    "Sample Timepoint": clean_sample_timepoint,
    "Stage": clean_stage
}

### Functions for columns that need to be calculated

In [16]:
def calculate_age(birth_val, collection_date):
    if pd.isna(birth_val) or pd.isna(collection_date):
        return pd.NA
    try:
        birth_str= str(birth_val).strip()
        if len(birth_str)== 4 and birth_str.isdigit():
            birth_date= pd.to_datetime(f"{birth_str}-01-01")
        else:
            birth_date= pd.to_datetime(birth_str, errors= "coerce")
        collection_dt= pd.to_datetime(collection_date, errors= "coerce")

        age= (collection_dt-birth_date).days // 365
        return age
    except:
        return pd.NA

def calculate_elapsed_days(row, start_col, end_col):
    start_val= row.get(start_col)
    end_val= row.get(end_col)

    if pd.isna(start_val):
        return pd.NA
    if pd.isna(end_val):
        return pd.NA
    try:
        start_date= pd.to_datetime(start_val, errors= "coerce")
        end_date= pd.to_datetime(end_val, errors= "coerce")

        if pd.isna(start_date) or pd.isna(end_date):
            return pd.NA

        delta= (end_date-start_date).days
        return delta if delta >= 0 else pd.NA
    except:
        return pd.NA

In [17]:
# Create column for duration between cancer diganosis and blood draw

final["Duration between Cancer Diagnosis and Blood Draw (days)"] = raw.apply(
    lambda row: calculate_elapsed_days(row, "Date of cancer diagnosis [yyyy-mm-dd]", "Date of blood collection [yyyy-mm-dd]"), axis=1)

# create a column for duration between tnm staging and blood draw
final["Duration between TNM Staging and Blood Draw (days)"] = raw.apply(
    lambda row: calculate_elapsed_days(row, "Date of TNM staging  [yyyy-mm-dd]", "Date of blood collection [yyyy-mm-dd]"), axis=1)

# create a column for age at the time of blood collection
final["AgeAtCollection"]= raw.apply(
    lambda row: calculate_age(row["Year of birth"], row["Date of blood collection [yyyy-mm-dd]"]), axis=1)

### Define columns that are not pulled from the raw data

In [18]:
# columns that are not pulled from raw data
fixed_values= {
    "Organism": "Human",
    "Data Transformer": "Nalika Palayoor",
    "Date of Transformation": "17-Jul-2025",
    "SpecimenType": "Plasma",
    "Condition": "cancer",
    "Diagnostic Condition": "breast cancer",
    "Hemolysis": "no hemolysis",
    "Project": "PB-2881",
    "ContainerType": "tube",
    "RNA-Sequencing Available": "No",
    "Source": "Indivumed",
    "Country": "Germany"
}

for col,val in fixed_values.items():
    if col in final.columns:
        final[col]= val


### Create final dataframe

In [19]:
for template_col in column_mapping:
    raw_col= column_mapping[template_col]
    if raw_col in raw.columns:
        if template_col in transformations:
            final[template_col]= raw[raw_col].apply(transformations[template_col])
        else:
            final[template_col]= raw[raw_col]


In [20]:
pd.set_option("display.max_columns",None)
final.head(50)

Unnamed: 0,ExternalId,Received Date,ContainerType,Volume_uL,TubeBarcode,Concentration,ConcentrationUnits,Organism,Stabilizer,Single or Double Spun,Processing Method,Processing Time(hrs),Freeze Thaw Status,Hemolysis,Project,Matched FFPE Available,Date of Blood Draw/Cell Collection,Time of Draw,Block Size,Tissue Size,Tissue Weight (mg),% Tumor,% Necrosis,Surgery Type,Tumor Tissue Type,Data Transformer,Date of Transformation,Other Sample Notes,ExSpecimenId,Collection Site,SpecimenType,Condition,Diagnostic Condition,Histology,Height,Weight,BMI,Duration between Cancer Diagnosis and Blood Draw (days),Duration between Metastatic Diagnosis and Blood Draw (days),Sample Timepoint,Sample Timepoint Description,AgeAtCollection,Detailed Anatomical Location,Grade,Tumor Size,TNM,Duration between TNM Staging and Blood Draw (days),Stage,Stage Detailed,Morphology Code,Description of Morphology Code,Metastatic Sites,Vehicle Control,Media Conditions,Additional Supplements to Media,Protocols for Harvesting Cell Lines,Blood collection date (days from birth),Number of lines of metastatic therapy at time of blood draw,Number of lines of chemotherapy at time of blood draw,Number of lines of anti-HER2 therapy at time of blood draw,Number of lines of endocrine therapy at time of blood draw,Overall Survival(months),Treatment Data,Progression Free Survival(months),Gestational Age at Collection,Fetus Sex,Menopausal Status,Blood Type,HER2 IHC,HER2,HER2 FISH,FISH Notes,HER2 Change from Previos Sample,ER,ER Notes,ER Status Change from Previous Sample,PR,PR Notes,PR Status Change from Previous Sample,AR,AR Notes,AR Status Change From Previous Sample,ROS,ALK,EGFR,EGFR Allele Information,PDL1,KRAS,PIK3CA,ESR1,BRCA1,BRCA2,FOLR1,Ki67,TROP2,HER3,MET,Tissue Factor (TF),NECTIN4,CEACAM5,LIV1,MESOTHELIN,B7H3,B7H4,FRalpha,DLL3,CDH6,PGR,PSA,KLF5,FGF/FGFR,Biomarker Notes,RNA-Sequencing Available,ExPatientId,Source,Country,Gender,Race,MedicalHistory,FamilyHistory,AlcoholHistory,SmokingHistory,Number of years smoked or smoking,Smoking Notes,Donor Notes,FISH
0,408_BD T0,,tube,,,,,Human,Streck,Double,,,,no hemolysis,PB-2881,,2017-12-20,11:29:00 AM,,,,,,,,Nalika Palayoor,17-Jul-2025,,,,Plasma,cancer,breast cancer,,160.0,,,5,,treatment-naïve,,57,,,,cT4c cN1 cM1,5.0,IV,,8500/3,"Infiltrating duct carcinoma, NOS (C50._)",,,,,,,,,,,,,,,,postmenopause,,,positive,,,,Negative,,,Negative,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,No,PRB_408,Indivumed,Germany,Female,,,,,,,,,
1,409_BD T0,,tube,,,,,Human,Streck,Double,,,,no hemolysis,PB-2881,,2019-09-17,11:04:00 AM,,,,,,,,Nalika Palayoor,17-Jul-2025,,,,Plasma,cancer,breast cancer,,160.0,,,33,,treatment-naïve,,74,,,,rT0 pN0 pM1 GX,33.0,IV,,8500/3,"Infiltrating duct carcinoma, NOS (C50._)",,,,,,,,,,,,,,,,postmenopause,,,positive,,,,Negative,,,Negative,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,No,PRB_409,Indivumed,Germany,Female,,,,,,,,,Positive
2,410_BD T0,,tube,,,,,Human,Streck,Double,,,,no hemolysis,PB-2881,,2019-02-12,11:34:00 AM,,,,,,,,Nalika Palayoor,17-Jul-2025,,,,Plasma,cancer,breast cancer,,175.0,,,28,,treatment-naïve,,71,,,,rT0 cN0 pM1 G3 LX VX PnX,28.0,IV,,8520/3,"Lobular carcinoma, NOS (C50._)",,,,,,,,,,,,,,,,premenopause,,,negative,,,,Positive,,,Positive,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,No,PRB_410,Indivumed,Germany,Female,,,,,,,,,
3,411_BD T0,,tube,,,,,Human,Streck,Double,,,,no hemolysis,PB-2881,,2019-01-22,09:57:00 AM,,,,,,,,Nalika Palayoor,17-Jul-2025,,,,Plasma,cancer,breast cancer,,169.0,,,38,,treatment-naïve,,56,,,,cT2 cN0 cM1 G3,38.0,IV,,8500/3,"Infiltrating duct carcinoma, NOS (C50._)",,,,,,,,,,,,,,,,premenopause,,,negative,,,,Positive,,,Negative,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,No,PRB_411,Indivumed,Germany,Female,,,,,,,,,
4,412_BD T0,,tube,,,,,Human,Streck,Double,,,,no hemolysis,PB-2881,,2023-02-28,11:16:00 AM,,,,,,,,Nalika Palayoor,17-Jul-2025,,,,Plasma,cancer,breast cancer,,166.0,,,44,,treatment-naïve,,65,,,,cT4b cNX cM1 G2,44.0,IV,,8520/3,"Lobular carcinoma, NOS (C50._)",,,,,,,,,,,,,,,,postmenopause,,,positive,,,,Negative,,,Negative,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,No,PRB_412,Indivumed,Germany,Female,,,,,,,,,
5,413_BD T0,,tube,,,,,Human,Streck,Double,,,,no hemolysis,PB-2881,,2019-12-16,09:08:00 AM,,,,,,,,Nalika Palayoor,17-Jul-2025,,,,Plasma,cancer,breast cancer,,175.0,,,62,,treatment-naïve,,70,,,,cT4b cNX cM1 G1 L1,62.0,IV,,8500/3,"Infiltrating duct carcinoma, NOS (C50._)",,,,,,,,,,,,,,,,postmenopause,,,negative,,,,Positive,,,Positive,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,No,PRB_413,Indivumed,Germany,Female,,,,,,,,,
6,414_BD T0,,tube,,,,,Human,Streck,Double,,,,no hemolysis,PB-2881,,2018-03-06,10:59:00 AM,,,,,,,,Nalika Palayoor,17-Jul-2025,,,,Plasma,cancer,breast cancer,,168.0,,,19,,treatment-naïve,,62,,,,cT3 cNX cM1,19.0,IV,,8500/3,"Infiltrating duct carcinoma, NOS (C50._)",,,,,,,,,,,,,,,,,,,negative,,,,Positive,,,,,,,,,not mutated,not mutated,not mutated,,Negative,,,,,,,,,,,,,,,,,,,,,,,,,,No,PRB_414,Indivumed,Germany,Female,,,,,,,,,
7,415_BD T0,,tube,,,,,Human,Streck,Double,,,,no hemolysis,PB-2881,,2023-03-01,09:13:00 AM,,,,,,,,Nalika Palayoor,17-Jul-2025,,,,Plasma,cancer,breast cancer,,162.0,,,45,,treatment-naïve,,86,,,,cT2 cN0 cM1 G2,45.0,IV,,8500/3,"Infiltrating duct carcinoma, NOS (C50._)",,,,,,,,,,,,,,,,postmenopause,,,negative,,,,Positive,,,Positive,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,No,PRB_415,Indivumed,Germany,Female,,,,,,,,,
8,416_BD T0,,tube,,,,,Human,Streck,Double,,,,no hemolysis,PB-2881,,2019-09-19,12:18:00 PM,,,,,,,,Nalika Palayoor,17-Jul-2025,,,,Plasma,cancer,breast cancer,,158.0,,,35,,treatment-naïve,,77,,,,cT4 cN1 cM1,35.0,IV,,8500/3,"Infiltrating duct carcinoma, NOS (C50._)",,,,,,,,,,,,,,,,perimenopause,,,positive,,,,2,,,Negative,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,No,PRB_416,Indivumed,Germany,Female,,,,,,,,,
9,417_BD T0,,tube,,,,,Human,Streck,Double,,,,no hemolysis,PB-2881,,2023-04-17,09:02:00 AM,,,,,,,,Nalika Palayoor,17-Jul-2025,,,,Plasma,cancer,breast cancer,,168.0,,,33,,treatment-naïve,,55,,,,cT2 cN1 cM1 G3 L0 V0 PnX,33.0,IV,,8500/3,"Infiltrating duct carcinoma, NOS (C50._)",,,,,,,,,,,,,,,,postmenopause,,,positive,,,,Negative,,,Negative,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,No,PRB_417,Indivumed,Germany,Female,,,,,,,,,
