In [1]:
import pandas as pd
import re
from datetime import datetime
from IPython.display import FileLink


### Define each of these fields each time based on the column names in the raw data

In [2]:
# DEFINE THESE EACH TIME YOU RUN THE SCRIPT

# filepath for the template and raw files
template= '1Bulk_sample_template_V11.xlsx'
raw= 'PRB_LB_0325_raw.xlsx'
shipping_manifest= 'PRB_LB_0325_shipping_manifest.xlsx'
dataset= 'PRB_LB_0325'
output_path= f"{dataset}_formatted_auto.xlsx"

# sheet name for the raw and shipping file
sheet_name_raw= 'Clinical Data'
sheet_name_shipping= 'Template'


# header row for the raw, template, and shipping files
raw_header= 1
template_header= 1
shipping_manifest_header= 10

# column names in the raw file that contain biomarker data
biomarker_cols= ["Biomarker 1", "Biomarker 2", "Biomarker 3", "Biomarker 4", "Biomarker 5", "Biomarker 6"]

# column mappings from raw to template (key: template column, value: raw column - add more columns as needed).
column_mapping= {
    "ExternalId": "Sample ID\nconsecutive",
    'TubeBarcode': 'Barcode Scan',
    "Stabilizer": "Collection Tube Type",
    "Single or Double Spun": "Single or Double Spun",
    "Hemolysis": "Hemolysis",
    "SpecimenType": "Sample   Type",
    "Date of Blood Draw/Cell Collection": "Date of blood collection [yyyy-mm-dd]",
    "Time of Draw": "Date of blood collection [yyyy-mm-dd]",
    "Gender": "Gender",
    "Height": "Body height [cm]",
    "Sample Timepoint": "Blood sample timepoint",
    "TNM": "TNM",
    "Stage": "Stage",
    "Morphology Code": "Morphology code",
    "Description of Morphology Code": "Description of morphology code",
    "ExPatientId": "Patient ID\nconsecutive",
    "ExSpecimenId": "Sample ID\nconsecutive",
    "Weight": "Maximum weight [kg] "
}

# fixed values that aren't in the raw file (key: template column, value: fixed value)
fixed_values= {
    "ContainerType": "tube",
    "Organism": "Human",
    "Project": "PRB_LB_0325",
    "Data Transformer": "Nalika Palayoor",
    "Date of Transformation": "17-Jul-2025",
    "Condition": "cancer",
    "Diagnostic Condition": "breast cancer",
    "RNA-Sequencing Available": "No",
    "Source": "Indivumed",
    "Country": "Germany"
}

# define as True if menopausal status is to be extracted from biomarker data. If its in a separate column, set to False and add the mapping in `column_mapping`.
extract_menopause_from_biomarker= True

# This is a dictionary where the key is a column that needs to be calculated and the values are the two columns from the raw data it will be calculated from
calculation_functions= {
    "Duration between Cancer Diagnosis and Blood Draw (days)": ["Date of cancer diagnosis [yyyy-mm-dd]","Date of blood collection [yyyy-mm-dd]"],
    "Duration between TNM Staging and Blood Draw (days)": ["Date of TNM staging  [yyyy-mm-dd]","Date of blood collection [yyyy-mm-dd]"]
    # Add more calculations as needed
}





### Synonym dictionaries for data cleaning (add to these as you find new synonyms in the raw data)

In [3]:
# If there are new synonyms found in the raw data, don't delete the old ones, just add to the lists below

menopause_mapping= {
    "premenopause": ["pre menopause"],
    "postmenopause": ["post menopause"],
    "perimenopause": ["peri menopause"],
    "not applicable": ["irrelevant"],
    "menopause": ["menopause"]
}

biomarker_lookup = {
    "HER2": ["her2"],
    "ER": ["er", "estrogen receptor"],
    "PR": ["pr", "progesterone receptor"],
    "HER2 FISH": ["fish"],
    "PDL1": ["pdl1"],
    "ALK": ["alk"],
    "ROS": ["ros"],
    "EGFR": ["egfr"],
    "KRAS": ["kras"],
    "PIK3CA": ["pik3ca"],
    "ESR1": ["esr1"],
    "AR": ["ar"],
    "BRCA1": ["BRCA1"],
    "BRCA2": ["BRCA2"],
    "Menopausal Status": ['menopause status']

}

pos_neg_mapping = {
    "positive": ["Positive","positive", "strong positive", "weak positive", "moderately positive", "2+", "3+", "1", "2","3","4","5","6","7","8","9","10","11","12"],
    "negative": ["Negative","negative", "0", "none", "not detected", "1+"],
    "mutated": ["mutated", "mutation detected", "mutation", "mut"],
    "not mutated": ["not mutated", "no mutation detected", "wild type", "wt", "no mutation", "no mut"]
}

# function factory for basic mapping cleaning
def make_cleaner(mapping_dict):
    """
    Given a mapping dictionary, returns a cleaning function that maps raw values to the standard values
    Parameters:
        mapping_dict (dict): dictionary where keys are standard values and values are lists of raw value options
    Returns: 
        function: takes a single value and returns the cleaned value or pd.NA if not found
    """
    lookup= {}
    for standard_val, raw_options in mapping_dict.items():
        for raw_val in raw_options:
            lookup[str(raw_val).strip().lower()]= standard_val
    def cleaner(val):
        if pd.isna(val):
            return pd.NA
        val_str= str(val).strip().lower()
        return lookup.get(val_str, pd.NA)
    return cleaner

# create functions from function factory (add to dictionaries as you find new synonyms in the raw data)
clean_stabilizer= make_cleaner({
    "Streck": ["Streck Cell-Free DNA BCT"]
})

clean_gender= make_cleaner({
    "Male": ['m', 'male', 'M', 'Male'],
    "Female": ['f', 'female', 'F', 'Female']
})

clean_single_double= make_cleaner({
    "Single": ['single', 'Single','1'],
    "Double": ['double', 'Double','2']
})

clean_sample_timepoint= make_cleaner({
    "treatment-naïve": ["Initial-0"]
})

clean_stage= make_cleaner({
    "I": ["I", "IA", "IB"],
    "II": ["II", "IIA", "IIB"],
    "III": ["III", "IIIA", "IIIB"],
    "IV": ["IV", "IVA", "IVB"]
})

clean_hemolysis= make_cleaner({
    "no hemolysis": ["No"],
    "light hemolysis": ["Light Hemolysis"],
    "hemolysis": [" Hemolysis"],
    "strong hemolysis": ["Strong Hemolysis"]
})



### Cleaning functions

In [4]:
# calls the calculation functions for elapsed days
def call_calculation_functions(final_df, raw_df, mapping_dict):
    for new_col, (col1, col2) in mapping_dict.items():
        final_df[new_col] = raw_df.apply(
            lambda row: calculate_elapsed_days(row, col1, col2), axis=1
        )
    return final_df

# functions to put the date and time in the correct format
def clean_date(val):
    """
    Cleans and formats a date string.
    Parameters:
        val (str): raw date string
    Returns:
        str or pd.NA: cleaned date string in "YYYY-MON-DD" format or pd.NA if not found
    """
    if pd.isna(val):
        return pd.NA

    try:
        return pd.to_datetime(val).strftime("%Y-%B-%d")
    except:
        return pd.NA

def clean_time(val):
    """
    Cleans and formats a time string.
    Parameters:
        val (str): raw time string
    Returns:
        str or pd.NA: cleaned time string in "HH:MM:SS AM/PM" format or pd.NA if not found
    """
    if pd.isna(val):
        return pd.NA

    try:
        return pd.to_datetime(val).strftime("%I:%M:%S %p")
    except: 
        return pd.NA

# functions for columns that need to be calculated
def calculate_age(birth_val, collection_date):
    """
    Calculates age in years given birth date (or year) and collection date. If only year is given for birth date, assumes Jan 1 of that year.
    Parameters:
        birth_val (str): raw birth date string
        collection_date (str): raw collection date string
    Returns:
        int or pd.NA: age in years or pd.NA if not calculable
    """
    if pd.isna(birth_val) or pd.isna(collection_date):
        return pd.NA
    try:
        birth_str= str(birth_val).strip()
        if len(birth_str)== 4 and birth_str.isdigit():
            birth_date= pd.to_datetime(f"{birth_str}-01-01")
        else:
            birth_date= pd.to_datetime(birth_str, errors= "coerce")
        collection_dt= pd.to_datetime(collection_date, errors= "coerce")

        age= (collection_dt-birth_date).days // 365
        return age
    except:
        return pd.NA

def calculate_elapsed_days(row, start_col, end_col):
    """
    Calculates elapsed days between two date columns in a row.
    Parameters:
        row (pd.Series): a row of the dataframe containing date information
        start_col (str): name of the column with the start date
        end_col (str): name of the column with the end date
    Returns:
        int or pd.NA: number of elapsed days or pd.NA if not calculable
    """
    start_val= row.get(start_col)
    end_val= row.get(end_col)

    if pd.isna(start_val):
        return pd.NA
    if pd.isna(end_val):
        return pd.NA
    try:
        start_date= pd.to_datetime(start_val, errors= "coerce")
        end_date= pd.to_datetime(end_val, errors= "coerce")

        if pd.isna(start_date) or pd.isna(end_date):
            return pd.NA

        delta= (end_date-start_date).days
        return delta if delta >= 0 else pd.NA
    except:
        return pd.NA
    
def calculate_bmi(weight, height):
    """
    Calculates BMI given weight in kg and height in cm.
    Parameters:
        weight (float): weight in kg
        height (float): height in cm
    Returns:
        float or pd.NA: BMI value or pd.NA if not calculable
    """
    weight = pd.to_numeric(weight, errors="coerce")
    height = pd.to_numeric(height, errors="coerce")

    if pd.isna(weight) or pd.isna(height) or height == 0:
        return 'noooo'
    try:
        height_m= height / 100
        bmi= weight / (height_m ** 2)
        return round(bmi, 2)
    except:
        return 'helpppp'
    
# function to extract biomarker data from blob text (blob is all biomarker columns concatenated)
def extract_biomarkers_from_blob(blob, biomarker_lookup, pos_neg_mapping):
    '''
    Extracts biomarker data from a concatenated blob of text.
    Parameters:
        blob (str): concatenated biomarker text
        biomarker_lookup (dict): dictionary mapping template columns to lists of raw biomarker variants
        pos_neg_mapping (dict): dictionary mapping positive/negative labels to lists of raw values
    Returns:
        dict: extracted biomarker values mapped to template columns
    '''
    results= {}
    blob= blob.replace("=", " = ")
    blob= re.sub(r'\s+', ' ', blob).strip().lower()

    biomarker_keys= sorted(
        [re.escape(v.lower()) for variants in biomarker_lookup.values() for v in variants],
        key= len,
        reverse= True
    )
    lookahead_pattern= r'(?=\s+\b(?:' + '|'.join(biomarker_keys) + r')\b\s*=|$)'

    fish_match= re.search(r'\bfish\b\s*=\s*([^\s=]+)', blob)
    fish_val= fish_match.group(1).strip() if fish_match else None

    for template_col, variants in biomarker_lookup.items():
        for variant in variants:
            variant = variant.lower()
            pattern = rf'\b{re.escape(variant)}\b\s*=\s*(.*?){lookahead_pattern}'
            match = re.search(pattern, blob)
            if match:
                raw_val = match.group(1).strip()

                if template_col == "HER2":
                    her2_match = re.search(r'(0|1\+|2\+|3\+)', raw_val)
                    if her2_match:
                        her2_score = her2_match.group(1)
                        if her2_score in ["0", "1+"]:
                            mapped = "negative"
                        elif her2_score == "3+":
                            mapped = "positive"
                        elif her2_score == "2+":
                            if fish_val in pos_neg_mapping.get("positive", []):
                                mapped = "positive"
                            elif fish_val in pos_neg_mapping.get("negative", []):
                                mapped = "negative"
                            else:
                                mapped = "HER2 2+ (FISH/ISH missing)"
                        else:
                            mapped = pd.NA

                        results["HER2 IHC"] = raw_val
                    else:
                        mapped = raw_val
                else:
                    mapped = None
                    for label, values in pos_neg_mapping.items():
                        if raw_val in values:
                            mapped = label
                            break
                    if mapped is None:
                        mapped = raw_val

                results[f"{template_col} Value"] = raw_val
                results[template_col] = mapped
                break

    return results

def extract_menopause_status(val_str):
    '''
    Extracts menopause status from a given string.
    Parameters:
        val_str (str): input string potentially containing menopause status
    Returns:
        str: mapped menopause status or pd.NA if not found
    '''
    val_str= val_str.lower()
    
    for clean_val, options in menopause_mapping.items():
        for opt in options:
            if opt in val_str:
                return clean_val
    return pd.NA



# transformation registry (key: template column, value: cleaning function)
# add a new row to the dictionary for each new function you create or use
transformations= {
    "Date of Blood Draw/Cell Collection": clean_date,
    "Time of Draw": clean_time,
    "Stabilizer": clean_stabilizer,
    "Gender": clean_gender,
    "Single or Double Spun": clean_single_double,
    "Sample Timepoint": clean_sample_timepoint,
    "Stage": clean_stage,
    "Hemolysis": clean_hemolysis
}



### Function calling + formatting df

In [5]:
# load in the data

template= pd.read_excel(template, header= template_header)
raw= pd.read_excel(raw, sheet_name= sheet_name_raw, header= raw_header)
shipping_manifest= pd.read_excel(shipping_manifest, sheet_name= sheet_name_shipping, header= shipping_manifest_header)
final= pd.DataFrame(index= raw.index, columns= template.columns)

# create the biomarker blob by concatenating the biomarker columns
raw['biomarker_blob']= raw[biomarker_cols]\
.astype(str)\
.replace('nan', '')\
.agg(' '.join, axis=1)\
.str.replace(r'\s+', ' ',regex= True)\
             .str.strip()

# merge the raw data with the shipping manifest
raw= pd.merge(raw, shipping_manifest, left_on= 'Patient ID\nconsecutive',right_on= 'Patient ID consecutive', how= "left")


# Extract biomarker data from biomarker blob
biomarker_data = raw['biomarker_blob'].apply(lambda row: extract_biomarkers_from_blob(row, biomarker_lookup, pos_neg_mapping))

for idx, result in biomarker_data.items():
    for biomarker, val in result.items():
        final.at[idx, biomarker] = val

# Runs if menopausal status is to be extracted from biomarker data
if extract_menopause_from_biomarker:
    final['Menopausal Status'] = raw['biomarker_blob'].apply(extract_menopause_status)

# Call the calculation functions to create calculated columns
call_calculation_functions(final,raw,calculation_functions)

# create a column for age at the time of blood collection
final["AgeAtCollection"]= raw.apply(
    lambda row: calculate_age(row["Year of birth"], row["Date of blood collection [yyyy-mm-dd]"]), axis=1)

# Create bmi column
final["BMI"]= raw.apply(
    lambda row: calculate_bmi(row["Maximum weight [kg] "], row["Body height [cm]"]), axis=1)


# apply transformations from transformation registry and mappings from synonym dictionaries
for col,val in fixed_values.items():
    if col in final.columns:
        final[col]= val

for template_col in column_mapping:
    raw_col= column_mapping[template_col]
    if raw_col in raw.columns:
        if template_col in transformations:
            final[template_col]= raw[raw_col].apply(transformations[template_col])
        else:
            final[template_col]= raw[raw_col]

# define all columns that are required in the final template
required_columns = [
    "Tube Barcode","Concentration Units","Single or Double Spun","Processing Method","Freeze Thaw Status","Project",
    "Matched FFPE Available","Surgery Type","Tumor Tissue Type","Other Sample Notes","Collection Site","Histology",
    "Sample Timepoint","Sample Timepoint Description","Detailed Anatomical Location","Grade","Tumor Size","TNM","Stage",
    "Stage_detailed","Morphology Code","Description of Morphology Code","Metastatic Sites","Vehicle Control","Media Conditions",
    "Additional Supplements to Media","Protocols for Harvesting Cell Lines","Menopausal Status","HER2 IHC","HER2","FISH",
    "HER2 Change from Previous Sample","ER","ER Notes","ER Status Change From Previous Sample","PR","PR Notes",
    "PR Status Change From Previous Sample","AR Notes","AR Status Change From Previous Sample","ROS","ALK","EGFR",
    "EGFR Allele Information","PDL1","KRAS","PIK3CA", "ESR1","BRCA1","BRCA2","FOLR1","Biomarker Notes","Country",
    "Gender","Race","SmokingHistory"
]


# if a column is blank and not optional, fill with Not received
for col in final.columns:
    if col in required_columns:
        final[col]= final[col].fillna("not received")

# display the dataframe
pd.set_option("display.max_columns",None)
final.head(20)


Unnamed: 0,ExternalId,Received Date,ContainerType,Volume_uL,TubeBarcode,Concentration,ConcentrationUnits,Organism,Stabilizer,Single or Double Spun,Processing Method,Processing Time(hrs),Freeze Thaw Status,Hemolysis,Project,Matched FFPE Available,Date of Blood Draw/Cell Collection,Time of Draw,Block Size,Tissue Size,Tissue Weight (mg),% Tumor,% Necrosis,Surgery Type,Tumor Tissue Type,Data Transformer,Date of Transformation,Other Sample Notes,ExSpecimenId,Collection Site,SpecimenType,Condition,Diagnostic Condition,Histology,Height,Weight,BMI,Duration between Cancer Diagnosis and Blood Draw (days),Duration between Metastatic Diagnosis and Blood Draw (days),Sample Timepoint,Sample Timepoint Description,AgeAtCollection,Detailed Anatomical Location,Grade,Tumor Size,TNM,Duration between TNM Staging and Blood Draw (days),Stage,Stage Detailed,Morphology Code,Description of Morphology Code,Metastatic Sites,Vehicle Control,Media Conditions,Additional Supplements to Media,Protocols for Harvesting Cell Lines,Blood collection date (days from birth),Number of lines of metastatic therapy at time of blood draw,Number of lines of chemotherapy at time of blood draw,Number of lines of anti-HER2 therapy at time of blood draw,Number of lines of endocrine therapy at time of blood draw,Overall Survival(months),Treatment Data,Progression Free Survival(months),Gestational Age at Collection,Fetus Sex,Menopausal Status,Blood Type,HER2 IHC,HER2,HER2 FISH,FISH Notes,HER2 Change from Previos Sample,ER,ER Notes,ER Value,ER Status Change from Previous Sample,PR,PR Notes,PR Value,PR Status Change from Previous Sample,AR,AR Notes,AR Status Change From Previous Sample,ROS,ALK,EGFR,EGFR Allele Information,PDL1,KRAS,PIK3CA,ESR1,BRCA1,BRCA2,FOLR1,Ki67,TROP2,HER3,MET,Tissue Factor (TF),NECTIN4,CEACAM5,LIV1,MESOTHELIN,B7H3,B7H4,FRalpha,DLL3,CDH6,PGR,PSA,KLF5,FGF/FGFR,Biomarker Notes,RNA-Sequencing Available,ExPatientId,Source,Country,Gender,Race,MedicalHistory,FamilyHistory,AlcoholHistory,SmokingHistory,Number of years smoked or smoking,Smoking Notes,Donor Notes,HER2 Value,Menopausal Status Value,HER2 FISH Value,BRCA1 Value,BRCA2 Value,PDL1 Value,ALK Value,ROS Value,EGFR Value
0,408_BD T0,,tube,,LV1004950744,,,Human,Streck,Double,not received,,not received,no hemolysis,PRB_LB_0325,not received,2017-December-20,11:29:00 AM,,,,,,not received,not received,Nalika Palayoor,17-Jul-2025,not received,408_BD T0,not received,plasma,cancer,breast cancer,not received,160.0,61.4,23.98,5,,treatment-naïve,not received,57,not received,not received,not received,cT4c cN1 cM1,5,IV,,8500/3,"Infiltrating duct carcinoma, NOS (C50._)",not received,not received,not received,not received,not received,,,,,,,,,,,postmenopause,,3+,positive,,,,negative,not received,negative,,negative,not received,negative,,,not received,not received,not received,not received,not received,not received,not received,not received,not received,not received,not received,not received,not received,,,,,,,,,,,,,,,,,,,not received,No,PRB_408,Indivumed,Germany,Female,not received,,,,not received,,,,3+,post menopause,,,,,,,
1,409_BD T0,,tube,,LV2001303058,,,Human,Streck,Double,not received,,not received,no hemolysis,PRB_LB_0325,not received,2019-September-17,11:04:00 AM,,,,,,not received,not received,Nalika Palayoor,17-Jul-2025,not received,409_BD T0,not received,plasma,cancer,breast cancer,not received,160.0,79.0,30.86,33,,treatment-naïve,not received,74,not received,not received,not received,rT0 pN0 pM1 GX,33,IV,,8500/3,"Infiltrating duct carcinoma, NOS (C50._)",not received,not received,not received,not received,not received,,,,,,,,,,,postmenopause,,2+ (positive fish/cish),positive,positive,,,negative,not received,0,,negative,not received,0,,,not received,not received,not received,not received,not received,not received,not received,not received,not received,not received,not received,not received,not received,,,,,,,,,,,,,,,,,,,not received,No,PRB_409,Indivumed,Germany,Female,not received,,,,not received,,,,2+ (positive fish/cish),post menopause,positive,,,,,,
2,410_BD T0,,tube,,LV2000478378,,,Human,Streck,Double,not received,,not received,no hemolysis,PRB_LB_0325,not received,2019-February-12,11:34:00 AM,,,,,,not received,not received,Nalika Palayoor,17-Jul-2025,not received,410_BD T0,not received,plasma,cancer,breast cancer,not received,175.0,91.0,29.71,28,,treatment-naïve,not received,71,not received,not received,not received,rT0 cN0 pM1 G3 LX VX PnX,28,IV,,8520/3,"Lobular carcinoma, NOS (C50._)",not received,not received,not received,not received,not received,,,,,,,,,,,premenopause,,1+,negative,,,,> = 51 and < = 80% pos. cells,not received,> = 51 and < = 80% pos. cells,,> = 10 and < = 50% pos. cells,not received,> = 10 and < = 50% pos. cells,,,not received,not received,not received,not received,not received,not received,not received,not received,not received,not received,not mutated,mutated,not received,,,,,,,,,,,,,,,,,,,not received,No,PRB_410,Indivumed,Germany,Female,not received,,,,not received,,,,1+,pre menopause,,not mutated,mutated,,,,
3,411_BD T0,,tube,,LV2000478505,,,Human,Streck,Double,not received,,not received,no hemolysis,PRB_LB_0325,not received,2019-January-22,09:57:00 AM,,,,,,not received,not received,Nalika Palayoor,17-Jul-2025,not received,411_BD T0,not received,plasma,cancer,breast cancer,not received,169.0,76.9,26.92,38,,treatment-naïve,not received,56,not received,not received,not received,cT2 cN0 cM1 G3,38,IV,,8500/3,"Infiltrating duct carcinoma, NOS (C50._)",not received,not received,not received,not received,not received,,,,,,,,,,,premenopause,,1+,negative,,,,positive,not received,12,,negative,not received,0,,,not received,not received,not received,not received,not received,not received,not received,not received,not received,not received,not mutated,not received,not received,,,,,,,,,,,,,,,,,,,not received,No,PRB_411,Indivumed,Germany,Female,not received,,,,not received,,,,1+,pre menopause,,not mutated,,,,,
4,412_BD T0,,tube,,LV2008151600,,,Human,Streck,Double,not received,,not received,no hemolysis,PRB_LB_0325,not received,2023-February-28,11:16:00 AM,,,,,,not received,not received,Nalika Palayoor,17-Jul-2025,not received,412_BD T0,not received,plasma,cancer,breast cancer,not received,166.0,70.9,25.73,44,,treatment-naïve,not received,65,not received,not received,not received,cT4b cNX cM1 G2,44,IV,,8520/3,"Lobular carcinoma, NOS (C50._)",not received,not received,not received,not received,not received,,,,,,,,,,,postmenopause,,3+,positive,,,,negative,not received,0,,negative,not received,0,,,not received,not received,not received,not received,not received,not received,not received,not received,not received,not received,not received,not received,not received,,,,,,,,,,,,,,,,,,,not received,No,PRB_412,Indivumed,Germany,Female,not received,,,,not received,,,,3+,post menopause,,,,,,,
5,413_BD T0,,tube,,LV2001490472,,,Human,Streck,Double,not received,,not received,hemolysis,PRB_LB_0325,not received,2019-December-16,09:08:00 AM,,,,,,not received,not received,Nalika Palayoor,17-Jul-2025,not received,413_BD T0,not received,plasma,cancer,breast cancer,not received,175.0,76.2,24.88,62,,treatment-naïve,not received,70,not received,not received,not received,cT4b cNX cM1 G1 L1,62,IV,,8500/3,"Infiltrating duct carcinoma, NOS (C50._)",not received,not received,not received,not received,not received,,,,,,,,,,,postmenopause,,1+,negative,,,,more than 80% positive cells,not received,more than 80% positive cells,,more than 80% positive cells,not received,more than 80% positive cells,,,not received,not received,not received,not received,not received,not received,not received,not received,not received,not received,not mutated,not mutated,not received,,,,,,,,,,,,,,,,,,,not received,No,PRB_413,Indivumed,Germany,Female,not received,,,,not received,,,,1+,post menopause,,not mutated,not mutated,,,,
6,414_BD T0,,tube,,LV1004946591,,,Human,Streck,Double,not received,,not received,no hemolysis,PRB_LB_0325,not received,2018-March-06,10:59:00 AM,,,,,,not received,not received,Nalika Palayoor,17-Jul-2025,not received,414_BD T0,not received,plasma,cancer,breast cancer,not received,168.0,57.0,20.2,19,,treatment-naïve,not received,62,not received,not received,not received,cT3 cNX cM1,19,IV,,8500/3,"Infiltrating duct carcinoma, NOS (C50._)",not received,not received,not received,not received,not received,,,,,,,,,,,not received,,0,negative,,,,positive,not received,12,,not received,not received,,,,not received,not received,not mutated,not mutated,not mutated,not received,negative,not received,not received,not received,not received,not received,not received,,,,,,,,,,,,,,,,,,,not received,No,PRB_414,Indivumed,Germany,Female,not received,,,,not received,,,,0,,,,,negative,not mutated,not mutated,not mutated
7,415_BD T0,,tube,,LV2008120191,,,Human,Streck,Double,not received,,not received,hemolysis,PRB_LB_0325,not received,2023-March-01,09:13:00 AM,,,,,,not received,not received,Nalika Palayoor,17-Jul-2025,not received,415_BD T0,not received,plasma,cancer,breast cancer,not received,162.0,82.0,31.25,45,,treatment-naïve,not received,86,not received,not received,not received,cT2 cN0 cM1 G2,45,IV,,8500/3,"Infiltrating duct carcinoma, NOS (C50._)",not received,not received,not received,not received,not received,,,,,,,,,,,postmenopause,,1+,negative,,,,positive,not received,12,,positive,not received,12,,,not received,not received,not received,not received,not received,not received,not received,not received,not received,not received,not received,not received,not received,,,,,,,,,,,,,,,,,,,not received,No,PRB_415,Indivumed,Germany,Female,not received,,,,not received,,,,1+,post menopause,,,,,,,
8,416_BD T0,,tube,,LV2001322423,,,Human,Streck,Double,not received,,not received,no hemolysis,PRB_LB_0325,not received,2019-September-19,12:18:00 PM,,,,,,not received,not received,Nalika Palayoor,17-Jul-2025,not received,416_BD T0,not received,plasma,cancer,breast cancer,not received,158.0,66.0,26.44,35,,treatment-naïve,not received,77,not received,not received,not received,cT4 cN1 cM1,35,IV,,8500/3,"Infiltrating duct carcinoma, NOS (C50._)",not received,not received,not received,not received,not received,,,,,,,,,,,premenopause,,3+,positive,,,,negative,not received,0,,negative,not received,0,,,not received,not received,not received,not received,not received,not received,not received,not received,not received,not received,not received,not received,not received,,,,,,,,,,,,,,,,,,,not received,No,PRB_416,Indivumed,Germany,Female,not received,,,,not received,,,,3+,peri menopause,,,,,,,
9,417_BD T0,,tube,,LV2008115344,,,Human,Streck,Double,not received,,not received,no hemolysis,PRB_LB_0325,not received,2023-April-17,09:02:00 AM,,,,,,not received,not received,Nalika Palayoor,17-Jul-2025,not received,417_BD T0,not received,plasma,cancer,breast cancer,not received,168.0,90.0,31.89,33,,treatment-naïve,not received,55,not received,not received,not received,cT2 cN1 cM1 G3 L0 V0 PnX,33,IV,,8500/3,"Infiltrating duct carcinoma, NOS (C50._)",not received,not received,not received,not received,not received,,,,,,,,,,,postmenopause,,3+,positive,,,,negative,not received,0,,negative,not received,0,,,not received,not received,not received,not received,not received,not received,not received,not received,not received,not received,not received,not received,not received,,,,,,,,,,,,,,,,,,,not received,No,PRB_417,Indivumed,Germany,Female,not received,,,,not received,,,,3+,post menopause,,,,,,,


### Run this to download the dataframe as a excel workbook

In [6]:
final.to_excel(output_path, index=False)
FileLink(output_path)


In [7]:


import re
import pandas as pd


def extract_biomarkers_from_blob(blob, biomarker_lookup, pos_neg_mapping):
    results = {}
    blob = blob.replace("=", " = ")
    blob = re.sub(r'\s+', ' ', blob).strip().lower()

    # Build list of all biomarker terms used for lookahead
    biomarker_keys = sorted(
        [re.escape(v.lower()) for variants in biomarker_lookup.values() for v in variants],
        key=len,
        reverse=True
    )
    lookahead_pattern = r'(?=\s+\b(?:' + '|'.join(biomarker_keys) + r')\b\s*=|$)'

    # Pull out FISH for HER2 scoring
    fish_match = re.search(r'\bfish\b\s*=\s*([^\s=]+)', blob)
    fish_val = fish_match.group(1).strip() if fish_match else None

    for template_col, variants in biomarker_lookup.items():
        for variant in variants:
            variant = variant.lower()
            pattern = rf'\b{re.escape(variant)}\b\s*=\s*(.*?){lookahead_pattern}'
            match = re.search(pattern, blob)
            if match:
                raw_val = match.group(1).strip()

                if template_col == "HER2":
                    her2_match = re.search(r'(0|1\+|2\+|3\+)', raw_val)
                    if her2_match:
                        her2_score = her2_match.group(1)
                        if her2_score in ["0", "1+"]:
                            mapped = "negative"
                        elif her2_score == "3+":
                            mapped = "positive"
                        elif her2_score == "2+":
                            if fish_val in pos_neg_mapping.get("positive", []):
                                mapped = "positive"
                            elif fish_val in pos_neg_mapping.get("negative", []):
                                mapped = "negative"
                            else:
                                mapped = "HER2 2+ (FISH/ISH missing)"
                        else:
                            mapped = pd.NA

                        # Save full HER2 IHC value
                        results["HER2 IHC"] = raw_val
                    else:
                        mapped = raw_val
                else:
                    mapped = None
                    for label, values in pos_neg_mapping.items():
                        if raw_val in values:
                            mapped = label
                            break
                    if mapped is None:
                        mapped = raw_val

                results[template_col] = mapped
                break

    return results




def extract_menopause_status(val_str):
    val_str= val_str.lower()
    
    for clean_val, options in menopause_mapping.items():
        for opt in options:
            if opt in val_str:
                return clean_val
    return pd.NA


row= "HER2 = 1+	ER = 12	menopause status = irrelevant	PR = 12"

extract_menopause_status(row)

'not applicable'