In [1]:
import pandas as pd
import re
from datetime import datetime
from IPython.display import FileLink


### Define each of these fields each time based on the column names in the raw data

In [2]:
# DEFINE THESE EACH TIME YOU RUN THE SCRIPT

# filepath for the template and raw files
template= '1Bulk_sample_template_V11.xlsx'
raw= 'PRB_LB_0325_raw.xlsx'
shipping_manifest= 'PRB_LB_0325_shipping_manifest.xlsx'
dataset= 'PRB_LB_0325'
output_path= f"{dataset}_formatted_auto.xlsx"

# sheet name for the raw file
sheet_name_raw= 'Clinical Data'
sheet_name_shipping= 'Template'


# header row for the raw and template files
raw_header= 1
template_header= 1
shipping_manifest_header= 10

# columns in the raw file that contain biomarker data
biomarker_cols= ["Biomarker 1", "Biomarker 2", "Biomarker 3", "Biomarker 4", "Biomarker 5", "Biomarker 6"]

# column mappings from raw to template (key: template column, value: raw column - add more columns as needed). If blank, default value will be Not received
column_mapping= {
    "ExternalId": "Sample ID\nconsecutive",
    'TubeBarcode': 'Barcode Scan',
    "Stabilizer": "Collection Tube Type",
    "Single or Double Spun": "Single or Double Spun",
    "Hemolysis": "Hemolysis",
    "SpecimenType": "Sample   Type",
    "Date of Blood Draw/Cell Collection": "Date of blood collection [yyyy-mm-dd]",
    "Time of Draw": "Date of blood collection [yyyy-mm-dd]",
    "Gender": "Gender",
    "Height": "Body height [cm]",
    "Sample Timepoint": "Blood sample timepoint",
    "TNM": "TNM",
    "Stage": "Stage",
    "Morphology Code": "Morphology code",
    "Description of Morphology Code": "Description of morphology code",
    "ExPatientId": "Patient ID\nconsecutive",
    "ExSpecimenId": "Sample ID\nconsecutive",
    "Weight": "Maximum weight [kg] "
}

# fixed values that aren't in the raw file (key: template column, value: fixed value)
fixed_values= {
    "ContainerType": "tube",
    "Organism": "Human",
    "Project": "PRB_LB_0325",
    "Data Transformer": "Nalika Palayoor",
    "Date of Transformation": "17-Jul-2025",
    "Condition": "cancer",
    "Diagnostic Condition": "breast cancer",
    "RNA-Sequencing Available": "No",
    "Source": "Indivumed",
    "Country": "Germany"
}

# define as True if menopausal status is to be extracted from biomarker data
extract_menopause_from_biomarker= True

# This is a dictionary where the key is a column that needs to be calculated and the values are the two columns from the raw data it will be calculated from
calculation_functions= {
    "Duration between Cancer Diagnosis and Blood Draw (days)": ["Date of cancer diagnosis [yyyy-mm-dd]","Date of blood collection [yyyy-mm-dd]"],
    "Duration between TNM Staging and Blood Draw (days)": ["Date of TNM staging  [yyyy-mm-dd]","Date of blood collection [yyyy-mm-dd]"],
}





### Cleaning functions

In [3]:
# function factory for basic mapping cleaning
def make_cleaner(mapping_dict):
    """
    Given a mapping dictionary, returns a cleaning function that maps raw values to the standard values
    Parameters:
        mapping_dict (dict): dictionary where keys are standard values and values are lists of raw value options
    Returns: 
        function: takes a single value and returns the cleaned value or pd.NA if not found
    """
    lookup= {}
    for standard_val, raw_options in mapping_dict.items():
        for raw_val in raw_options:
            lookup[str(raw_val).strip().lower()]= standard_val
    def cleaner(val):
        if pd.isna(val):
            return pd.NA
        val_str= str(val).strip().lower()
        return lookup.get(val_str, pd.NA)
    return cleaner

# calls the calculation functions for elapsed days
def call_calculation_functions(final_df, raw_df, mapping_dict):
    for new_col, (col1, col2) in mapping_dict.items():
        final_df[new_col] = raw_df.apply(
            lambda row: calculate_elapsed_days(row, col1, col2), axis=1
        )
    return final_df

# function to extract menopausal status from biomarker columns (logic set so it only runs if specified)
def extract_menopause_status(row):
    """
    Given a row of data, extracts menopausal status from biomarker columns if present
    Parameters:
        row (pd.Series): a row of the dataframe that may contain menopause status
    Returns: 
        str or pd.NA: cleaned menopausal status or pd.NA if not found
    """

    for col in biomarker_cols:
        val= row.get(col)
        if pd.isna(val):
            continue
        val_str= str(val).strip().lower()

        if "menopause status" in val_str:
            parts= val_str.split("=")
            if len(parts)>1:
                raw_status= parts[1].strip().lower()

                for clean_val, options in menopause_mapping.items():
                    if raw_status in [opt.lower() for opt in options]:
                        return clean_val
                return "Unknown"
    return pd.NA

# function to interpret HER2 status from complex strings
def interpret_her2_status(val):
    """
    Given a raw HER2 status string, returns the cleaned HER2 status
    Parameters:
        val (str): raw HER2 status string
    Returns: 
        str or pd.NA: cleaned HER2 status ("positive", "negative", "HER2 2+ (FISH/ISH missing)") or pd.NA if not found
    """
    if pd.isna(val):
        return pd.NA

    val = str(val).lower()
    her2_score = None
    fish_result = None

    match_ihc = re.search(r"her2\s*=\s*([0-3]\+?)", val)
    if match_ihc:
        her2_score = match_ihc.group(1)

    if "fish" in val or "ish" in val:
        if "positive" in val:
            fish_result = "positive"
        elif "negative" in val:
            fish_result = "negative"

    if her2_score in ["0", "1+"]:
        return "negative"
    elif her2_score == "3+":
        return "positive"
    elif her2_score == "2+":
        if fish_result == "positive":
            return "positive"
        elif fish_result == "negative":
            return "negative"
        else:
            return "HER2 2+ (FISH/ISH missing)"

    return pd.NA

# function to extract and clean biomarker data from biomarker columns
def extract_pos_neg_biomarker(row, biomarker_lookup, pos_neg_mapping):
    """
    Extracts positive and negative biomarker results from a row of data.
    Parameters:
        row (pd.Series): a row of the dataframe containing biomarker information
        biomarker_cols (list): list of columns to check for biomarker data
        biomarker_lookup (dict): mapping of biomarker names to standardized names
        pos_neg_mapping (dict): mapping of raw results to positive/negative labels
    Returns:
        results: a dictionary of extracted biomarker results
    """
    results = {}

    val= str(row['biomarker_blob']).strip().lower()



    if "=" in val:
        biomarker_name, raw_result= [s.strip() for s in val.split("=", 1)]

        for template_col, raw_variants in biomarker_lookup.items():
            if biomarker_name.lower() in [v.lower() for v in raw_variants]:
                if template_col== "HER2":
                    mapped = interpret_her2_status(val)
                else:
                    mapped = None
                    for label, values in pos_neg_mapping.items():
                        for v in values:
                            if v.lower() in raw_result:
                                mapped= label
                                break
                        if mapped:
                            break
                    if not mapped:
                        mapped= raw_result
                    try:
                        num= int(raw_result)
                        if biomarker_name.lower()== 'er':
                            results['ER Value']= num
                        if biomarker_name.lower()== 'pr':
                            results['PR Value']= num
                    except:
                        pass

                results[template_col]= mapped
                
    return results

# functions to put the date and time in the correct format
def clean_date(val):
    """
    Cleans and formats a date string.
    Parameters:
        val (str): raw date string
    Returns:
        str or pd.NA: cleaned date string in "YYYY-MM-DD" format or pd.NA if not found
    """
    if pd.isna(val):
        return pd.NA

    try:
        return pd.to_datetime(val).strftime("%Y-%m-%d")
    except:
        return pd.NA

def clean_time(val):
    """
    Cleans and formats a time string.
    Parameters:
        val (str): raw time string
    Returns:
        str or pd.NA: cleaned time string in "HH:MM:SS AM/PM" format or pd.NA if not found
    """
    if pd.isna(val):
        return pd.NA

    try:
        return pd.to_datetime(val).strftime("%I:%M:%S %p")
    except: 
        return pd.NA

# functions for columns that need to be calculated
def calculate_age(birth_val, collection_date):
    """
    Calculates age in years given birth date (or year) and collection date. If only year is given for birth date, assumes Jan 1 of that year.
    Parameters:
        birth_val (str): raw birth date string
        collection_date (str): raw collection date string
    Returns:
        int or pd.NA: age in years or pd.NA if not calculable
    """
    if pd.isna(birth_val) or pd.isna(collection_date):
        return pd.NA
    try:
        birth_str= str(birth_val).strip()
        if len(birth_str)== 4 and birth_str.isdigit():
            birth_date= pd.to_datetime(f"{birth_str}-01-01")
        else:
            birth_date= pd.to_datetime(birth_str, errors= "coerce")
        collection_dt= pd.to_datetime(collection_date, errors= "coerce")

        age= (collection_dt-birth_date).days // 365
        return age
    except:
        return pd.NA

def calculate_elapsed_days(row, start_col, end_col):
    """
    Calculates elapsed days between two date columns in a row.
    Parameters:
        row (pd.Series): a row of the dataframe containing date information
        start_col (str): name of the column with the start date
        end_col (str): name of the column with the end date
    Returns:
        int or pd.NA: number of elapsed days or pd.NA if not calculable
    """
    start_val= row.get(start_col)
    end_val= row.get(end_col)

    if pd.isna(start_val):
        return pd.NA
    if pd.isna(end_val):
        return pd.NA
    try:
        start_date= pd.to_datetime(start_val, errors= "coerce")
        end_date= pd.to_datetime(end_val, errors= "coerce")

        if pd.isna(start_date) or pd.isna(end_date):
            return pd.NA

        delta= (end_date-start_date).days
        return delta if delta >= 0 else pd.NA
    except:
        return pd.NA
    
def calculate_bmi(weight, height):
    """
    Calculates BMI given weight in kg and height in cm.
    Parameters:
        weight (float): weight in kg
        height (float): height in cm
    Returns:
        float or pd.NA: BMI value or pd.NA if not calculable
    """
    print(f"weight: {weight} | height: {height}")

    weight = pd.to_numeric(weight, errors="coerce")
    height = pd.to_numeric(height, errors="coerce")

    if pd.isna(weight) or pd.isna(height) or height == 0:
        return 'noooo'
    try:
        height_m= height / 100
        bmi= weight / (height_m ** 2)
        return round(bmi, 2)
    except:
        return 'lol'



### Synonym dictionaries for data cleaning (add to these as you find new synonyms in the raw data)

In [4]:
# If there are new synonyms found in the raw data, don't delete the old ones, just add to the lists below

menopause_mapping= {
    "premenopause": ["pre menopause"],
    "perimenopause": ["peri menopause"],
    "menopause": ["menopause"],
    "postmenopause": ["post menopause"]
}

biomarker_lookup = {
    "HER2": ["her2"],
    "ER": ["er", "estrogen receptor"],
    "PR": ["pr", "progesterone receptor"],
    "FISH": ["fish"],
    "PDL1": ["pdl1"],
    "ALK": ["alk"],
    "ROS": ["ros"],
    "EGFR": ["egfr"],
    "KRAS": ["kras"],
    "PIK3CA": ["pik3ca"],
    "ESR1": ["esr1"],
    "AR": ["ar"],
}

pos_neg_mapping = {
    "positive": ["Positive","positive", "strong positive", "weak positive", "2+", "3+", "1", "2","3","4","5","6","7","8","9","10","11","12","12",">= 10", ">=10"],
    "negative": ["Negative","negative", "0", "none", "not detected", "less than 10%", "less than 10", "1+"]
}

# create functions from function factory (add to dictionaries as you find new synonyms in the raw data)
clean_stabilizer= make_cleaner({
    "Streck": ["Streck Cell-Free DNA BCT"]
})

clean_gender= make_cleaner({
    "Male": ['m', 'male', 'M', 'Male'],
    "Female": ['f', 'female', 'F', 'Female']
})

clean_single_double= make_cleaner({
    "Single": ['single', 'Single','1'],
    "Double": ['double', 'Double','2']
})

clean_sample_timepoint= make_cleaner({
    "treatment-naïve": ["Initial-0"]
})

clean_stage= make_cleaner({
    "I": ["I", "IA", "IB"],
    "II": ["II", "IIA", "IIB"],
    "III": ["III", "IIIA", "IIIB"],
    "IV": ["IV", "IVA", "IVB"]
})

clean_hemolysis= make_cleaner({
    "no hemolysis": ["No"],
    "light hemolysis": ["Light Hemolysis"],
    "hemolysis": [" Hemolysis"],
    "strong hemolysis": ["Strong Hemolysis"]
})

# transformation registry (key: template column, value: cleaning function)
# add a new row to the dictionary for each new function you create/use
transformations= {
    "Date of Blood Draw/Cell Collection": clean_date,
    "Time of Draw": clean_time,
    "Stabilizer": clean_stabilizer,
    "Gender": clean_gender,
    "Single or Double Spun": clean_single_double,
    "Sample Timepoint": clean_sample_timepoint,
    "Stage": clean_stage,
    "Hemolysis": clean_hemolysis
}

### Function calling + formatting df

In [5]:
# load in the data

template= pd.read_excel(template, header= template_header)
raw= pd.read_excel(raw, sheet_name= sheet_name_raw, header= raw_header)
shipping_manifest= pd.read_excel(shipping_manifest, sheet_name= sheet_name_shipping, header= shipping_manifest_header)
final= pd.DataFrame(index= raw.index, columns= template.columns)

raw['biomarker_blob']= raw[biomarker_cols]\
.astype(str)\
.replace('nan', '')\
.agg(' '.join, axis=1)\
.str.replace(r'\s+', ' ',regex= True)\
             .str.strip()

# merge the raw data with the shipping manifest to get tube barcodes
raw= pd.merge(raw, shipping_manifest, left_on= 'Patient ID\nconsecutive',right_on= 'Patient ID consecutive', how= "left")

# this occurs if menopause status is in biomarker columns
if extract_menopause_from_biomarker:
    final["Menopausal Status"]= raw['biomarker_blob']

# adding biomarker columns to the column mapping
biomarker_data = raw.apply(
    lambda row: extract_pos_neg_biomarker(
        row,
        biomarker_lookup,
        pos_neg_mapping
    ),
    axis=1
)


for idx, result in biomarker_data.items():
    for col, val in result.items():
        final.at[idx, col] = val

# call the calculation functions to create calculated columns
call_calculation_functions(final,raw,calculation_functions)

# create a column for age at the time of blood collection
final["AgeAtCollection"]= raw.apply(
    lambda row: calculate_age(row["Year of birth"], row["Date of blood collection [yyyy-mm-dd]"]), axis=1)

# Create bmi column

final["BMI"]= raw.apply(
    lambda row: calculate_bmi(row["Maximum weight [kg] "], row["Body height [cm]"]), axis=1)


# apply transformations and mappings

for col,val in fixed_values.items():
    if col in final.columns:
        final[col]= val

for template_col in column_mapping:
    raw_col= column_mapping[template_col]
    if raw_col in raw.columns:
        if template_col in transformations:
            final[template_col]= raw[raw_col].apply(transformations[template_col])
        else:
            final[template_col]= raw[raw_col]

# define columns that can be left blank if no data is present
required_columns = [
    "Tube Barcode","Concentration Units","Single or Double Spun","Processing Method","Freeze Thaw Status","Project",
    "Matched FFPE Available","Surgery Type","Tumor Tissue Type","Other Sample Notes","Collection Site","Histology",
    "Sample Timepoint","Sample Timepoint Description","Detailed Anatomical Location","Grade","Tumor Size","TNM","Stage",
    "Stage_detailed","Morphology Code","Description of Morphology Code","Metastatic Sites","Vehicle Control","Media Conditions",
    "Additional Supplements to Media","Protocols for Harvesting Cell Lines","Menopausal Status","HER2 IHC","HER2","FISH",
    "HER2 Change from Previous Sample","ER","ER Notes","ER Status Change From Previous Sample","PR","PR Notes",
    "PR Status Change From Previous Sample","AR Notes","AR Status Change From Previous Sample","ROS","ALK","EGFR",
    "EGFR Allele Information","PDL1","KRAS","PIK3CA", "ESR1","BRCA1","BRCA2","FOLR1","Biomarker Notes","Country",
    "Gender","Race","SmokingHistory"
]



# if a column is blank and not optional, fill with Not received
for col in final.columns:
    if col in required_columns:
        final[col]= final[col].fillna("not received")

# display the dataframe
pd.set_option("display.max_columns",None)
final.head(6)


weight: 61.4 | height: 160.0
weight: 79.0 | height: 160.0
weight: 91.0 | height: 175.0
weight: 76.9 | height: 169.0
weight: 70.9 | height: 166.0
weight: 76.2 | height: 175.0
weight: 57.0 | height: 168.0
weight: 82.0 | height: 162.0
weight: 66.0 | height: 158.0
weight: 90.0 | height: 168.0
weight: 57.0 | height: 158.0
weight: 79.0 | height: 156.0
weight: 68.0 | height: 178.0
weight: 80.0 | height: 157.0
weight: 70.0 | height: 161.0
weight: 53.5 | height: 152.0
weight: 72.0 | height: 160.0
weight: 49.5 | height: 155.0
weight: 85.5 | height: 155.0
weight: 82.0 | height: 163.0
weight: 101.0 | height: 159.0
weight: 65.0 | height: 150.0
weight: 86.0 | height: 182.0
weight: 57.5 | height: 157.0
weight: 60.0 | height: 160.0
weight: 56.1 | height: 166.0
weight: 82.0 | height: 172.0
weight: 57.0 | height: 175.0
weight: 96.0 | height: 160.0
weight: nan | height: nan
weight: 112.0 | height: 166.0
weight: nan | height: nan
weight: 110.0 | height: 159.0
weight: 70.0 | height: 167.0
weight: 85.0 | he

Unnamed: 0,ExternalId,Received Date,ContainerType,Volume_uL,TubeBarcode,Concentration,ConcentrationUnits,Organism,Stabilizer,Single or Double Spun,Processing Method,Processing Time(hrs),Freeze Thaw Status,Hemolysis,Project,Matched FFPE Available,Date of Blood Draw/Cell Collection,Time of Draw,Block Size,Tissue Size,Tissue Weight (mg),% Tumor,% Necrosis,Surgery Type,Tumor Tissue Type,Data Transformer,Date of Transformation,Other Sample Notes,ExSpecimenId,Collection Site,SpecimenType,Condition,Diagnostic Condition,Histology,Height,Weight,BMI,Duration between Cancer Diagnosis and Blood Draw (days),Duration between Metastatic Diagnosis and Blood Draw (days),Sample Timepoint,Sample Timepoint Description,AgeAtCollection,Detailed Anatomical Location,Grade,Tumor Size,TNM,Duration between TNM Staging and Blood Draw (days),Stage,Stage Detailed,Morphology Code,Description of Morphology Code,Metastatic Sites,Vehicle Control,Media Conditions,Additional Supplements to Media,Protocols for Harvesting Cell Lines,Blood collection date (days from birth),Number of lines of metastatic therapy at time of blood draw,Number of lines of chemotherapy at time of blood draw,Number of lines of anti-HER2 therapy at time of blood draw,Number of lines of endocrine therapy at time of blood draw,Overall Survival(months),Treatment Data,Progression Free Survival(months),Gestational Age at Collection,Fetus Sex,Menopausal Status,Blood Type,HER2 IHC,HER2,HER2 FISH,FISH Notes,HER2 Change from Previos Sample,ER,ER Notes,ER Value,ER Status Change from Previous Sample,PR,PR Notes,PR Value,PR Status Change from Previous Sample,AR,AR Notes,AR Status Change From Previous Sample,ROS,ALK,EGFR,EGFR Allele Information,PDL1,KRAS,PIK3CA,ESR1,BRCA1,BRCA2,FOLR1,Ki67,TROP2,HER3,MET,Tissue Factor (TF),NECTIN4,CEACAM5,LIV1,MESOTHELIN,B7H3,B7H4,FRalpha,DLL3,CDH6,PGR,PSA,KLF5,FGF/FGFR,Biomarker Notes,RNA-Sequencing Available,ExPatientId,Source,Country,Gender,Race,MedicalHistory,FamilyHistory,AlcoholHistory,SmokingHistory,Number of years smoked or smoking,Smoking Notes,Donor Notes
0,408_BD T0,,tube,,LV1004950744,,,Human,Streck,Double,not received,,not received,no hemolysis,PRB_LB_0325,not received,2017-12-20,11:29:00 AM,,,,,,not received,not received,Nalika Palayoor,17-Jul-2025,not received,408_BD T0,not received,plasma,cancer,breast cancer,not received,160.0,61.4,23.98,5,,treatment-naïve,not received,57,not received,not received,not received,cT4c cN1 cM1,5,IV,,8500/3,"Infiltrating duct carcinoma, NOS (C50._)",not received,not received,not received,not received,not received,,,,,,,,,,,menopause status = post menopause ER = negativ...,,not received,not received,,,,not received,not received,,,not received,not received,,,,not received,not received,not received,not received,not received,not received,not received,not received,not received,not received,not received,not received,not received,,,,,,,,,,,,,,,,,,,not received,No,PRB_408,Indivumed,Germany,Female,not received,,,,not received,,,
1,409_BD T0,,tube,,LV2001303058,,,Human,Streck,Double,not received,,not received,no hemolysis,PRB_LB_0325,not received,2019-09-17,11:04:00 AM,,,,,,not received,not received,Nalika Palayoor,17-Jul-2025,not received,409_BD T0,not received,plasma,cancer,breast cancer,not received,160.0,79.0,30.86,33,,treatment-naïve,not received,74,not received,not received,not received,rT0 pN0 pM1 GX,33,IV,,8500/3,"Infiltrating duct carcinoma, NOS (C50._)",not received,not received,not received,not received,not received,,,,,,,,,,,ER = 0 FISH = positive HER2 = 2+ (positive FIS...,,not received,not received,,,,positive,not received,,,not received,not received,,,,not received,not received,not received,not received,not received,not received,not received,not received,not received,not received,not received,not received,not received,,,,,,,,,,,,,,,,,,,not received,No,PRB_409,Indivumed,Germany,Female,not received,,,,not received,,,
2,410_BD T0,,tube,,LV2000478378,,,Human,Streck,Double,not received,,not received,no hemolysis,PRB_LB_0325,not received,2019-02-12,11:34:00 AM,,,,,,not received,not received,Nalika Palayoor,17-Jul-2025,not received,410_BD T0,not received,plasma,cancer,breast cancer,not received,175.0,91.0,29.71,28,,treatment-naïve,not received,71,not received,not received,not received,rT0 cN0 pM1 G3 LX VX PnX,28,IV,,8520/3,"Lobular carcinoma, NOS (C50._)",not received,not received,not received,not received,not received,,,,,,,,,,,BRCA1 = not mutated BRCA2 = mutated ER = >=51 ...,,not received,not received,,,,not received,not received,,,not received,not received,,,,not received,not received,not received,not received,not received,not received,not received,not received,not received,not received,not received,not received,not received,,,,,,,,,,,,,,,,,,,not received,No,PRB_410,Indivumed,Germany,Female,not received,,,,not received,,,
3,411_BD T0,,tube,,LV2000478505,,,Human,Streck,Double,not received,,not received,no hemolysis,PRB_LB_0325,not received,2019-01-22,09:57:00 AM,,,,,,not received,not received,Nalika Palayoor,17-Jul-2025,not received,411_BD T0,not received,plasma,cancer,breast cancer,not received,169.0,76.9,26.92,38,,treatment-naïve,not received,56,not received,not received,not received,cT2 cN0 cM1 G3,38,IV,,8500/3,"Infiltrating duct carcinoma, NOS (C50._)",not received,not received,not received,not received,not received,,,,,,,,,,,ER = 12 HER2 = 1+ menopause status = pre menop...,,not received,not received,,,,positive,not received,,,not received,not received,,,,not received,not received,not received,not received,not received,not received,not received,not received,not received,not received,not received,not received,not received,,,,,,,,,,,,,,,,,,,not received,No,PRB_411,Indivumed,Germany,Female,not received,,,,not received,,,
4,412_BD T0,,tube,,LV2008151600,,,Human,Streck,Double,not received,,not received,no hemolysis,PRB_LB_0325,not received,2023-02-28,11:16:00 AM,,,,,,not received,not received,Nalika Palayoor,17-Jul-2025,not received,412_BD T0,not received,plasma,cancer,breast cancer,not received,166.0,70.9,25.73,44,,treatment-naïve,not received,65,not received,not received,not received,cT4b cNX cM1 G2,44,IV,,8520/3,"Lobular carcinoma, NOS (C50._)",not received,not received,not received,not received,not received,,,,,,,,,,,ER = 0 HER2 = 3+ menopause status = post menop...,,not received,not received,,,,positive,not received,,,not received,not received,,,,not received,not received,not received,not received,not received,not received,not received,not received,not received,not received,not received,not received,not received,,,,,,,,,,,,,,,,,,,not received,No,PRB_412,Indivumed,Germany,Female,not received,,,,not received,,,
5,413_BD T0,,tube,,LV2001490472,,,Human,Streck,Double,not received,,not received,hemolysis,PRB_LB_0325,not received,2019-12-16,09:08:00 AM,,,,,,not received,not received,Nalika Palayoor,17-Jul-2025,not received,413_BD T0,not received,plasma,cancer,breast cancer,not received,175.0,76.2,24.88,62,,treatment-naïve,not received,70,not received,not received,not received,cT4b cNX cM1 G1 L1,62,IV,,8500/3,"Infiltrating duct carcinoma, NOS (C50._)",not received,not received,not received,not received,not received,,,,,,,,,,,BRCA1 = not mutated BRCA2 = not mutated ER = m...,,not received,not received,,,,not received,not received,,,not received,not received,,,,not received,not received,not received,not received,not received,not received,not received,not received,not received,not received,not received,not received,not received,,,,,,,,,,,,,,,,,,,not received,No,PRB_413,Indivumed,Germany,Female,not received,,,,not received,,,


### Run this to download the dataframe as a excel workbook

In [6]:
final.to_excel(output_path, index=False)
FileLink(output_path)


In [7]:


print(raw.columns.tolist())


['Patient ID', 'Sample ID', 'Patient ID\nconsecutive', 'Sample ID\nconsecutive', 'Blood sample timepoint', 'Date of blood collection [yyyy-mm-dd]', 'Amount of aliquots', 'Sample or data', 'Gender', 'Year of birth', 'Body height [cm]', 'Minimum weight [kg]', 'Maximum weight [kg] ', 'Date of cancer diagnosis [yyyy-mm-dd]', 'ICD code', 'Description of ICD code', 'Organ of primary tumor', 'NSCLC or SCLC', 'Date of metastatic disease [yyyy-mm-dd]', 'Metastatic sites at index (ICD-O-3)', 'Date of TNM staging  [yyyy-mm-dd]', 'TNM', 'Stage', 'Dignity', 'Morphology code', 'Description of morphology code', 'Biomarker 1', 'Biomarker 2', 'Biomarker 3', 'Biomarker 4', 'Biomarker 5', 'Biomarker 6', 'Surgery Date [yyyy-mm-dd]', 'Surgery Doc', 'Treatment classification', 'Therapeutic agent #1', 'Dosage therapeutic agent #1', 'Dosage unit therapeutic agent #1', 'Therapeutic agent #2', 'Dosage therapeutic agent #2', 'Dosage unit therapeutic agent #2', 'Therapeutic agent #3', 'Dosage therapeutic agent #3

In [8]:
print(final.columns.tolist())

['ExternalId', 'Received Date', 'ContainerType', 'Volume_uL', 'TubeBarcode', 'Concentration', 'ConcentrationUnits', 'Organism', 'Stabilizer', 'Single or Double Spun', 'Processing Method', 'Processing Time(hrs)', 'Freeze Thaw Status', 'Hemolysis', 'Project', 'Matched FFPE Available', 'Date of Blood Draw/Cell Collection', 'Time of Draw', 'Block Size', 'Tissue Size', 'Tissue Weight (mg)', '% Tumor', '% Necrosis', 'Surgery Type', 'Tumor Tissue Type', 'Data Transformer', 'Date of Transformation', 'Other Sample Notes', 'ExSpecimenId', 'Collection Site', 'SpecimenType', 'Condition', 'Diagnostic Condition', 'Histology', 'Height', 'Weight', 'BMI', 'Duration between Cancer Diagnosis and Blood Draw (days)', 'Duration between Metastatic Diagnosis and Blood Draw (days)', 'Sample Timepoint', 'Sample Timepoint Description', 'AgeAtCollection', 'Detailed Anatomical Location', 'Grade', 'Tumor Size', 'TNM', 'Duration between TNM Staging and Blood Draw (days)', 'Stage', 'Stage Detailed', 'Morphology Code'

In [9]:
raw['biomarker_blob'][2]

'BRCA1 = not mutated BRCA2 = mutated ER = >=51 and <=80% pos. cells HER2 = 1+ menopause status = pre menopause PR = >= 10 and <=50% pos. cells'

In [10]:
# show all the data in the raw file
raw.head()


Unnamed: 0,Patient ID,Sample ID,Patient ID\nconsecutive,Sample ID\nconsecutive,Blood sample timepoint,Date of blood collection [yyyy-mm-dd],Amount of aliquots,Sample or data,Gender,Year of birth,Body height [cm],Minimum weight [kg],Maximum weight [kg],Date of cancer diagnosis [yyyy-mm-dd],ICD code,Description of ICD code,Organ of primary tumor,NSCLC or SCLC,Date of metastatic disease [yyyy-mm-dd],Metastatic sites at index (ICD-O-3),Date of TNM staging [yyyy-mm-dd],TNM,Stage,Dignity,Morphology code,Description of morphology code,Biomarker 1,Biomarker 2,Biomarker 3,Biomarker 4,Biomarker 5,Biomarker 6,Surgery Date [yyyy-mm-dd],Surgery Doc,Treatment classification,Therapeutic agent #1,Dosage therapeutic agent #1,Dosage unit therapeutic agent #1,Therapeutic agent #2,Dosage therapeutic agent #2,Dosage unit therapeutic agent #2,Therapeutic agent #3,Dosage therapeutic agent #3,Dosage unit therapeutic agent #3,Therapeutic agent #4,Dosage therapeutic agent #4,Dosage unit therapeutic agent #4,Therapeutic agent #5,Dosage therapeutic agent #5,Dosage unit therapeutic agent #5,Therapeutic agent #6,Dosage therapeutic agent #6,Dosage unit therapeutic agent #6,Line of treatment,Treatment intention,Tumor status at timepoint,Clinical status at timepoint,Karnofsky score (quality of life),Therapy start date [yyyy-mm-dd],Therapy end date [yyyy-mm-dd],Therapy ended according to plan,Cancel reason,Clinical status at therapy end,Tumor status at therapy end,Date of last tumor status [yyyy-mm-dd],Last tumor status,Death,Date of death [yyyy-mm-dd],Last date alive [yyyy-mm-dd],Collection Tube Type,Single or Double Spun,biomarker_blob,#,Patient ID \npre Data,Label \npreData,Patient ID consecutive,Label consecutive,LV-Number,Entity,Sample Type,Quantity,Quantity Unit,Box,Position,Barcode Scan,Matches Barcode,Sample Damage?,Hemolysis,Label Readable Issue?,Label Information Issue?,"Location (freezer 2, shelf x, rack x, position x)"
0,Pat_002,002_BD T0,PRB_408,408_BD T0,Initial-0,2017-12-20 11:29:00,,,f,1961,160.0,45.2,61.4,2017-12-15,C50.9,Malignant neoplasm of breast of unspecified site,,,NaT,,2017-12-15,cT4c cN1 cM1,IV,malign,8500/3,"Infiltrating duct carcinoma, NOS (C50._)",menopause status = post menopause,ER = negative,HER2 = 3+,PR = negative,ER = negative,HER2 = 3+,,,Targeted therapy,Bevacizumab,10.0,mg/kg,Bevacizumab,10.0,mg/kg,Paclitaxel,80.0,mg/m²,,,,,,,,,,1.0,palliative,,not evaluable,,2017-12-20,2018-03-14,according to plan,,clinically stable,,2024-03-14,not evaluable,,NaT,2024-07-18,Streck Cell-Free DNA BCT,double,menopause status = post menopause ER = negativ...,1,Pat_002,002_BD T0-1,PRB_408,408_BD T0,LV1004950744,breast,plasma,1.0,ml,Box 1,A1,LV1004950744,MATCH,No,No,No,No,"Freezer 2, shelf 2, rack 4, box 9, position 39"
1,Pat_005,005_BD T0,PRB_409,409_BD T0,Initial-0,2019-09-17 11:04:00,,,f,1945,160.0,79.0,79.0,2019-08-15,C50.9,Malignant neoplasm of breast of unspecified site,,,2019-08-15,C38,2019-08-15,rT0 pN0 pM1 GX,IV,malign,8500/3,"Infiltrating duct carcinoma, NOS (C50._)",ER = 0,FISH = positive,HER2 = 2+ (positive FISH/CISH),menopause status = post menopause,PR = 0,,,,Targeted therapy,Ribociclib,200.0,mg absolute,,,,,,,,,,,,,,,,1.0,palliative,,clinically stable,,2019-09-18,NaT,,,,,NaT,,,NaT,2022-07-12,Streck Cell-Free DNA BCT,double,ER = 0 FISH = positive HER2 = 2+ (positive FIS...,2,Pat_005,005_BD T0-1,PRB_409,409_BD T0,LV2001303058,breast,plasma,1.0,ml,Box 1,A2,LV2001303058,MATCH,No,No,No,No,"Freezer 2, shelf 2, rack 4, box 9, position 40"
2,Pat_006,006_BD T0,PRB_410,410_BD T0,Initial-0,2019-02-12 11:34:00,,,f,1948,175.0,75.2,91.0,2019-01-15,C50.9,Malignant neoplasm of breast of unspecified site,,,2019-01-15,"C22,C40,C79.8",2019-01-15,rT0 cN0 pM1 G3 LX VX PnX,IV,malign,8520/3,"Lobular carcinoma, NOS (C50._)",BRCA1 = not mutated,BRCA2 = mutated,ER = >=51 and <=80% pos. cells,HER2 = 1+,menopause status = pre menopause,PR = >= 10 and <=50% pos. cells,,,Monoclonal Ab,Pertuzumab,840.0,mg absolute,Trastuzumab,6.0,mg/kg,,,,,,,,,,,,,1.0,palliative,,therapy start with tumor,,2019-02-18,NaT,,,,,2021-08-15,not evaluable,yes,2021-08-15,NaT,Streck Cell-Free DNA BCT,double,BRCA1 = not mutated BRCA2 = mutated ER = >=51 ...,3,Pat_006,006_BD T0-1,PRB_410,410_BD T0,LV2000478378,breast,plasma,1.0,ml,Box 1,A3,LV2000478378,MATCH,No,No,No,No,"Freezer 2, shelf 2, rack 4, box 9, position 41"
3,Pat_007,007_BD T0,PRB_411,411_BD T0,Initial-0,2019-01-22 09:57:00,,,f,1963,169.0,72.0,76.9,2018-12-15,C50.5,Malignant neoplasm of lower-outer quadrant of ...,,,NaT,,2018-12-15,cT2 cN0 cM1 G3,IV,malign,8500/3,"Infiltrating duct carcinoma, NOS (C50._)",ER = 12,HER2 = 1+,menopause status = pre menopause,PR = 0,BRCA1 = not mutated,,,,Targeted therapy,Bevacizumab,15.0,mg/kg,Bevacizumab,15.0,mg/kg,Paclitaxel,80.0,mg/m²,,,,,,,,,,1.0,palliative,,therapy start with tumor,,2019-01-22,2019-05-21,according to plan,,clinically stable,,2020-03-12,PD,yes,2020-12-15,NaT,Streck Cell-Free DNA BCT,double,ER = 12 HER2 = 1+ menopause status = pre menop...,4,Pat_007,007_BD T0-1,PRB_411,411_BD T0,LV2000478505,breast,plasma,1.0,ml,Box 1,A4,LV2000478505,MATCH,No,No,No,No,"Freezer 2, shelf 2, rack 4, box 9, position 42"
4,Pat_008,008_BD T0,PRB_412,412_BD T0,Initial-0,2023-02-28 11:16:00,,,f,1958,166.0,70.4,70.9,2023-01-15,C50.9,Malignant neoplasm of breast of unspecified site,,,NaT,,2023-01-15,cT4b cNX cM1 G2,IV,malign,8520/3,"Lobular carcinoma, NOS (C50._)",ER = 0,HER2 = 3+,menopause status = post menopause,PR = 0,,,,,Targeted therapy,Letrozole,2.5,mg absolute,Ribociclib,600.0,mg absolute,,,,,,,,,,,,,1.0,palliative,,,,2023-03-02,2023-06-14,canceled,patient wish,,not evaluable,2024-05-14,CR,,NaT,2024-08-29,Streck Cell-Free DNA BCT,double,ER = 0 HER2 = 3+ menopause status = post menop...,5,Pat_008,008_BD T0-1,PRB_412,412_BD T0,LV2008151600,breast,plasma,1.0,ml,Box 1,A5,LV2008151600,MATCH,No,No,No,No,"Freezer 2, shelf 2, rack 4, box 9, position 43"


In [11]:
raw['biomarker_blob']

0     menopause status = post menopause ER = negativ...
1     ER = 0 FISH = positive HER2 = 2+ (positive FIS...
2     BRCA1 = not mutated BRCA2 = mutated ER = >=51 ...
3     ER = 12 HER2 = 1+ menopause status = pre menop...
4     ER = 0 HER2 = 3+ menopause status = post menop...
                            ...                        
57    HER2 = 1+ ER = 12 menopause status = post meno...
58      HER2 = 3+ HER2 = 3+ ER = negative PR = negative
59    ER = 12 HER2 = 0 menopause status = post menop...
60                      HER2 = 0 ER = 9 PR = 4 HER2 = 0
61    PR = 9 HER2 = 0 ER = negative menopause status...
Name: biomarker_blob, Length: 62, dtype: object