In [1]:
### Preprocess of MD Anderson patient records #######################

# Import libraries and read clinical files
import pandas as pd
import numpy as np
# From the second file I have to extract the information from the CÓDIGO CNIO, DIAGNÓSTICO and GRADE OMS columns
mda_path='/home/vant/TFM/clinical data/MDAnderson_MDA_series/20241104_Actual Base Datos Clínicos solicitados RVB_20200907_MDABB_IDCNIO.xlsx'
# Read file
mda = pd.read_excel(mda_path,header=None)
# Show length and first rows to see the format: rows with indexes 2,42 and 43 must be eliminated
print("The length/number of rows of the MDA dataframe mda is:", len(mda)) #57
print(mda.head(45))
# Rows with indexes 2,42 and 43 must be eliminated

The length/number of rows of the MDA dataframe mda is: 57
                 0                             1        2         3        4   \
0   NAME (INITIALS)  ID (Clinical History Number)  ID CNIO     Noray  NHC MDA   
1               NaN                           NaN      NaN       NaN      NaN   
2               NaN                           NaN      NaN       NaN      NaN   
3               NaN                           NaN     MDA1     M0568    15018   
4               NaN                           NaN     MDA2     M0667    16091   
5               NaN                           NaN     MDA3    M06148    19289   
6               NaN        Información preferente     MDA4    M06201    21376   
7               NaN                           NaN     MDA5   C201889    23409   
8               NaN                           NaN     MDA6    M07402    26352   
9               NaN                           NaN     MDA7    M08509    32372   
10              NaN                           NaN  

In [2]:
# Eliminate unnecessary columns and reset the index
# Remove columns with index 0 and 1
mda = mda.drop(columns=[0, 1])
# Reset indexes of the columns to start from 0
mda.columns = range(mda.shape[1])
# Verify
print(mda.head())

        0      1        2                             3                    4   \
0  ID CNIO  Noray  NHC MDA                DATE OF BIRTH     DATE OF DIAGNOSIS   
1      NaN    NaN      NaN  ( DD-MM-YYYY; NA=DON´T KNOW)           DD-MM-YYYY   
2      NaN    NaN      NaN                           NaN                  NaN   
3     MDA1  M0568    15018                    XX-XX-1925  2005-11-14 00:00:00   
4     MDA2  M0667    16091                    XX-XX-1961           01/01/2006   

                                          5   \
0                           AGE AT DIAGNOSIS   
1  (2 digit age at diagnosis; NA=don´t know)   
2                                        NaN   
3                                         80   
4                                         45   

                                                  6   \
0                                        SERIES TYPE   
1  1=SPORADIC unselected for familial cancer hist...   
2                                                NaN   


In [3]:
# Eliminate unnecessary rows and reset the index

# Remove rows with index 2, 42, 43 (they are not needed)
mda_cleaned = mda.drop([2, 42, 43]).reset_index(drop=True)

# Show the first 10 rows of the cleaned dataframe
print(mda_cleaned.head(10))

# Display the number of rows in the cleaned dataframe
print("The length/number of rows of the MDA dataframe cleaned is:", len(mda_cleaned))  # Expected output: 54

# Assign headers from the first row (index 0)
mda_cleaned.columns = mda_cleaned.iloc[0]  # Set the first row as the header

# Remove the first row (now the headers) from the dataframe
mda_cleaned = mda_cleaned[1:]  # Skip the first row

# Display the first few rows before resetting the index
print('Before resetting the index:\n', mda_cleaned.head(6))

# Display the number of rows before resetting the index
print('Length before resetting the index:', len(mda_cleaned))  # Expected output: 53

# Reset the index to ensure a clean, continuous indexing
mda_cleaned.reset_index(drop=True, inplace=True)

# Show the dataframe after resetting the index
print('After resetting the index:\n', mda_cleaned.head())

# Display the number of rows after resetting the index
print('Length after resetting the index:', len(mda_cleaned))  # Expected output: 53

# Display the final row count of the dataframe
print("The length/number of rows of the final MDA dataframe is:", len(mda_cleaned))  # Expected output: 53

        0        1        2                             3   \
0  ID CNIO    Noray  NHC MDA                DATE OF BIRTH    
1      NaN      NaN      NaN  ( DD-MM-YYYY; NA=DON´T KNOW)   
2     MDA1    M0568    15018                    XX-XX-1925   
3     MDA2    M0667    16091                    XX-XX-1961   
4     MDA3   M06148    19289                    XX-XX-1949   
5     MDA4   M06201    21376           1974-04-11 00:00:00   
6     MDA5  C201889    23409           1954-12-10 00:00:00   
7     MDA6   M07402    26352           1949-10-22 00:00:00   
8     MDA7   M08509    32372           1923-06-07 00:00:00   
9     MDA8   M08566    30989           1958-02-13 00:00:00   

                    4                                          5   \
0    DATE OF DIAGNOSIS                           AGE AT DIAGNOSIS   
1           DD-MM-YYYY  (2 digit age at diagnosis; NA=don´t know)   
2  2005-11-14 00:00:00                                         80   
3           01/01/2006                   

In [4]:
print(mda_cleaned.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 53 entries, 0 to 52
Data columns (total 79 columns):
 #   Column                                                                         Non-Null Count  Dtype  
---  ------                                                                         --------------  -----  
 0   ID CNIO                                                                        52 non-null     object 
 1   Noray                                                                          52 non-null     object 
 2   NHC MDA                                                                        52 non-null     object 
 3   DATE OF BIRTH                                                                  53 non-null     object 
 4   DATE OF DIAGNOSIS                                                              53 non-null     object 
 5   AGE AT DIAGNOSIS                                                               53 non-null     object 
 6   SERIES TYPE                 

In [5]:
# Verify duplicated columns and eliminate them if necessary
mda_cleaned.columns = mda_cleaned.columns.str.strip()
print("Column names before removing duplicates:\n", mda_cleaned.columns)
cols = mda_cleaned.columns
duplicate_cols = cols[cols.duplicated()].tolist()
print("Columnas duplicadas:", duplicate_cols)
print("Number of duplicate column names:", mda_cleaned.columns.duplicated().sum())
# Verify and eliminate duplicates. In this case we don´t need to do that
# mda_cleaned = mda_cleaned.loc[:, ~mda_cleaned.columns.duplicated()]
# print(mda_cleaned.columns)
print(mda_cleaned['DATE OF DIAGNOSIS']) # Print the duplicated columns to check what must be kept. The first one contains some errors
# so we decide to keep the second one

Column names before removing duplicates:
 Index([                                                                      'ID CNIO',
                                                                               'Noray',
                                                                             'NHC MDA',
                                                                       'DATE OF BIRTH',
                                                                   'DATE OF DIAGNOSIS',
                                                                    'AGE AT DIAGNOSIS',
                                                                         'SERIES TYPE',
                                                 'Tissue ID (Pathology Report Number)',
                                                                           'HISTOLOGY',
                                                                   'PATHOLOGIC REVIEW',
                                   'LOCATION (ie. Right ovary, left ovary, bil

In [6]:
# Convert all column names to strings to avoid issues
mda_cleaned.columns = mda_cleaned.columns.astype(str)

# Rename duplicated columns to uniquely identify them
mda_cleaned.columns = [f"{col}_{i}" if mda_cleaned.columns.tolist().count(col) > 1 else col 
              for i, col in enumerate(mda_cleaned.columns)]

# Display column names after renaming
print("Column names after renaming:")
print(mda_cleaned.columns)

# Identify the name of the first duplicated column
first_duplicate_name = [col for col in mda_cleaned.columns if 'DATE OF DIAGNOSIS' in col][0]
print(f"Name of the first duplicated column: {first_duplicate_name}")

# Drop the first duplicated column
mda_cleaned = mda_cleaned.drop(columns=[first_duplicate_name])

# Display column names after removing the duplicate
print("Column names after removal:")
print(mda_cleaned.columns)

Column names after renaming:
Index(['ID CNIO', 'Noray', 'NHC MDA', 'DATE OF BIRTH', 'DATE OF DIAGNOSIS_4',
       'AGE AT DIAGNOSIS', 'SERIES TYPE',
       'Tissue ID (Pathology Report Number)', 'HISTOLOGY', 'PATHOLOGIC REVIEW',
       'LOCATION (ie. Right ovary, left ovary, bilateral)', 'NORMAL TISSUE',
       'NORMAL TISSUE source', 'FIGO STAGE',
       'TUMOR GRADE ( two tier system, Malpica et al.  ONLY SEROUS SUBTYPE)',
       'TUMOR GRADE',
       'PERSONAL ANTECEDENTS OF CANCER OTHER THAN OVARIAN CANCER',
       'PERSONAL ANTECEDENTS OF CANCER OTHER THAN OVARIAN CANCER. SPECIFY:',
       'PERSONAL ANTECEDENTS OF ENDOMETRIOSIS or diagnosed in the pathological report',
       'FAMILIAL ANTECEDENTS OF  COLON, ENDOMETRIAL AND/OR OVARIAN CANCER',
       'FAMILIAL ANTECEDENTS OF  CANCER',
       'FAMILIAL ANTECEDENTS OF  CANCER. ESPECIFY', 'DATE OF DIAGNOSIS_22',
       'DATE OF SURGERY', 'PROPER SURGERY for initial stages',
       'TYPE OF FIRST SURGERY FOR ADVANCED STAGES',
       '

In [7]:
# Rename 'DATE OF DIAGNOSIS_22' to 'DATE OF DIAGNOSIS'
mda_cleaned = mda_cleaned.rename(columns={'DATE OF DIAGNOSIS_22': 'DATE OF DIAGNOSIS'})
# Check
print(mda_cleaned.columns)

Index(['ID CNIO', 'Noray', 'NHC MDA', 'DATE OF BIRTH', 'AGE AT DIAGNOSIS',
       'SERIES TYPE', 'Tissue ID (Pathology Report Number)', 'HISTOLOGY',
       'PATHOLOGIC REVIEW',
       'LOCATION (ie. Right ovary, left ovary, bilateral)', 'NORMAL TISSUE',
       'NORMAL TISSUE source', 'FIGO STAGE',
       'TUMOR GRADE ( two tier system, Malpica et al.  ONLY SEROUS SUBTYPE)',
       'TUMOR GRADE',
       'PERSONAL ANTECEDENTS OF CANCER OTHER THAN OVARIAN CANCER',
       'PERSONAL ANTECEDENTS OF CANCER OTHER THAN OVARIAN CANCER. SPECIFY:',
       'PERSONAL ANTECEDENTS OF ENDOMETRIOSIS or diagnosed in the pathological report',
       'FAMILIAL ANTECEDENTS OF  COLON, ENDOMETRIAL AND/OR OVARIAN CANCER',
       'FAMILIAL ANTECEDENTS OF  CANCER',
       'FAMILIAL ANTECEDENTS OF  CANCER. ESPECIFY', 'DATE OF DIAGNOSIS',
       'DATE OF SURGERY', 'PROPER SURGERY for initial stages',
       'TYPE OF FIRST SURGERY FOR ADVANCED STAGES',
       'CA125 level pre-treatment (AT DIAGNOSIS)',
       'CA12

In [8]:
# Partial dates are not recognized as such, so we convert them with these functions in the relevant columns.
# Dates in the format XX/XX/year will be converted to 01/06/year, and dates in the format XX/month/year 
# will be converted to 01/month/year. New columns are created as a tag to mark partial dates

def detect_partial(date_str):
    """Detects if a date is partial."""
    if isinstance(date_str, str) and 'XX' in date_str:
        return 'YES'
    return 'NO'

def patch_date(date_str):
    """Converts a date string to datetime, handling partial dates."""
    if isinstance(date_str, str) and 'XX' in date_str:
        parts = date_str.split('/')
        if len(parts) == 3:
            # Replace XX/XX/year with 01/06/year
            if parts[0] == 'XX' and parts[1] == 'XX':
                return pd.to_datetime(f'01/06/{parts[2]}', format='%d/%m/%Y')
            # Replace XX/month/year with 01/month/year
            elif parts[0] == 'XX':
                return pd.to_datetime(f'01/{parts[1]}/{parts[2]}', format='%d/%m/%Y')
    # For non-string values or already complete dates
    return pd.to_datetime(date_str, format='%d/%m/%Y', errors='coerce')

# Create new columns to indicate if the dates are partial
mda_cleaned['PARTIAL DATE DEATH_LASTv'] = mda_cleaned['DATE OF DEATH OR LAST VISIT'].apply(detect_partial)
mda_cleaned['PARTIAL DATE DIAGNOSIS'] = mda_cleaned['DATE OF DIAGNOSIS'].apply(detect_partial)

# Correct the partial dates by applying the patch_date function
mda_cleaned['DATE OF DEATH OR LAST VISIT'] = mda_cleaned['DATE OF DEATH OR LAST VISIT'].apply(patch_date)
mda_cleaned['DATE OF DIAGNOSIS'] = mda_cleaned['DATE OF DIAGNOSIS'].apply(patch_date)


In [9]:
# Check null values after conversion.
print("Null values in DATE OF DEATH OR LAST VISIT column:", mda_cleaned['DATE OF DEATH OR LAST VISIT'].isnull().sum())
print("Null values in DATE OF DIAGNOSIS column:", mda_cleaned['DATE OF DIAGNOSIS'].isnull().sum())
# Create OS_CNIO column 
mda_cleaned['OS_CNIO'] = (mda_cleaned['DATE OF DEATH OR LAST VISIT'] - mda_cleaned['DATE OF DIAGNOSIS']).dt.days
print(mda_cleaned[['DATE OF DEATH OR LAST VISIT','DATE OF DIAGNOSIS','OS_CNIO','PARTIAL DATE DEATH_LASTv','PARTIAL DATE DIAGNOSIS']])

Null values in DATE OF DEATH OR LAST VISIT column: 2
Null values in DATE OF DIAGNOSIS column: 1
   DATE OF DEATH OR LAST VISIT DATE OF DIAGNOSIS  OS_CNIO  \
0                          NaT               NaT      NaN   
1                          NaT        2005-11-14      NaN   
2                   2014-10-08        2006-01-01   3202.0   
3                   2007-05-12        2006-06-26    320.0   
4                   2024-08-19        2006-04-01   6715.0   
5                   2020-09-25        2007-05-28   4869.0   
6                   2020-08-28        2007-09-06   4740.0   
7                   2021-05-06        2008-02-28   4816.0   
8                   2008-10-27        2008-04-24    186.0   
9                   2013-07-01        2007-06-01   2222.0   
10                  2009-02-22        2008-06-01    266.0   
11                  2017-11-17        2009-05-14   3109.0   
12                  2019-06-20        2009-07-06   3636.0   
13                  2014-06-16        2008-06-01  

In [10]:
# Create a new column 'DATE USED FOR OS' and set the value based on 'DATE OF DIAGNOSIS'
# If 'DATE OF DIAGNOSIS' is not null, set 'D' (for Diagnosis), otherwise set 'NA'
mda_cleaned['DATE USED FOR OS'] = mda_cleaned['DATE OF DIAGNOSIS'].apply(lambda x: 'D' if pd.notnull(x) else 'NA')

# Extract the current legend (first row) from the dataframe
leyendas_actuales = mda_cleaned.iloc[0].copy()  # Copy the first row which contains the legends

# Display the extracted legends
print("Legends are:\n", leyendas_actuales)

Legends are:
 ID CNIO                                                           NaN
Noray                                                             NaN
NHC MDA                                                           NaN
DATE OF BIRTH                            ( DD-MM-YYYY; NA=DON´T KNOW)
AGE AT DIAGNOSIS            (2 digit age at diagnosis; NA=don´t know)
                                              ...                    
nan_78                                                            NaN
PARTIAL DATE DEATH_LASTv                                           NO
PARTIAL DATE DIAGNOSIS                                             NO
OS_CNIO                                                           NaN
DATE USED FOR OS                                                   NA
Name: 0, Length: 82, dtype: object


In [11]:
# Create the columns 'FIGOi' (intermediate, with values as Roman numerals) from the column 'FIGO STAGE', 
# which contains Roman numerals combined with letters, 'FIGOa' (grouped categories) and FIGOL.
# Eventually, 'FIGOi' will be renamed to 'FIGO STAGE' once the conversion is done correctly.

# Mapping of FIGO STAGE to numerical values for FIGOi
figo_stage_map = {
    'IA': 1, 'IB': 2, 'IC': 3, 'IC1': 3, 'IC2': 3, 'I': 4, 
    'IIA': 5, 'IIB': 6, 'II': 8, 
    'IIIA': 9, 'IIIB': 10, 'IIIC': 11, 'III': 12, 
    'IV': 13, 'IVA': 13, 'IVB': 13
}

# Create the new column 'FIGOi' from 'FIGO STAGE' by mapping the values using the defined dictionary
# This column will later be renamed to 'FIGO STAGE' if the conversion is correct.
mda_cleaned['FIGOi'] = mda_cleaned['FIGO STAGE'].map(figo_stage_map)

# Mapping of FIGO stage to a more aggregated category for 'FIGOa'
figoa_map = {
    'IA': 1, 'IB': 1, 'IC': 1, 'IC1': 1, 'IC2': 1, 'I': 1, 
    'IIA': 2, 'IIB': 2, 'II': 2, 
    'IIIA': 3, 'IIIB': 3, 'IIIC': 3, 'III': 3, 
    'IV': 4, 'IVA': 4, 'IVB': 4
}
# Create the new column 'FIGOa' with the aggregated categories
mda_cleaned['FIGOa'] = mda_cleaned['FIGO STAGE'].map(figoa_map)

# Now, define a function to convert FIGOa values into FIGOL (0 = localized, 1 = advanced, NA = unknown)
def map_figoa_to_figol(value):
    if value in [1, 2]:
        return 0  # Localized stage
    elif value in [3, 4]:
        return 1  # Advanced stage
    else:
        return value  # Keep NA values 

# Apply the mapping function to create the 'FIGOL' column
mda_cleaned['FIGOL'] = mda_cleaned['FIGOa'].map(map_figoa_to_figol)

# Display the original 'FIGO STAGE' and the newly created columns to verify the conversion
print(mda_cleaned[['FIGO STAGE', 'FIGOi', 'FIGOL', 'FIGOa']])

# Rename the 'FIGOi' column to 'FIGO STAGE' (after conversion, 'FIGOi' becomes the new 'FIGO STAGE')
mda_cleaned.rename(columns={'FIGO STAGE': 'FIGOi', 'FIGOi': 'FIGO STAGE'}, inplace=True)

# Show the updated columns after renaming
print(mda_cleaned[['FIGO STAGE', 'FIGOi', 'FIGOL', 'FIGOa']])

                                           FIGO STAGE  FIGOi  FIGOL  FIGOa
0   1=IA, 2=IB, 3=IC, 4=I(NOS), 5=IIA, 6=IIB, 7=II...    NaN    NaN    NaN
1                                                 IIB    6.0    0.0    2.0
2                                                 IIB    6.0    0.0    2.0
3                                                 IIB    6.0    0.0    2.0
4                                                 IC2    3.0    0.0    1.0
5                                                  IA    1.0    0.0    1.0
6                                                  IA    1.0    0.0    1.0
7                                                IIIC   11.0    1.0    3.0
8                                                IIIA    9.0    1.0    3.0
9                                                  IA    1.0    0.0    1.0
10                                                IVA   13.0    1.0    4.0
11                                                 IA    1.0    0.0    1.0
12                       

In [12]:
print(mda_cleaned.columns)

Index(['ID CNIO', 'Noray', 'NHC MDA', 'DATE OF BIRTH', 'AGE AT DIAGNOSIS',
       'SERIES TYPE', 'Tissue ID (Pathology Report Number)', 'HISTOLOGY',
       'PATHOLOGIC REVIEW',
       'LOCATION (ie. Right ovary, left ovary, bilateral)', 'NORMAL TISSUE',
       'NORMAL TISSUE source', 'FIGOi',
       'TUMOR GRADE ( two tier system, Malpica et al.  ONLY SEROUS SUBTYPE)',
       'TUMOR GRADE',
       'PERSONAL ANTECEDENTS OF CANCER OTHER THAN OVARIAN CANCER',
       'PERSONAL ANTECEDENTS OF CANCER OTHER THAN OVARIAN CANCER. SPECIFY:',
       'PERSONAL ANTECEDENTS OF ENDOMETRIOSIS or diagnosed in the pathological report',
       'FAMILIAL ANTECEDENTS OF  COLON, ENDOMETRIAL AND/OR OVARIAN CANCER',
       'FAMILIAL ANTECEDENTS OF  CANCER',
       'FAMILIAL ANTECEDENTS OF  CANCER. ESPECIFY', 'DATE OF DIAGNOSIS',
       'DATE OF SURGERY', 'PROPER SURGERY for initial stages',
       'TYPE OF FIRST SURGERY FOR ADVANCED STAGES',
       'CA125 level pre-treatment (AT DIAGNOSIS)',
       'CA125 lev

In [13]:
# Check values in Residual Disease AFTER SURGERY column. NA values was modified to 3 in the original file
# to distinguish 'NA' from NaN data
print(mda_cleaned['Residual Disease AFTER SURGERY'])

0     0=no macroscopic disease, 1=macroscopic diseas...
1                                                     3
2                                                     3
3                                                     3
4                                                     0
5                                                     0
6                                                     0
7                                                     3
8                                                     3
9                                                     3
10                                                  NaN
11                                                    3
12                                                    0
13                                                  NaN
14                                                    0
15                                                  NaN
16                                                  NaN
17                                              

In [14]:
# Add a new column 'RESIDUALsD' which represents:
# 0 for "no disease" (if value in 'Residual Disease AFTER SURGERY' is 0)
# 1 for "disease" (if value in 'Residual Disease AFTER SURGERY' is 1, 2, or 'NA-unknown size'-set as 3 in order to avoid issues)

# Define a mapping dictionary to convert the values in the 'Residual Disease AFTER SURGERY' column
value_residuals = {
    3: 1,  # Treat 3 (NA-unknown size) as 1 (disease present but size unknown)
    1: 1,                  # Treat value 1 as disease
    2: 1,                  # Treat value 2 as disease
    0: 0                   # Treat value 0 as no disease
}

# Apply the mapping function to the 'Residual Disease AFTER SURGERY' column to create 'RESIDUALsD' column
mda_cleaned['RESIDUALsD'] = mda_cleaned['Residual Disease AFTER SURGERY'].map(value_residuals)

# Print the original and new columns to verify the changes
print(mda_cleaned[['Residual Disease AFTER SURGERY', 'RESIDUALsD']])

                       Residual Disease AFTER SURGERY  RESIDUALsD
0   0=no macroscopic disease, 1=macroscopic diseas...         NaN
1                                                   3         1.0
2                                                   3         1.0
3                                                   3         1.0
4                                                   0         0.0
5                                                   0         0.0
6                                                   0         0.0
7                                                   3         1.0
8                                                   3         1.0
9                                                   3         1.0
10                                                NaN         NaN
11                                                  3         1.0
12                                                  0         0.0
13                                                NaN         NaN
14        

In [15]:
# Selection of columns of interest
headers_seleccionados = [
    'ID CNIO','HISTOLOGY', 'AGE AT DIAGNOSIS', 'TUMOR GRADE', 'FIGO STAGE', 'FIGOa','FIGOL',
    'Residual Disease AFTER SURGERY', 'RESIDUALsD', 'FIRST LINE or adjuvant TREATMENT',
    'First line regimen', 'MMR GERMLINE STATUS', 'BRCA STATUS','VITAL STATUS','OS_CNIO','DATE USED FOR OS',
    'PARTIAL DATE DEATH_LASTv','PARTIAL DATE DIAGNOSIS','FAMILIAL ANTECEDENTS OF  COLON, ENDOMETRIAL AND/OR OVARIAN CANCER'
]

# Create the new DataFrame with the selected columns
df_final_mda = mda_cleaned[headers_seleccionados]

# Create a DataFrame with labels to later append in the first row under the headers
header_row = pd.DataFrame([{
    'ID CNIO': 'ID CNIO',
    'HISTOLOGY': '1=serous, 2=mucinous, 3=endometrioid, 4=clear cell, 5=mixed cell, 6=other specified epithelial ovarian cancer (e.g. Brenner), 7=undifferentiated epithelial, NA=don’t know',
    'AGE AT DIAGNOSIS': 'years',
    'TUMOR GRADE': '1=well differentiated, 2=moderately differentiated, 3=poorly differentiated, NA=unknown',
    'FIGO STAGE': '1=IA, 2=IB, 3=IC, 4=I(NOS), 5=IIA, 6=IIB, 7=IIC, 8=II(NOS), 9=IIIA, 10=IIIB, 11=IIIC, 12=III(NOS), 13=IV, NA=don’t know',
    'FIGOa': '1= IA, IB, IC, I(NOS); 2=IIA, IIB, II (NOS); 3=IIIA, IIIB, IIIC, III(NOS); 4= IV; NA=unknown',
    'FIGOL':'0=localized,1=advanced; NA=unknown',
    'Residual Disease AFTER SURGERY':'0=no macroscopic disease, 1=macroscopic disease <1 cm; 2=macroscopic disease >1;  3=macroscopic disease, size unknown',
    'RESIDUALsD':'0=No residual disease, 1=Yes residual disease',
    'FIRST LINE or adjuvant TREATMENT':'CHEMOTHERAPY NO=0, YES=1 ,ND=No available',
    'First line regimen':'0=Carbo-paclitaxel;1=Cis-paclitaxel iv;2=Cis-pacli IP;3=Carbo monotherapy;4=Carbo-Taxol-Beva;5=other, ND=No available',
    'MMR GERMLINE STATUS':'0=UNKNOWN, 1=MUTATED, 2= UNMUTATED',
    'BRCA STATUS':'0=UNKNOWN, 1=MUTATED, 2= UNMUTATED',
    'VITAL STATUS':'0 = alive, 1 = dead, PD= No available',
    'OS_CNIO':'Overall survival in days',
    'DATE USED FOR OS':'Diagnosis=diagnosis date used for OS, Surgery=surgery date used for OS, NA=unknown',
    'PARTIAL DATE DEATH_LASTv':'YES= partial date xx/xx/y or xx/m/y, NO= Complete date or NA',
    'PARTIAL DATE DIAGNOSIS':'YES= partial date xx/xx/y or xx/m/y, NO= Complete date or NA',
    'FAMILIAL ANTECEDENTS OF  COLON, ENDOMETRIAL AND/OR OVARIAN CANCER':'0=NO;1=YES;ND=No available'
}])

# Remove row 0 (descriptive header row)
df_final_mda = df_final_mda.drop(0).reset_index(drop=True)
# Concatenate the header row with the original DataFrame
df_final_mda = pd.concat([header_row, df_final_mda], ignore_index=True)
print(df_final_mda.head(6))

# Export the DataFrame to an Excel file
output_path = '/home/vant/TFM/MDA_bbdd_filtered1.xlsx'
df_final_mda.to_excel(output_path, index=False)

print(f"The DataFrame has been successfully exported to {output_path}.")

   ID CNIO                                          HISTOLOGY  \
0  ID CNIO  1=serous, 2=mucinous, 3=endometrioid, 4=clear ...   
1     MDA1                                                  4   
2     MDA2                                                  3   
3     MDA3                                                  4   
4     MDA4                                                  3   
5     MDA5                                                  4   

  AGE AT DIAGNOSIS                                        TUMOR GRADE  \
0            years  1=well differentiated, 2=moderately differenti...   
1               80                                                  3   
2               45                                                  3   
3               57                                                  2   
4               32                                                  1   
5               53                                                  1   

                                