In [2]:
### Unify the notation for the different clinical variables across all centers and hospitals in RVB samples ############

# I import the modules to work
import pandas as pd
import numpy as np
import re
import os
# I read the prefiltered file for RVB cohort
rvb_path='/home/vant/TFM/RVB_bbdd_filered1.xlsx'
rvb=pd.read_excel(rvb_path)
print(rvb.head(10))
print("Length of RVB dataframe:",len(rvb))

  ID CNIO_MJG                                HISTOLOGY_coded MJG  \
0     ID CNIO  1=serous, 2=mucinous, 3=endometrioid, 4=clear ...   
1        RVB1                                                NaN   
2        RVB2                                                  3   
3        RVB3                                                  5   
4        RVB4                                                  1   
5        RVB5                                                  4   
6        RVB6                                                  3   
7        RVB7                                                  3   
8        RVB8                                                NaN   
9        RVB9                                                NaN   

  Age at diagnosis                                        TUMOR GRADE  \
0            years  1=well differentiated, 2=moderately differenti...   
1               46                                                NaN   
2               47              

In [3]:
rvb_no_subheaders=rvb.drop([0])
rvb_no_subheaders
rvb_no_subheaders.info() # There are 9 samples

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9 entries, 1 to 9
Data columns (total 16 columns):
 #   Column                                                             Non-Null Count  Dtype 
---  ------                                                             --------------  ----- 
 0   ID CNIO_MJG                                                        9 non-null      object
 1   HISTOLOGY_coded MJG                                                6 non-null      object
 2   Age at diagnosis                                                   9 non-null      object
 3   TUMOR GRADE                                                        4 non-null      object
 4   FIGO STAGE_MJG                                                     5 non-null      object
 5   FIGOa                                                              5 non-null      object
 6   FIGOL                                                              5 non-null      object
 7   FIRST LINE or adjuvant TREATMENT_MJG   

In [4]:
# Firstly I rename the columns 
# Rename columns.I create a dictionary key:value= old name:new name
new_column_names = {
    'ID CNIO_MJG': 'ID_CNIO',
    'HISTOLOGY_coded MJG':'HISTOLOGY',
    'Age at diagnosis':'AGE',
    'TUMOR GRADE':'GRADE',
    'FIGO STAGE_MJG':'FIGO',
    'FIRST LINE or adjuvant TREATMENT_MJG':'ADJUVANT_TREATMENT',
    'First line regimen_MJG':'TYPE_ADJUVANT',
    'MMR GERMLINE STATUS':'MMR STATUS',
    'DATE USED FOR OS':'DATE_OS',
    'FAMILIAL ANTECEDENTS OF  COLON, ENDOMETRIAL AND/OR OVARIAN CANCER':'FAMILIAL'
}
rvb_no_subheaders.rename(columns=new_column_names,inplace=True)
rvb_no_subheaders.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9 entries, 1 to 9
Data columns (total 16 columns):
 #   Column                    Non-Null Count  Dtype 
---  ------                    --------------  ----- 
 0   ID_CNIO                   9 non-null      object
 1   HISTOLOGY                 6 non-null      object
 2   AGE                       9 non-null      object
 3   GRADE                     4 non-null      object
 4   FIGO                      5 non-null      object
 5   FIGOa                     5 non-null      object
 6   FIGOL                     5 non-null      object
 7   ADJUVANT_TREATMENT        5 non-null      object
 8   TYPE_ADJUVANT             5 non-null      object
 9   MMR STATUS                9 non-null      object
 10  VITAL STATUS              9 non-null      object
 11  OS_CNIO                   9 non-null      object
 12  DATE_OS                   9 non-null      object
 13  PARTIAL DATE DEATH_LASTv  9 non-null      object
 14  PARTIAL DATE DIAGNOSIS    9 no

In [5]:
# Transform coding in HISTOLOGY column to the final coding
# Actual coding: 1=serous, 2=mucinous, 3=endometrioid, 4=clear cell, 
# 5=mixed cell, 6=other specified epithelial ovarian cancer (e.g. Brenner), 7=undifferentiated epithelial, NA=don´t know
# New coding:0=endometroid;1=clear cells;2=mixed,3=others,NA=unknown'
def convert_histology(data1):
    if data1==3:
        return 0
    elif data1==4:
        return 1
    elif data1==1 or data1==2:
        return 3
    elif data1==5:
        return 2
    elif pd.isna(data1):
        return 'NA'
print(rvb_no_subheaders['HISTOLOGY'].head(15))
rvb_no_subheaders['HISTOLOGY']= rvb_no_subheaders['HISTOLOGY'].apply(convert_histology)
print(rvb_no_subheaders['HISTOLOGY'].head(15))

1    NaN
2      3
3      5
4      1
5      4
6      3
7      3
8    NaN
9    NaN
Name: HISTOLOGY, dtype: object
1    NA
2     0
3     2
4     3
5     1
6     0
7     0
8    NA
9    NA
Name: HISTOLOGY, dtype: object


In [6]:
# Adjust TYPE_ADYUVANT values to the final coding
# Actual coding: 0=Carbo-paclitaxel;1=Cis-paclitaxel iv;2=Cis-pacli IP;
# 3=Carbo monoterapia;4=Carbo-Taxol-Beva;5=otro;6=chemotherapy, but regimen unknown;7=no chemotherapy;NA=no information
# Final coding: 0=Carbo-paclitaxel,1=Cis-paclitaxel(iv e IP),2=Carbomonotherapy,3=Carbo-Taxol-Beva;4=Others; NA=unknown
def adjust_type_adjuvant(data2):
    if data2 == 0:
        return 0
    if data2 == 1 or data2 == 2:
        return 1
    elif data2 == 3:
        return 2
    elif data2 == 4:
        return 3
    elif data2 == 5:
        return 4
    elif data2 == 6 or data2 == 7 or pd.isna(data2):
        return 'NA'

print(rvb_no_subheaders['TYPE_ADJUVANT'].head(11))
# Apply the function to the column in order to recode the values
rvb_no_subheaders['TYPE_ADJUVANT'] = rvb_no_subheaders['TYPE_ADJUVANT'].apply(adjust_type_adjuvant)
print(rvb_no_subheaders['TYPE_ADJUVANT'].head(11))

1      6
2      6
3    NaN
4      7
5      4
6      0
7    NaN
8    NaN
9    NaN
Name: TYPE_ADJUVANT, dtype: object
1    NA
2    NA
3    NA
4    NA
5     3
6     0
7    NA
8    NA
9    NA
Name: TYPE_ADJUVANT, dtype: object


In [7]:
# Change coding in MMR STATUS column
# In these columns the coding is:  0=unknown; 1=mutated; 2=unmutated;
# and now MMR STATUS/BRCA STATUS: 0=not studied,1=studied,without mutation,2=mutation in MMR genes/BRCA1/2,3=other genes mutated, NA= no data
# Having into account that NA can be LYNCH/BRCA in BRCA STATUS/MMR STATUS
def calculate_mmr_status(data4):
    if data4 == 0 or data4=='0':
        return 0
    elif data4 == 1 or data4=='1':
        return 2
    elif data4 == 2 or data4=='2':
        return 1
    elif pd.isna(data4):
        return data4
print("Before:\n",rvb_no_subheaders['MMR STATUS'].head(10))
rvb_no_subheaders['MMR STATUS']=rvb_no_subheaders['MMR STATUS'].apply(calculate_mmr_status)
print("After conversion:\n", rvb_no_subheaders['MMR STATUS'].head(10))

Before:
 1    1
2    1
3    1
4    1
5    1
6    2
7    1
8    1
9    1
Name: MMR STATUS, dtype: object
After conversion:
 1    2
2    2
3    2
4    2
5    2
6    1
7    2
8    2
9    2
Name: MMR STATUS, dtype: int64


In [8]:
rvb_no_subheaders.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9 entries, 1 to 9
Data columns (total 16 columns):
 #   Column                    Non-Null Count  Dtype 
---  ------                    --------------  ----- 
 0   ID_CNIO                   9 non-null      object
 1   HISTOLOGY                 9 non-null      object
 2   AGE                       9 non-null      object
 3   GRADE                     4 non-null      object
 4   FIGO                      5 non-null      object
 5   FIGOa                     5 non-null      object
 6   FIGOL                     5 non-null      object
 7   ADJUVANT_TREATMENT        5 non-null      object
 8   TYPE_ADJUVANT             9 non-null      object
 9   MMR STATUS                9 non-null      int64 
 10  VITAL STATUS              9 non-null      object
 11  OS_CNIO                   9 non-null      object
 12  DATE_OS                   9 non-null      object
 13  PARTIAL DATE DEATH_LASTv  9 non-null      object
 14  PARTIAL DATE DIAGNOSIS    9 no

In [9]:
# Order the columns in the definitive order
# New order
new_order = [
    'ID_CNIO', 'HISTOLOGY', 'AGE', 'GRADE', 'FIGO',
    'FIGOL', 'FIGOa', 'ADJUVANT_TREATMENT', 'TYPE_ADJUVANT', 'OS_CNIO', 'VITAL STATUS', 'MMR STATUS','FAMILIAL', 
    'DATE_OS','PARTIAL DATE DEATH_LASTv','PARTIAL DATE DIAGNOSIS'
]

# Reorder
rvb_no_subheaders = rvb_no_subheaders[new_order]
print(rvb_no_subheaders.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9 entries, 1 to 9
Data columns (total 16 columns):
 #   Column                    Non-Null Count  Dtype 
---  ------                    --------------  ----- 
 0   ID_CNIO                   9 non-null      object
 1   HISTOLOGY                 9 non-null      object
 2   AGE                       9 non-null      object
 3   GRADE                     4 non-null      object
 4   FIGO                      5 non-null      object
 5   FIGOL                     5 non-null      object
 6   FIGOa                     5 non-null      object
 7   ADJUVANT_TREATMENT        5 non-null      object
 8   TYPE_ADJUVANT             9 non-null      object
 9   OS_CNIO                   9 non-null      object
 10  VITAL STATUS              9 non-null      object
 11  MMR STATUS                9 non-null      int64 
 12  FAMILIAL                  2 non-null      object
 13  DATE_OS                   9 non-null      object
 14  PARTIAL DATE DEATH_LASTv  9 no

In [10]:
# Create row of subheaders.Put them in the same order the columns of the dataframe headers
subheaders = ['LP: ID for samples from La Paz Hospital; OVE: ID for samples from Virgen del Rocio Hospital; MDA: ID for samples from MD Anderson Cancer Center  Hospital ; RVB: ID for samples from Red Valenciana de Biobancos',
              '0=endometroid; 1=clear cells; 2=mixed; 3=others; NA=unknown','2 digits; unit is YEARS',
             '1=well differentiated; 2=moderately differentiated; 3=poorly differentiated; NA=unknown',
             '1=IA; 2=IB; 3=IC; 4=I(NOS); 5=IIA; 6=IIB; 8=II(NOS); 9=IIIA; 10=IIIB; 11=IIIC; 12=III(NOS); 13=IV; NA=unknown',
             '0=localized (I,II); 1=advanced(III,IV); NA=unknown','1= IA, IB, IC, I(NOS); 2=IIA, IIB, II (NOS); 3=IIIA, IIIB, IIIC, III(NOS); 4= IV; NA=unknown',
             '0=NO; 1=YES; NA=unknown',
             '0=Carbo-paclitaxel;1=Cis-paclitaxel;2=Carbo-monotherapy;3=Carbo-Taxol-Beva;4=other; NA=unknown','Unit is DAYS',
             '0=alive; 1=dead; LF=lost to follow-up; NA=unknown','0=not studied; 1=studied,without mutation; 2=mutation in MMR genes; 3=other genes mutated; NA=unknown',
             '0=NO; 1=YES; NA=unknown',
             'D=diagnosis date used for OS; S=surgery date used for OS; LD:Lacking diagnosis/surgery/death_lastv date; NA=unknown',
             'YES= partial date xx/xx/y or xx/m/y; NO= Complete date or NA','YES= partial date xx/xx/y or xx/m/y; NO= Complete date or NA'
             ]
# A new dataframe is created with a row (list of lists: [subheaders] that is added below the header names of the rvb_no_subheaders dataframe
df_subheaders = pd.DataFrame([subheaders], columns=rvb_no_subheaders.columns)
print("df_subheaders is:\n",df_subheaders) # df_subheaders is a dataframe with 1 row (index 0) that contains the subheaders below the headers
# of the rvb_no_subheaders dataframe. I need to add the values for every sample
# Concatenate the row of subheaders below the original DataFrame
df_with_subheaders = pd.concat([df_subheaders, rvb_no_subheaders], ignore_index=True)
print("\nDataFrame con subheaders añadidos:")
print("df_with_subheaders is:\n",df_with_subheaders)

df_subheaders is:
                                              ID_CNIO  \
0  LP: ID for samples from La Paz Hospital; OVE: ...   

                                           HISTOLOGY                      AGE  \
0  0=endometroid; 1=clear cells; 2=mixed; 3=other...  2 digits; unit is YEARS   

                                               GRADE  \
0  1=well differentiated; 2=moderately differenti...   

                                                FIGO  \
0  1=IA; 2=IB; 3=IC; 4=I(NOS); 5=IIA; 6=IIB; 8=II...   

                                               FIGOL  \
0  0=localized (I,II); 1=advanced(III,IV); NA=unk...   

                                               FIGOa       ADJUVANT_TREATMENT  \
0  1= IA, IB, IC, I(NOS); 2=IIA, IIB, II (NOS); 3...  0=NO; 1=YES; NA=unknown   

                                       TYPE_ADJUVANT       OS_CNIO  \
0  0=Carbo-paclitaxel;1=Cis-paclitaxel;2=Carbo-mo...  Unit is DAYS   

                                        VITAL STATUS  \
0  0=a

In [11]:
# Row with descriptions- I want it in the first row below the headers (lista o diccionario)
descriptions = {
    'ID_CNIO': 'Unique CNIO tumor identifier',
    'HISTOLOGY': 'Tumor histology/Tumor type',
    'AGE': 'Age at diagnosis. Calculated as (date of diagnosis – date of birth). When not provided, date of surgery was used instead of date of diagnosis (OVE series); a tag column was added to indicate this: “DATE_AGE”.',
    'GRADE':'Tumor differentiation grade',
    'FIGO':'FIGO tumor stage',
    'FIGOL':'FIGO tumor stage according to cancer spread (localized or advance)',
    'FIGOa':'Aggregated tumor stage',
    'ADJUVANT_TREATMENT':'Information about whether the patient received chemotherapy',
    'TYPE_ADJUVANT':'Type of adjuvant treatment received. Carboplatino+taxol is the standard',
    'OS_CNIO':'Overall survival calculated by CNIO from original records as (date of death-date of diagnosis) or (date last known to be alive-date of diagnosis). When not provided, date of surgery was used instead of date of diagnosis. In this last case it is indicated in the tag column DATE USED FOR OS',
    'VITAL STATUS':'Vital status at last followup',
    'MMR STATUS':'Information about the GERMILINE mutations in genes related to mismatch repair pathway',
    'FAMILIAL':'Familial antecedents of colon,endometrial and/or ovarian cancer. In OVE series this information refers to possible hereditary conditions according to the clinician',
    'DATE_OS':'Tag column to indicate how overall survival was calculated, using date of diagnosis or date of surgery (OVE series)',
    'PARTIAL DATE DEATH_LASTv':'Tag column to indicate how overall survival was calculated, using a partial or a complete date of last visit. If a partial date is used this suggests that the estimation of OS is not exact. A partial date is coded as XX/XX/year or XX/month/year and to make it possible the calculation these are transformed into 01/06/year or 01/month/year respectively',
    'PARTIAL DATE DIAGNOSIS':'Tag column to indicate how overall survival was calculated, using a partial or a complete date of diagnosis. If a partial date is used this suggests that the estimation of OS is not exact. A partial date is coded as XX/XX/year or XX/month/year and to make it possible the calculation these are transformed into 01/06/year or 01/month/year respectively' 
}

# Convert the dictionary into a dataframe
desc_df = pd.DataFrame(descriptions, index=[0])

# Concatenate description row with the final dataframe
df_final = pd.concat([desc_df, df_with_subheaders]).reset_index(drop=True)

# Show
print(df_final)  

                                              ID_CNIO  \
0                        Unique CNIO tumor identifier   
1   LP: ID for samples from La Paz Hospital; OVE: ...   
2                                                RVB1   
3                                                RVB2   
4                                                RVB3   
5                                                RVB4   
6                                                RVB5   
7                                                RVB6   
8                                                RVB7   
9                                                RVB8   
10                                               RVB9   

                                            HISTOLOGY  \
0                          Tumor histology/Tumor type   
1   0=endometroid; 1=clear cells; 2=mixed; 3=other...   
2                                                  NA   
3                                                   0   
4                             

In [12]:
# Order the samples

# Separate the first two rows (subheaders and coding)
header_rows = df_final.iloc[:2]

# The rest of the DataFrame (starting from row 2)
data_rows = df_final.iloc[2:].copy()  # Use .copy() to avoid SettingWithCopyWarning

# Extract the number following "RVB" for the data rows only
data_rows.loc[:, 'ID_number'] = data_rows['ID_CNIO'].str.extract(r'RVB(\d+)')

# Convert the extracted values to numeric, ignoring errors
data_rows.loc[:, 'ID_number'] = pd.to_numeric(data_rows['ID_number'], errors='coerce')

# Filter out rows where 'ID_number' is NaN
data_rows = data_rows[data_rows['ID_number'].notna()]

# Sort the DataFrame by the extracted number
data_rows = data_rows.sort_values(by='ID_number').reset_index(drop=True)

# Drop the auxiliary 'ID_number' column if it's no longer needed
data_rows = data_rows.drop(columns=['ID_number'])

# Recombine the subheader rows with the sorted data rows
df_final_sorted = pd.concat([header_rows, data_rows], ignore_index=True)

In [13]:
# Add original identifiers columns from the file passed by Maria
# Read identifiers file
identificadores_df = pd.read_excel("/home/vant/TFM/Identificadores OVE_LP_MDA_RVB_serie completa.xlsx",sheet_name=0)
print(identificadores_df.head())
print(identificadores_df.columns) # I realise there are additional spaces in ID CNIO header
identificadores_df.columns = identificadores_df.columns.str.strip()
print(identificadores_df.columns)

                 ID CNIO                                      ORIGINAL ID_AP  \
0  unique identifier CNIO  unique identifier from hospital, "Anatomía Pat...   
1                   07T25                                         8175-04/A4   
2                   08T94                          99B5119-8 (F.H. ALCORCÓN)   
3                   08T96                         04B0008135 (F.H. ALCORCÓN)   
4                  09T134                      B781787  (H,Gregorio Marañon)   

                               ORIGINAL ID_NHC_BBANK  \
0  unique identifier from Hospital (LP, OVE, MDA ...   
1                                       BTCNIO06/111   
2                                                NaN   
3                                                NaN   
4                                            B781787   

                   SUBTIPO HISTOLÓGICO  \
0  CC=clear cell ovarian carcinoma; E=   
1                                   CC   
2                                    E   
3             

In [14]:
# Add original identifiers columns from the file passed by Maria
print(identificadores_df.head())

# Rename 'ID CNIO' to match with 'ID_CNIO' of the DataFrame
identificadores_df.rename(columns={'ID CNIO': 'ID_CNIO'}, inplace=True)

# Merge based on common column
df_final2 = df_final_sorted.merge(identificadores_df[['ID_CNIO', 'ORIGINAL ID_AP', 'ORIGINAL ID_NHC_BBANK']], 
                    on='ID_CNIO', 
                    how='left')

# Check
print(df_final2.head())

                  ID CNIO                                     ORIGINAL ID_AP  \
0  unique identifier CNIO  unique identifier from hospital, "Anatomía Pat...   
1                   07T25                                         8175-04/A4   
2                   08T94                          99B5119-8 (F.H. ALCORCÓN)   
3                   08T96                         04B0008135 (F.H. ALCORCÓN)   
4                  09T134                      B781787  (H,Gregorio Marañon)   

                               ORIGINAL ID_NHC_BBANK  \
0  unique identifier from Hospital (LP, OVE, MDA ...   
1                                       BTCNIO06/111   
2                                                NaN   
3                                                NaN   
4                                            B781787   

                   SUBTIPO HISTOLÓGICO  \
0  CC=clear cell ovarian carcinoma; E=   
1                                   CC   
2                                    E   
3             

In [15]:
# Add description and coding in rows with index 0 and 1
# Rename'ORIGINAL ID_AP' to 'ID_ORIGINAL' and ORIGINAL ID_NHC_BBANK to 'ID_ORIGINAL_NHC_BBANK'
df_final2.rename(columns={'ORIGINAL ID_AP': 'ID_ORIGINAL'}, inplace=True)
df_final2.rename(columns={'ORIGINAL ID_NHC_BBANK': 'ID_ORIGINAL_NHC_BBANK'}, inplace=True)
# Assign values to rows 0 and 1
df_final2.loc[0, 'ID_ORIGINAL'] = 'Unique identifier from the hospital to the patient/sample'
df_final2.loc[1, 'ID_ORIGINAL'] = 'Pathology code'
# Assign values to rows 0 and 1
df_final2.loc[0, 'ID_ORIGINAL_NHC_BBANK'] = 'Unique identifier from Hospital (LP, OVE, MDA series; NHC= number of clinical history) or Biobank (RVB series) to the patient/sample'
df_final2.loc[1, 'ID_ORIGINAL_NHC_BBANK'] = 'Unique alphanumeric code from the hospital or biobank.'

In [16]:
df_final2.to_excel("/home/vant/TFM/RVB_final.xlsx",index=False)
print("File succesfully created")

File succesfully created
