In [10]:
### Unify the notation for the different clinical variables across all centers and hospitals in La Paz samples ############
# I import the modules I need
import pandas as pd
import numpy as np
import re
# I read the prefiltered file for LP cohort
lp_path='/home/vant/TFM/LP_bbdd_filtered1.xlsx'
lp=pd.read_excel(lp_path)
print("The length of LP dataframe is:",len(lp))
print(lp.head)

The length of LP dataframe is: 120
<bound method NDFrame.head of     ID_CNIO                       SUBTYPE   AGEY     DIFERENTIATION  \
0       NaN  0=endometrioid, 1=clear cell  YEARS  1= G1; 2=G2; 3=G3   
1       LP6                             0  61.59                  3   
2      LP13                             0  72.91                  2   
3      LP19                             0  67.76                  2   
4      LP22                             0  37.84                  2   
..      ...                           ...    ...                ...   
115   LP115                             1  46.84                NaN   
116   LP116                             1  48.59                NaN   
117   LP117                             1  40.34                NaN   
118   LP119                             1  81.61                NaN   
119   LP120                             1  48.84                NaN   

                                                  FIGO  \
0    1=IA, 2=IB, 3=IC, 4

In [11]:
print(lp.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 120 entries, 0 to 119
Data columns (total 18 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   ID_CNIO           119 non-null    object
 1   SUBTYPE           120 non-null    object
 2   AGEY              120 non-null    object
 3   DIFERENTIATION    59 non-null     object
 4   FIGO              120 non-null    object
 5   FIGOL             120 non-null    object
 6   TYPEs             118 non-null    object
 7   TYPEc             89 non-null     object
 8   RESIDUALs         111 non-null    object
 9   CHEMO             116 non-null    object
 10  GENETIC           119 non-null    object
 11  OS                111 non-null    object
 12  DEATH             113 non-null    object
 13  AFS               108 non-null    object
 14  OS_CNIO           111 non-null    object
 15  DATE USED FOR OS  120 non-null    object
 16  FIGOa             120 non-null    object
 17  RESIDUALsD      

In [12]:
# Rename columns 
new_column_names = {
    'SUBTYPE': 'HISTOLOGY',
    'AGEY': 'AGE',
    'DIFERENTIATION': 'GRADE',
    'TYPEs':'NEOADJUVANT_TREATMENT',
    'CHEMO':'ADJUVANT_TREATMENT',
    'TYPEc':'TYPE_ADJUVANT',
    'DEATH':'VITAL STATUS',
    'RESIDUALs':'RESIDUAL',
    'RESIDUALsD':'RESIDUALa',
    'AFS':'FAMILIAL',
    'DATE USED FOR OS':'DATE_OS'
}
lp.rename(columns=new_column_names,inplace=True)
lp.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 120 entries, 0 to 119
Data columns (total 18 columns):
 #   Column                 Non-Null Count  Dtype 
---  ------                 --------------  ----- 
 0   ID_CNIO                119 non-null    object
 1   HISTOLOGY              120 non-null    object
 2   AGE                    120 non-null    object
 3   GRADE                  59 non-null     object
 4   FIGO                   120 non-null    object
 5   FIGOL                  120 non-null    object
 6   NEOADJUVANT_TREATMENT  118 non-null    object
 7   TYPE_ADJUVANT          89 non-null     object
 8   RESIDUAL               111 non-null    object
 9   ADJUVANT_TREATMENT     116 non-null    object
 10  GENETIC                119 non-null    object
 11  OS                     111 non-null    object
 12  VITAL STATUS           113 non-null    object
 13  FAMILIAL               108 non-null    object
 14  OS_CNIO                111 non-null    object
 15  DATE_OS                

In [13]:
# Show the dataframe
print(lp.head())

  ID_CNIO                     HISTOLOGY    AGE              GRADE  \
0     NaN  0=endometrioid, 1=clear cell  YEARS  1= G1; 2=G2; 3=G3   
1     LP6                             0  61.59                  3   
2    LP13                             0  72.91                  2   
3    LP19                             0  67.76                  2   
4    LP22                             0  37.84                  2   

                                                FIGO  \
0  1=IA, 2=IB, 3=IC, 4=I(NOS), 5=IIA, 6=IIB,  8=I...   
1                                                 11   
2                                                 13   
3                                                  9   
4                                                 11   

                         FIGOL  \
0   0=localizado vs 1=avanzado   
1                            1   
2                            1   
3                            1   
4                            1   

                               NEOADJUVANT_

In [14]:
# I eliminate the row 0 to work only with the samples information. 
lp_no_subheaders = lp.drop(0)
print("\nDataFrame without subheaders):")
print(lp_no_subheaders.info()) # Only we have 119 samples (the 2 first rows are headers and subheaders)


DataFrame without subheaders):
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 119 entries, 1 to 119
Data columns (total 18 columns):
 #   Column                 Non-Null Count  Dtype 
---  ------                 --------------  ----- 
 0   ID_CNIO                119 non-null    object
 1   HISTOLOGY              119 non-null    object
 2   AGE                    119 non-null    object
 3   GRADE                  58 non-null     object
 4   FIGO                   119 non-null    object
 5   FIGOL                  119 non-null    object
 6   NEOADJUVANT_TREATMENT  117 non-null    object
 7   TYPE_ADJUVANT          88 non-null     object
 8   RESIDUAL               110 non-null    object
 9   ADJUVANT_TREATMENT     115 non-null    object
 10  GENETIC                118 non-null    object
 11  OS                     110 non-null    object
 12  VITAL STATUS           112 non-null    object
 13  FAMILIAL               107 non-null    object
 14  OS_CNIO                110 non-null    obj

In [15]:
# Transform coding in TYPE_ADJUVANT
# Actual: 0=Carbo-paclitaxel 1=Cis-paclitaxel iv,2=Cis-pacli IP,3=Carbo monoterapia 4=Carbo-Taxol-Beva ,5=otro.
# New: 0=Carbo-paclitaxel 1=Cis-paclitaxel, 2=Carbo-monotherapy, 3=Carbo-Taxol-Beva ;4=other. 
# I create a dictionary to specify the changes
value_changes={
    1:1,2:1,3:2,4:3,5:4
}
print("Original TYPE_ADJUVANT column values:",lp_no_subheaders['TYPE_ADJUVANT'].head(35))
lp_no_subheaders['TYPE_ADJUVANT']=lp_no_subheaders['TYPE_ADJUVANT'].replace(value_changes) # Uso replace instead of map because I don´t want
# to change the 0 values (keep the coding in that case. If I use map I have to include 0:0 in the dictionary
print("Final TIPE_ADJUVANT column values:",lp_no_subheaders['TYPE_ADJUVANT'].head(35))

Original TYPE_ADJUVANT column values: 1       0
2       0
3     NaN
4       0
5       0
6     NaN
7       0
8       0
9       0
10      0
11      0
12      0
13      0
14      0
15      0
16      0
17      0
18      0
19      0
20      0
21      5
22      5
23      0
24      4
25      0
26      0
27    NaN
28      0
29      4
30    NaN
31      2
32      0
33    NaN
34      2
35      0
Name: TYPE_ADJUVANT, dtype: object
Final TIPE_ADJUVANT column values: 1     0.0
2     0.0
3     NaN
4     0.0
5     0.0
6     NaN
7     0.0
8     0.0
9     0.0
10    0.0
11    0.0
12    0.0
13    0.0
14    0.0
15    0.0
16    0.0
17    0.0
18    0.0
19    0.0
20    0.0
21    4.0
22    4.0
23    0.0
24    3.0
25    0.0
26    0.0
27    NaN
28    0.0
29    3.0
30    NaN
31    1.0
32    0.0
33    NaN
34    1.0
35    0.0
Name: TYPE_ADJUVANT, dtype: float64


In [16]:
# I round the age values. Firstly to avoid errors I transform the data to numeric 
lp_no_subheaders['AGE'] = pd.to_numeric(lp_no_subheaders['AGE'], errors='coerce')
print("\nDataFrame con 'AGE' convertido a numérico:")
print(lp_no_subheaders['AGE'].head(45))

lp_no_subheaders['AGE']=lp_no_subheaders['AGE'].round()
print(lp_no_subheaders['AGE'].head(45))


DataFrame con 'AGE' convertido a numérico:
1     61.59
2     72.91
3     67.76
4     37.84
5     38.69
6     51.51
7     75.67
8     67.14
9     40.59
10    33.42
11    38.17
12    75.76
13    63.51
14    58.59
15    41.50
16    68.35
17    56.01
18    55.08
19    57.17
20    69.93
21    56.68
22    86.33
23    50.59
24    69.84
25    59.59
26    50.76
27    77.01
28    48.34
29    54.51
30    35.01
31    60.09
32    47.92
33    84.92
34    37.34
35    74.96
36    49.59
37    42.00
38    68.42
39    50.34
40    81.93
41    45.18
42    48.09
43    50.84
44    55.50
45    34.50
Name: AGE, dtype: float64
1     62.0
2     73.0
3     68.0
4     38.0
5     39.0
6     52.0
7     76.0
8     67.0
9     41.0
10    33.0
11    38.0
12    76.0
13    64.0
14    59.0
15    42.0
16    68.0
17    56.0
18    55.0
19    57.0
20    70.0
21    57.0
22    86.0
23    51.0
24    70.0
25    60.0
26    51.0
27    77.0
28    48.0
29    55.0
30    35.0
31    60.0
32    48.0
33    85.0
34    37.0
35    75.0
36   

In [17]:
# Transform GENETIC column to obtain 2 columns: MMR STATUS and BRCA STATUS.
# In GENETIC column the coding is:  1=not studied; 2=LYNCH; 3= BRCA1/2 mut; 4= studied, without mutation; 5= mut other genes
# and now MMR STATUS/BRCA STATUS: 0=not studied,1=studied,without mutation,2=mutation in MMR genes/BRCA1/2,3=other genes mutated, NA= no data
# Having into account that in MMR STATUS 3 (other genes mutated) can refter to BRCA mutated or other genes mutated and the same for
# BRCA status
def calculate_mmr_status(genetic):
    if genetic == 1:
        return 0
    elif genetic == 4:
        return 1
    elif genetic == 2:
        return 2
    elif genetic==5 or genetic==3:
        return 3
    elif pd.isna(genetic):
        return genetic
def calculate_brca_status(genetic):
    if genetic==1:
        return 0
    elif genetic==4:
        return 1
    elif genetic==3:
        return 2
    elif genetic==5 or genetic==2:
        return 3
    elif pd.isna(genetic):
        return genetic
lp_no_subheaders['MMR STATUS']=lp_no_subheaders['GENETIC'].apply(calculate_mmr_status)
lp_no_subheaders['BRCA STATUS']=lp_no_subheaders['GENETIC'].apply(calculate_brca_status)
columns_to_print=['ID_CNIO','GENETIC','MMR STATUS','BRCA STATUS']
print(lp_no_subheaders[columns_to_print].head(50))

   ID_CNIO GENETIC  MMR STATUS  BRCA STATUS
1      LP6       1         0.0          0.0
2     LP13       4         1.0          1.0
3     LP19       1         0.0          0.0
4     LP22       1         0.0          0.0
5     LP25       1         0.0          0.0
6     LP34       1         0.0          0.0
7     LP41       1         0.0          0.0
8     LP43       1         0.0          0.0
9     LP46       1         0.0          0.0
10    LP47       4         1.0          1.0
11    LP48       1         0.0          0.0
12    LP61       1         0.0          0.0
13    LP65       4         1.0          1.0
14    LP70       1         0.0          0.0
15    LP74       1         0.0          0.0
16    LP78       1         0.0          0.0
17    LP80       1         0.0          0.0
18    LP84       1         0.0          0.0
19    LP87       1         0.0          0.0
20    LP90       1         0.0          0.0
21    LP91       1         0.0          0.0
22    LP93       1         0.0  

In [18]:
# I see that in the column 'DATE FOR OS' some data are lacking for OS, but in this column Diagnosis is the value
# I change this for the value LD that refers to 'Lacking diagnosis/death_lastv date'
lp_no_subheaders['DATE_OS'] = lp_no_subheaders.apply(
    lambda row: 'LD' if pd.isna(row['OS_CNIO']) else row['DATE_OS'],
    axis=1
)

print("\nDataFrame updated:")
print(lp_no_subheaders[['OS_CNIO','DATE_OS']])


DataFrame updated:
    OS_CNIO DATE_OS
1      2585       D
2      2501       D
3       NaN      LD
4      7579       D
5      4065       D
..      ...     ...
115    5589       D
116    3526       D
117    5061       D
118     309       D
119     504       D

[119 rows x 2 columns]


In [19]:
# Transform coding of NEOADJUVANT_TREATMENT column. Now it is coded as 0=NO SURGERY OR PALIATIVE SURGERY, 
# 1=PRIMARY DEBULKING, 2= INTERVAL SURGERY. We have to convert to: 0=NO;1=YES;NA=unknown
def calculate_neo_adj(neoadjuvant):
    if neoadjuvant == 0 or neoadjuvant==1:
        return 0
    elif neoadjuvant == 2:
        return 1
    elif pd.isna(neoadjuvant):
        return neoadjuvant
lp_no_subheaders['NEOADJUVANT_TREATMENT']=lp_no_subheaders['NEOADJUVANT_TREATMENT'].apply(calculate_neo_adj)
print(lp_no_subheaders['NEOADJUVANT_TREATMENT'])

1      0.0
2      1.0
3      NaN
4      0.0
5      0.0
      ... 
115    0.0
116    0.0
117    0.0
118    0.0
119    0.0
Name: NEOADJUVANT_TREATMENT, Length: 119, dtype: float64


In [20]:
print(lp_no_subheaders.info()) 

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 119 entries, 1 to 119
Data columns (total 20 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   ID_CNIO                119 non-null    object 
 1   HISTOLOGY              119 non-null    object 
 2   AGE                    119 non-null    float64
 3   GRADE                  58 non-null     object 
 4   FIGO                   119 non-null    object 
 5   FIGOL                  119 non-null    object 
 6   NEOADJUVANT_TREATMENT  117 non-null    float64
 7   TYPE_ADJUVANT          88 non-null     float64
 8   RESIDUAL               110 non-null    object 
 9   ADJUVANT_TREATMENT     115 non-null    object 
 10  GENETIC                118 non-null    object 
 11  OS                     110 non-null    object 
 12  VITAL STATUS           112 non-null    object 
 13  FAMILIAL               107 non-null    object 
 14  OS_CNIO                110 non-null    object 
 15  DATE_O

In [21]:
lp_final=lp_no_subheaders.drop(columns=['GENETIC','OS'])
print(lp_final.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 119 entries, 1 to 119
Data columns (total 18 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   ID_CNIO                119 non-null    object 
 1   HISTOLOGY              119 non-null    object 
 2   AGE                    119 non-null    float64
 3   GRADE                  58 non-null     object 
 4   FIGO                   119 non-null    object 
 5   FIGOL                  119 non-null    object 
 6   NEOADJUVANT_TREATMENT  117 non-null    float64
 7   TYPE_ADJUVANT          88 non-null     float64
 8   RESIDUAL               110 non-null    object 
 9   ADJUVANT_TREATMENT     115 non-null    object 
 10  VITAL STATUS           112 non-null    object 
 11  FAMILIAL               107 non-null    object 
 12  OS_CNIO                110 non-null    object 
 13  DATE_OS                119 non-null    object 
 14  FIGOa                  119 non-null    object 
 15  RESIDU

In [22]:
# Order the columns in the definitive order
# New order
new_order = [
    'ID_CNIO', 'HISTOLOGY', 'AGE', 'GRADE', 'FIGO',
    'FIGOL', 'FIGOa', 'NEOADJUVANT_TREATMENT', 'ADJUVANT_TREATMENT', 'TYPE_ADJUVANT',
    'RESIDUAL', 'RESIDUALa', 'OS_CNIO', 'VITAL STATUS', 'MMR STATUS', 'BRCA STATUS','FAMILIAL',
    'DATE_OS'
]

# Reorder
lp_no_subheaders = lp_no_subheaders[new_order]
print(lp_no_subheaders)

    ID_CNIO HISTOLOGY   AGE GRADE FIGO FIGOL FIGOa  NEOADJUVANT_TREATMENT  \
1       LP6         0  62.0     3   11     1     3                    0.0   
2      LP13         0  73.0     2   13     1     4                    1.0   
3      LP19         0  68.0     2    9     1     3                    NaN   
4      LP22         0  38.0     2   11     1     3                    0.0   
5      LP25         0  39.0     1   12     1     3                    0.0   
..      ...       ...   ...   ...  ...   ...   ...                    ...   
115   LP115         1  47.0   NaN    3     0     1                    0.0   
116   LP116         1  49.0   NaN    3     0     1                    0.0   
117   LP117         1  40.0   NaN    1     0     1                    0.0   
118   LP119         1  82.0   NaN    3     0     1                    0.0   
119   LP120         1  49.0   NaN    3     0     1                    0.0   

    ADJUVANT_TREATMENT  TYPE_ADJUVANT RESIDUAL RESIDUALa OS_CNIO VITAL STAT

In [23]:
lp_no_subheaders.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 119 entries, 1 to 119
Data columns (total 18 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   ID_CNIO                119 non-null    object 
 1   HISTOLOGY              119 non-null    object 
 2   AGE                    119 non-null    float64
 3   GRADE                  58 non-null     object 
 4   FIGO                   119 non-null    object 
 5   FIGOL                  119 non-null    object 
 6   FIGOa                  119 non-null    object 
 7   NEOADJUVANT_TREATMENT  117 non-null    float64
 8   ADJUVANT_TREATMENT     115 non-null    object 
 9   TYPE_ADJUVANT          88 non-null     float64
 10  RESIDUAL               110 non-null    object 
 11  RESIDUALa              110 non-null    object 
 12  OS_CNIO                110 non-null    object 
 13  VITAL STATUS           112 non-null    object 
 14  MMR STATUS             118 non-null    float64
 15  BRCA S

In [24]:
# Create row of subheaders/coding
subheaders = ['LP: ID for samples from La Paz Hospital; OVE: ID for samples from Virgen del Rocio Hospital; MDA: ID for samples from MD Anderson Cancer Center  Hospital ; RVB: ID for samples from Red Valenciana de Biobancos', 
              '0=endometroid; 1=clear cells; 2=mixed; 3=others; NA=unknown', '2 digits; unit is YEARS',
             '1=well differentiated; 2=moderately differentiated; 3=poorly differentiated; NA=unknown',
             '1=IA; 2=IB; 3=IC; 4=I(NOS); 5=IIA; 6=IIB; 8=II(NOS); 9=IIIA; 10=IIIB; 11=IIIC; 12=III(NOS); 13=IV; NA=unknown',
             '0=localized (I,II); 1=advanced(III,IV); NA=unknown','1= IA, IB, IC, I(NOS); 2=IIA, IIB, II (NOS); 3=IIIA, IIIB, IIIC, III(NOS); 4= IV; NA=unknown',
             '0=NO; 1=YES; NA=unknown','0=NO; 1=YES; NA=unknown',
             '0=Carbo-paclitaxel; 1=Cis-paclitaxel; 2=Carbo-monotherapy; 3=Carbo-Taxol-Beva; 4=other; NA=unknown',
             '0=no macroscopic disease; 1=macroscopic disease<=1; 2=macroscopic disease>1; 3=macroscopic disease,unknown size; NA=unknown',
             '0=No residual disease; 1=Yes residual disease; NA=unknown','Unit is DAYS',
             '0=alive; 1=dead; LF=lost to follow-up; NA=unknown','0=not studied; 1=studied,without mutation; 2=mutation in MMR genes; 3=other genes mutated; NA=unknown',
             '0=not studied; 1=studied,without mutation; 2=mutation in BRCA1/2; 3=other genes mutated; NA=unknown','0=NO; 1=YES; NA=unknown',
             'D=diagnosis date used for OS; S=surgery date used for OS; LD=Lacking diagnosis/surgery/death_lastv date; NA=unknown'
             ]
df_subheaders = pd.DataFrame([subheaders], columns=lp_no_subheaders.columns)

# Concatenate the row of subheaders below the original DataFrame
df_with_subheaders = pd.concat([df_subheaders, lp_no_subheaders], ignore_index=True)
print("\nDataFrame con subheaders añadidos:")
print(df_with_subheaders)


DataFrame con subheaders añadidos:
                                               ID_CNIO  \
0    LP: ID for samples from La Paz Hospital; OVE: ...   
1                                                  LP6   
2                                                 LP13   
3                                                 LP19   
4                                                 LP22   
..                                                 ...   
115                                              LP115   
116                                              LP116   
117                                              LP117   
118                                              LP119   
119                                              LP120   

                                             HISTOLOGY  \
0    0=endometroid; 1=clear cells; 2=mixed; 3=other...   
1                                                    0   
2                                                    0   
3                                  

In [25]:
# Row with descriptions. I want it in the first row below the headers 
descriptions = {
    'ID_CNIO': 'Unique CNIO tumor identifier',
    'HISTOLOGY': 'Tumor histology/Tumor type',
    'AGE': 'Age at diagnosis. Calculated as (date of diagnosis – date of birth). When not provided, date of surgery was used instead of date of diagnosis (OVE series); a tag column was added to indicate this: “DATE_AGE”',
    'GRADE':'Tumor differentiation grade',
    'FIGO':'FIGO tumor stage',
    'FIGOL':'FIGO tumor stage according to cancer spread (localized or advance)',
    'FIGOa':'Aggregated tumor stage',
    'NEOADJUVANT_TREATMENT':'Information about whether the patient received neoadjuvant treatment',
    'ADJUVANT_TREATMENT':'Information about whether the patient received chemotherapy',
    'TYPE_ADJUVANT':'Type of adjuvant treatment received. Carboplatino+taxol is the standard',
    'RESIDUAL':'Residual disease after surgery',
    'RESIDUALa':'Aggregated categories of residual disease after surgery',
    'OS_CNIO':'Overall survival calculated by CNIO from original records as (date of death-date of diagnosis) or (date last known to be alive-date of diagnosis). When not provided, date of surgery was used instead of date of diagnosis. In this last case it is indicated in the tag column DATE USED FOR OS',
    'VITAL STATUS':'Vital status at last followup',
    'MMR STATUS':'Information about the GERMILINE mutations in genes related to mismatch repair pathway',
    'BRCA STATUS':'Information about GERMILINE mutation in BRCA1/2 genes',
    'FAMILIAL':'Familial antecedents of colon,endometrial and/or ovarian cancer. In OVE series this information refers to possible hereditary conditions according to the clinician',
    'DATE_OS':'Tag column to indicate how overall survival was calculated, using date of diagnosis or date of surgery (OVE series)'
}

# Convert the dictionary to a dataframe
desc_df = pd.DataFrame(descriptions, index=[0])

# Concatenate the descriptions row with the original dataframe l
df_final = pd.concat([desc_df, df_with_subheaders]).reset_index(drop=True)

# Show the final dataframe
print(df_final)  

                                               ID_CNIO  \
0                         Unique CNIO tumor identifier   
1    LP: ID for samples from La Paz Hospital; OVE: ...   
2                                                  LP6   
3                                                 LP13   
4                                                 LP19   
..                                                 ...   
116                                              LP115   
117                                              LP116   
118                                              LP117   
119                                              LP119   
120                                              LP120   

                                             HISTOLOGY  \
0                           Tumor histology/Tumor type   
1    0=endometroid; 1=clear cells; 2=mixed; 3=other...   
2                                                    0   
3                                                    0   
4            

In [26]:
# Separate the first two rows (subheaders and coding information)
header_rows = df_final.iloc[:2]

# Select the remaining rows (starting from row 2) as the data portion
data_rows = df_final.iloc[2:].copy()  # Use .copy() to avoid the SettingWithCopyWarning

# Extract the number after "LP" only from the data rows in the 'ID_CNIO' column
data_rows.loc[:, 'ID_number'] = data_rows['ID_CNIO'].str.extract(r'LP(\d+)')

# Convert the extracted values to numeric (integer), ignoring errors for invalid entries
data_rows.loc[:, 'ID_number'] = pd.to_numeric(data_rows['ID_number'], errors='coerce')

# Filter rows where 'ID_number' is not NaN (i.e., only keep rows with valid extracted numbers)
data_rows = data_rows[data_rows['ID_number'].notna()]

# Sort the DataFrame based on the extracted number in ascending order
data_rows = data_rows.sort_values(by='ID_number').reset_index(drop=True)

# Drop the auxiliary column 'ID_number' if it's no longer needed
data_rows = data_rows.drop(columns=['ID_number'])

# Recombine the subheader rows with the sorted data rows
df_final_sorted = pd.concat([header_rows, data_rows], ignore_index=True)

In [27]:
# Add original identifiers columns from the file passed by Maria
# Read identifiers file
identificadores_df = pd.read_excel("/home/vant/TFM/Identificadores OVE_LP_MDA_RVB_serie completa.xlsx",sheet_name=0)
print(identificadores_df.head())
print(identificadores_df.columns) # I realise there are additional spaces in ID CNIO header
identificadores_df.columns = identificadores_df.columns.str.strip()
print(identificadores_df.columns)

                 ID CNIO                                      ORIGINAL ID_AP  \
0  unique identifier CNIO  unique identifier from hospital, "Anatomía Pat...   
1                   07T25                                         8175-04/A4   
2                   08T94                          99B5119-8 (F.H. ALCORCÓN)   
3                   08T96                         04B0008135 (F.H. ALCORCÓN)   
4                  09T134                      B781787  (H,Gregorio Marañon)   

                               ORIGINAL ID_NHC_BBANK  \
0  unique identifier from Hospital (LP, OVE, MDA ...   
1                                       BTCNIO06/111   
2                                                NaN   
3                                                NaN   
4                                            B781787   

                   SUBTIPO HISTOLÓGICO  \
0  CC=clear cell ovarian carcinoma; E=   
1                                   CC   
2                                    E   
3             

In [28]:
# Add original identifiers columns from the file passed by Maria

print(identificadores_df.head())
# Rename 'ID CNIO' to match with 'ID_CNIO' in the DataFrame
identificadores_df.rename(columns={'ID CNIO': 'ID_CNIO'}, inplace=True)

# Merge using common column
df_final2 = df_final_sorted.merge(identificadores_df[['ID_CNIO', 'ORIGINAL ID_AP', 'ORIGINAL ID_NHC_BBANK']], 
                    on='ID_CNIO', 
                    how='left')

# Check
print(df_final2.head())


                  ID CNIO                                     ORIGINAL ID_AP  \
0  unique identifier CNIO  unique identifier from hospital, "Anatomía Pat...   
1                   07T25                                         8175-04/A4   
2                   08T94                          99B5119-8 (F.H. ALCORCÓN)   
3                   08T96                         04B0008135 (F.H. ALCORCÓN)   
4                  09T134                      B781787  (H,Gregorio Marañon)   

                               ORIGINAL ID_NHC_BBANK  \
0  unique identifier from Hospital (LP, OVE, MDA ...   
1                                       BTCNIO06/111   
2                                                NaN   
3                                                NaN   
4                                            B781787   

                   SUBTIPO HISTOLÓGICO  \
0  CC=clear cell ovarian carcinoma; E=   
1                                   CC   
2                                    E   
3             

In [29]:
# Add description and coding in rows with index 0 and 1
# Rename'ORIGINAL ID_AP' to 'ID_ORIGINAL' and ORIGINAL ID_NHC_BBANK to 'ID_ORIGINAL_NHC_BBANK'
df_final2.rename(columns={'ORIGINAL ID_AP': 'ID_ORIGINAL'}, inplace=True)
df_final2.rename(columns={'ORIGINAL ID_NHC_BBANK': 'ID_ORIGINAL_NHC_BBANK'}, inplace=True)
# Assign values to rows 0 and 1
df_final2.loc[0, 'ID_ORIGINAL'] = 'Unique identifier from the hospital to the patient/sample'
df_final2.loc[1, 'ID_ORIGINAL'] = 'Pathology code'
# Assign values to rows 0 and 1
df_final2.loc[0, 'ID_ORIGINAL_NHC_BBANK'] = 'Unique identifier from Hospital (LP, OVE, MDA series; NHC= number of clinical history) or Biobank (RVB series) to the patient/sample'
df_final2.loc[1, 'ID_ORIGINAL_NHC_BBANK'] = 'Unique alphanumeric code from the hospital or biobank.'

In [30]:
df_final2.to_excel('/home/vant/TFM/LP_final.xlsx', index=False)
print("Archivo creado con éxito.")

Archivo creado con éxito.
