In [31]:
### Unify the notation for the different clinical variables across all centers and hospitals in OVE samples ############

# I import the modules to work
import pandas as pd
import numpy as np
import re
import os
# I read the prefiltered file for OVE cohort
ove_path='/home/vant/TFM/OVE_bbdd_filtered1.xlsx'
ove=pd.read_excel(ove_path)
print(ove.head(7))
print(ove.info())
print("The length of OVE dataframe prefiltered is:",len(ove)) # There are 129 sampes, the row with index 0 that contains the subheaders
# is also counted so 130

  CÓDIGO CNIO                                        DIAGNÓSTICO  \
0     ID CNIO  CA ENDOMETRIOIDE; CA CELS CLARAS; CA CELS CLAR...   
1      OVE105                                          CA SEROSO   
2      OVE127                            TUMOR SEROSO BORDERLINE   
3      OVE128             TUMOR MUCINOSO BORDERLINE ENDOCERVICAL   
4      OVE107                                          CA SEROSO   
5      OVE108                                   CA ENDOMETRIOIDE   
6      OVE129             TUMOR MUCINOSO BORDERLINE ENDOCERVICAL   

                                           HISTOLOGY  \
0  0=endometroid,1=clear cells,2=mixed,3=others,N...   
1                                                  3   
2                                                  3   
3                                                  3   
4                                                  3   
5                                                  0   
6                                                  3   

      

In [32]:
ove_no_subheaders=ove.drop([0])
print(ove_no_subheaders.info())
print("Length of OVE without subheaders is:",len(ove_no_subheaders))

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 129 entries, 1 to 129
Data columns (total 27 columns):
 #   Column                       Non-Null Count  Dtype         
---  ------                       --------------  -----         
 0   CÓDIGO CNIO                  129 non-null    object        
 1   DIAGNÓSTICO                  129 non-null    object        
 2   HISTOLOGY                    129 non-null    object        
 3   HISTOLOGY_DETAIL             129 non-null    object        
 4   EDAD                         129 non-null    object        
 5   DATE_AGE                     114 non-null    object        
 6   GRADO OMS                    129 non-null    object        
 7   FIGO STAGE                   100 non-null    object        
 8   FIGOa                        100 non-null    object        
 9   FIGOL                        101 non-null    object        
 10  TRATAMIENTO NEOADYUVANTE     116 non-null    object        
 11  VOLUMEN TUMOR RESIDUAL (cm)  70 non-null     

In [33]:
# I eliminate column DIAGNÓSTICO,OS_CNIO and OS_CNIO_months because they have redundant information
ove_no_subheaders=ove_no_subheaders.drop(columns=['DIAGNÓSTICO','OS_CNIO','AGE'])
print(ove_no_subheaders.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 129 entries, 1 to 129
Data columns (total 24 columns):
 #   Column                       Non-Null Count  Dtype         
---  ------                       --------------  -----         
 0   CÓDIGO CNIO                  129 non-null    object        
 1   HISTOLOGY                    129 non-null    object        
 2   HISTOLOGY_DETAIL             129 non-null    object        
 3   EDAD                         129 non-null    object        
 4   DATE_AGE                     114 non-null    object        
 5   GRADO OMS                    129 non-null    object        
 6   FIGO STAGE                   100 non-null    object        
 7   FIGOa                        100 non-null    object        
 8   FIGOL                        101 non-null    object        
 9   TRATAMIENTO NEOADYUVANTE     116 non-null    object        
 10  VOLUMEN TUMOR RESIDUAL (cm)  70 non-null     object        
 11  RESIDUALsD                   70 non-null     

In [34]:
# Firstly I rename the columns 
# Rename columns. I decide to use the OS column provided by the hospital because it is the most complete one, but it is coded 
# in  months so we have to convert the values to days
new_column_names = {
    'CÓDIGO CNIO': 'ID_CNIO',
    'EDAD':'AGE', #BE CAREFUL HERE BECAUSE IS AGE AT SURGERY. WE HAVE COLUMN DATE USED FOR AGE TO TAG THIS
    'GRADO OMS':'GRADE',
    'FIGO STAGE':'FIGO',
    'VOLUMEN TUMOR RESIDUAL (cm)': 'RESIDUAL',
    'RESIDUALsD':'RESIDUALa',
    'TRATAMIENTO NEOADYUVANTE':'NEOADJUVANT_TREATMENT',
    'TRATAMIENTO ADYUVANTE': 'ADJUVANT_TREATMENT',
    'First line regimen':'TYPE_ADJUVANT',
    'OS':'OS_CNIO',
    'OTROS.1':'OTHER_ADJ_TREAT',
    'DATE USED FOR OS':'DATE_OS',
    'POSIBLE HEREDITARIO':'FAMILIAL'
}
ove_no_subheaders.rename(columns=new_column_names,inplace=True)
ove_no_subheaders.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 129 entries, 1 to 129
Data columns (total 24 columns):
 #   Column                 Non-Null Count  Dtype         
---  ------                 --------------  -----         
 0   ID_CNIO                129 non-null    object        
 1   HISTOLOGY              129 non-null    object        
 2   HISTOLOGY_DETAIL       129 non-null    object        
 3   AGE                    129 non-null    object        
 4   DATE_AGE               114 non-null    object        
 5   GRADE                  129 non-null    object        
 6   FIGO                   100 non-null    object        
 7   FIGOa                  100 non-null    object        
 8   FIGOL                  101 non-null    object        
 9   NEOADJUVANT_TREATMENT  116 non-null    object        
 10  RESIDUAL               70 non-null     object        
 11  RESIDUALa              70 non-null     object        
 12  ADJUVANT_TREATMENT     116 non-null    object        
 13  CARBO

In [35]:
# As I now all the data used for age comes from the Surgery date I set up all values to Surgery
ove_no_subheaders['DATE_AGE'] = 'S'
# In column TUMOR GRADE the mixed cancers have 2 values separated by a space, for example I II. For the moment we keep the values in 
# that way

In [36]:
# I change the RESIDUALs values nan to 'NA'
print(ove_no_subheaders['RESIDUAL'].head(20))
ove_no_subheaders['RESIDUAL'] = ove_no_subheaders['RESIDUAL'].fillna('NA')
print(ove_no_subheaders['RESIDUAL'].head(20))

1     NaN
2     NaN
3       0
4       0
5     NaN
6       1
7     NaN
8       0
9       0
10    NaN
11      0
12      2
13      0
14      0
15      0
16      0
17    NaN
18      0
19      0
20    NaN
Name: RESIDUAL, dtype: object
1     NA
2     NA
3      0
4      0
5     NA
6      1
7     NA
8      0
9      0
10    NA
11     0
12     2
13     0
14     0
15     0
16     0
17    NA
18     0
19     0
20    NA
Name: RESIDUAL, dtype: object


In [37]:
# Change GRADE column to latin numbers. They are in roman numbers
roman_to_int = {
    'I': '1',
    'II': '2',
    'III': '3'
}
print(ove_no_subheaders['GRADE'].head(10))
# Function to transform values in column GRADE
def transform_grade(value):
    if pd.isna(value):  # Keep NA
        return value
    # Remove spaces and split
    parts = value.split()
    # Convert each part using the dictionary
    transformed_parts = [roman_to_int.get(part, part) for part in parts]
    # Unify the components using ';'
    return ';'.join(transformed_parts)

# Apply 
ove_no_subheaders['GRADE'] = ove_no_subheaders['GRADE'].apply(transform_grade)

# Show
print(ove_no_subheaders['GRADE'].head(10))   

1     II
2      I
3      I
4     II
5      I
6      I
7     II
8      I
9      I
10     I
Name: GRADE, dtype: object
1     2
2     1
3     1
4     2
5     1
6     1
7     2
8     1
9     1
10    1
Name: GRADE, dtype: object


In [38]:
# Transform coding in RESIDUALs column
# Actual:0=no macroscopic disease, 1=1cm;2=2cm >1; <2=between 1 and 2 cm;  NA=unknown. (there are values like <1,<2,>2.5,>4,etc
# Now we mantain 0 and NA like that. We use a function that evaluates if the number after > or < is greater than 1. If it is <1 or =1 
# assigns 1 while if it is >1 or > number (being this number greater than 1) assigns 2. 
# New: 0=no macroscopic disease, 1=macroscopic disease <=1 cm; 2=macroscopic disease >1; 3=macroscopic disease, size unknown; NA=unknown
# value[1:] takes the second character in the string after > or < simbols and if it is a digit evaluates if it is >1 or <=1 in order to
# assign the proper category

print(ove_no_subheaders['RESIDUAL'].head(45))

import pandas as pd

def simplify_residuals(value):
    # Convert to string
    if not isinstance(value, str):
        value = str(value)
    
    # Manage special values
    if value == 'NA':
        return 'NA'
    elif value == '0':
        return 0
    elif value == '1':
        return 1
    elif value=='2':
        return 2
    elif value.startswith('<'):
        # Extract number after '<'
        number_part = value[1:].strip()
        try:
            number = float(number_part)
            if number <= 1:
                return 1
            else:
                return 2
        except ValueError:
            return 'NA'
    elif value.startswith('>'):
        # Extract number after '>'
        number_part = value[1:].strip()
        try:
            number = float(number_part)
            if number > 1:
                return 2
            else:
                return 1
        except ValueError:
            return 'NA'
    # NA if not matches with the other situations
    return 'NA'

# Apply
ove_no_subheaders['RESIDUAL'] = ove_no_subheaders['RESIDUAL'].apply(simplify_residuals)

# Show
print(ove_no_subheaders['RESIDUAL'].head(45))

1     NA
2     NA
3      0
4      0
5     NA
6      1
7     NA
8      0
9      0
10    NA
11     0
12     2
13     0
14     0
15     0
16     0
17    NA
18     0
19     0
20    NA
21     0
22     0
23     0
24    <2
25     0
26    NA
27    NA
28    NA
29     0
30     2
31     2
32    NA
33    <2
34     0
35    NA
36    NA
37     0
38     0
39     0
40     2
41     0
42     2
43    <2
44     0
45     0
Name: RESIDUAL, dtype: object
1     NA
2     NA
3      0
4      0
5     NA
6      1
7     NA
8      0
9      0
10    NA
11     0
12     2
13     0
14     0
15     0
16     0
17    NA
18     0
19     0
20    NA
21     0
22     0
23     0
24     2
25     0
26    NA
27    NA
28    NA
29     0
30     2
31     2
32    NA
33     2
34     0
35    NA
36    NA
37     0
38     0
39     0
40     2
41     0
42     2
43     2
44     0
45     0
Name: RESIDUAL, dtype: object


In [39]:
# Normalize OTHER_ADJ_TREAT for Carboplatino using 'carboplat' term for all of them
# Firstly I convert all nan data to 'NA' to avoid problems and make it easier the recognition in the functions
ove_no_subheaders['OTHER_ADJ_TREAT'] = ove_no_subheaders['OTHER_ADJ_TREAT'].fillna('NA')
print(ove_no_subheaders['OTHER_ADJ_TREAT'].head(40))
def normalize_other_adj_treat(value):
    if pd.isna(value):
        return 'NA'
    # Use regular expressions to find variant of 'carboplatino'term
    value = re.sub(r'\bcarb(oplat|p|oploatino|oplt|oplatino)\b', 'carboplat', value, flags=re.IGNORECASE)
    return value
ove_no_subheaders['OTHER_ADJ_TREAT'] = ove_no_subheaders['OTHER_ADJ_TREAT'].apply(normalize_other_adj_treat)
print(ove_no_subheaders['OTHER_ADJ_TREAT'].head(40))

1                                  NA
2                                  NA
3                                  NA
4                                  NA
5                                  NA
6                 docetaxel-carboplat
7                                  NA
8                                  NA
9                                  NA
10                                 NA
11                                 NA
12                                 NA
13                                 NA
14              gencitabina-carboplat
15                                 NA
16                                 NA
17                                 NA
18                                 NA
19                                 NA
20                                 NA
21                                 NA
22                                 NA
23                                 NA
24    carbp-caelyx, carbp-gencitabina
25     gencitabina y caelyx-carboplat
26                                 NA
27          

In [40]:
# Function to determine new coding values for TYPE_ADJUVANT
# Current coding: 
#   - 0 = NO
#   - 1 = YES for CARBOPLATINO.1, CARBOPLATINO+TAXOL.1, CISPLATINO+TAXOL.1
# New unified coding:
#   - 0 = Carbo-paclitaxel (CARBOPLATINO+TAXOL.1)
#   - 1 = Cis-paclitaxel (CISPLATINO+TAXOL.1)
#   - 2 = Carbo-monotherapy (CARBOPLATINO.1 only)
#   - 3 = Carbo-Taxol-Beva (not defined explicitly in this logic)
#   - 4 = Other treatments (based on OTHER_ADJ_TREAT column)
#   - NA = Unknown treatment
#
# In some cases:
# - If CARBOPLATINO.1 = 1, CARBOPLATINO+TAXOL.1 = 0, and OTHER_ADJ_TREAT is empty, the function returns 2 (Carbo-monotherapy).
# - If CARBOPLATINO+TAXOL.1 = 1 but there is data in OTHER_ADJ_TREAT, it returns 1.
# - If CARBOPLATINO+TAXOL.1 = 0 but there is data in OTHER_ADJ_TREAT, it returns 4.
# - For cases with no treatment and OTHER_ADJ_TREAT = "NA", it returns NA.

# Firstly I convert to NA all the empty cells of OTHER_ADJ_TREAT, if not issues appear
def determine_type_adjuvant(row):
    if row['CARBOPLATINO+TAXOL.1'] == 1:
        return 0
    elif row['CISPLATINO+TAXOL.1'] == 1:
        return 1
    elif row['CARBOPLATINO.1'] == 1 and row['CARBOPLATINO+TAXOL.1'] == 0:
        if not row['OTHER_ADJ_TREAT']:
            return 2
        elif row['OTHER_ADJ_TREAT'] != 'NA':
            return 4
    elif row['OTHER_ADJ_TREAT'] != 'NA' and row['CARBOPLATINO+TAXOL.1'] == 0:
        return 4
    elif (row['CARBOPLATINO.1'] == 0 and 
          row['CARBOPLATINO+TAXOL.1'] == 0 and 
          row['CISPLATINO+TAXOL.1'] == 0 and 
          row['OTHER_ADJ_TREAT'] == 'NA'):
        return 'NA'
    else:
        return ''

# Function to add -carboplat to the OTHER_ADJ_TREAT when CARBOPLATINO.1=1 and the rest are 0 and there are not empty data in OTHER_ADJ_TREAT
# but the chain carboplat is not present.
def modify_other_adj_treat(row):
    if row['CARBOPLATINO.1'] == 1 and row['CARBOPLATINO+TAXOL.1'] == 0 and row['CISPLATINO+TAXOL.1']==0 and row['OTHER_ADJ_TREAT']!='NA':
        if 'carboplat' not in row['OTHER_ADJ_TREAT']:
        # Modify OTHER_ADJ_TREAT to include '-carboplat'
            return row['OTHER_ADJ_TREAT'] + '-carboplat'
    return row['OTHER_ADJ_TREAT']
        
print("Before\n:",ove_no_subheaders[['CARBOPLATINO.1','CARBOPLATINO+TAXOL.1','CISPLATINO+TAXOL.1','OTHER_ADJ_TREAT']].head(40))
# Apply
ove_no_subheaders['TYPE_ADJUVANT'] = ove_no_subheaders.apply(determine_type_adjuvant, axis=1)
ove_no_subheaders['OTHER_ADJ_TREAT'] = ove_no_subheaders.apply(modify_other_adj_treat, axis=1)
# Show
print("Later\n:",ove_no_subheaders[['CARBOPLATINO.1','CARBOPLATINO+TAXOL.1','CISPLATINO+TAXOL.1','OTHER_ADJ_TREAT','TYPE_ADJUVANT']].head(40))

Before
:    CARBOPLATINO.1 CARBOPLATINO+TAXOL.1 CISPLATINO+TAXOL.1  \
1             NaN                  NaN                NaN   
2               0                    1                  0   
3               0                    0                  0   
4               0                    0                  0   
5               0                    1                  0   
6               0                    1                  0   
7               1                    1                  0   
8               0                    1                  0   
9               0                    1                  0   
10              0                    0                  0   
11              0                    0                  0   
12              0                    0                  0   
13              0                    1                  0   
14              0                    0                  0   
15              0                    1                  0   
16             

In [41]:
ove_no_subheaders.to_excel("Prueba1.xlsx",index=False)

In [42]:
# Once we have obtained the values of TYPE_ADJUVANT' column we can eliminate the other columns:
ove_no_subheaders=ove_no_subheaders.drop(columns=['CARBOPLATINO.1','CARBOPLATINO+TAXOL.1','CISPLATINO+TAXOL.1'])
print(ove_no_subheaders.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 129 entries, 1 to 129
Data columns (total 22 columns):
 #   Column                 Non-Null Count  Dtype         
---  ------                 --------------  -----         
 0   ID_CNIO                129 non-null    object        
 1   HISTOLOGY              129 non-null    object        
 2   HISTOLOGY_DETAIL       129 non-null    object        
 3   AGE                    129 non-null    object        
 4   DATE_AGE               129 non-null    object        
 5   GRADE                  129 non-null    object        
 6   FIGO                   100 non-null    object        
 7   FIGOa                  100 non-null    object        
 8   FIGOL                  101 non-null    object        
 9   NEOADJUVANT_TREATMENT  116 non-null    object        
 10  RESIDUAL               129 non-null    object        
 11  RESIDUALa              70 non-null     object        
 12  ADJUVANT_TREATMENT     116 non-null    object        
 13  OTHER

In [43]:
# Now I have to transform the data in OS_CNIO to days (it is in months)
ove_no_subheaders['OS_CNIO'] = pd.to_numeric(ove_no_subheaders['OS_CNIO'], errors='coerce')
# I prove to do the same but with the column calculated by us (decimals) in order to compare the results
ove_no_subheaders['OS_CNIO_months'] = pd.to_numeric(ove_no_subheaders['OS_CNIO_months'], errors='coerce')
# 30.44 days by month
ove_no_subheaders['OS_CNIO_DAYS'] = ove_no_subheaders['OS_CNIO'] * 30.44
ove_no_subheaders['OS_CNIO_DAYS'] = ove_no_subheaders['OS_CNIO_DAYS'].round()
ove_no_subheaders['OS_CNIO_DAYS_1'] = ove_no_subheaders['OS_CNIO_months'] * 30.44
ove_no_subheaders['OS_CNIO_DAYS_1'] = ove_no_subheaders['OS_CNIO_DAYS_1'].round()
# Show both columns
print(ove_no_subheaders[['OS_CNIO','OS_CNIO_DAYS','OS_CNIO_DAYS_1']].head(50))

    OS_CNIO  OS_CNIO_DAYS  OS_CNIO_DAYS_1
1       NaN           NaN             NaN
2      26.0         791.0           797.0
3      19.0         578.0           581.0
4       1.0          30.0            31.0
5      21.0         639.0           623.0
6      14.0         426.0           404.0
7      37.0        1126.0          1121.0
8      36.0        1096.0          1089.0
9      32.0         974.0           973.0
10     30.0         913.0           896.0
11     33.0        1005.0           977.0
12      1.0          30.0            20.0
13     18.0         548.0           552.0
14     15.0         457.0             NaN
15     27.0         822.0           826.0
16     27.0         822.0           837.0
17     24.0         731.0           715.0
18     45.0        1370.0          1380.0
19     22.0         670.0           677.0
20     39.0        1187.0          1205.0
21     31.0         944.0           987.0
22     47.0        1431.0          1462.0
23     47.0        1431.0         

In [44]:
# Update'OS_CNIO_DAYS_1' with 'OS_CNIO_DAYS' values when 'OS_CNIO_DAYS_1' is NaN
ove_no_subheaders['OS_CNIO_DAYS_1'] = ove_no_subheaders.apply(
    lambda row: row['OS_CNIO_DAYS'] if pd.notna(row['OS_CNIO_DAYS']) and pd.isna(row['OS_CNIO_DAYS_1']) else row['OS_CNIO_DAYS_1'],
    axis=1
)
print(ove_no_subheaders[['OS_CNIO','OS_CNIO_DAYS','OS_CNIO_DAYS_1']].head(50))

    OS_CNIO  OS_CNIO_DAYS  OS_CNIO_DAYS_1
1       NaN           NaN             NaN
2      26.0         791.0           797.0
3      19.0         578.0           581.0
4       1.0          30.0            31.0
5      21.0         639.0           623.0
6      14.0         426.0           404.0
7      37.0        1126.0          1121.0
8      36.0        1096.0          1089.0
9      32.0         974.0           973.0
10     30.0         913.0           896.0
11     33.0        1005.0           977.0
12      1.0          30.0            20.0
13     18.0         548.0           552.0
14     15.0         457.0           457.0
15     27.0         822.0           826.0
16     27.0         822.0           837.0
17     24.0         731.0           715.0
18     45.0        1370.0          1380.0
19     22.0         670.0           677.0
20     39.0        1187.0          1205.0
21     31.0         944.0           987.0
22     47.0        1431.0          1462.0
23     47.0        1431.0         

In [45]:
ove_no_subheaders=ove_no_subheaders.drop(columns=['OS_CNIO','OS_CNIO_DAYS','OS_CNIO_months'])
ove_no_subheaders.rename(columns={'OS_CNIO_DAYS_1': 'OS_CNIO'}, inplace=True)
print(ove_no_subheaders.info())
print(ove_no_subheaders['OS_CNIO'].head(40))

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 129 entries, 1 to 129
Data columns (total 21 columns):
 #   Column                 Non-Null Count  Dtype         
---  ------                 --------------  -----         
 0   ID_CNIO                129 non-null    object        
 1   HISTOLOGY              129 non-null    object        
 2   HISTOLOGY_DETAIL       129 non-null    object        
 3   AGE                    129 non-null    object        
 4   DATE_AGE               129 non-null    object        
 5   GRADE                  129 non-null    object        
 6   FIGO                   100 non-null    object        
 7   FIGOa                  100 non-null    object        
 8   FIGOL                  101 non-null    object        
 9   NEOADJUVANT_TREATMENT  116 non-null    object        
 10  RESIDUAL               129 non-null    object        
 11  RESIDUALa              70 non-null     object        
 12  ADJUVANT_TREATMENT     116 non-null    object        
 13  OTHER

In [46]:
# I see that in the column 'DATE FOR OS' some data are lacking for OS, but in this column Diagnosis is the value
# I change this for the value 'Lacking diagnosis/death_lastv date'
ove_no_subheaders['DATE_OS'] = ove_no_subheaders.apply(
    lambda row: 'LD' if pd.isna(row['OS_CNIO']) else row['DATE_OS'],
    axis=1
)

print("\nDataFrame updated:")
print(ove_no_subheaders[['OS_CNIO','DATE_OS']].head(50))


DataFrame updated:
    OS_CNIO DATE_OS
1       NaN      LD
2     797.0       S
3     581.0       S
4      31.0       S
5     623.0       S
6     404.0       S
7    1121.0       S
8    1089.0       S
9     973.0       S
10    896.0       S
11    977.0       S
12     20.0       S
13    552.0       S
14    457.0     NaN
15    826.0       S
16    837.0       S
17    715.0       S
18   1380.0       S
19    677.0       S
20   1205.0       S
21    987.0       S
22   1462.0       S
23   1450.0       S
24   1456.0       S
25   1168.0       S
26    784.0       S
27    749.0       S
28    166.0       S
29   1315.0       S
30    686.0       S
31    481.0       S
32      4.0       S
33   1267.0       S
34   1301.0       S
35   1112.0       S
36   1225.0       S
37   1232.0       S
38   1788.0       S
39   1851.0       S
40      NaN      LD
41   1667.0       S
42    153.0       S
43   1217.0       S
44   1750.0       S
45    518.0       S
46    946.0       S
47   1624.0       S
48   1419.0       S


In [47]:
# Eliminate spaces before and after the term PS. Before that I convert all values to string type. If I don´t do that 0 and 1 will be
# empty cells
ove_no_subheaders['VITAL STATUS'] = ove_no_subheaders['VITAL STATUS'].astype(str).str.strip()
print (ove_no_subheaders['VITAL STATUS'])

# Ensure recognition of na values
ove_no_subheaders['VITAL STATUS'] = ove_no_subheaders['VITAL STATUS'].replace('nan', np.nan)

# Fill na with 'NA'
ove_no_subheaders['VITAL STATUS'] = ove_no_subheaders['VITAL STATUS'].fillna('NA')

# Verify
print(ove_no_subheaders['VITAL STATUS'])


1      nan
2        0
3        0
4        0
5        0
      ... 
125    nan
126    nan
127    nan
128    nan
129    nan
Name: VITAL STATUS, Length: 129, dtype: object
1      NA
2       0
3       0
4       0
5       0
       ..
125    NA
126    NA
127    NA
128    NA
129    NA
Name: VITAL STATUS, Length: 129, dtype: object


In [48]:
print(ove_no_subheaders.info()) # In OVE there are not partial dates

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 129 entries, 1 to 129
Data columns (total 21 columns):
 #   Column                 Non-Null Count  Dtype         
---  ------                 --------------  -----         
 0   ID_CNIO                129 non-null    object        
 1   HISTOLOGY              129 non-null    object        
 2   HISTOLOGY_DETAIL       129 non-null    object        
 3   AGE                    129 non-null    object        
 4   DATE_AGE               129 non-null    object        
 5   GRADE                  129 non-null    object        
 6   FIGO                   100 non-null    object        
 7   FIGOa                  100 non-null    object        
 8   FIGOL                  101 non-null    object        
 9   NEOADJUVANT_TREATMENT  116 non-null    object        
 10  RESIDUAL               129 non-null    object        
 11  RESIDUALa              70 non-null     object        
 12  ADJUVANT_TREATMENT     116 non-null    object        
 13  OTHER

In [49]:
# Order the columns in the definitive order
# New order
new_order = [
    'ID_CNIO', 'HISTOLOGY', 'HISTOLOGY_DETAIL','AGE', 'DATE_AGE','GRADE', 'FIGO',
    'FIGOL', 'FIGOa', 'NEOADJUVANT_TREATMENT','ADJUVANT_TREATMENT', 'TYPE_ADJUVANT','OTHER_ADJ_TREAT',
    'RESIDUAL', 'RESIDUALa', 'OS_CNIO', 'VITAL STATUS', 'FAMILIAL','DATE_OS'
]

# Reorder
ove_no_subheaders = ove_no_subheaders[new_order]
print(ove_no_subheaders.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 129 entries, 1 to 129
Data columns (total 19 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   ID_CNIO                129 non-null    object 
 1   HISTOLOGY              129 non-null    object 
 2   HISTOLOGY_DETAIL       129 non-null    object 
 3   AGE                    129 non-null    object 
 4   DATE_AGE               129 non-null    object 
 5   GRADE                  129 non-null    object 
 6   FIGO                   100 non-null    object 
 7   FIGOL                  101 non-null    object 
 8   FIGOa                  100 non-null    object 
 9   NEOADJUVANT_TREATMENT  116 non-null    object 
 10  ADJUVANT_TREATMENT     116 non-null    object 
 11  TYPE_ADJUVANT          128 non-null    object 
 12  OTHER_ADJ_TREAT        129 non-null    object 
 13  RESIDUAL               129 non-null    object 
 14  RESIDUALa              70 non-null     object 
 15  OS_CNI

In [50]:
# Create row of subheaders.
# Note that AGE AT DIAGNOSIS in this case is like age at surgery although the subheader indicates other thing
subheaders = ['LP: ID for samples from La Paz Hospital; OVE: ID for samples from Virgen del Rocio Hospital; MDA: ID for samples from MD Anderson Cancer Center  Hospital ; RVB: ID for samples from Red Valenciana de Biobancos', 
            '0=endometroid; 1=clear cells;2=mixed; 3=others; NA=unknown', 'type of mixed cancer; no mixed; NA=unknown',
             '2 digits; unit is YEARS','D=diagnosis date used for calculating age; S=surgery date used for calculating age; NA=unknown','1=well differentiated; 2=moderately differentiated; 3=poorly differentiated; NA=unknown',
             '1=IA; 2=IB; 3=IC; 4=I(NOS); 5=IIA; 6=IIB; 8=II(NOS); 9=IIIA; 10=IIIB, 11=IIIC; 12=III(NOS); 13=IV; NA=unknown',
             '0=localized (I,II); 1=advanced(III,IV); NA=unknown','1= IA, IB, IC, I(NOS); 2=IIA, IIB, II (NOS); 3=IIIA, IIIB, IIIC, III(NOS); 4= IV; NA=unknown',
             '0=NO; 1=YES; NA=unknown','0=NO; 1=YES; NA=unknown',
             '0=Carbo-paclitaxel; 1=Cis-paclitaxel; 2=Carbo-monotherapy; 3=Carbo-Taxol-Beva; 4=other; NA=unknown','Drugs used as adjuvant treatment',
             '0=no macroscopic disease; 1=macroscopic disease<=1; 2=macroscopic disease>1; 3=macroscopic disease,unknown size; NA=unknown',
             '0=No residual disease; 1=Yes residual disease; NA=unknown','Unit is DAYS',
             '0=alive; 1=dead; LF=lost to follow-up; NA=unknown','0=NO; 1=YES; NA=unknown',
             'D=diagnosis date used for OS; S=surgery date used for OS; LD:Lacking diagnosis/surgery/death_lastv date; NA=unknown'
             ]
df_subheaders = pd.DataFrame([subheaders], columns=ove_no_subheaders.columns)

# Concatenate the row of subheaders below the original DataFrame
df_with_subheaders = pd.concat([df_subheaders, ove_no_subheaders], ignore_index=True)
print("\nDataFrame con subheaders añadidos:")
print(df_with_subheaders)


DataFrame con subheaders añadidos:
                                               ID_CNIO  \
0    LP: ID for samples from La Paz Hospital; OVE: ...   
1                                               OVE105   
2                                               OVE127   
3                                               OVE128   
4                                               OVE107   
..                                                 ...   
125                                             OVE106   
126                                              OVE95   
127                                              OVE22   
128                                               OVE2   
129                                              OVE11   

                                             HISTOLOGY  \
0    0=endometroid; 1=clear cells;2=mixed; 3=others...   
1                                                    3   
2                                                    3   
3                                  

In [51]:
# Row with descriptions- I want it in the first row below the headers (lista o diccionario)
descriptions = {
    'ID_CNIO': 'Unique CNIO tumor identifier',
    'HISTOLOGY': 'Tumor histology/Tumor type',
    'HISTOLOGY_DETAIL':'Tumor type based on the present cells: mixed (type of mixed in that case), no mixed',
    'AGE': 'Age at diagnosis calculated as (date of diagnosis – date of birth). When not provided, date of surgery was used instead of date of diagnosis (case of OVE series)',
    'DATE_AGE':'Tag column to indicate if the date used for calculating age at diagnosis was the diagnosis date or the surgery date',
    'GRADE':'Tumor differentiation grade',
    'FIGO':'FIGO tumor stage',
    'FIGOL':'FIGO tumor stage according to cancer spread (localized or advanced)',
    'FIGOa':'Aggregated tumor stage',
    'NEOADJUVANT_TREATMENT':'Information about whether the patient received neoadjuvant treatment',
    'ADJUVANT_TREATMENT':'Information about whether the patient received chemotherapy',
    'TYPE_ADJUVANT':'Type of adjuvant treatment received. Carboplatino+taxol is the standard',
    'OTHER_ADJ_TREAT':'Other type of adjuvant treatment received by the patient different from the standard, carboplatino+taxol',
    'RESIDUAL':'Residual disease after surgery',
    'RESIDUALa':'Aggregated categories of residual disease after surgery',
    'OS_CNIO':'Overall survival calculated by CNIO from original records as (date of death-date of diagnosis) or (date last known to be alive-date of diagnosis). When not provided, date of surgery was used instead of date of diagnosis. In this last case it is indicated in the tag column DATE USED FOR OS',
    'VITAL STATUS':'Vital status at last followup',
    'FAMILIAL':' Familial antecedents of colon,endometrial and/or ovarian cancer. In OVE series this information refers to possible hereditary conditions according to the clinician',
    'DATE_OS':'Tag column to indicate how overall survival was calculated, using date of diagnosis or date of surgery (OVE series)'
}

# Convert the dictionary into a dataframe
desc_df = pd.DataFrame(descriptions, index=[0])

# Concatenate
df_final = pd.concat([desc_df, df_with_subheaders]).reset_index(drop=True)

# Show
print(df_final)  

                                               ID_CNIO  \
0                         Unique CNIO tumor identifier   
1    LP: ID for samples from La Paz Hospital; OVE: ...   
2                                               OVE105   
3                                               OVE127   
4                                               OVE128   
..                                                 ...   
126                                             OVE106   
127                                              OVE95   
128                                              OVE22   
129                                               OVE2   
130                                              OVE11   

                                             HISTOLOGY  \
0                           Tumor histology/Tumor type   
1    0=endometroid; 1=clear cells;2=mixed; 3=others...   
2                                                    3   
3                                                    3   
4            

In [52]:
# Order the samples
import re

# Separate the first two rows (subheaders and coding)
header_rows = df_final.iloc[:2]

# The rest of the DataFrame (starting from row 2)
data_rows = df_final.iloc[2:].copy()  # Use .copy() to avoid SettingWithCopyWarning

# Extract the number after "OVE" only for the data rows. In the original database OVE19 appears as 0VE19 
# and it is not recognized. I will modify this in the original files.
data_rows.loc[:, 'ID_number'] = data_rows['ID_CNIO'].str.extract(r'OVE(\d+)')

# Convert the extracted values to numbers, ignoring errors
data_rows.loc[:, 'ID_number'] = pd.to_numeric(data_rows['ID_number'], errors='coerce')

# Filter rows where 'ID_number' is not NaN
data_rows = data_rows[data_rows['ID_number'].notna()]

# Sort the DataFrame by the extracted number
data_rows = data_rows.sort_values(by='ID_number').reset_index(drop=True)

# Drop the auxiliary column if it's no longer needed
data_rows = data_rows.drop(columns=['ID_number'])

# Recombine the subheader rows with the sorted data rows
df_final_sorted = pd.concat([header_rows, data_rows], ignore_index=True)


In [53]:
# Add original identifiers columns from the file passed by Maria
# I identify a problem: in the summary sheet some OVE samples are not, but they are in the specific OVE sheet (sheet 3), but
# the columns have other names so I have to read that sheet, recover that columns and then reassign the same name of the columns
# Read identifiers file
identificadores_df = pd.read_excel("/home/vant/TFM/Identificadores OVE_LP_MDA_RVB_serie completa.xlsx", sheet_name='identificadores OVE')
print(identificadores_df.info())
# As I can see there are 2 columns, but I have to use the second one because contain all complete cases

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 142 entries, 0 to 141
Data columns (total 6 columns):
 #   Column                                                                                                                     Non-Null Count  Dtype  
---  ------                                                                                                                     --------------  -----  
 0   CÓDIGO CNIO                                                                                                                130 non-null    object 
 1   CÓDIGO CNIO.1                                                                                                              130 non-null    object 
 2   Nº HISTORIA                                                                                                                129 non-null    float64
 3   CASO (AP)                                                                                                                  129 non-

In [54]:
# Add original identifiers columns from the file passed by Maria
# I identify a problem: in the summary sheet some OVE samples are not, but they are in the specific OVE sheet (sheet 3), but
# the columns have other names so I have to read that sheet, recover that columns and then reassign the same name of the columns

# Read identifiers file
identificadores_df = pd.read_excel("/home/vant/TFM/Identificadores OVE_LP_MDA_RVB_serie completa.xlsx", sheet_name='identificadores OVE')

# Rename 'CÓDIGO CNIO' column to match with 'ID_CNIO' of df_final_sorted
identificadores_df.rename(columns={'CÓDIGO CNIO.1': 'ID_CNIO'}, inplace=True)

# Merge
df_final_sorted = df_final_sorted.merge(identificadores_df[['ID_CNIO', 'Nº HISTORIA', 'CASO (AP)']], 
                                        on='ID_CNIO', 
                                        how='left')

# Check
print(df_final_sorted.head())

                                             ID_CNIO  \
0                       Unique CNIO tumor identifier   
1  LP: ID for samples from La Paz Hospital; OVE: ...   
2                                               OVE1   
3                                               OVE2   
4                                      08T402 / OVE3   

                                           HISTOLOGY  \
0                         Tumor histology/Tumor type   
1  0=endometroid; 1=clear cells;2=mixed; 3=others...   
2                                                  3   
3                                                  1   
4                                                  3   

                                    HISTOLOGY_DETAIL  \
0  Tumor type based on the present cells: mixed (...   
1         type of mixed cancer; no mixed; NA=unknown   
2                                           No mixed   
3                                           No mixed   
4                                           No

In [55]:
# Add description and coding in rows with index 0 and 1
# Rename'ORIGINAL ID_AP' to 'ID_ORIGINAL' and ORIGINAL ID_NHC_BBANK to 'ID_ORIGINAL_NHC_BBANK'

df_final_sorted.rename(columns={'CASO (AP)': 'ID_ORIGINAL'}, inplace=True)
df_final_sorted.rename(columns={'Nº HISTORIA': 'ID_ORIGINAL_NHC_BBANK'}, inplace=True)
df_final_sorted['ID_ORIGINAL_NHC_BBANK'] = df_final_sorted['ID_ORIGINAL_NHC_BBANK'].astype('object')
# Assign values to rows 0 and 1
df_final_sorted.loc[0, 'ID_ORIGINAL'] = 'Unique identifier from the hospital to the patient/sample'
df_final_sorted.loc[1, 'ID_ORIGINAL'] = 'Pathology code'
# Assign values to rows 0 and 1
df_final_sorted.loc[0, 'ID_ORIGINAL_NHC_BBANK'] = 'Unique identifier from Hospital (LP, OVE, MDA series; NHC= number of clinical history) or Biobank (RVB series) to the patient/sample'
df_final_sorted.loc[1, 'ID_ORIGINAL_NHC_BBANK'] = 'Unique alphanumeric code from the hospital or biobank'

In [56]:
print(df_final_sorted.head())

                                             ID_CNIO  \
0                       Unique CNIO tumor identifier   
1  LP: ID for samples from La Paz Hospital; OVE: ...   
2                                               OVE1   
3                                               OVE2   
4                                      08T402 / OVE3   

                                           HISTOLOGY  \
0                         Tumor histology/Tumor type   
1  0=endometroid; 1=clear cells;2=mixed; 3=others...   
2                                                  3   
3                                                  1   
4                                                  3   

                                    HISTOLOGY_DETAIL  \
0  Tumor type based on the present cells: mixed (...   
1         type of mixed cancer; no mixed; NA=unknown   
2                                           No mixed   
3                                           No mixed   
4                                           No

In [57]:
# Generate the final excel file
df_final_sorted.to_excel("/home/vant/TFM/OVE_final.xlsx",index=False)
print("File succesfully generated")

File succesfully generated
