In [2]:
### COMBINE ALL TYPE OF DATA IN A FINAL DATABASE AND OTHER DATABASES WITH SPECIFIC INFORMATION ############

## Import neccesary libraries
import pandas as pd
import numpy as np
import re
import os
### Read and combine final clinical databases
# List of files to combine
files = ['LP_final.xlsx', 'MDA_final.xlsx', 'OVE_final.xlsx', 'RVB_final.xlsx']
# Read and combine eliminating rows of descriptions and coding (index 0 and 1). There are not the same in the different files so it is better to 
# create them again when we have all columns in the final order in the definitive file
dataframes = []
for file in files:
    df = pd.read_excel(file)
    df = df.drop([0,1]) # Eliminate subheaders row
    dataframes.append(df)

# Combine all dataframes
combined_df = pd.concat(dataframes, ignore_index=True)
combined_df['Clinical_info']='YES'
print(combined_df.info())  # 309 samples

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 309 entries, 0 to 308
Data columns (total 26 columns):
 #   Column                    Non-Null Count  Dtype 
---  ------                    --------------  ----- 
 0   ID_CNIO                   309 non-null    object
 1   HISTOLOGY                 306 non-null    object
 2   AGE                       309 non-null    object
 3   GRADE                     243 non-null    object
 4   FIGO                      275 non-null    object
 5   FIGOL                     276 non-null    object
 6   FIGOa                     275 non-null    object
 7   NEOADJUVANT_TREATMENT     233 non-null    object
 8   ADJUVANT_TREATMENT        279 non-null    object
 9   TYPE_ADJUVANT             197 non-null    object
 10  RESIDUAL                  224 non-null    object
 11  RESIDUALa                 224 non-null    object
 12  OS_CNIO                   284 non-null    object
 13  VITAL STATUS              289 non-null    object
 14  MMR STATUS                

In [3]:
# Add colums with genomic/inmunogenic/IHC/PCR data from Genomic_tils_final_database.xlsx
# TILs counts column: YES values give us the information about the number of samples with TILs data and
# FINAL SAMPLES (Diego) about the number of samples with genomic information that pass QC filters
# Step 1: Read the data
df=pd.read_excel('/home/vant/TFM/Final_db/Genomic_tils_final_database.xlsx')
# Step 2: See information
print(df.info())  # 235 samples

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 235 entries, 0 to 234
Data columns (total 33 columns):
 #   Column                           Non-Null Count  Dtype  
---  ------                           --------------  -----  
 0   ID_CNIO                          235 non-null    object 
 1   AP number                        204 non-null    object 
 2   HIST                             205 non-null    object 
 3   IHC                              232 non-null    object 
 4   IHC pattern                      232 non-null    object 
 5   IHC_MMR_class                    232 non-null    object 
 6   MSI (PCR)                        198 non-null    object 
 7   Evidence score for MMRd          31 non-null     float64
 8   IHC (Diego)                      171 non-null    object 
 9   PCR (Diego)                      172 non-null    object 
 10  PASS QC > 2M reads   (Diego)     205 non-null    object 
 11  FINAL SAMPLES (Diego)            235 non-null    object 
 12  CLASS ACCORDING IHC/PC

In [4]:
# Eliminate additional spaces in the column names
df.columns = df.columns.str.strip()
combined_df.columns = combined_df.columns.str.strip()

# Step 1: Find ID_CNIO values present in both DataFrames
common_ids = pd.merge(combined_df[['ID_CNIO']], df[['ID_CNIO']], on='ID_CNIO', how='inner')['ID_CNIO']

# Step 2: Find unique ID_CNIO values in combined_df (clinical database) , but not in df (genomic+gold standard+tils database)
only_in_combined_df = combined_df[~combined_df['ID_CNIO'].isin(df['ID_CNIO'])]['ID_CNIO']

# Step 3: Find unique ID_CNIO values in df (genomic+GS+tils database), but not in combined_df (clinical database)
only_in_df = df[~df['ID_CNIO'].isin(combined_df['ID_CNIO'])]['ID_CNIO']

# Step 4: Show the results
print(f"Number of samples with same ID_CNIO values in both DataFrames: {len(common_ids)}")
print(f"Samples in both DataFrames:\n{common_ids.to_list()}")

print(f"Number of samples only in combined_df: {len(only_in_combined_df)}")
print(f"Samples only in combined_df-clinical data:\n{only_in_combined_df.to_list()}")

print(f"Number of samples only in df: {len(only_in_df)}")
print(f"Samples only in df-genomic/tils/IHC/PCR:\n{only_in_df.to_list()}")

Number of samples with same ID_CNIO values in both DataFrames: 209
Samples in both DataFrames:
['LP1', 'LP2', 'LP3', 'LP4', 'LP5', 'LP6', 'LP7', 'LP8', 'LP9', 'LP10', 'LP11', 'LP12', 'LP13', 'LP14', 'LP15', 'LP16', 'LP17', 'LP18', 'LP19', 'LP20', 'LP21', 'LP22', 'LP24', 'LP25', 'LP26', 'LP27', 'LP28', 'LP29', 'LP30', 'LP31', 'LP32', 'LP33', 'LP34', 'LP35', 'LP36', 'LP37', 'LP38', 'LP39', 'LP40', 'LP41', 'LP42', 'LP43', 'LP44', 'LP45', 'LP46', 'LP47', 'LP48', 'LP49', 'LP50', 'LP51', 'LP53', 'LP55', 'LP56', 'LP57', 'LP58', 'LP59', 'LP60', 'LP61', 'LP62', 'LP63', 'LP64', 'LP65', 'LP66', 'LP67', 'LP68', 'LP69', 'LP70', 'LP71', 'LP72', 'LP73', 'LP74', 'LP75', 'LP76', 'LP77', 'LP78', 'LP79', 'LP80', 'LP81', 'LP82', 'LP83', 'LP84', 'LP85', 'LP86', 'LP87', 'LP88', 'LP89', 'LP90', 'LP91', 'LP92', 'LP93', 'LP94', 'LP95', 'LP96', 'LP97', 'LP98', 'LP99', 'LP100', 'LP102', 'LP103', 'LP104', 'LP105', 'LP106', 'LP109', 'LP110', 'LP114', 'LP115', 'LP116', 'LP117', 'LP119', 'LP121', 'MDA1', 'MDA2', 'MD

In [5]:
# After the last operation, I observe that some ID_CNIO values in combined_df, such as 08T399 / OVE24, for example,
# do not match with OVE24 in ID_CNIO in df. To solve this:
# Extract the relevant part of ID_CNIO in combined_df
import re

# Remove irrelevant prefixes from IDs in combined_df, keeping the suffixes. In this way, I replace any digit followed by
# a 'T' and then any digit one or more times, optionally followed by zero or more spaces, and replace it with an empty string.
combined_df['ID_CNIO_corrected'] = combined_df['ID_CNIO'].apply(lambda x: re.sub(r'^\d+T\d+\s*/\s*', '', x))
# The IDs in df are already correct and do not require modification.
df['ID_CNIO_corrected'] = df['ID_CNIO'].str.strip()
# Step 3: Find the ID_CNIO present in both DataFrames
common_ids = pd.merge(combined_df[['ID_CNIO_corrected']], df[['ID_CNIO_corrected']], on='ID_CNIO_corrected', how='inner')['ID_CNIO_corrected']

# Step 4: Find the unique ID_CNIO in combined_df but not in df
only_in_combined_df = combined_df[~combined_df['ID_CNIO_corrected'].isin(df['ID_CNIO_corrected'])]['ID_CNIO_corrected']

# Step 5: Find the unique ID_CNIO in df but not in combined_df
only_in_df = df[~df['ID_CNIO_corrected'].isin(combined_df['ID_CNIO_corrected'])]['ID_CNIO_corrected']

# Step 6: Display the results
print(f"Total samples with the same ID_CNIO in both DataFrames: {len(common_ids)}")
print(f"Samples in both DataFrames:\n{common_ids.to_list()}")

print(f"Total samples only in combined_df: {len(only_in_combined_df)}")
print(f"Samples only in combined_df-clinical data:\n{only_in_combined_df.to_list()}")

print(f"Total samples only in df: {len(only_in_df)}")
print(f"Samples only in df-genomic/tils/IHC/PCR:\n{only_in_df.to_list()}")


Total samples with the same ID_CNIO in both DataFrames: 214
Samples in both DataFrames:
['LP1', 'LP2', 'LP3', 'LP4', 'LP5', 'LP6', 'LP7', 'LP8', 'LP9', 'LP10', 'LP11', 'LP12', 'LP13', 'LP14', 'LP15', 'LP16', 'LP17', 'LP18', 'LP19', 'LP20', 'LP21', 'LP22', 'LP24', 'LP25', 'LP26', 'LP27', 'LP28', 'LP29', 'LP30', 'LP31', 'LP32', 'LP33', 'LP34', 'LP35', 'LP36', 'LP37', 'LP38', 'LP39', 'LP40', 'LP41', 'LP42', 'LP43', 'LP44', 'LP45', 'LP46', 'LP47', 'LP48', 'LP49', 'LP50', 'LP51', 'LP53', 'LP55', 'LP56', 'LP57', 'LP58', 'LP59', 'LP60', 'LP61', 'LP62', 'LP63', 'LP64', 'LP65', 'LP66', 'LP67', 'LP68', 'LP69', 'LP70', 'LP71', 'LP72', 'LP73', 'LP74', 'LP75', 'LP76', 'LP77', 'LP78', 'LP79', 'LP80', 'LP81', 'LP82', 'LP83', 'LP84', 'LP85', 'LP86', 'LP87', 'LP88', 'LP89', 'LP90', 'LP91', 'LP92', 'LP93', 'LP94', 'LP95', 'LP96', 'LP97', 'LP98', 'LP99', 'LP100', 'LP102', 'LP103', 'LP104', 'LP105', 'LP106', 'LP109', 'LP110', 'LP114', 'LP115', 'LP116', 'LP117', 'LP119', 'LP121', 'MDA1', 'MDA2', 'MDA3', 'M

In [6]:
print(combined_df.info())
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 309 entries, 0 to 308
Data columns (total 27 columns):
 #   Column                    Non-Null Count  Dtype 
---  ------                    --------------  ----- 
 0   ID_CNIO                   309 non-null    object
 1   HISTOLOGY                 306 non-null    object
 2   AGE                       309 non-null    object
 3   GRADE                     243 non-null    object
 4   FIGO                      275 non-null    object
 5   FIGOL                     276 non-null    object
 6   FIGOa                     275 non-null    object
 7   NEOADJUVANT_TREATMENT     233 non-null    object
 8   ADJUVANT_TREATMENT        279 non-null    object
 9   TYPE_ADJUVANT             197 non-null    object
 10  RESIDUAL                  224 non-null    object
 11  RESIDUALa                 224 non-null    object
 12  OS_CNIO                   284 non-null    object
 13  VITAL STATUS              289 non-null    object
 14  MMR STATUS                

In [7]:
# I eliminate every space in the name of the columns. There are samples in df with no information about the clinics (combined_df) and viceversa
df.columns = df.columns.str.strip()
combined_df.columns = combined_df.columns.str.strip()
# List of specific columns in df I want to add to the new dataframe 
columns_to_merge = [
    'IHC', 'IHC pattern','IHC_MMR_class','MSI (PCR)', 'Evidence score for MMRd','IHC (Diego)', 'PCR (Diego)', 'PASS QC > 2M reads   (Diego)', 
    'FINAL SAMPLES (Diego)', 'CLASS ACCORDING IHC/PCR RESULTS', 'CLASS ACCORDING TFM  (Diego)', 
    'MSI by MSISensor2 (Diego)', 'TMB', 'ID2', 'ID7', 'SUM ID2+ID7', 'CNV', 
    '% genome altered (Diego)', 'Pol mutation (Diego)', 'MMRd mutation (Diego)', 
    'Non-MMRd mutation (Diego)', 'TILs counts', 'TILs_CNIO_info','TILs_ep_CNIO', 'TILs_ep_LP', 
    'TILs_ep_OTTA', 'TILs_tu_CNIO', 'TILs_tu_LP','TILs_raw_ep','TILs_raw_tu'
]
# Combine DataFrames keeping all rows
final_df = pd.merge(
    combined_df,
    df[['ID_CNIO_corrected'] + columns_to_merge],  # Select only the necessary columns in df
    on='ID_CNIO_corrected',
    how='outer'  # 'outer' allows to keep all rows in combined_df and df
)

# Show the final DataFrame 
print(final_df.head())
print("Length of the complete df is:",len(final_df))
final_df.to_excel("/home/vant/TFM/Final_db/Combined_df_clinical_genomic_tils.xlsx",index=False)

  ID_CNIO HISTOLOGY AGE GRADE FIGO FIGOL FIGOa NEOADJUVANT_TREATMENT  \
0     LP1         0  50     2    3     0     1                     0   
1     LP2         0  42     2    3     0     1                     0   
2     LP3         0  68     2    5     0     2                     0   
3     LP4         0  50     1    6     0     2                     0   
4     LP5         0  82     1    1     0     1                     0   

  ADJUVANT_TREATMENT TYPE_ADJUVANT  ... Non-MMRd mutation (Diego) TILs counts  \
0                  1             0  ...                        NO         YES   
1                  1             0  ...                        NO         YES   
2                  1             0  ...                        NO         YES   
3                  1             0  ...                        NO         YES   
4                  0           NaN  ...                        NO         YES   

  TILs_CNIO_info TILs_ep_CNIO TILs_ep_LP TILs_ep_OTTA TILs_tu_CNIO TILs_tu_LP  \

In [8]:
print(final_df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 330 entries, 0 to 329
Data columns (total 57 columns):
 #   Column                           Non-Null Count  Dtype  
---  ------                           --------------  -----  
 0   ID_CNIO                          309 non-null    object 
 1   HISTOLOGY                        306 non-null    object 
 2   AGE                              309 non-null    object 
 3   GRADE                            243 non-null    object 
 4   FIGO                             275 non-null    object 
 5   FIGOL                            276 non-null    object 
 6   FIGOa                            275 non-null    object 
 7   NEOADJUVANT_TREATMENT            233 non-null    object 
 8   ADJUVANT_TREATMENT               279 non-null    object 
 9   TYPE_ADJUVANT                    197 non-null    object 
 10  RESIDUAL                         224 non-null    object 
 11  RESIDUALa                        224 non-null    object 
 12  OS_CNIO               

In [9]:
# Eliminate ID_CNIO because it is repeated, rename ID_CNIO_corrected to ID_CNIO
final_df.drop(columns=['ID_CNIO'],inplace=True)
# Create a dictitonary to rename the columns names.
final_df.rename(columns={
    'ID_CNIO_corrected': 'ID_CNIO',
    'MSI (PCR)' :'PCR',
    'IHC': 'IHC_defective_MMR_protein',
    'IHC (Diego)':'IHCd',
    'PCR (Diego)':'PCRd',
    'PASS QC > 2M reads   (Diego)':'QC_filters',
    'FINAL SAMPLES (Diego)':'Genomic_info', # 173.Remember Diego gave us data of 4 extra samples that didn´t appear in its previous database version
    'CLASS ACCORDING IHC/PCR RESULTS':'MMR_final_status', 
    'CLASS ACCORDING TFM  (Diego)':'CLASSd',
    'MSI by MSISensor2 (Diego)':'MSI_sensor2',
     '% genome altered (Diego)':'%genome_altered',
    'Pol mutation (Diego)':'Pol_mutation',
    'MMRd mutation (Diego)':'MMRd_mutation',
    'Non-MMRd mutation (Diego)':'Non-MMRd_mutation',
    'TILs counts':'TILs_score_info',
    
}, inplace=True)

# Check the changes
print(final_df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 330 entries, 0 to 329
Data columns (total 56 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   HISTOLOGY                  306 non-null    object 
 1   AGE                        309 non-null    object 
 2   GRADE                      243 non-null    object 
 3   FIGO                       275 non-null    object 
 4   FIGOL                      276 non-null    object 
 5   FIGOa                      275 non-null    object 
 6   NEOADJUVANT_TREATMENT      233 non-null    object 
 7   ADJUVANT_TREATMENT         279 non-null    object 
 8   TYPE_ADJUVANT              197 non-null    object 
 9   RESIDUAL                   224 non-null    object 
 10  RESIDUALa                  224 non-null    object 
 11  OS_CNIO                    284 non-null    object 
 12  VITAL STATUS               289 non-null    object 
 13  MMR STATUS                 179 non-null    object 

In [10]:
# New order of the columns
new_order = [
    'ID_CNIO', 'ID_ORIGINAL','ID_ORIGINAL_NHC_BBANK','HISTOLOGY', 'HISTOLOGY_DETAIL','AGE', 'GRADE', 'FIGO',
    'FIGOL', 'FIGOa', 'NEOADJUVANT_TREATMENT', 'ADJUVANT_TREATMENT', 'TYPE_ADJUVANT', 'OTHER_ADJ_TREAT',
    'RESIDUAL', 'RESIDUALa', 'OS_CNIO', 'VITAL STATUS', 'MMR STATUS', 'BRCA STATUS','FAMILIAL','DATE_AGE',
    'DATE_OS','PARTIAL DATE DIAGNOSIS','PARTIAL DATE DEATH_LASTv','Clinical_info','IHC_defective_MMR_protein', 'IHC pattern','IHC_MMR_class',
    'PCR','Evidence score for MMRd','IHCd','PCRd','QC_filters','Genomic_info','MMR_final_status','CLASSd','MSI_sensor2','TMB','ID2','ID7','SUM ID2+ID7','CNV','%genome_altered','Pol_mutation','MMRd_mutation',
    'Non-MMRd_mutation','TILs_score_info', 'TILs_CNIO_info','TILs_ep_CNIO','TILs_ep_LP','TILs_ep_OTTA','TILs_tu_CNIO','TILs_tu_LP','TILs_raw_ep',
    'TILs_raw_tu'
]
final_df=final_df[new_order]
print(final_df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 330 entries, 0 to 329
Data columns (total 56 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   ID_CNIO                    330 non-null    object 
 1   ID_ORIGINAL                299 non-null    object 
 2   ID_ORIGINAL_NHC_BBANK      283 non-null    object 
 3   HISTOLOGY                  306 non-null    object 
 4   HISTOLOGY_DETAIL           129 non-null    object 
 5   AGE                        309 non-null    object 
 6   GRADE                      243 non-null    object 
 7   FIGO                       275 non-null    object 
 8   FIGOL                      276 non-null    object 
 9   FIGOa                      275 non-null    object 
 10  NEOADJUVANT_TREATMENT      233 non-null    object 
 11  ADJUVANT_TREATMENT         279 non-null    object 
 12  TYPE_ADJUVANT              197 non-null    object 
 13  OTHER_ADJ_TREAT            14 non-null     object 

In [11]:
####### RECOVER MISSING DATA IF IT IS POSSIBLE #################
# Some samples lack some data that we can recover from the identifiers file
identificadores_df = pd.read_excel("Identificadores OVE_LP_MDA_RVB_serie completa.xlsx", sheet_name='identificadores serie completa ')
print(identificadores_df.info())
identificadores_df.columns=identificadores_df.columns.str.strip()
print(identificadores_df.info())
identificadores_df.rename(columns={'ID CNIO': 'ID_CNIO'}, inplace=True)
print(identificadores_df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 235 entries, 0 to 234
Data columns (total 5 columns):
 #   Column                 Non-Null Count  Dtype 
---  ------                 --------------  ----- 
 0   ID CNIO                235 non-null    object
 1   ORIGINAL ID_AP         226 non-null    object
 2   ORIGINAL ID_NHC_BBANK  204 non-null    object
 3   SUBTIPO HISTOLÓGICO    235 non-null    object
 4   SERIE                  235 non-null    object
dtypes: object(5)
memory usage: 9.3+ KB
None
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 235 entries, 0 to 234
Data columns (total 5 columns):
 #   Column                 Non-Null Count  Dtype 
---  ------                 --------------  ----- 
 0   ID CNIO                235 non-null    object
 1   ORIGINAL ID_AP         226 non-null    object
 2   ORIGINAL ID_NHC_BBANK  204 non-null    object
 3   SUBTIPO HISTOLÓGICO    235 non-null    object
 4   SERIE                  235 non-null    object
dtypes: object(5)
memory usage: 9

In [12]:
# Filter data to keep those in 'final_df' in which 'ID_ORIGINAL' and 'ID_ORIGINAL_NHC_BBANK' are NaN
df_missing = final_df[(final_df['ID_ORIGINAL'].isna()) & (final_df['ID_ORIGINAL_NHC_BBANK'].isna())]
print(df_missing)

       ID_CNIO ID_ORIGINAL ID_ORIGINAL_NHC_BBANK HISTOLOGY HISTOLOGY_DETAIL  \
22        LP23         NaN                   NaN         0              NaN   
309       LP52         NaN                   NaN       NaN              NaN   
310       LP54         NaN                   NaN       NaN              NaN   
311      LP124         NaN                   NaN       NaN              NaN   
312   03B07829         NaN                   NaN       NaN              NaN   
313   07B23784         NaN                   NaN       NaN              NaN   
314   10B36563         NaN                   NaN       NaN              NaN   
315   11B23809         NaN                   NaN       NaN              NaN   
316    11B5978         NaN                   NaN       NaN              NaN   
317      07T25         NaN                   NaN       NaN              NaN   
318      08T96         NaN                   NaN       NaN              NaN   
319     09T134         NaN                   NaN    

In [13]:
# Merge with identifiers dataFrame to obtain only the missing values
merged_df = df_missing[['ID_CNIO']].merge(identificadores_df[['ID_CNIO', 'ORIGINAL ID_AP', 'ORIGINAL ID_NHC_BBANK']],
                                          on='ID_CNIO', how='left')
# Verify
print(merged_df)

      ID_CNIO                    ORIGINAL ID_AP ORIGINAL ID_NHC_BBANK
0        LP23                               NaN                   NaN
1        LP52                        06B0015868               2142532
2        LP54                        07B0030433               2258343
3       LP124                        02B0013829                   NaN
4    03B07829                               NaN                   NaN
5    07B23784                               NaN                   NaN
6    10B36563                               NaN                   NaN
7    11B23809                               NaN                   NaN
8     11B5978                               NaN                   NaN
9       07T25                        8175-04/A4          BTCNIO06/111
10      08T96        04B0008135 (F.H. ALCORCÓN)                   NaN
11     09T134     B781787  (H,Gregorio Marañon)               B781787
12      07T29                               NaN                   NaN
13     08T107       

In [14]:
# Make sure the indexes are aligneated
print("Índexes of df_missing:", df_missing.index)
print("Índexes of merged_df:", merged_df.index)
# Update values in final_df
final_df.loc[df_missing.index, 'ID_ORIGINAL'] = merged_df['ORIGINAL ID_AP'].values
final_df.loc[df_missing.index, 'ID_ORIGINAL_NHC_BBANK'] = merged_df['ORIGINAL ID_NHC_BBANK'].values
# Rename the columns as required
final_df.rename(columns={'ORIGINAL ID_AP': 'ID_ORIGINAL', 'ORIGINAL ID_NHC_BBANK': 'ID_ORIGINAL_NHC_BBANK'}, inplace=True)
# Check the final dataFrame 
print(final_df.head())

Índexes of df_missing: Index([ 22, 309, 310, 311, 312, 313, 314, 315, 316, 317, 318, 319, 320, 321,
       322, 323, 324, 325, 326, 327, 328, 329],
      dtype='int64')
Índexes of merged_df: RangeIndex(start=0, stop=22, step=1)
  ID_CNIO ID_ORIGINAL ID_ORIGINAL_NHC_BBANK HISTOLOGY HISTOLOGY_DETAIL AGE  \
0     LP1    02B28662                   NaN         0              NaN  50   
1     LP2    08B18876                   NaN         0              NaN  42   
2     LP3     08B260A                   NaN         0              NaN  68   
3     LP4    11B18858                   NaN         0              NaN  50   
4     LP5     99B6752                   NaN         0              NaN  82   

  GRADE FIGO FIGOL FIGOa  ... Non-MMRd_mutation TILs_score_info  \
0     2    3     0     1  ...                NO             YES   
1     2    3     0     1  ...                NO             YES   
2     2    5     0     2  ...                NO             YES   
3     1    6     0     2  ...      

In [15]:
# Remember that df dataframe was read previously. Now we need it to add the incomplete data in the HISTOLOGY column
# for some samples. The idea is to check with are the rows with empty data and then find the value of HISTOLOGY in HIST column
# in Noelia complete database.ods (dataframe df). If in that column the value is 'E' we must assign 0, if 'CC' we must assign 1
# If the value contains a string with 'M' or 'mixto' we must assign 2. If it is a string without 'M' or 'mixto' must assign 3 
# and if it is empty keep it the same.
# print(df.info())
df.columns=df.columns.str.strip()
#  Filter data of 'final_df' in which 'HISTOLOGY' value is NaN
final_missing = final_df[(final_df['HISTOLOGY'].isna())]
print(final_missing[['ID_CNIO','HISTOLOGY']])

       ID_CNIO HISTOLOGY
300       RVB1       NaN
307       RVB8       NaN
308       RVB9       NaN
309       LP52       NaN
310       LP54       NaN
311      LP124       NaN
312   03B07829       NaN
313   07B23784       NaN
314   10B36563       NaN
315   11B23809       NaN
316    11B5978       NaN
317      07T25       NaN
318      08T96       NaN
319     09T134       NaN
320      07T29       NaN
321     08T107       NaN
322    09T-114       NaN
323  09T-117-T       NaN
324    09T-119       NaN
325      09T45       NaN
326      08T94       NaN
327     09T237       NaN
328      11T82       NaN
329     16T131       NaN


In [16]:
# Merge to obtain missing values from the df dataframe (it contains genomic+TILs data)
merged_df2 = final_missing[['ID_CNIO']].merge(df[['ID_CNIO', 'HIST']],
                                          on='ID_CNIO', how='left')
# Verify
print(merged_df2)
print("Índexes of df_missing:", final_missing.index)
print("Índexes of merged_df:", merged_df2.index)

      ID_CNIO         HIST
0        RVB1  NA (Colon?)
1        RVB8          NaN
2        RVB9          NaN
3        LP52            E
4        LP54            E
5       LP124           CC
6    03B07829          NaN
7    07B23784          NaN
8    10B36563          NaN
9    11B23809          NaN
10    11B5978          NaN
11      07T25           CC
12      08T96           CC
13     09T134           CC
14      07T29          NaN
15     08T107          NaN
16    09T-114          NaN
17  09T-117-T          NaN
18    09T-119          NaN
19      09T45          NaN
20      08T94            E
21     09T237            E
22      11T82            E
23     16T131            E
Índexes of df_missing: Index([300, 307, 308, 309, 310, 311, 312, 313, 314, 315, 316, 317, 318, 319,
       320, 321, 322, 323, 324, 325, 326, 327, 328, 329],
      dtype='int64')
Índexes of merged_df: RangeIndex(start=0, stop=24, step=1)


In [17]:
# Update values in final_df
final_df.loc[final_missing.index, 'HISTOLOGY'] = merged_df2['HIST'].values
print(final_df.head())

  ID_CNIO ID_ORIGINAL ID_ORIGINAL_NHC_BBANK HISTOLOGY HISTOLOGY_DETAIL AGE  \
0     LP1    02B28662                   NaN         0              NaN  50   
1     LP2    08B18876                   NaN         0              NaN  42   
2     LP3     08B260A                   NaN         0              NaN  68   
3     LP4    11B18858                   NaN         0              NaN  50   
4     LP5     99B6752                   NaN         0              NaN  82   

  GRADE FIGO FIGOL FIGOa  ... Non-MMRd_mutation TILs_score_info  \
0     2    3     0     1  ...                NO             YES   
1     2    3     0     1  ...                NO             YES   
2     2    5     0     2  ...                NO             YES   
3     1    6     0     2  ...                NO             YES   
4     1    1     0     1  ...                NO             YES   

  TILs_CNIO_info TILs_ep_CNIO TILs_ep_LP TILs_ep_OTTA TILs_tu_CNIO TILs_tu_LP  \
0   YES_complete          1.0        9.0       

In [18]:
#  Rename the columns as required
final_df.rename(columns={'HIST': 'HISTOLOGY'}, inplace=True)
# Check
print(final_df.head())

  ID_CNIO ID_ORIGINAL ID_ORIGINAL_NHC_BBANK HISTOLOGY HISTOLOGY_DETAIL AGE  \
0     LP1    02B28662                   NaN         0              NaN  50   
1     LP2    08B18876                   NaN         0              NaN  42   
2     LP3     08B260A                   NaN         0              NaN  68   
3     LP4    11B18858                   NaN         0              NaN  50   
4     LP5     99B6752                   NaN         0              NaN  82   

  GRADE FIGO FIGOL FIGOa  ... Non-MMRd_mutation TILs_score_info  \
0     2    3     0     1  ...                NO             YES   
1     2    3     0     1  ...                NO             YES   
2     2    5     0     2  ...                NO             YES   
3     1    6     0     2  ...                NO             YES   
4     1    1     0     1  ...                NO             YES   

  TILs_CNIO_info TILs_ep_CNIO TILs_ep_LP TILs_ep_OTTA TILs_tu_CNIO TILs_tu_LP  \
0   YES_complete          1.0        9.0       

In [19]:
# I analyse the type of data we have in the 'HISTOLOGY' column.
print(final_df['HISTOLOGY'].dtype)

object


In [20]:
# Now I have to change the values with the new coding without modifiying the rows that already have the right codification.
# Function to transform the values in the 'HISTOLOGY'column according to the definitive coding.

def transform_histology(value):
    if pd.isna(value) or value in [0, 1, 2, 3]:
        return value
    if isinstance(value, str):  # Verifica si es una cadena
        value = value.strip()  # Elimina espacios en blanco
        if value == 'E':
            return 0
        elif value == 'CC':
            return 1
        elif 'M' in value or 'mixto' in value.lower() or 'mixed' in value.lower():
            return 2
        else:
            return 3
    return value  # Si es un valor numérico, lo devolvemos tal cual
    
# Apply the function to the 'HISTOLOGY' column
final_df['HISTOLOGY'] = final_df['HISTOLOGY'].apply(transform_histology)

# Show the final dataframe
print(final_df)

    ID_CNIO                       ID_ORIGINAL ID_ORIGINAL_NHC_BBANK  \
0       LP1                          02B28662                   NaN   
1       LP2                          08B18876                   NaN   
2       LP3                           08B260A                   NaN   
3       LP4                          11B18858                   NaN   
4       LP5                           99B6752                   NaN   
..      ...                               ...                   ...   
325   09T45                               NaN                   NaN   
326   08T94         99B5119-8 (F.H. ALCORCÓN)                   NaN   
327  09T237                        08B1440-A4                   NaN   
328   11T82        074040B5(B5;B6) (ALBACETE)                   NaN   
329  16T131  13B9434-2-7 H. U. Sant Joan Reus             NHC:51092   

     HISTOLOGY HISTOLOGY_DETAIL  AGE GRADE FIGO FIGOL FIGOa  ...  \
0          0.0              NaN   50     2    3     0     1  ...   
1          

In [21]:
# Update 'HISTOLOGY_DETAIL' column
# Add the value 'Mixed E+CC' in 'HISTOLOGY_DETAIL' for the row with ID_CNIO='RVB3'
final_df.loc[final_df['ID_CNIO'] == 'RVB3', 'HISTOLOGY_DETAIL'] = 'Mixed E+CC'

# Update 'HISTOLOGY_DETAIL' values according to the 'HISTOLOGY' values
def update_histology_detail(row):
    if pd.isna(row['HISTOLOGY']):  # If 'HISTOLOGY' is NaN, we keep it'
        return row['HISTOLOGY_DETAIL']
    elif row['HISTOLOGY'] in [0, 1, 3]:  # If it is 0, 1 o 3, assign 'No mixed'
        return 'No mixed'
    else:  # For the rest of the cases we keep the original data
        return row['HISTOLOGY_DETAIL']

# Apply the function to the column
final_df['HISTOLOGY_DETAIL'] = final_df.apply(update_histology_detail, axis=1)

# Show the final dataframe
print(final_df)

    ID_CNIO                       ID_ORIGINAL ID_ORIGINAL_NHC_BBANK  \
0       LP1                          02B28662                   NaN   
1       LP2                          08B18876                   NaN   
2       LP3                           08B260A                   NaN   
3       LP4                          11B18858                   NaN   
4       LP5                           99B6752                   NaN   
..      ...                               ...                   ...   
325   09T45                               NaN                   NaN   
326   08T94         99B5119-8 (F.H. ALCORCÓN)                   NaN   
327  09T237                        08B1440-A4                   NaN   
328   11T82        074040B5(B5;B6) (ALBACETE)                   NaN   
329  16T131  13B9434-2-7 H. U. Sant Joan Reus             NHC:51092   

     HISTOLOGY HISTOLOGY_DETAIL  AGE GRADE FIGO FIGOL FIGOa  ...  \
0          0.0         No mixed   50     2    3     0     1  ...   
1          

In [22]:
# Some more modifications to keep the final coding
# GRADE column: I change the value of RVB4 'ovario:3; ENDOMETRIO: 1'for 3;1 in order to mantain the coding
# Additionally I transform all values to integer type or a list of integers (if they are split by a ;), because there are different
# among cohorts(float,string).
final_df.loc[final_df['ID_CNIO']=='RVB4','GRADE']='3;1'
# Function to convert the data. Previously I tranform all data to string type to avoid errors when using the function.
#final_df['GRADE'] = final_df['GRADE'].astype(str)
def convert_value(value):
    """Convert a string split by ';' into a list of integers or a single integer."""
    if pd.notna(value) and value != '':
        # Convertir el valor a cadena de texto si no lo es
        value = str(value)
        if ';' in value:
            # Convertir los números separados por ';' en una lista de enteros
            try:
                return [int(part) for part in value.split(';')]
            except ValueError:
                return np.nan
        else:
            try:
                # Convertir un solo valor en un entero
                return int(value)
            except ValueError:
                return np.nan
    return np.nan  # Mantener valores NaN
    
# Apply the function to the column
final_df['GRADE'] = final_df['GRADE'].apply(convert_value)
print(final_df['GRADE'])

0        2
1        2
2        2
3        1
4        1
      ... 
325    NaN
326    NaN
327    NaN
328    NaN
329    NaN
Name: GRADE, Length: 330, dtype: object


In [23]:
# Fill empty cells with NO in Clinical_info, Genomic_info and TILs_score_info
final_df['Clinical_info']=final_df['Clinical_info'].fillna('NO')
final_df['Genomic_info']=final_df['Genomic_info'].fillna('NO')
final_df['TILs_score_info']=final_df['TILs_score_info'].fillna('NO')
# final_df.to_excel("Samples_all_info.xlsx", index=False)

In [24]:
print(final_df.info())
print(final_df.head())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 330 entries, 0 to 329
Data columns (total 56 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   ID_CNIO                    330 non-null    object 
 1   ID_ORIGINAL                309 non-null    object 
 2   ID_ORIGINAL_NHC_BBANK      288 non-null    object 
 3   HISTOLOGY                  317 non-null    float64
 4   HISTOLOGY_DETAIL           317 non-null    object 
 5   AGE                        309 non-null    object 
 6   GRADE                      243 non-null    object 
 7   FIGO                       275 non-null    object 
 8   FIGOL                      276 non-null    object 
 9   FIGOa                      275 non-null    object 
 10  NEOADJUVANT_TREATMENT      233 non-null    object 
 11  ADJUVANT_TREATMENT         279 non-null    object 
 12  TYPE_ADJUVANT              197 non-null    object 
 13  OTHER_ADJ_TREAT            14 non-null     object 

In [25]:
# Convert all values in the 'VITAL_STATUS' column to strings, remove extra spaces, and replace NaN values with 'NA'
final_df['VITAL STATUS'] = final_df['VITAL STATUS'].astype(str).str.strip()

# Verify the unique values to ensure there are no extra spaces
print(final_df['VITAL STATUS'].unique())

# Replace 'nan' (NaN values converted to string) with 'NA'
final_df['VITAL STATUS'] = final_df['VITAL STATUS'].replace('nan', 'NA')
print(final_df['VITAL STATUS'].unique())

['0' '1' 'nan']
['0' '1' 'NA']


In [26]:
# Counts to know how many samples have all data type
# Filter rows in which 'Genomic_info' is 'YES'
genomic_yes = final_df[final_df['Genomic_info'] == 'YES']

# Fill the rows with Genomic_info that are 'YES' in 'Clinical_info' and 'TILs_score_info' too
yes_count = genomic_yes[(genomic_yes['Clinical_info'] == 'YES') & (genomic_yes['TILs_score_info'] == 'YES')]
yes_count_cnio=genomic_yes[(genomic_yes['Clinical_info'] == 'YES') & (genomic_yes['TILs_CNIO_info'] != 'NO')]
yes_count_cnio_complete=genomic_yes[(genomic_yes['Clinical_info'] == 'YES') & (genomic_yes['TILs_CNIO_info'] == 'YES_complete')]
# Count the number of rows
count_yes_yes_yes = yes_count.shape[0]
count_yes_cnio= yes_count_cnio.shape[0]
count_yes_cnio_complete=yes_count_cnio_complete.shape[0]
# Save the results in a new DataFrame and export it to an excel file
new_df = yes_count.copy()
# Show the results
print(f"Number of rows in which 'Genomic_info', 'Clinical_info', and 'TILs_score_info' is 'YES': {count_yes_yes_yes}") #167
print(f"Number of rows in which 'Genomic_info', 'Clinical_info', and 'TILs_CNIO_info' is YES (incomplete or complete for TILs): {count_yes_cnio}") 
print(f"Number of rows in which 'Genomic_info', 'Clinical_info', and 'TILs_CNIO_info' is YES (complete for TILs): {count_yes_cnio_complete}")

Number of rows in which 'Genomic_info', 'Clinical_info', and 'TILs_score_info' is 'YES': 167
Number of rows in which 'Genomic_info', 'Clinical_info', and 'TILs_CNIO_info' is YES (incomplete or complete for TILs): 158
Number of rows in which 'Genomic_info', 'Clinical_info', and 'TILs_CNIO_info' is YES (complete for TILs): 157


In [27]:
# Number of rows with 'YES' in 'Clinical_info' and 'TILs_info' regardless of 'Genomic_info'
clinical_tils_yes = final_df[(final_df['Clinical_info'] == 'YES') & (final_df['TILs_score_info'] == 'YES')]
count_clinical_tils_yes = clinical_tils_yes.shape[0]

# Count how many rows have 'YES' in 'Clinical_info' and 'Genomic_info' regardless of 'TILs_info'
clinical_genomic_yes = final_df[(final_df['Clinical_info'] == 'YES') & (final_df['Genomic_info'] == 'YES')]
count_clinical_genomic_yes = clinical_genomic_yes.shape[0]

# Count how many rows have 'YES' in 'Genomic_info' and 'TILs_info' regardless of 'Clinical_info'
genomic_tils_yes = final_df[(final_df['Genomic_info'] == 'YES') & (final_df['TILs_score_info'] == 'YES')]
count_genomic_tils_yes = genomic_tils_yes.shape[0]

# Show the results
print(f"Number of rows with 'YES' in 'Clinical_info' and 'TILs_info': {count_clinical_tils_yes}")
print(f"Number of rows with 'YES' in 'Clinical_info' and 'Genomic_info': {count_clinical_genomic_yes}")
print(f"Number of rows with 'YES' in 'Genomic_info' and 'TILs_info': {count_genomic_tils_yes}")

Number of rows with 'YES' in 'Clinical_info' and 'TILs_info': 212
Number of rows with 'YES' in 'Clinical_info' and 'Genomic_info': 169
Number of rows with 'YES' in 'Genomic_info' and 'TILs_info': 170


In [28]:
# Original filtering without excluding index 0
clinical_tils_yes = final_df[(final_df['Clinical_info'] == 'YES') & (final_df['TILs_score_info'] == 'YES')]
clinical_genomic_yes = final_df[(final_df['Clinical_info'] == 'YES') & (final_df['Genomic_info'] == 'YES')]
genomic_tils_yes = final_df[(final_df['Genomic_info'] == 'YES') & (final_df['TILs_score_info'] == 'YES')]
all_yes = final_df[(final_df['Clinical_info'] == 'YES') & (final_df['Genomic_info'] == 'YES') & (final_df['TILs_score_info'] == 'YES')]

# Count how many are MMRd and MMRp in each group

# For Clinical_info and TILs_info
clinical_tils_MMRd = clinical_tils_yes[clinical_tils_yes['MMR_final_status'] == 'MMRd'].shape[0]
clinical_tils_MMRp = clinical_tils_yes[clinical_tils_yes['MMR_final_status'] == 'MMRp'].shape[0]

# For Clinical_info and Genomic_info
clinical_genomic_MMRd = clinical_genomic_yes[clinical_genomic_yes['MMR_final_status'] == 'MMRd'].shape[0]
clinical_genomic_MMRp = clinical_genomic_yes[clinical_genomic_yes['MMR_final_status'] == 'MMRp'].shape[0]

# For Genomic_info and TILs_score_info
genomic_tils_MMRd = genomic_tils_yes[genomic_tils_yes['MMR_final_status'] == 'MMRd'].shape[0]
genomic_tils_MMRp = genomic_tils_yes[genomic_tils_yes['MMR_final_status'] == 'MMRp'].shape[0]

# For Genomic_info, Clinical_info, and TILs_score_info
all_yes_MMRd = all_yes[all_yes['MMR_final_status'] == 'MMRd'].shape[0]
all_yes_MMRp = all_yes[all_yes['MMR_final_status'] == 'MMRp'].shape[0]

# Display results
print(f"Number of rows with 'YES' in 'Clinical_info' and 'TILs_info' that are MMRd: {clinical_tils_MMRd}")
print(f"Number of rows with 'YES' in 'Clinical_info' and 'TILs_info' that are MMRp: {clinical_tils_MMRp}")

print(f"Number of rows with 'YES' in 'Clinical_info' and 'Genomic_info' that are MMRd: {clinical_genomic_MMRd}")
print(f"Number of rows with 'YES' in 'Clinical_info' and 'Genomic_info' that are MMRp: {clinical_genomic_MMRp}")

print(f"Number of rows with 'YES' in 'Genomic_info' and 'TILs_info' that are MMRd: {genomic_tils_MMRd}")
print(f"Number of rows with 'YES' in 'Genomic_info' and 'TILs_info' that are MMRp: {genomic_tils_MMRp}")

print(f"Number of rows with 'YES' in 'Genomic_info', 'Clinical_info', and 'TILs_info' that are MMRd: {all_yes_MMRd}")
print(f"Number of rows with 'YES' in 'Genomic_info', 'Clinical_info', and 'TILs_info' that are MMRp: {all_yes_MMRp}")


Number of rows with 'YES' in 'Clinical_info' and 'TILs_info' that are MMRd: 24
Number of rows with 'YES' in 'Clinical_info' and 'TILs_info' that are MMRp: 169
Number of rows with 'YES' in 'Clinical_info' and 'Genomic_info' that are MMRd: 21
Number of rows with 'YES' in 'Clinical_info' and 'Genomic_info' that are MMRp: 148
Number of rows with 'YES' in 'Genomic_info' and 'TILs_info' that are MMRd: 20
Number of rows with 'YES' in 'Genomic_info' and 'TILs_info' that are MMRp: 150
Number of rows with 'YES' in 'Genomic_info', 'Clinical_info', and 'TILs_info' that are MMRd: 20
Number of rows with 'YES' in 'Genomic_info', 'Clinical_info', and 'TILs_info' that are MMRp: 147


In [29]:
# Count how many are E (0) and CC(1) in each group

# For Clinical_info and TILs_info
clinical_tils_E = clinical_tils_yes[clinical_tils_yes['HISTOLOGY'] == 0].shape[0]
clinical_tils_CC = clinical_tils_yes[clinical_tils_yes['HISTOLOGY'] == 1].shape[0]

# For Clinical_info and Genomic_info
clinical_genomic_E = clinical_genomic_yes[clinical_genomic_yes['HISTOLOGY'] == 0].shape[0]
clinical_genomic_CC = clinical_genomic_yes[clinical_genomic_yes['HISTOLOGY'] == 1].shape[0]

# For Genomic_info and TILs_info
genomic_tils_E = genomic_tils_yes[genomic_tils_yes['HISTOLOGY'] == 0].shape[0]
genomic_tils_CC = genomic_tils_yes[genomic_tils_yes['HISTOLOGY'] == 1].shape[0]

# For Genomic_info, Clinical_info, and TILs_info
all_yes_E = all_yes[all_yes['HISTOLOGY'] == 0].shape[0]
all_yes_CC = all_yes[all_yes['HISTOLOGY'] == 1].shape[0]

# Display results
print(f"Number of rows with 'YES' in 'Clinical_info' and 'TILs_info' that are E: {clinical_tils_E}")
print(f"Number of rows with 'YES' in 'Clinical_info' and 'TILs_info' that are CC: {clinical_tils_CC}")

print(f"Number of rows with 'YES' in 'Clinical_info' and 'Genomic_info' that are E: {clinical_genomic_E}")
print(f"Number of rows with 'YES' in 'Clinical_info' and 'Genomic_info' that are CC: {clinical_genomic_CC}")

print(f"Number of rows with 'YES' in 'Genomic_info' and 'TILs_info' that are E: {genomic_tils_E}")
print(f"Number of rows with 'YES' in 'Genomic_info' and 'TILs_info' that are CC: {genomic_tils_CC}")

print(f"Number of rows with 'YES' in 'Genomic_info', 'Clinical_info', and 'TILs_info' that are E: {all_yes_E}")
print(f"Number of rows with 'YES' in 'Genomic_info', 'Clinical_info', and 'TILs_info' that are CC: {all_yes_CC}")

Number of rows with 'YES' in 'Clinical_info' and 'TILs_info' that are E: 103
Number of rows with 'YES' in 'Clinical_info' and 'TILs_info' that are CC: 100
Number of rows with 'YES' in 'Clinical_info' and 'Genomic_info' that are E: 90
Number of rows with 'YES' in 'Clinical_info' and 'Genomic_info' that are CC: 78
Number of rows with 'YES' in 'Genomic_info' and 'TILs_info' that are E: 91
Number of rows with 'YES' in 'Genomic_info' and 'TILs_info' that are CC: 78
Number of rows with 'YES' in 'Genomic_info', 'Clinical_info', and 'TILs_info' that are E: 89
Number of rows with 'YES' in 'Genomic_info', 'Clinical_info', and 'TILs_info' that are CC: 77


In [30]:
print(new_df.info())

<class 'pandas.core.frame.DataFrame'>
Index: 167 entries, 0 to 305
Data columns (total 56 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   ID_CNIO                    167 non-null    object 
 1   ID_ORIGINAL                163 non-null    object 
 2   ID_ORIGINAL_NHC_BBANK      147 non-null    object 
 3   HISTOLOGY                  167 non-null    float64
 4   HISTOLOGY_DETAIL           167 non-null    object 
 5   AGE                        167 non-null    object 
 6   GRADE                      132 non-null    object 
 7   FIGO                       161 non-null    object 
 8   FIGOL                      161 non-null    object 
 9   FIGOa                      161 non-null    object 
 10  NEOADJUVANT_TREATMENT      111 non-null    object 
 11  ADJUVANT_TREATMENT         152 non-null    object 
 12  TYPE_ADJUVANT              121 non-null    object 
 13  OTHER_ADJ_TREAT            7 non-null      object 
 14 

In [31]:
# Create rows with descriptions (index 0) and coding (index 1) now in the new file. In the order showed with info().
# Step 1: Create dictionaries with the information
descriptions = {
    'ID_CNIO': 'Unique CNIO tumor identifier',
    'ID_ORIGINAL':'Unique identifier from the hospital to the patient/sample',
    'ID_ORIGINAL_NHC_BBANK':'Unique identifier from Hospital (LP, OVE, MDA series; NHC= number of clinical history) or Biobank (RVB series) to the patient/sample',
    'HISTOLOGY': 'Tumor histology/Tumor type',
    'HISTOLOGY_DETAIL':'Tumor type based on the present cells: mixed (type of mixed in that case), no mixed',
    'AGE': 'Age at diagnosis. Calculated as (date of diagnosis – date of birth). When not provided, date of surgery was used instead of date of diagnosis (OVE series); a tag column was added to indicate this: “DATE_AGE”',
    'GRADE':'Tumor differentiation grade',
    'FIGO':'FIGO tumor stage',
    'FIGOL':'FIGO tumor stage according to cancer spread (localized or advance)',
    'FIGOa':'Aggregated tumor stage',
    'NEOADJUVANT_TREATMENT':'Information about whether the patient received neoadjuvant treatment',
    'ADJUVANT_TREATMENT':'Information about whether the patient received chemotherapy',
    'TYPE_ADJUVANT':'Type of adjuvant treatment received. Carboplatino+taxol is the standard',
    'OTHER_ADJ_TREAT':'Other type of adjuvant treatment received by the patient different from the standard, carboplatino+taxol',
    'RESIDUAL':'Residual disease after surgery',
    'RESIDUALa':'Aggregated categories of residual disease after surgery',
    'OS_CNIO':'Overall survival,  TIME FROM DIAGNOSIS TO DEATH or LAST KNOWN TO BE ALIVE. Calculated by CNIO from original records as (date of death MINUS date of diagnosis)  or (date last known to be alive MINUS date of diagnosis). When not provided, date of surgery was used instead of date of diagnosis. In this last case it is indicated in the tag column DATE USED FOR OS',
    'VITAL STATUS':'Vital status at last followup',
    'MMR STATUS':'Information about the GERMILINE mutations in genes related to mismatch repair pathway',
    'BRCA STATUS':'Information about GERMILINE mutatioS in BRCA1/2 genes',
    'FAMILIAL':'Familial antecedents of colon,endometrial and/or ovarian cancer. In OVE series this information refers to possible hereditary conditions according to the clinician',
    'DATE_AGE':'Tag column to indicate if the date used for calculating age at diagnosis was the diagnosis date or the surgery date',
    'DATE_OS':'Tag column to indicate how overall survival was calculated, using date of diagnosis or date of surgery (OVE series)',
    'PARTIAL DATE DIAGNOSIS':'Tag column to indicate how overall survival was calculated, using a partial or a complete date of diagnosis. If a partial date is used this suggests that the estimation of OS is not exact. A partial date is coded as XX/XX/year or XX/month/year and to make it possible the calculation these are transformed into 01/06/year or 01/month/year respectively',
    'PARTIAL DATE DEATH_LASTv':'Tag column to indicate how overall survival was calculated, using a partial or a complete date of last visit or death. If a partial date is used this suggests that the estimation of OS is not exact. A partial date is coded as XX/XX/year or XX/month/year and to make it possible the calculation these are transformed into 01/06/year or 01/month/year respectively',
    'Clinical_info':'Indicates whether there is clinical information available for this sample',
    'IHC_defective_MMR_protein':'IHC defective MMR protein according the results of immunohistochemistry tests. IHC tests the expression of MLH1, MSH2, MSH6 and PMS2 in tumour tissue sections. MMRp tumours express nuclear positivity of these 4 proteins, whereas MMRd tumours generally exhibit loss of expression of some of these proteins. Defective protein according to staining pattern of MLH1, PMS2, MSH2 & MSH6',
    'IHC pattern':'It refers to Abnormal or Standard IHC pattern for the missing proteins in the IHC',
    'IHC_MMR_class':'MMR classification based on IHC results of immunohistochemistry tests. IHC tests the expression of MLH1, MSH2, MSH6 and PMS2 in tumour tissue sections. MMRp tumours show nuclear positivity of these 4 proteins, whereas MMRd tumours generally exhibit loss of expression of some of these proteins',
    'PCR':'Results of PCR tests. Classic PCR test for MSI detection examines five MS regions in the genes NR21, NR22, NR24, BAT25 and BAT26. MSS tumours show no degree of instability across these 5 markers, whereas instability at just one marker is defined as MSI-Low. Instability at two or more of these markers confers a MSI-High diagnosis',
    'Evidence score for MMRd':'Evidence score for MMR deficiency based on data availability and agreement of standard techniques',
    'IHCd':'Results of immunohistochemistry tests used in Diego’s work (refers to ‘IHC_lab’ data)',
    'PCRd':'PCR results used in Diego’s work (refers to ‘PCR_lab’ data)',
    'QC_filters':'Indicates if this sample has passed NGS QC filters: more than 2 million reads, certain coverage and not artefacts in variant calling',
    'Genomic_info':'Indicates whether there is genomic information available for ALL of SOME of the features tested in this sample',
    'MMR_final_status':'MMR classification according to gold standard techniques (IHC and PCR)',
    'CLASSd':'Classification of the MMR system for this sample according to Diego’s work',
    'MSI_sensor2':'Microsatellite instability quantified as a numeric score using MSIsensor2 algorithm. A higher score indicates a higher microsatellite instability in that tumour.The MSI score refers to the number of MSI sites/all valid sites',
    'TMB': 'Total number of non synonymous coding mutations per megabase (Mb) of a tumour genome',
    'ID2':'Indel (Insertion-deletion) mutational signature. Biomarker linked to MMRd. It is predominantly composed of deletions of thymine at long (≥5) thymine mononucleotide repeats',
    'ID7':'Indel (Insertion-deletion) mutational signature. Biomarker linked to MMRd. Mutational signature indicative of large number of indels',
    'SUM ID2+ID7':'Sum of the indel (Insertion-deletion) mutational signatures ID2 and ID7',
    'CNV': 'Total number of copy number alterations (events) per tumor sample',
    '%genome_altered':'Percentage of the genome mutated/altered from the total',
    'Pol_mutation':'Indicates whether there are mutations in some of polymerase genes. Here we refers to the genes POLE or POLD1. POLE encodes the catalytic subunit of DNA polymerase epsilon while POLD1 encodes the catalytic and proofreading subunit of DNA polymerase-delta',
    'MMRd_mutation':'Indicates whether there are mutations in some of the genes involved in MMR pathway. Here we refers to the genes MLH1, MSH2, MSH6 and PMS2',
    'Non-MMRd_mutation':'Indicates whether there are mutations in some of the genes involved in MMR pathway. Here we refers to the genes  PMS1, MLH3 and MSH3',
    'TILs_score_info':'Indicates whether there is immunogenic information available for this sample. It refers to scores information. ‘TILs’ term refers to cytotoxic CD8+ tumour-infiltrating lymphocytes',
    'TILs_CNIO_info':'Indicates whether there is immunogenic information (CD8+ tumour-infiltrating lymphocytes) available for this sample coming from CNIO and if this information is complete or incomplete and in what sense',
    'TILs_ep_CNIO':'Intraephitelial count (OTTA guidelines) of TILs made by Eduardo Caleiras at CNIO',
    'TILs_ep_LP':'Intraepithelial count (OTTA guidelines) of TILs made by Berjón at La Paz Hospital',
    'TILs_ep_OTTA':'Intraepithelial count (OTTA guidelines) of TILs made by OTTA in Canada',
    'TILs_tu_CNIO':'Intratumoral count (all TILs in tumor section) made at CNIO',
    'TILs_tu_LP':'Intratumoral count (all TILs in tumor section) made by Berjón at La Paz Hospital',
    'TILs_raw_ep':'Intraepithelial raw counts (OTTA guidelines) of TILs made by Eduardo Caleiras at CNIO.',
    'TILs_raw_tu':'Intratumoral raw counts (OTTA guidelines) of TILs made by Eduardo Caleiras at CNIO'
}
subheaders = ['LP: ID for samples from La Paz Hospital; OVE: ID for samples from Virgen del Rocio Hospital; MDA:ID for samples from MD Anderson Cancer Center  Hospital ; RVB: ID for samples from Red Valenciana de Biobancos', 
              'Pathology code','Unique alphanumeric code from the hospital or biobank','0=endometroid; 1=clear cells; 2=mixed; 3=others; NA=unknown', 
             'type of mixed cancer; no mixed; NA=unknown','2 digits; unit is YEARS',
             '1=well differentiated; 2=moderately differentiated; 3=poorly differentiated; NA=unknown',
             '1=IA; 2=IB; 3=IC; 4=I(NOS); 5=IIA; 6=IIB; 8=II(NOS); 9=IIIA; 10=IIIB; 11=IIIC; 12=III(NOS); 13=IV; NA=unknown',
             '0=localized (I,II); 1=advanced(III,IV); NA=unknown','1=IA,IB,IC,I(NOS); 2=IIA,IIB,II(NOS); 3=IIIA,IIIB,IIIC,III(NOS); 4=IV; NA=unknown',
             '0=NO; 1=YES; NA=unknown','0=NO; 1=YES; NA=unknown',
             '0=Carbo-paclitaxel; 1=Cis-paclitaxel; 2=Carbo-monotherapy; 3=Carbo-Taxol-Beva; 4=other; NA=unknown',
             'Drugs used as adjuvant treatment',
             '0=no macroscopic disease; 1=macroscopic disease<=1; 2=macroscopic disease>1; 3=macroscopic disease,unknown size; NA=unknown',
             '0=No residual disease; 1=Yes residual disease; NA=unknown','Unit is DAYS',
             '0=alive; 1=dead; NA=unknown','0=not studied; 1=studied,without mutation; 2=mutation in MMR genes; 3=other genes mutated; NA=unknown',
             '0=not studied; 1=studied,without mutation; 2=mutation in BRCA1/2; 3=other genes mutated; NA=unknown','0=NO; 1=YES; NA=unknown',
             'D=diagnosis date used for calculating age; S=surgery date used for calculating age; NA=unknown',
             'D=diagnosis date used for OS; S=surgery date used for OS; LD=Lacking diagnosis/surgery/death_lastv date; NA=unknown',
             'YES=partial date xx/xx/y or xx/m/y; NO=Complete date or NA','YES=partial date xx/xx/y or xx/m/y; NO=Complete date or NA',
             'YES=There is clinical information for this sample; NO=There isn’t clinical information for this sample',
             'Defective protein according to staining pattern of MLH1, PMS2, MSH2 & MSH6: MLH1=MLH1 & PMS2 negative staining; MSH2=MSH2 & MSH6 negative staining; PMS2=PMS2 negat staining;  MSH6=MSH6 negative staining; NO=All positive; NA=Not Available',
             'Standard= see column "IHC_defective_MMR_protein"; Abnormal=negat staining of MLH1,PMS2,MSH6 or MSH2 in a combination which is not the standard; NO=All positive; NA=Not Available',
             'MMR proficient=All four MLH1, PMS2, MSH2 & MSH6 proteins showed positive staining; MMRd, MMR deficient=One/two of the MLH1, PMS2, MSH2 & MSH6 showed negative staining;  NA=Not Available',
             'MSS=Microsatellite stability-no degree of instability across the 5 tested markers; MSI-Low=Low microsatellite instability-instability at one tested marker; MSI-High=High microsatellite instability-instability at two or more of the tested markers',
             '1=IHC&PCR indicates MMRd unambiguously; 2= One MMRd unambiguous, the other MMRd ambiguous or NA;  3= One MMRd unambiguous, the other MMRp or NA; 4= One MMRd ambiguous',
             'MLH1, MSH2, MSH6 and PMS2: indicates the loss of expression of that specific protein; UNKNOWN: tested, but unknown',
             'MSS= Microsatellite stability-no degree of instability across the 5 tested markers; MSI-Low=Low microsatellite instability-instability at one tested marker; MSI-High=High microsatellite instability-instability at two or more of the tested markers',
             'YES=Sample passes QC filters; NO=Sample doesn’t pass QC filters',
             'YES=There is genomic information for this sample; NO=There isn’t genomic information for this sample',
             'MMRp=MMR proficient sample, MMRd=MMR deficient sample. Only samples with MMRd evidence score equal or greater than 3 were classified as MMRd',
             'MMRp:mismatch repair proficient sample; MMRd:mismatch repair deficient sample',
             'Percentage','Number of non synonymous coding mutations/Mb tumour genome',
             'Percentage of observed mutations in the tumour sample that can be attributed to ID2 mutational signature',
             'Percentage of observed mutations in the tumour sample that can be attributed to ID7 mutational signature',
             'Percentage of observed mutations in the tumour sample that can be attributed to ID2+ID7 mutational signatures',
             'Number of copy number alterations/events.One event is considered when a DNA segment has more or fewer than 2 copies (the normal value)',
             'Percentage','NO=No mutation in polymerase genes; POLE:POLE gene is mutated (in one copy of the genome); POLD1:POLD1 gene is mutated (in one copy of the genome); POLE&POLE:Both POLE copies in the genome are mutated',
             'NO=No mutation in MMR genes; MLH1,MSH2,MSH6 and PMS2:indicates a mutation in that specific MMR gene.Sometimes there are more than one gene mutated or there are two copies of the same gene mutated (&)',
             'NO=No mutation in  PMS1, MLH3 and MSH3 genes; PMS1, MLH3 and MSH3:indicates a mutation in that specific gene',
             'YES=There is immunogenic information for this sample; NO=There isn’t immunogenic information for this sample',
             'YES_complete=There is complete immunogenic information for this sample, intraephitelial and intratumoral data; YES_only_ep= There is only information about the number of intraepithelial lymphocytes ; YES_only_tu: There is only information about the number of intratumoral lymphocytes; NO=There isn’t immunogenic information for this sample',
             '0:no TILs; 1:1-2 TILs; 2:3-19 TILS; 3:>20 TILs; 9:not evaluable','0:no TILs; 1:1-2 TILs; 2:3-19 TILS; 3:>20 TILs; 9:not evaluable',
             '0:no TILs; 1:1-2 TILs; 2:3-19 TILS; 3:>20 TILs; 9:not evaluable','0:no TILs; 1:1-2 TILs; 2:3-19 TILS; 3:>20 TILs; 9:not evaluable',
             '0:no TILs; 1:1-2 TILs; 2:3-19 TILS; 3:>20 TILs; 9:not evaluable',
              'Number of TILs in the intraephitelial selected section of TMA','Number of TILs in the intratumoral selected section of TMA',
             ]

# Step 2: Convert 'descriptions' and 'subheaders-coding' into a DataFrame
# Align descriptions and coding with the columns in 'new_df'
descriptions_row = pd.DataFrame([descriptions], columns=final_df.columns)
coding_row = pd.DataFrame([subheaders], columns=final_df.columns)

# Step 3: Concatenate the new rows with the original DataFrame
new_df_with_annotations = pd.concat([descriptions_row, coding_row, final_df], ignore_index=True)

# Step 4: Display or save the new DataFrame
print(new_df_with_annotations.head(5))  # Show the first 5 rows
# Save the DataFrame to Excel if needed
new_df_with_annotations.to_excel('/home/vant/TFM/Final_db/Combined_data_all_cohorts_annotated.xlsx', index=False) # 330 samples: some with only tils,genomic or clinical,
# with 2 of them o with the 3 of them

# Final file with samples that have the 3 types of data at the same time
descriptions_row1 = pd.DataFrame([descriptions], columns=new_df.columns)
coding_row1 = pd.DataFrame([subheaders], columns=new_df.columns)

# Step 3: Concatenate the new rows with the original DataFrame
new_df_with_annotations1 = pd.concat([descriptions_row1, coding_row1, new_df], ignore_index=True)

# Step 4: Display or save the new DataFrame
print(new_df_with_annotations1.head(5))
new_df_with_annotations1.to_excel("/home/vant/TFM/Final_db/Samples_alltypedata_annotated.xlsx",index=False) # Samples with the 3 types of datap

                                             ID_CNIO  \
0                       Unique CNIO tumor identifier   
1  LP: ID for samples from La Paz Hospital; OVE: ...   
2                                                LP1   
3                                                LP2   
4                                                LP3   

                                         ID_ORIGINAL  \
0  Unique identifier from the hospital to the pat...   
1                                     Pathology code   
2                                           02B28662   
3                                           08B18876   
4                                            08B260A   

                               ID_ORIGINAL_NHC_BBANK  \
0  Unique identifier from Hospital (LP, OVE, MDA ...   
1  Unique alphanumeric code from the hospital or ...   
2                                                NaN   
3                                                NaN   
4                                             

In [32]:
# Create the clinical database
# Create a new DataFrame with rows with clinical information and columns from index 0 to 25 to extract only the clinical database
filtered_clinical=new_df_with_annotations[new_df_with_annotations['Clinical_info']=='YES']
# Select also rows with índexes 0 and 1
filtered_clinical = pd.concat([new_df_with_annotations.loc[[0, 1]], filtered_clinical])
new_df_clinical = filtered_clinical.iloc[:, :25]
# Display the first few rows of the new DataFrame to verify
print(new_df_clinical.head())
new_df_clinical.to_excel("/home/vant/TFM/Final_db/Definitive_clinical_database.xlsx", index=False) #309 samples

                                             ID_CNIO  \
0                       Unique CNIO tumor identifier   
1  LP: ID for samples from La Paz Hospital; OVE: ...   
2                                                LP1   
3                                                LP2   
4                                                LP3   

                                         ID_ORIGINAL  \
0  Unique identifier from the hospital to the pat...   
1                                     Pathology code   
2                                           02B28662   
3                                           08B18876   
4                                            08B260A   

                               ID_ORIGINAL_NHC_BBANK  \
0  Unique identifier from Hospital (LP, OVE, MDA ...   
1  Unique alphanumeric code from the hospital or ...   
2                                                NaN   
3                                                NaN   
4                                             

In [33]:
### Create the genomic database
# List of specific column names you want to include in the new DataFrame
columns_to_include = ['ID_CNIO', 'ID_ORIGINAL', 'ID_ORIGINAL_NHC_BBANK', 'HISTOLOGY','IHC_defective_MMR_protein','IHC pattern','IHC_MMR_class','PCR','Evidence score for MMRd','IHCd','PCRd','QC_filters','Genomic_info',
                     'MMR_final_status','CLASSd','MSI_sensor2','TMB','ID2','ID7','SUM ID2+ID7','CNV','%genome_altered','Pol_mutation','MMRd_mutation',
                      'Non-MMRd_mutation']
### Create a TILs database
columns_to_include2 = ['ID_CNIO', 'ID_ORIGINAL', 'ID_ORIGINAL_NHC_BBANK', 'HISTOLOGY','TILs_score_info','TILs_CNIO_info','TILs_ep_CNIO','TILs_ep_LP',
                       'TILs_ep_OTTA','TILs_tu_CNIO','TILs_tu_LP','TILs_raw_ep','TILs_raw_tu']
# Create a new DataFrame with only those specific columns
filtered_genomic=new_df_with_annotations[new_df_with_annotations['Genomic_info']=='YES']
# Select also rows with índexes 0 and 1
filtered_genomic = pd.concat([new_df_with_annotations.loc[[0, 1]], filtered_genomic])
new_df_genomic = filtered_genomic[columns_to_include]
filtered_tils=new_df_with_annotations[new_df_with_annotations['TILs_score_info']=='YES']
# Select also rows with índexes 0 and 1
filtered_tils = pd.concat([new_df_with_annotations.loc[[0, 1]], filtered_tils])
new_df_tils = filtered_tils[columns_to_include2]
# Display the first few rows of the new DataFrame to verify
print(new_df_genomic.head())
print(new_df_tils.head())
new_df_genomic.to_excel("/home/vant/TFM/Final_db/Definitive_genomic_database.xlsx", index=False) # 173 samples
new_df_tils.to_excel("/home/vant/TFM/Final_db/Definitive_TILs_database.xlsx", index=False) # 232 samples / 195 if we only select the CNIO counts

                                             ID_CNIO  \
0                       Unique CNIO tumor identifier   
1  LP: ID for samples from La Paz Hospital; OVE: ...   
2                                                LP1   
3                                                LP2   
4                                                LP3   

                                         ID_ORIGINAL  \
0  Unique identifier from the hospital to the pat...   
1                                     Pathology code   
2                                           02B28662   
3                                           08B18876   
4                                            08B260A   

                               ID_ORIGINAL_NHC_BBANK  \
0  Unique identifier from Hospital (LP, OVE, MDA ...   
1  Unique alphanumeric code from the hospital or ...   
2                                                NaN   
3                                                NaN   
4                                             

In [38]:
# Create the complete database with samples with genomic data
# Filter rows in which 'Genomic_info' is 'YES'
genomic_yes = new_df_with_annotations[new_df_with_annotations['Genomic_info'] == 'YES']
# Select also rows with índexes 0 and 1
genomic_yes = pd.concat([new_df_with_annotations.loc[[0, 1]], genomic_yes])
count_genomic = genomic_yes.shape[0]
print(f"Number of samples with genomic info are:{count_genomic}") # 173+2 rows (description + coding rows)
genomic_yes.to_excel("/home/vant/TFM/Final_db/Samples_Genomic_definitive.xlsx",index=False)

Number of samples with genomic info are:175


In [39]:
# Create the complete database with samples with clinical data
# Filter rows in which 'Clinical_info' is 'YES'
clinical_yes = new_df_with_annotations[new_df_with_annotations['Clinical_info'] == 'YES']
# Select also rows with índexes 0 and 1
clinical_yes = pd.concat([new_df_with_annotations.loc[[0, 1]], clinical_yes])
count_clinical = clinical_yes.shape[0]
print(f"Number of samples with clinical info are:{count_clinical}")
clinical_yes.to_excel("/home/vant/TFM/Final_db/Samples_Clinical_definitive.xlsx",index=False)

Number of samples with clinical info are:311


In [40]:
# Create the complete database with samples with TILs data
# Filter rows in which 'TILs_info' is 'YES'
tils_yes = new_df_with_annotations[new_df_with_annotations['TILs_score_info'] == 'YES']
# Select also rows with índexes 0 and 1
tils_yes = pd.concat([new_df_with_annotations.loc[[0, 1]], tils_yes])
count_tils = tils_yes.shape[0]
print(f"Number of samples with TILs info are:{count_tils}")
tils_yes.to_excel("/home/vant/TFM/Final_db/Samples_TILs_definitive.xlsx",index=False) # 233 in Alice's original files, 232 here

Number of samples with TILs info are:234


In [37]:
print(final_df.head(25))

   ID_CNIO  ID_ORIGINAL ID_ORIGINAL_NHC_BBANK  HISTOLOGY HISTOLOGY_DETAIL AGE  \
0      LP1     02B28662                   NaN        0.0         No mixed  50   
1      LP2     08B18876                   NaN        0.0         No mixed  42   
2      LP3      08B260A                   NaN        0.0         No mixed  68   
3      LP4     11B18858                   NaN        0.0         No mixed  50   
4      LP5      99B6752                   NaN        0.0         No mixed  82   
5      LP6      00B9877                   NaN        0.0         No mixed  62   
6      LP7     12B25893                   NaN        0.0         No mixed  45   
7      LP8     10B25690                   NaN        0.0         No mixed  48   
8      LP9     08B10682                   NaN        0.0         No mixed  51   
9     LP10      00B6886                   NaN        0.0         No mixed  56   
10    LP11     03B2020A                   NaN        0.0         No mixed  34   
11    LP12     08B27334     