In [1]:
from pathlib import Path
import pandas as pd
import ast
from tqdm import tqdm

### Data debugging

In [38]:
# feature_dir = Path('/lustre/groups/peng/datasets/histology_data/DACHS/features/Raw/Dachs_HistAuGAN_CTransPath/')
# clini_table =  Path('/lustre/groups/peng/datasets/histology_data/clini_tables/DACHS-CRC-DX_CLINI.xlsx')
# slide_csv = Path('/lustre/groups/peng/datasets/histology_data/clini_tables/DACHS-CRC-DX_SLIDE.csv')
feature_dir = Path('/lustre/groups/peng/datasets/histology_data/FOXTROT/features/Macenko/FOXTROT-CRC-DX_macenko_ctranspath/')
clini_table =  Path('/lustre/groups/peng/datasets/histology_data/clini_tables/FOXTROT-CRC-DX_CLINI.xlsx')
slide_csv = Path('/lustre/groups/peng/datasets/histology_data/clini_tables/FOXTROT-CRC-DX_resection_tumor_slide.csv')
target_labels = ['isMSIH']
clini_info= {}
categories = ['Not mut.', 'Mutat.', 'nonMSIH', 'MSIH', 'WT', 'MUT', 'wt', 'MT']



In [39]:
clini_df = pd.read_csv(clini_table, dtype=str) if Path(clini_table).suffix == '.csv' else pd.read_excel(
    clini_table, dtype=str)
slide_df = pd.read_csv(slide_csv, dtype=str)
df = clini_df.merge(slide_df, on='PATIENT')
# adapt dataframe to case sensitive clini tables
df = df.rename({
    'MSI': 'isMSIH',
    'BRAF': 'braf', 'BRAF_mutation': 'braf', 'braf_status': 'braf', 
    'KRAS': 'kras', 'kras_status': 'kras', 'KRAS_mutation': 'kras',
    'NRAS': 'nras', 'NRAS_mutation': 'nras',
    'Age': 'AGE'
}, axis=1)

In [None]:
clini_df

In [None]:
slide_df

In [42]:
slide_df.PATIENT.nunique()

869

In [43]:
# remove columns not in target_labels
for key in df.columns:
    if key not in target_labels + ['PATIENT', 'SLIDE', 'FILENAME', *list(clini_info.keys())]:
        df.drop(key, axis=1, inplace=True)
# remove rows/slides with non-valid labels
for target in target_labels:
    df = df[df[target].isin(categories)]

In [49]:
# h5s = set(feature_dir.glob('**/*.h5'))
h5s = set(Path('/lustre/groups/peng/datasets/histology_data/FOXTROT/features/Macenko/FOXTROT-CRC-DX_macenko_ctranspath').glob('**/*.h5'))


In [None]:
h5s

### Create slide-csv for cohort

In [51]:
feature_dir = Path('/lustre/groups/peng/datasets/histology_data/MCO-CRC-DX/features/Macenko/CTransPath')

In [101]:
h5s = set(feature_dir.glob('**/*.h5'))


In [102]:
filenames = [h5.stem for h5 in h5s]

In [103]:
names = pd.DataFrame(filenames, columns=['ParentSpecimen'])

In [60]:
names.to_csv('filenames.csv')

In [None]:
names

In [96]:
mappings = pd.read_csv('MCO_slide_large.csv')

  mappings = pd.read_csv('MCO_slide_large.csv')


In [97]:
mappings = mappings.drop('Unnamed: 2', axis=1)

In [99]:
mappings = mappings.dropna()

In [None]:
mappings

In [105]:
df = mappings.merge(names, on='ParentSpecimen')

In [106]:
df = df.drop_duplicates()

In [None]:
df

In [108]:
df.PATIENT = df.PATIENT.astype(int)

In [109]:
df = df.reset_index()

In [110]:
df = df.drop('index', axis=1)

In [None]:
df

In [112]:
df.to_csv('MCO-CRC-DX_slide.csv')

### Create slide csv for MCO

In [23]:
clini = pd.read_excel('MCO-CRC-DX_clini_full.xlsx')

In [None]:
clini

In [44]:
slide = clini[['PATIENT', 'slide_path']]

In [None]:
slide

In [None]:
for i in tqdm(range(len(slide))):
    # slide.slide_path[i] = 
    s = slide.slide_path[i]
    s = s.replace('Path(', '').replace(')', '')  # remove the 'Path()' function calls
    path_strs = ast.literal_eval(s)  # evaluate the resulting string

    # convert strings back to Path() objects
    path_list = [Path(x) for x in path_strs]
    slide.slide_path[i] = path_list


In [47]:
slide = slide.explode('slide_path')

In [48]:
slide = slide.reset_index()

In [None]:
slide.slide_path[0]

In [None]:
for i in tqdm(range(len(slide))):
    slide.slide_path[i] = slide.slide_path[i].stem

In [None]:
slide

In [52]:
slide = slide.drop('index', axis=1)

In [53]:
slide = slide.rename({'slide_path': 'FILENAME'}, axis=1)

In [None]:
slide.keys()

In [55]:
slide.to_csv('MCO-CRC-DX_slide.csv', index=False)

In [56]:
test = pd.read_csv('MCO-CRC-DX_slide.csv')

In [None]:
test

### Add clinical values such as BRAF and KRAS

In [None]:
exp = pd.read_excel('JJ_MCO_Experimental_Full_Raw_2021.xlsx')
exp = exp.rename({'Patient_ID': 'PATIENT'}, axis=1)

In [3]:
for key in exp.columns:
    if key not in ['PATIENT', 'Analyte', 'Result']:
        exp.drop(key, axis=1, inplace=True)

In [17]:
braf = exp.loc[exp['Analyte']=='BRAF']

In [None]:
braf

In [None]:
braf.Result.unique()

In [20]:
braf = braf.rename({'Result': 'BRAF'}, axis=1)
braf = braf.replace({'Mutated c.1799T>A (V600E)': 'MUT'})
braf = braf.replace({'Wildtype': 'WT'})
braf = braf.replace({'Not available': 'NaN'})


In [21]:
braf = braf.drop('Analyte', axis=1)
braf = braf.drop_duplicates()
braf = braf.reset_index(drop=True)


In [None]:
braf

In [25]:
clini_braf = clini.merge(braf, on='PATIENT')

In [None]:
clini_braf

In [28]:
kras = exp.loc[exp['Analyte']=='KRAS']


In [None]:
kras


In [None]:
kras.KRAS.unique()


In [36]:
kras = kras.rename({'Result': 'KRAS'}, axis=1)
kras = kras.replace({'Mutated (Codon 12/13)': 'MUT'})
kras = kras.replace({'Mutated c.34G>T (G12C)': 'MUT'})
kras = kras.replace({'Mutated c.38G>A (G13D)': 'MUT'})
kras = kras.replace({'Mutated c.35G>C (G12A)': 'MUT'})
kras = kras.replace({'Mutated c.35G>A (G12D)': 'MUT'})
kras = kras.replace({'Mutated c.35G>T (G12V)|Mutated c.38G>A (G13D) ': 'MUT'})
kras = kras.replace({'Mutated c.35G>T (G12V)': 'MUT'})
kras = kras.replace({'Mutated c.34G>A (G12S)': 'MUT'})
kras = kras.replace({'Mutated c.34G>C (G12R)': 'MUT'})
kras = kras.replace({'Mutated c.37G>T (G13C)': 'MUT'})
kras = kras.replace({'Mutated c.183A>C (Q61H) ': 'MUT'})
kras = kras.replace({'Mutated': 'MUT'})
kras = kras.replace({'Mutated c.38G>T (G13V)': 'MUT'})
kras = kras.replace({'Mutated c.34G>T (G12C) & Mutated c.35G>C (G12A) & Mutated c.38G>T (G13V)': 'MUT'})
kras = kras.replace({'Wildtype': 'WT'})
kras = kras.replace({'Not available': 'NaN'})


In [38]:

kras = kras.drop('Analyte', axis=1)
kras = kras.drop_duplicates()
kras = kras.reset_index(drop=True)


In [None]:
kras


In [40]:
clini_braf_kras = clini_braf.merge(kras, on='PATIENT')


In [42]:
clini_braf_kras.to_excel('MCO-CRC-DX_clini_full_braf_kras.xlsx')

### create csv table for MAINZ BIOPSIES

In [2]:
table = pd.read_csv('Liste_Slides_Sophia.csv')

In [4]:
patient = []
for p in table['Patient_ID']:
    patient_id = p.split('_Slide')[0]
    if patient_id not in patient:
        patient.append(patient_id)

In [6]:
clini = pd.DataFrame(patient, columns=['PATIENT'])

In [7]:
clini['isMSIH'] = 'NaN'

In [25]:
for i in range(len(clini)):
    clini.isMSIH[i] = table.loc[table['Patient_ID'] == clini.PATIENT[i] + '_Slide_1', ['MSS/MSI']].values[0][0]

In [27]:
clini.to_excel('MAINZ-BIOPSIES-CRC-DX_clini.xlsx')

In [35]:
slide = pd.DataFrame(table['Patient_ID'].values, columns=['FILENAME'])

In [37]:
slide['PATIENT'] = 'NaN'

In [38]:
for i in range(len(slide)):
    slide['PATIENT'][i] = slide['FILENAME'][i].split('_Slide')[0]

In [41]:
slide.to_csv('MAINZ-BIOPSIES-CRC-DX_slide.csv', index=False)

In [2]:
clini_path = Path('/lustre/groups/peng/datasets/histology_data/clini_tables/MAINZ-BIOPSIES-CRC-DX_clini.xlsx')

In [3]:
clini = pd.read_excel(clini_path)

In [7]:
clini.to_excel(clini_path)

In [10]:
clini_path = Path('/lustre/groups/peng/datasets/histology_data/clini_tables/MCO-CRC-DX_clini_full_braf_kras.xlsx')

In [11]:
clini = pd.read_excel(clini_path)

In [15]:
clini = clini.rename({'MSI': 'isMSIH'}, axis=1)

In [16]:
clini.isMSIH = clini.isMSIH.replace({'Stable': 'nonMSIH'})

In [17]:
clini.isMSIH = clini.isMSIH.replace({'Unstable': 'MSIH'})

In [5]:
clini = clini.drop('slide_path', axis=1)

In [7]:
clini = clini.drop('Unnamed: 0', axis=1)
clini = clini.drop('Unnamed: 0_y', axis=1)

In [18]:
clini.to_excel(clini_path)

In [None]:
clini

In [23]:
ckpt_path = Path('/lustre/groups/peng/workspace/sophia.wagner/logs/debug_transformer_MCO_macenko_isMSIH/models/best_model_debug_transformer_MCO_macenko_isMSIH_fold0.ckpt')

In [20]:
import torch

In [24]:
ckpt = torch.load(ckpt_path)

In [26]:
Path('/lustre/groups/peng/workspace/sophia.wagner/logs/idkidc/debug_transformer_MCO_macenko_isMSIH/results_debug_transformer_MCO_macenko_isMSIH.csv').is_file()

True