In [1]:
from pathlib import Path
import pandas as pd
import ast
from tqdm import tqdm

### Data debugging

In [38]:
# feature_dir = Path('/lustre/groups/peng/datasets/histology_data/DACHS/features/Raw/Dachs_HistAuGAN_CTransPath/')
# clini_table =  Path('/lustre/groups/peng/datasets/histology_data/clini_tables/DACHS-CRC-DX_CLINI.xlsx')
# slide_csv = Path('/lustre/groups/peng/datasets/histology_data/clini_tables/DACHS-CRC-DX_SLIDE.csv')
feature_dir = Path('/lustre/groups/peng/datasets/histology_data/FOXTROT/features/Macenko/FOXTROT-CRC-DX_macenko_ctranspath/')
clini_table =  Path('/lustre/groups/peng/datasets/histology_data/clini_tables/FOXTROT-CRC-DX_CLINI.xlsx')
slide_csv = Path('/lustre/groups/peng/datasets/histology_data/clini_tables/FOXTROT-CRC-DX_resection_tumor_slide.csv')
target_labels = ['isMSIH']
clini_info= {}
categories = ['Not mut.', 'Mutat.', 'nonMSIH', 'MSIH', 'WT', 'MUT', 'wt', 'MT']



In [39]:
clini_df = pd.read_csv(clini_table, dtype=str) if Path(clini_table).suffix == '.csv' else pd.read_excel(
    clini_table, dtype=str)
slide_df = pd.read_csv(slide_csv, dtype=str)
df = clini_df.merge(slide_df, on='PATIENT')
# adapt dataframe to case sensitive clini tables
df = df.rename({
    'MSI': 'isMSIH',
    'BRAF': 'braf', 'BRAF_mutation': 'braf', 'braf_status': 'braf', 
    'KRAS': 'kras', 'kras_status': 'kras', 'KRAS_mutation': 'kras',
    'NRAS': 'nras', 'NRAS_mutation': 'nras',
    'Age': 'AGE'
}, axis=1)

In [40]:
clini_df

Unnamed: 0,Case,PATIENT,MMRStatus,MLH_1,PMS_2,MSH_2,MSH_6,isMSIH,x_TCD_tumour1_,x_TCD_tumour2_
0,FOXTROT_1001,1001,pMMR,1,1,1,1,nonMSIH,41.9672,
1,FOXTROT_1002,1002,pMMR,1,1,1,1,nonMSIH,28.1139,
2,FOXTROT_1003,1003,pMMR,1,1,1,1,nonMSIH,18.5668,
3,FOXTROT_1004,1004,pMMR,1,1,1,1,nonMSIH,19.1638,
4,FOXTROT_1005,1005,pMMR,1,1,1,1,nonMSIH,8.4746,
...,...,...,...,...,...,...,...,...,...,...
1048,FOXTROT_2049,2049,pMMR,1,1,1,1,nonMSIH,35.461,
1049,FOXTROT_2050,2050,UNKNOWN,,,,,,50,
1050,FOXTROT_2051,2051,dMMR,0,0,1,1,MSIH,21.7391,
1051,FOXTROT_2052,2052,pMMR,1,1,1,1,nonMSIH,16.0714,


In [41]:
slide_df

Unnamed: 0,FILENAME,PATIENT,CLASSIFICATION
0,FOXROT_100103,1053,resection tumor
1,FOXROT_100104,1053,resection tumor
2,FOXROT_100107,1053,resection tumor
3,FOXROT_100110,1053,resection tumor
4,FOXROT_100114,1053,resection tumor
...,...,...,...
6006,FOXROT_99834,1027,resection tumor
6007,FOXROT_99835,1054,resection tumor
6008,FOXROT_99845,1028,resection tumor
6009,FOXROT_99856,1028,resection tumor


In [42]:
slide_df.PATIENT.nunique()

869

In [43]:
# remove columns not in target_labels
for key in df.columns:
    if key not in target_labels + ['PATIENT', 'SLIDE', 'FILENAME', *list(clini_info.keys())]:
        df.drop(key, axis=1, inplace=True)
# remove rows/slides with non-valid labels
for target in target_labels:
    df = df[df[target].isin(categories)]

In [49]:
# h5s = set(feature_dir.glob('**/*.h5'))
h5s = set(Path('/lustre/groups/peng/datasets/histology_data/FOXTROT/features/Macenko/FOXTROT-CRC-DX_macenko_ctranspath').glob('**/*.h5'))


In [50]:
h5s

{PosixPath('/lustre/groups/peng/datasets/histology_data/FOXTROT/features/Macenko/FOXTROT-CRC-DX_macenko_ctranspath/TRANSFER_FOXTROT_1300_to_1499/1481_wMacro/FOXROT_311488.h5'),
 PosixPath('/lustre/groups/peng/datasets/histology_data/FOXTROT/features/Macenko/FOXTROT-CRC-DX_macenko_ctranspath/TRANSFER_FOXTROT_1500_to_1699/1638_wMacro/FOXROT_282675.h5'),
 PosixPath('/lustre/groups/peng/datasets/histology_data/FOXTROT/features/Macenko/FOXTROT-CRC-DX_macenko_ctranspath/TRANSFER_FOXTROT_1200_to_1299/1207_wMacro/FOXROT_148804.h5'),
 PosixPath('/lustre/groups/peng/datasets/histology_data/FOXTROT/features/Macenko/FOXTROT-CRC-DX_macenko_ctranspath/TRANSFER_FOXTROT_1500_to_1699/1531_wMacro/FOXROT_240018.h5'),
 PosixPath('/lustre/groups/peng/datasets/histology_data/FOXTROT/features/Macenko/FOXTROT-CRC-DX_macenko_ctranspath/TRANSFER_FOXTROT_1300_to_1499/1431/FOXROT_210564.h5'),
 PosixPath('/lustre/groups/peng/datasets/histology_data/FOXTROT/features/Macenko/FOXTROT-CRC-DX_macenko_ctranspath/TRANSFE

### Create slide-csv for cohort

In [51]:
feature_dir = Path('/lustre/groups/peng/datasets/histology_data/MCO-CRC-DX/features/Macenko/CTransPath')

In [101]:
h5s = set(feature_dir.glob('**/*.h5'))


In [102]:
filenames = [h5.stem for h5 in h5s]

In [103]:
names = pd.DataFrame(filenames, columns=['ParentSpecimen'])

In [60]:
names.to_csv('filenames.csv')

In [104]:
names

Unnamed: 0,ParentSpecimen
0,MCO5488
1,MCO5424
2,MCO4095
3,MCO2273
4,MCO0821
...,...
1610,MCO5627
1611,MCO2291
1612,MCO5562
1613,MCO2996


In [96]:
mappings = pd.read_csv('MCO_slide_large.csv')

  mappings = pd.read_csv('MCO_slide_large.csv')


In [97]:
mappings = mappings.drop('Unnamed: 2', axis=1)

In [99]:
mappings = mappings.dropna()

In [100]:
mappings

Unnamed: 0,PATIENT,ParentSpecimen
0,4809.0,MCO100003
1,4809.0,MCO100003
2,4809.0,MCO100003
3,4809.0,MCO100003
4,4809.0,MCO100003
...,...,...
19164,303.0,MCO9995
19165,303.0,MCO9995
19166,303.0,MCO9995
19167,303.0,MCO9995


In [105]:
df = mappings.merge(names, on='ParentSpecimen')

In [106]:
df = df.drop_duplicates()

In [107]:
df

Unnamed: 0,PATIENT,ParentSpecimen
0,520.0,MCO1156
7,1672.0,MCO1164
14,523.0,MCO1181
21,1956.0,MCO1193
28,1963.0,MCO1209
...,...,...
4584,5119.0,MCO6226
4589,5125.0,MCO6232
4594,5126.0,MCO6233
4599,5128.0,MCO6235


In [108]:
df.PATIENT = df.PATIENT.astype(int)

In [109]:
df = df.reset_index()

In [110]:
df = df.drop('index', axis=1)

In [111]:
df

Unnamed: 0,PATIENT,ParentSpecimen
0,520,MCO1156
1,1672,MCO1164
2,523,MCO1181
3,1956,MCO1193
4,1963,MCO1209
...,...,...
794,5119,MCO6226
795,5125,MCO6232
796,5126,MCO6233
797,5128,MCO6235


In [112]:
df.to_csv('MCO-CRC-DX_slide.csv')

### Create slide csv for MCO

In [23]:
clini = pd.read_excel('MCO-CRC-DX_clini_full.xlsx')

In [43]:
clini

Unnamed: 0,PATIENT,5yOS_E,5yOS,5yDSS_E,FILENAME,OS,OS_E,slide_path,SCORE,Lifetime Vital Status,...,Radiotherapy Treatment Type,Therapeutic Radiotherapy Dose Given,Surgery Target Site (ICD-O-3),Surgical Procedure for Cancer,Unnamed: 0_y,MSI,adjTreat_new,Lymph Nodes(Tested)_new,Lymph Nodes(Positive)_new,N012
0,8,0,59,0,MCO3966,59,0,[Path('/home/xjiang/crc_surv_path/features/Nor...,-3.255808e-02,Alive,...,,Unknown,C18.6 Descending colon,,0.0,Stable,1.0,14,8,N2
1,12,1,44,1,MCO3968,44,1,[Path('/home/xjiang/crc_surv_path/features/Nor...,-5.048414e-02,Dead,...,,Unknown,C18.6 Descending colon,,1.0,Stable,1.0,13,3,N1
2,13,1,22,0,MCO3969,22,1,[Path('/home/xjiang/crc_surv_path/features/Nor...,-1.474705e+07,Dead,...,,Not Applicable,C18.0 Cecum,,2.0,Stable,0.0,2,0,N0
3,14,1,9,1,MCO3970,9,1,[Path('/home/xjiang/crc_surv_path/features/Nor...,-4.854023e-01,Dead,...,,Unknown,C18.4 Transverse colon,,3.0,Unstable,1.0,17,6,N2
4,17,0,59,0,MCO3973,59,0,[Path('/home/xjiang/crc_surv_path/features/Nor...,3.359147e-01,Alive,...,,Unknown,C18.7 Sigmoid colon,,4.0,Stable,1.0,16,6,N2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1390,5125,0,59,0,MCO6232,59,0,[Path('/home/xjiang/crc_surv_path/features/Nor...,8.634378e-01,Alive,...,,Unknown,"C20.9 Rectum, NOS",Low Anterior resection,1468.0,Stable,1.0,14,0,N0
1391,5126,0,59,0,MCO6233,59,0,[Path('/home/xjiang/crc_surv_path/features/Nor...,2.331220e-02,Alive,...,,Not Applicable,C18.7 Sigmoid colon,,1469.0,Stable,1.0,11,1,N1
1392,5128,0,59,0,MCO6235,59,0,[Path('/home/xjiang/crc_surv_path/features/Nor...,-1.875084e-01,Alive,...,,Not Applicable,C18.4 Transverse colon,,1470.0,Stable,0.0,14,0,N0
1393,5129,0,59,0,MCO6236,59,0,[Path('/home/xjiang/crc_surv_path/features/Nor...,-4.870297e-01,Alive,...,,Yes,"C20.9 Rectum, NOS",Low Anterior resection,1471.0,Stable,0.0,27,0,N0


In [44]:
slide = clini[['PATIENT', 'slide_path']]

In [45]:
slide

Unnamed: 0,PATIENT,slide_path
0,8,[Path('/home/xjiang/crc_surv_path/features/Nor...
1,12,[Path('/home/xjiang/crc_surv_path/features/Nor...
2,13,[Path('/home/xjiang/crc_surv_path/features/Nor...
3,14,[Path('/home/xjiang/crc_surv_path/features/Nor...
4,17,[Path('/home/xjiang/crc_surv_path/features/Nor...
...,...,...
1390,5125,[Path('/home/xjiang/crc_surv_path/features/Nor...
1391,5126,[Path('/home/xjiang/crc_surv_path/features/Nor...
1392,5128,[Path('/home/xjiang/crc_surv_path/features/Nor...
1393,5129,[Path('/home/xjiang/crc_surv_path/features/Nor...


In [46]:
for i in tqdm(range(len(slide))):
    # slide.slide_path[i] = 
    s = slide.slide_path[i]
    s = s.replace('Path(', '').replace(')', '')  # remove the 'Path()' function calls
    path_strs = ast.literal_eval(s)  # evaluate the resulting string

    # convert strings back to Path() objects
    path_list = [Path(x) for x in path_strs]
    slide.slide_path[i] = path_list


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  slide.slide_path[i] = path_list
100%|██████████| 1395/1395 [00:12<00:00, 114.56it/s]


In [47]:
slide = slide.explode('slide_path')

In [48]:
slide = slide.reset_index()

In [49]:
slide.slide_path[0]

PosixPath('/home/xjiang/crc_surv_path/features/Norm/MCO-CRC-DX/RetCCL/MCO3966.h5')

In [50]:
for i in tqdm(range(len(slide))):
    slide.slide_path[i] = slide.slide_path[i].stem

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  slide.slide_path[i] = slide.slide_path[i].stem
100%|██████████| 1476/1476 [00:06<00:00, 227.33it/s]


In [51]:
slide

Unnamed: 0,index,PATIENT,slide_path
0,0,8,MCO3966
1,1,12,MCO3968
2,2,13,MCO3969
3,3,14,MCO3970
4,4,17,MCO3973
...,...,...,...
1471,1390,5125,MCO6232
1472,1391,5126,MCO6233
1473,1392,5128,MCO6235
1474,1393,5129,MCO6236


In [52]:
slide = slide.drop('index', axis=1)

In [53]:
slide = slide.rename({'slide_path': 'FILENAME'}, axis=1)

In [54]:
slide.keys()

Index(['PATIENT', 'FILENAME'], dtype='object')

In [55]:
slide.to_csv('MCO-CRC-DX_slide.csv', index=False)

In [56]:
test = pd.read_csv('MCO-CRC-DX_slide.csv')

In [57]:
test

Unnamed: 0,PATIENT,FILENAME
0,8,MCO3966
1,12,MCO3968
2,13,MCO3969
3,14,MCO3970
4,17,MCO3973
...,...,...
1471,5125,MCO6232
1472,5126,MCO6233
1473,5128,MCO6235
1474,5129,MCO6236


### Add clinical values such as BRAF and KRAS

In [2]:
exp = pd.read_excel('JJ_MCO_Experimental_Full_Raw_2021.xlsx')
exp = exp.rename({'Patient_ID': 'PATIENT'}, axis=1)

  warn(msg)


In [3]:
for key in exp.columns:
    if key not in ['PATIENT', 'Analyte', 'Result']:
        exp.drop(key, axis=1, inplace=True)

In [17]:
braf = exp.loc[exp['Analyte']=='BRAF']

In [18]:
braf

Unnamed: 0,PATIENT,Analyte,Result
3,8,BRAF,Wildtype
4,8,BRAF,Wildtype
32,12,BRAF,Wildtype
33,12,BRAF,Wildtype
61,13,BRAF,Wildtype
...,...,...,...
46196,5128,BRAF,Wildtype
46220,5129,BRAF,Wildtype
46221,5129,BRAF,Wildtype
46245,5131,BRAF,Wildtype


In [19]:
braf.Result.unique()

array(['Wildtype', 'Mutated c.1799T>A (V600E)', 'Not available'],
      dtype=object)

In [20]:
braf = braf.rename({'Result': 'BRAF'}, axis=1)
braf = braf.replace({'Mutated c.1799T>A (V600E)': 'MUT'})
braf = braf.replace({'Wildtype': 'WT'})
braf = braf.replace({'Not available': 'NaN'})


In [21]:
braf = braf.drop('Analyte', axis=1)
braf = braf.drop_duplicates()
braf = braf.reset_index(drop=True)


In [22]:
braf

Unnamed: 0,PATIENT,BRAF
0,8,WT
1,12,WT
2,13,WT
3,14,MUT
4,17,WT
...,...,...
1511,5125,WT
1512,5126,WT
1513,5128,WT
1514,5129,WT


In [25]:
clini_braf = clini.merge(braf, on='PATIENT')

In [27]:
clini_braf

Unnamed: 0,PATIENT,5yOS_E,5yOS,5yDSS_E,FILENAME,OS,OS_E,slide_path,SCORE,Lifetime Vital Status,...,Therapeutic Radiotherapy Dose Given,Surgery Target Site (ICD-O-3),Surgical Procedure for Cancer,Unnamed: 0_y,MSI,adjTreat_new,Lymph Nodes(Tested)_new,Lymph Nodes(Positive)_new,N012,BRAF
0,8,0,59,0,MCO3966,59,0,[Path('/home/xjiang/crc_surv_path/features/Nor...,-3.255808e-02,Alive,...,Unknown,C18.6 Descending colon,,0.0,Stable,1.0,14,8,N2,WT
1,12,1,44,1,MCO3968,44,1,[Path('/home/xjiang/crc_surv_path/features/Nor...,-5.048414e-02,Dead,...,Unknown,C18.6 Descending colon,,1.0,Stable,1.0,13,3,N1,WT
2,13,1,22,0,MCO3969,22,1,[Path('/home/xjiang/crc_surv_path/features/Nor...,-1.474705e+07,Dead,...,Not Applicable,C18.0 Cecum,,2.0,Stable,0.0,2,0,N0,WT
3,14,1,9,1,MCO3970,9,1,[Path('/home/xjiang/crc_surv_path/features/Nor...,-4.854023e-01,Dead,...,Unknown,C18.4 Transverse colon,,3.0,Unstable,1.0,17,6,N2,MUT
4,17,0,59,0,MCO3973,59,0,[Path('/home/xjiang/crc_surv_path/features/Nor...,3.359147e-01,Alive,...,Unknown,C18.7 Sigmoid colon,,4.0,Stable,1.0,16,6,N2,WT
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1431,5125,0,59,0,MCO6232,59,0,[Path('/home/xjiang/crc_surv_path/features/Nor...,8.634378e-01,Alive,...,Unknown,"C20.9 Rectum, NOS",Low Anterior resection,1468.0,Stable,1.0,14,0,N0,WT
1432,5126,0,59,0,MCO6233,59,0,[Path('/home/xjiang/crc_surv_path/features/Nor...,2.331220e-02,Alive,...,Not Applicable,C18.7 Sigmoid colon,,1469.0,Stable,1.0,11,1,N1,WT
1433,5128,0,59,0,MCO6235,59,0,[Path('/home/xjiang/crc_surv_path/features/Nor...,-1.875084e-01,Alive,...,Not Applicable,C18.4 Transverse colon,,1470.0,Stable,0.0,14,0,N0,WT
1434,5129,0,59,0,MCO6236,59,0,[Path('/home/xjiang/crc_surv_path/features/Nor...,-4.870297e-01,Alive,...,Yes,"C20.9 Rectum, NOS",Low Anterior resection,1471.0,Stable,0.0,27,0,N0,WT


In [28]:
kras = exp.loc[exp['Analyte']=='KRAS']


In [29]:
kras


Unnamed: 0,PATIENT,Analyte,Result
6,8,KRAS,Mutated (Codon 12/13)
7,8,KRAS,Mutated c.34G>T (G12C)
35,12,KRAS,Wildtype
36,12,KRAS,Wildtype
63,13,KRAS,Mutated (Codon 12/13)
...,...,...,...
46200,5128,KRAS,Wildtype
46224,5129,KRAS,Wildtype
46225,5129,KRAS,Wildtype
46249,5131,KRAS,Mutated (Codon 12/13)


In [37]:
kras.KRAS.unique()


array(['MUT', 'WT', 'NaN'], dtype=object)

In [36]:
kras = kras.rename({'Result': 'KRAS'}, axis=1)
kras = kras.replace({'Mutated (Codon 12/13)': 'MUT'})
kras = kras.replace({'Mutated c.34G>T (G12C)': 'MUT'})
kras = kras.replace({'Mutated c.38G>A (G13D)': 'MUT'})
kras = kras.replace({'Mutated c.35G>C (G12A)': 'MUT'})
kras = kras.replace({'Mutated c.35G>A (G12D)': 'MUT'})
kras = kras.replace({'Mutated c.35G>T (G12V)|Mutated c.38G>A (G13D) ': 'MUT'})
kras = kras.replace({'Mutated c.35G>T (G12V)': 'MUT'})
kras = kras.replace({'Mutated c.34G>A (G12S)': 'MUT'})
kras = kras.replace({'Mutated c.34G>C (G12R)': 'MUT'})
kras = kras.replace({'Mutated c.37G>T (G13C)': 'MUT'})
kras = kras.replace({'Mutated c.183A>C (Q61H) ': 'MUT'})
kras = kras.replace({'Mutated': 'MUT'})
kras = kras.replace({'Mutated c.38G>T (G13V)': 'MUT'})
kras = kras.replace({'Mutated c.34G>T (G12C) & Mutated c.35G>C (G12A) & Mutated c.38G>T (G13V)': 'MUT'})
kras = kras.replace({'Wildtype': 'WT'})
kras = kras.replace({'Not available': 'NaN'})


In [38]:

kras = kras.drop('Analyte', axis=1)
kras = kras.drop_duplicates()
kras = kras.reset_index(drop=True)


In [39]:
kras


Unnamed: 0,PATIENT,KRAS
0,8,MUT
1,12,WT
2,13,MUT
3,14,WT
4,17,MUT
...,...,...
1516,5125,MUT
1517,5126,WT
1518,5128,WT
1519,5129,WT


In [40]:
clini_braf_kras = clini_braf.merge(kras, on='PATIENT')


In [42]:
clini_braf_kras.to_excel('MCO-CRC-DX_clini_full_braf_kras.xlsx')

### create csv table for MAINZ BIOPSIES

In [2]:
table = pd.read_csv('Liste_Slides_Sophia.csv')

In [4]:
patient = []
for p in table['Patient_ID']:
    patient_id = p.split('_Slide')[0]
    if patient_id not in patient:
        patient.append(patient_id)

In [6]:
clini = pd.DataFrame(patient, columns=['PATIENT'])

In [7]:
clini['isMSIH'] = 'NaN'

In [25]:
for i in range(len(clini)):
    clini.isMSIH[i] = table.loc[table['Patient_ID'] == clini.PATIENT[i] + '_Slide_1', ['MSS/MSI']].values[0][0]

In [27]:
clini.to_excel('MAINZ-BIOPSIES-CRC-DX_clini.xlsx')

In [35]:
slide = pd.DataFrame(table['Patient_ID'].values, columns=['FILENAME'])

In [37]:
slide['PATIENT'] = 'NaN'

In [38]:
for i in range(len(slide)):
    slide['PATIENT'][i] = slide['FILENAME'][i].split('_Slide')[0]

In [41]:
slide.to_csv('MAINZ-BIOPSIES-CRC-DX_slide.csv', index=False)

In [2]:
clini_path = Path('/lustre/groups/peng/datasets/histology_data/clini_tables/MAINZ-BIOPSIES-CRC-DX_clini.xlsx')

In [3]:
clini = pd.read_excel(clini_path)

In [4]:
clini.isMSIH = clini.isMSIH.replace({'MSS': 'nonMSIH'})

In [5]:
clini.isMSIH = clini.isMSIH.replace({'MSI': 'MSIH'})

In [7]:
clini.to_excel(clini_path)