In [17]:
import os
import pandas as pd
from pathlib import Path

In [2]:
# Read in samples file.
samples_df = pd.read_csv('er_status_samples.txt', sep='\t')
samples_df

Unnamed: 0,sample,Histology_Annotation,PAM50,Tumor_or_normal,er_status_by_ihc,pr_status_by_ihc,HER2_status,Triple_negative_status,id,filename,md5,size
0,TCGA-EW-A1PH-01,Invasive ductal carcinoma,Basal,Tumor,Negative,Negative,Indeterminate,No,eed0b09f-9b9f-4da3-b24b-8d916be3fa3e,TCGA-EW-A1PH-01Z-00-DX1.77e0f907-fe59-4dc2-b58...,e5f640dc7262aad8abdf7c5ef6f7e08d,2764279457
1,TCGA-E2-A1L7-01,Invasive ductal carcinoma,LumB,Tumor,Negative,Negative,Negative,Yes,966c7061-a5e0-4340-8e51-2d5dd509c663,TCGA-E2-A1L7-01Z-00-DX1.BE796CD2-2E81-44E8-8CA...,c2e94fca66e21076d27816d0e5682bf9,354195597
2,TCGA-C8-A12Q-01,Invasive ductal carcinoma,HER2E,Tumor,Negative,Negative,Positive,No,b411c802-4c8d-4b65-8eee-72a538f14f65,TCGA-C8-A12Q-01Z-00-DX1.CE74E5B7-FD30-4CBE-871...,69e6ca0e1bd272bef529229b65bcc265,857550913
3,TCGA-AR-A0TU-01,Invasive ductal carcinoma,Basal,Tumor,Negative,Negative,Negative,Yes,ca1f5e9c-2943-4cfd-829b-ad7713f49fa0,TCGA-AR-A0TU-01Z-00-DX1.2CBBDDAB-C1DD-4205-A55...,3434b8fc245f2942d08f75c451afd4dc,1690981359
4,TCGA-AO-A12D-01,Invasive ductal carcinoma,HER2E,Tumor,Negative,Negative,Positive,No,391143f8-da62-48fb-863f-ac505a6b0eaa,TCGA-AO-A12D-01Z-00-DX1.BA006BAD-5C6E-4099-BC9...,85143e70321ee7e721665abb7e2fbae9,1052702068
5,TCGA-S3-AA15-01,Invasive ductal carcinoma,Basal,Tumor,Negative,Negative,Negative,Yes,0018cc22-498a-45b8-bfd4-b0fe0c3a2f0a,TCGA-S3-AA15-01Z-00-DX1.A2456A4A-E6E8-4429-8F0...,268b85a5d4db9405c646f7f7b148b921,1168025971
6,TCGA-A8-A09D-01,Invasive ductal carcinoma,LumA,Tumor,Positive,Positive,Negative,No,6ed1420e-7d02-4f4e-86f7-81f1afe99e40,TCGA-A8-A09D-01Z-00-DX1.66312A8A-88BA-4B58-96D...,682d609ac65ceb7e816a31a4d6231fd6,639495708
7,TCGA-A8-A08O-01,Invasive ductal carcinoma,LumA,Tumor,Positive,Positive,Negative,No,5f71ff26-3b04-432b-8bfc-1dc439263d79,TCGA-A8-A08O-01Z-00-DX1.BC87C01D-F081-41CA-939...,470b15b54e76abdd0a0cdee1fc76ac0a,289580616
8,TCGA-E2-A10E-01,Invasive ductal carcinoma,LumA,Tumor,Positive,Positive,Negative,No,b3f4c01e-4068-45bc-8a7f-e03683f12d42,TCGA-E2-A10E-01Z-00-DX1.C45030A9-CC1A-4BA7-8F6...,4ce4a5509952c214fc50e877d58d2827,1605224769
9,TCGA-OK-A5Q2-01,Invasive lobular carcinoma,LumA,Tumor,Positive,Positive,Negative,No,89390bcb-3498-4763-8403-0a9012e66aa5,TCGA-OK-A5Q2-01Z-00-DX1.0D169898-37C6-44CA-AC8...,9b9b51881903f858c68520b9e5d6d87b,1470823965


In [49]:
def create_ludwig_inputs(samples_df):
    '''
    Create a dataframe with the image paths and labels for Ludwig.
    '''
    ludwig_df = pd.DataFrame(columns=['image_path', 'er_status_by_ihc', 'sample'])
    for row in samples_df.itertuples():
        sample_dir = Path(row.filename).stem
        tiles = os.listdir(os.path.join(sample_dir, sample_dir + '_tiles'))
        # Add each tile to the training data.
        for tile in tiles:
            tile_path = os.path.join(sample_dir, sample_dir + '_tiles', tile)
            ludwig_df.loc[len(ludwig_df)] = {'image_path': tile_path, 'er_status_by_ihc': row.er_status_by_ihc, 'sample': row.sample}

    return ludwig_df

In [47]:
# Simple train/test split.
test_samples = ['TCGA-EW-A1PH-01', 'TCGA-A8-A09D-01']
train_samples = (samples_df[~samples_df['sample'].isin(test_samples)])['sample']

# Write training data to file.
ludwig_train = create_ludwig_inputs(samples_df[samples_df['sample'].isin(train_samples)])
ludwig_train.to_csv('er_status_training_data.csv', index=False)

# Write test data to file.
ludwig_test = create_ludwig_inputs(samples_df[samples_df['sample'].isin(test_samples)])
ludwig_test.to_csv('er_status_test_data.csv', index=False)

In [50]:
# Write all data to file.
ludwig_all = create_ludwig_inputs(samples_df)
ludwig_all.to_csv('er_status_all_data.csv', index=False)