# Generate CSV for CLAM dataset splitting (TCGA)
## Input:
1. Sample clustering results (cluster label): ../results/tcga/sample_clusters_{}_zscore_hc_ward_euclidean_3.csv".format(gene_signature)
2. Slide id: ../../dataset_csv/tcga_hcc_feature_349.csv

## Ouput:
1. CSV for CLAM dataset splitting (case id, slide id, cluster): "../../dataset_csv/tcga_hcc_349_{}_highvsrest.csv".format(gene_signature)

In [1]:
import os
import pandas as pd

In [22]:
# Gene signatures in Sangro's paper
gene_signature = "Inflammatory" # 6G_Interferon_Gamma, Gajewski_13G_Inflammatory, Inflammatory,
                                    # Interferon_Gamma_Biology, Ribas_10G_Interferon_Gamma, T-cell_Exhaustion

labelf = "../results/tcga/sample_clusters_{}_zscore_hc_ward.D2_euclidean_3_reorder.csv".format(gene_signature)

idf = "../../dataset_csv/tcga_hcc_feature_349.csv" # all slide ids

In [23]:
# Load clustering results
labels = pd.read_csv(labelf)
# Load CSV auto-generated by CLAM segmentation -- for full slide name
df = pd.read_csv(idf)

In [24]:
# Data vis
print(labels.shape)
display(labels.head(5))

print(df.shape)
display(df.head(5))

(336, 2)


Unnamed: 0,Sample,Cluster
0,A6M3.01A,Cluster Median
1,A7IJ.01A,Cluster Median
2,A1EC.01A,Cluster Median
3,A114.01A,Cluster Median
4,A110.01A,Cluster Median


(349, 1)


Unnamed: 0,slide_id
0,TCGA-CC-5259-01Z-00-DX1.D18B2F62-0CC0-4665-90E...
1,TCGA-DD-AADN-01Z-00-DX1.0C031278-22F9-40DE-9CF...
2,TCGA-FV-A4ZP-01Z-00-DX1.4B78C21C-8033-4870-82D...
3,TCGA-DD-AACI-01Z-00-DX1.C2931364-0514-49DB-AC9...
4,TCGA-DD-AADS-01Z-00-DX1.A147E3F6-065A-4313-A16...


In [25]:
# Re-format sample name: 
# 1 remove "A" or "B" after XXXX-XX
# 2 remove "X" before all numeric names
# 3 replace "." with "-"
for i in range(len(labels)):
    labels.iloc[i,0] = labels.iloc[i,0][-8:-1].replace(".", "-")
print(labels.shape)
display(labels.head(5))

(336, 2)


Unnamed: 0,Sample,Cluster
0,A6M3-01,Cluster Median
1,A7IJ-01,Cluster Median
2,A1EC-01,Cluster Median
3,A114-01,Cluster Median
4,A110-01,Cluster Median


In [26]:
labels.replace("Cluster Median", "Cluster Median + Low", inplace=True)
labels.replace("Cluster Low", "Cluster Median + Low", inplace=True)
display(labels.head(5))

Unnamed: 0,Sample,Cluster
0,A6M3-01,Cluster Median + Low
1,A7IJ-01,Cluster Median + Low
2,A1EC-01,Cluster Median + Low
3,A114-01,Cluster Median + Low
4,A110-01,Cluster Median + Low


In [27]:
# Build a new df
df1 = df.filter(items=["slide_id"], axis="columns")
df1["case_id"] = ""
df1["Sample"] = ""
for i in range(df.shape[0]):
    df1.iloc[i,0] = df.iloc[i,0]
    df1.iloc[i,1] = df.iloc[i,0][0:12] # case id
    df1.iloc[i,2] = df.iloc[i,0][8:15] # sample id which used to match the label df

print(df1.shape)
display(df1.head(5))

(349, 3)


Unnamed: 0,slide_id,case_id,Sample
0,TCGA-CC-5259-01Z-00-DX1.D18B2F62-0CC0-4665-90E...,TCGA-CC-5259,5259-01
1,TCGA-DD-AADN-01Z-00-DX1.0C031278-22F9-40DE-9CF...,TCGA-DD-AADN,AADN-01
2,TCGA-FV-A4ZP-01Z-00-DX1.4B78C21C-8033-4870-82D...,TCGA-FV-A4ZP,A4ZP-01
3,TCGA-DD-AACI-01Z-00-DX1.C2931364-0514-49DB-AC9...,TCGA-DD-AACI,AACI-01
4,TCGA-DD-AADS-01Z-00-DX1.A147E3F6-065A-4313-A16...,TCGA-DD-AADS,AADS-01


In [28]:
# Matching the two df on sample id
results = pd.merge(labels, df1, on="Sample", how="inner") # keep the matched duplicates (any case using merge 
                                                          # function) and preserve the left order
print(results.shape)
display(results.head(5))

(349, 4)


Unnamed: 0,Sample,Cluster,slide_id,case_id
0,A6M3-01,Cluster Median + Low,TCGA-RC-A6M3-01Z-00-DX1.5B63A6E7-F016-4CA5-9C6...,TCGA-RC-A6M3
1,A7IJ-01,Cluster Median + Low,TCGA-CC-A7IJ-01Z-00-DX1.525F3A40-46CC-4455-947...,TCGA-CC-A7IJ
2,A1EC-01,Cluster Median + Low,TCGA-DD-A1EC-01Z-00-DX1.3F1B660A-0AD8-4EC9-883...,TCGA-DD-A1EC
3,A114-01,Cluster Median + Low,TCGA-DD-A114-01Z-00-DX1.2A8C5243-29C4-419D-8FD...,TCGA-DD-A114
4,A110-01,Cluster Median + Low,TCGA-BC-A110-01Z-00-DX1.E46C2B24-A159-4970-82B...,TCGA-BC-A110


In [29]:
# Reconstruct the results
results = results.filter(items=["case_id", "slide_id", "Cluster"], axis="columns")
print(results.shape)
results.rename(columns={"Cluster":"cluster"}, inplace=True)
print(results.shape)
display(results.head(5))

(349, 3)
(349, 3)


Unnamed: 0,case_id,slide_id,cluster
0,TCGA-RC-A6M3,TCGA-RC-A6M3-01Z-00-DX1.5B63A6E7-F016-4CA5-9C6...,Cluster Median + Low
1,TCGA-CC-A7IJ,TCGA-CC-A7IJ-01Z-00-DX1.525F3A40-46CC-4455-947...,Cluster Median + Low
2,TCGA-DD-A1EC,TCGA-DD-A1EC-01Z-00-DX1.3F1B660A-0AD8-4EC9-883...,Cluster Median + Low
3,TCGA-DD-A114,TCGA-DD-A114-01Z-00-DX1.2A8C5243-29C4-419D-8FD...,Cluster Median + Low
4,TCGA-BC-A110,TCGA-BC-A110-01Z-00-DX1.E46C2B24-A159-4970-82B...,Cluster Median + Low


In [30]:
# Export
results.to_csv("../../dataset_csv/tcga_hcc_349_{}_highvsrest.csv".format(gene_signature))