# Get survival information from clincal data

In [1]:
import numpy as np
import pandas as pd

from pathme_forte.constants import *

In [2]:
BRCA_CLINICAL_DATA = os.path.join(DATA,'tcga_datasets','brca','brca_tcga_clinical_data.tsv')
LIHC_CLINICAL_DATA = os.path.join(DATA,'tcga_datasets','lihc','lihc_tcga_clinical_data.tsv')
PRAD_CLINICAL_DATA = os.path.join(DATA,'tcga_datasets','prad','prad_tcga_clinical_data.tsv')
KIRC_CLINICAL_DATA = os.path.join(DATA,'tcga_datasets','kirc','kirc_tcga_clinical_data.tsv')
OV_CLINICAL_DATA = os.path.join(DATA,'tcga_datasets','ov','ov_tcga_clinical_data.tsv')

In [3]:
def survival_data_to_csv(clinical_data, dataset):
    
    # Read clinical meta data file
    clinical_data_df = pd.read_csv(clinical_data, sep='\t')
    
    # Get relevant columns for survival information
    clinical_data_df = clinical_data_df[['Days to Last Followup','Overall Survival (Months)','Overall Survival Status']]
    
    # Convert survival months to days
    clinical_data_df['Survival (Days)'] = round(clinical_data_df['Overall Survival (Months)']*30.42,2)
    
    # If patient is living, replace their overall survival time in months to NaN
    clinical_data_df.loc[clinical_data_df['Overall Survival Status'] == 'LIVING', 'Overall Survival (Months)'] = np.NaN

    # Rearrange columns
    cols = clinical_data_df.columns.tolist()
    cols =['Days to Last Followup','Overall Survival (Months)','Survival (Days)','Overall Survival Status']
    
    clinical_data_df = clinical_data_df[cols]
    
    clinical_data_df.to_csv('{}_survival_data.tsv'.format(dataset), sep='\t')

In [4]:
survival_data_to_csv(BRCA_CLINICAL_DATA,'brca')
survival_data_to_csv(KIRC_CLINICAL_DATA,'kirc')
survival_data_to_csv(LIHC_CLINICAL_DATA,'lihc')
survival_data_to_csv(PRAD_CLINICAL_DATA,'prad')
survival_data_to_csv(OV_CLINICAL_DATA,'ov')