In [27]:
#### preprocess the gene mutation data
import pandas as pd

root = "/mnt/gpfs01/lsf-workspace/u2070124/Code/cancer-metadata/tcga/"

# Initialize an empty DataFrame for merging
merged_df = pd.DataFrame()

for part in range(1, 15):
    part_path = root + f"tcga-mutations-part{part}.txt"
    part_df = pd.read_csv(part_path, sep="\t")
    
    # Only keep rows where SAMPLE_ID starts with "TCGA"
    part_df = part_df[part_df['SAMPLE_ID'].str.startswith('TCGA')]
    
    # Rename STUDY_ID values by splitting and keeping the first part
    if 'STUDY_ID' in part_df.columns:
        part_df['STUDY_ID'] = part_df['STUDY_ID'].apply(lambda x: x.split('_')[0].upper())
    
    if not merged_df.empty:
        # Get the columns that are not already in merged_df
        new_columns = [col for col in part_df.columns if col not in merged_df.columns or col == 'SAMPLE_ID']
        part_df = part_df[new_columns]
    
    if merged_df.empty:
        # If merged_df is empty, assign part_df to merged_df
        merged_df = part_df
    else:
        # Merge the current part_df with the existing merged_df on 'SAMPLE_ID'
        merged_df = pd.merge(merged_df, part_df, on='SAMPLE_ID', how='outer')

# Reset index after merging
merged_df.reset_index(drop=True, inplace=True)

# Replace "WT" with 0, other values with 1, and "NS" with NaN in all columns except "SAMPLE_ID" and "STUDY_ID"
def transform_value(value):
    if value == "WT":
        return 0
    elif value == "NS":
        return float('nan')
    else:
        return 1

# Replace "WT" with 0 and other values with 1 in all columns except "SAMPLE_ID" and "STUDY_ID"
columns_to_modify = [col for col in merged_df.columns if col not in ['SAMPLE_ID', 'STUDY_ID']]
merged_df[columns_to_modify] = merged_df[columns_to_modify].applymap(transform_value)

# sort the genes alphabetically
columns_to_modify = sorted(columns_to_modify)
merged_df = merged_df[['STUDY_ID', 'SAMPLE_ID'] + columns_to_modify]

# Display the first few rows of the merged dataframe
merged_df.to_csv("gene/data/tcga_all_gene_mutations.csv", index=None)

In [16]:
#### preprocess the gene CNV data
import pandas as pd

root = "/mnt/gpfs01/lsf-workspace/u2070124/Code/cancer-metadata/tcga/"

# Initialize an empty DataFrame for merging
merged_df = pd.DataFrame()

for part in range(1, 15):
    part_path = root + f"tcga-cna-part{part}.txt"
    part_df = pd.read_csv(part_path, sep="\t")
    
    # Only keep rows where SAMPLE_ID starts with "TCGA"
    part_df = part_df[part_df['SAMPLE_ID'].str.startswith('TCGA')]
    
    # Rename STUDY_ID values by splitting and keeping the first part
    if 'STUDY_ID' in part_df.columns:
        part_df['STUDY_ID'] = part_df['STUDY_ID'].apply(lambda x: x.split('_')[0].upper())
    
    if not merged_df.empty:
        # Get the columns that are not already in merged_df
        new_columns = [col for col in part_df.columns if col not in merged_df.columns or col == 'SAMPLE_ID']
        part_df = part_df[new_columns]
    
    if merged_df.empty:
        # If merged_df is empty, assign part_df to merged_df
        merged_df = part_df
    else:
        # Merge the current part_df with the existing merged_df on 'SAMPLE_ID'
        merged_df = pd.merge(merged_df, part_df, on='SAMPLE_ID', how='outer')

# Reset index after merging
merged_df.reset_index(drop=True, inplace=True)

# Replace "WT" with 0 and other values with 1 in all columns except "SAMPLE_ID" and "STUDY_ID"
columns_to_modify = [col for col in merged_df.columns if col not in ['SAMPLE_ID', 'STUDY_ID']]

# sort the genes alphabetically
columns_to_modify = sorted(columns_to_modify)
merged_df = merged_df[['STUDY_ID', 'SAMPLE_ID'] + columns_to_modify]

# convert the values to float (NP to NaN)
merged_df[columns_to_modify] = merged_df[columns_to_modify].apply(pd.to_numeric, errors='coerce')

# Display the first few rows of the merged dataframe
merged_df.to_csv("gene/data/tcga_all_gene_cnv.csv", index=None)

