# Examining gene and feature importance scores with and without germline data
- germline
- somatic
- somatic plus germline

Positive control to verify: Do we see BRCA2 in top genes for germline data? If not, suggests that we need to change how we manipulate the germline data (e.g. patho variant filtering, gene filtering, grouping variants by type, etc).

In [1]:
import torch
import random
import seaborn as sns
import pandas as pd
import os
import matplotlib.pyplot as plt
import prostate_data_loaders

import logging
logging.basicConfig(
            format='%(asctime)s %(levelname)-8s %(message)s',
            level=logging.INFO,
            datefmt='%Y-%m-%d %H:%M:%S')

logger = logging.getLogger()
logger.setLevel(logging.INFO)

%load_ext autoreload
%autoreload 2

In [13]:
who = "val"
dirs = ['prostate_val_germline', 'prostate_val_somatic', 'prostate_val_germ_and_somatic' ]

logging.debug("Defining paths for germline data")
GERMLINE_DATADIR = "../../pnet_germline/data/"
logging.debug("Defining paths for the sample metadata")
id_map_f = os.path.join(GERMLINE_DATADIR, "prostate/germline_somatic_id_map_outer_join.csv") # germline_somatic_id_map_f
sample_metadata_f = os.path.join(GERMLINE_DATADIR,"prostate/pathogenic_variants_with_clinical_annotation_1341_aug2021_correlation.csv")

prostate_response = prostate_data_loaders.get_target(id_map_f, sample_metadata_f, id_to_use="Tumor_Sample_Barcode", target_col="is_met")
prostate_response = prostate_response.rename(columns={'is_met': 'response'})


2023-11-07 17:01:26 INFO     Getting prediction target
2023-11-07 17:01:26 INFO     Loading the sample metadata DF that has all the IDs and also our target, metastatic status ('is_met')
2023-11-07 17:01:26 INFO     Loading the germline metadata file at ../../pnet_germline/data/prostate/pathogenic_variants_with_clinical_annotation_1341_aug2021_correlation.csv
2023-11-07 17:01:26 INFO     Generating the target DF (target column '{target_col}' indexed by '{id}')
2023-11-07 17:01:26 INFO     Target column value_counts: 0    704
1    626
Name: is_met, dtype: int64


In [35]:
df_imps = pd.DataFrame()
df_ranks = pd.DataFrame()
for i in dirs:
    imps = pd.read_csv(f'../results/{i}/{who}_gene_importances.csv'.format(i)).set_index('Unnamed: 0')
    imps = imps.join(prostate_response).groupby('response').mean().diff(axis=0).iloc[1]
    ranks = imps.rank(ascending=False)
    df_imps[i] = imps
    df_ranks[i] = ranks

logging.info("Averaged across trials, top importance genes")
df_imps.mean(axis=1).nlargest(20)



2023-11-07 17:07:23 INFO     Averaged across trials, top importance genes


In [41]:
for i in dirs:
    logging.info(f"Sorting by {i}")
    df_imps = df_imps.sort_values(by=i, ascending=False)
    df_ranks = df_ranks.sort_values(by=i, ascending=True)
    display(df_imps[:10])
    # display(df_ranks[:10])

2023-11-07 17:59:47 INFO     Sorting by prostate_val_germline


Unnamed: 0,prostate_val_germline,prostate_val_somatic,prostate_val_germ_and_somatic
PRSS1,1.678804,-1.2e-05,0.1243762
BRCA2,0.038028,0.000202,0.002465071
PMS2,0.008218,-0.000385,0.002310565
HLA-A,0.006194,0.0,0.02074946
HFE,0.004755,-0.000582,0.004897047
AKAP9,0.003908,9.7e-05,0.0002539032
SBDS,0.002364,7.5e-05,2.432148e-05
FBLN2,0.001596,0.0,5.257273e-05
PMS1,0.001003,2e-06,-4.831142e-06
WNK2,0.000812,0.002506,2.73736e-07


2023-11-07 17:59:47 INFO     Sorting by prostate_val_somatic


Unnamed: 0,prostate_val_germline,prostate_val_somatic,prostate_val_germ_and_somatic
RB1,0.0,0.01457,0.039277
APC,-0.001766,0.013602,0.018202
CCNC,0.0,0.007773,0.006523
CXCR4,0.0,0.007579,0.004587
SLC25A32,0.0,0.006808,8.3e-05
XPA,0.0,0.00636,0.000239
MAT1A,0.0,0.005423,0.000985
ITGA1,0.0,0.005306,-0.028851
FZD6,0.0,0.005244,9.6e-05
SGK3,0.0,0.005113,-4.2e-05


2023-11-07 17:59:47 INFO     Sorting by prostate_val_germ_and_somatic


Unnamed: 0,prostate_val_germline,prostate_val_somatic,prostate_val_germ_and_somatic
AR,0.0,-0.011289,0.51429
TP53,0.0,-0.026799,0.174181
PRSS1,1.678804,-1.2e-05,0.124376
MUC4,0.0,0.001992,0.072185
RAC1,0.0,0.0043,0.049191
RB1,0.0,0.01457,0.039277
GNAS,0.0,-0.002528,0.039031
AP2A2,0.0,0.00409,0.037736
OBSCN,0.0,0.000838,0.036086
PDGFA,0.0,0.000846,0.033259
