In [1]:
import numpy as np
import pandas as pd
from scipy.stats import spearmanr, pearsonr
from sklearn.metrics import r2_score

In [2]:
COUNTS_PATH = "../process_geuvadis_data/log_tpm/corrected_log_tpm.annot.csv.gz"
SHORT_CONTEXT_PREDS_PATH = "h5_bins_384_chrom_split/predixcan_preds.384_bins.csv"
SHORT_CONTEXT_NO_CV_PREDS_PATH = (
    "h5_bins_384_chrom_split/predixcan_preds.384_bins.no_cv.csv"
)
LONG_CONTEXT_PREDS_PATH = "h5_bins_384_chrom_split/predixcan_preds.1Mb.csv"
LONG_CONTEXT_NO_CV_PREDS_PATH = "h5_bins_384_chrom_split/predixcan_preds.1Mb.no_cv.csv"
GENE_CLASS_PATH = "../finetuning/data/h5_bins_384_chrom_split/gene_class.csv"

In [4]:
counts_df = pd.read_csv(COUNTS_PATH, index_col="our_gene_name")
short_context_preds_df = pd.read_csv(SHORT_CONTEXT_PREDS_PATH, index_col=0)
short_context_no_cv_preds_df = pd.read_csv(SHORT_CONTEXT_NO_CV_PREDS_PATH, index_col=0)
long_context_preds_df = pd.read_csv(LONG_CONTEXT_PREDS_PATH, index_col=0)
long_context_no_cv_preds_df = pd.read_csv(LONG_CONTEXT_NO_CV_PREDS_PATH, index_col=0)
class_df = pd.read_csv(GENE_CLASS_PATH, index_col=0)

In [5]:
def get_correlation(
    preds_df, counts_df, genes, correlation: str = "spearman", verbose: bool = False
):
    correlations = []
    constant_prediction_genes = []
    for gene in genes:
        preds = preds_df.loc[gene].dropna()
        counts = counts_df.loc[gene, preds.index]
        assert len(preds) == len(counts) == 77
        if correlation == "spearman":
            corr, _ = spearmanr(preds, counts)
        elif correlation == "pearson":
            corr, _ = pearsonr(preds, counts)
        elif correlation == "r2":
            corr = r2_score(counts, preds)
        else:
            raise ValueError(f"Unknown correlation {correlation}")
        if np.isnan(corr):
            assert np.all(preds == preds[0])
            constant_prediction_genes.append(gene)
            corr = 0.0
        correlations.append(corr)

    if verbose:
        print(f"Mean correlation: {np.mean(correlations)}")
        print(
            f"{len(constant_prediction_genes)} genes with constant predictions: {constant_prediction_genes}"
        )
    return np.mean(correlations)

# Short context predictions (with cross validation for hyperparameter selection)

In [6]:
random_split_genes = class_df[class_df["class"] == "random_split"].index.tolist()
yri_split_genes = class_df[class_df["class"] == "yri_split"].index.tolist()

random_split_spearman = get_correlation(
    short_context_preds_df, counts_df, random_split_genes, "spearman"
)
yri_split_spearman = get_correlation(
    short_context_preds_df, counts_df, yri_split_genes, "spearman"
)

random_split_pearson = get_correlation(
    short_context_preds_df, counts_df, random_split_genes, "pearson"
)
yri_split_pearson = get_correlation(
    short_context_preds_df, counts_df, yri_split_genes, "pearson"
)

random_split_r2 = get_correlation(
    short_context_preds_df, counts_df, random_split_genes, "r2"
)
yri_split_r2 = get_correlation(short_context_preds_df, counts_df, yri_split_genes, "r2")

print(
    f"Spearman: random_split={random_split_spearman:.3f}, yri_split={yri_split_spearman:.3f}"
)
print(
    f"Pearson: random_split={random_split_pearson:.3f}, yri_split={yri_split_pearson:.3f}"
)
print(f"R2: random_split={random_split_r2:.3f}, yri_split={yri_split_r2:.3f}")



Spearman: random_split=0.273, yri_split=0.157
Pearson: random_split=0.287, yri_split=0.158
R2: random_split=0.083, yri_split=-0.036


# Long context predictions (with cross validation for hyperparameter selection)

In [7]:
random_split_genes = class_df[class_df["class"] == "random_split"].index.tolist()
random_split_genes = class_df[class_df["class"] == "random_split"].index.tolist()
yri_split_genes = class_df[class_df["class"] == "yri_split"].index.tolist()

random_split_spearman = get_correlation(
    long_context_preds_df, counts_df, random_split_genes, "spearman"
)
yri_split_spearman = get_correlation(
    long_context_preds_df, counts_df, yri_split_genes, "spearman"
)

random_split_pearson = get_correlation(
    long_context_preds_df, counts_df, random_split_genes, "pearson"
)
yri_split_pearson = get_correlation(
    long_context_preds_df, counts_df, yri_split_genes, "pearson"
)

random_split_r2 = get_correlation(
    long_context_preds_df, counts_df, random_split_genes, "r2"
)
yri_split_r2 = get_correlation(long_context_preds_df, counts_df, yri_split_genes, "r2")

print(
    f"Spearman: random_split={random_split_spearman:.3f}, yri_split={yri_split_spearman:.3f}"
)
print(
    f"Pearson: random_split={random_split_pearson:.3f}, yri_split={yri_split_pearson:.3f}"
)
print(f"R2: random_split={random_split_r2:.3f}, yri_split={yri_split_r2:.3f}")



Spearman: random_split=0.273, yri_split=0.159
Pearson: random_split=0.294, yri_split=0.153
R2: random_split=0.082, yri_split=-0.044


# Short context predictions (without cross validation for hyperparameter selection)

In [8]:
random_split_genes = class_df[class_df["class"] == "random_split"].index.tolist()
yri_split_genes = class_df[class_df["class"] == "yri_split"].index.tolist()

random_split_spearman = get_correlation(
    short_context_no_cv_preds_df, counts_df, random_split_genes, "spearman"
)
yri_split_spearman = get_correlation(
    short_context_no_cv_preds_df, counts_df, yri_split_genes, "spearman"
)

random_split_pearson = get_correlation(
    short_context_no_cv_preds_df, counts_df, random_split_genes, "pearson"
)
yri_split_pearson = get_correlation(
    short_context_no_cv_preds_df, counts_df, yri_split_genes, "pearson"
)

random_split_r2 = get_correlation(
    short_context_no_cv_preds_df, counts_df, random_split_genes, "r2"
)
yri_split_r2 = get_correlation(
    short_context_no_cv_preds_df, counts_df, yri_split_genes, "r2"
)

print(
    f"Spearman: random_split={random_split_spearman:.3f}, yri_split={yri_split_spearman:.3f}"
)
print(
    f"Pearson: random_split={random_split_pearson:.3f}, yri_split={yri_split_pearson:.3f}"
)
print(f"R2: random_split={random_split_r2:.3f}, yri_split={yri_split_r2:.3f}")



Spearman: random_split=0.261, yri_split=0.141
Pearson: random_split=0.276, yri_split=0.138
R2: random_split=0.038, yri_split=-0.414


# Long context predictions (without cross validation for hyperparameter selection)

In [9]:
random_split_genes = class_df[class_df["class"] == "random_split"].index.tolist()
yri_split_genes = class_df[class_df["class"] == "yri_split"].index.tolist()

random_split_spearman = get_correlation(
    long_context_no_cv_preds_df, counts_df, random_split_genes, "spearman"
)
yri_split_spearman = get_correlation(
    long_context_no_cv_preds_df, counts_df, yri_split_genes, "spearman"
)

random_split_pearson = get_correlation(
    long_context_no_cv_preds_df, counts_df, random_split_genes, "pearson"
)
yri_split_pearson = get_correlation(
    long_context_no_cv_preds_df, counts_df, yri_split_genes, "pearson"
)

random_split_r2 = get_correlation(
    long_context_no_cv_preds_df, counts_df, random_split_genes, "r2"
)
yri_split_r2 = get_correlation(
    long_context_no_cv_preds_df, counts_df, yri_split_genes, "r2"
)

print(
    f"Spearman: random_split={random_split_spearman:.3f}, yri_split={yri_split_spearman:.3f}"
)
print(
    f"Pearson: random_split={random_split_pearson:.3f}, yri_split={yri_split_pearson:.3f}"
)
print(f"R2: random_split={random_split_r2:.3f}, yri_split={yri_split_r2:.3f}")

Spearman: random_split=0.253, yri_split=0.139
Pearson: random_split=0.264, yri_split=0.132
R2: random_split=0.004, yri_split=-0.163
