In [14]:
import numpy as np
import pandas as pd
from scipy.stats import pearsonr, spearmanr
from tqdm import tqdm

In [20]:
RANDOM_SPLIT_COUNTS_PATH = "/data/yosef3/scratch/ruchir/finetuning-enformer/meta_feature_prior/r1_random_split/r1_random_split_C_0.01_lr_0.001/preds/83-273168.counts.csv"
RANDOM_SPLIT_PREDS_PATH = "/data/yosef3/scratch/ruchir/finetuning-enformer/meta_feature_prior/r1_random_split/r1_random_split_C_0.01_lr_0.001/preds/83-273168.preds.csv"

POP_SPLIT_COUNTS_PATH = "/data/yosef3/scratch/ruchir/finetuning-enformer/meta_feature_prior/r1_population_split/r1_population_split_C_0.01_lr_0.001/preds/87-286088.counts.csv"
POP_SPLIT_PREDS_PATH = "/data/yosef3/scratch/ruchir/finetuning-enformer/meta_feature_prior/r1_population_split/r1_population_split_C_0.01_lr_0.001/preds/87-286088.preds.csv"

FINETUNING_GENE_CLASS_PATH = "/data/yosef3/users/ruchir/finetuning-enformer/finetuning/data/h5_bins_384_chrom_split/gene_class.csv"

In [27]:
gene_class_df = pd.read_csv(FINETUNING_GENE_CLASS_PATH, index_col=0)
finetuning_random_split_genes = gene_class_df[
    gene_class_df["class"] == "random_split"
].index.tolist()
finetuning_pop_split_genes = gene_class_df[
    gene_class_df["class"] == "yri_split"
].index.tolist()

# Random split

In [5]:
random_split_counts_df = pd.read_csv(RANDOM_SPLIT_COUNTS_PATH, index_col=0)
random_split_preds_df = pd.read_csv(RANDOM_SPLIT_PREDS_PATH, index_col=0)

In [13]:
def measure_correlations(counts_df, preds_df):
    pearsons = {}
    spearmans = {}
    for gene in tqdm(counts_df.index):
        Y = counts_df.loc[gene].dropna()
        Y_pred = preds_df.loc[gene].dropna()
        assert Y.index.equals(Y_pred.index)
        assert len(Y) == 77
        pearsons[gene] = pearsonr(Y.values, Y_pred.values)[0]
        spearmans[gene] = spearmanr(Y.values, Y_pred.values)[0]
    return pearsons, spearmans

In [25]:
random_split_pearsons, random_split_spearmans = measure_correlations(
    random_split_counts_df, random_split_preds_df
)

print(f"Pearson correlation: {np.nanmean(list(random_split_pearsons.values()))}")
print(f"Spearman correlation: {np.nanmean(list(random_split_spearmans.values()))}")

print(
    f"Pearson correlation for finetuning random split genes: {np.nanmean([random_split_pearsons.get(g, 0.0) for g in finetuning_random_split_genes])}"
)
print(
    f"Spearman correlation for finetuning random split genes: {np.nanmean([random_split_spearmans.get(g, 0.0) for g in finetuning_random_split_genes])}"
)

100%|██████████| 3252/3252 [00:08<00:00, 378.59it/s]

Pearson correlation: 0.2652272479458596
Spearman correlation: 0.25539968441787947
Pearson correlation for finetuning random split genes: 0.2683469321360736
Spearman correlation for finetuning random split genes: 0.24819871737353996





# Predictions split

In [17]:
pop_split_counts_df = pd.read_csv(POP_SPLIT_COUNTS_PATH, index_col=0)
pop_split_preds_df = pd.read_csv(POP_SPLIT_PREDS_PATH, index_col=0)

In [28]:
pop_split_pearsons, pop_split_spearmans = measure_correlations(
    pop_split_counts_df, pop_split_preds_df
)

print(f"Pearson correlation: {np.nanmean(list(pop_split_pearsons.values()))}")
print(f"Spearman correlation: {np.nanmean(list(pop_split_spearmans.values()))}")

print(
    f"Pearson correlation for finetuning population split genes: {np.nanmean([pop_split_pearsons.get(g, 0.0) for g in finetuning_pop_split_genes])}"
)
print(
    f"Spearman correlation for finetuning population split genes: {np.nanmean([pop_split_spearmans.get(g, 0.0) for g in finetuning_pop_split_genes])}"
)

  pearsons[gene] = pearsonr(Y.values, Y_pred.values)[0]
  spearmans[gene] = spearmanr(Y.values, Y_pred.values)[0]
100%|██████████| 3251/3251 [00:08<00:00, 385.17it/s]

Pearson correlation: 0.15432630195065317
Spearman correlation: 0.14937435451840073
Pearson correlation for finetuning population split genes: 0.14418616423331118
Spearman correlation for finetuning population split genes: 0.14290161174915936



