In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import importlib.resources as ir
from typing import Any

import pytest
from deepdiff import DeepDiff
from dill import load
from pyplier.plier_res import PLIERResults

from pathlib import Path
import pandas as pd
import pytest

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
data_dir = Path("tests").joinpath("data")
common_dir = data_dir.joinpath("common")
l1 = 18.43058
l2 = 36.86117
l3 = 0.0005530844

held_out_genes_df = pd.read_csv(common_dir.joinpath("plier_res_heldoutgenes.csv.gz"), index_col=0)
b_df = pd.read_csv(common_dir.joinpath("plier_res_b.csv.gz"), index_col=0)
c_df = pd.read_csv(common_dir.joinpath("plier_res_c.csv.gz"), index_col=0)
residual_df = pd.read_csv(common_dir.joinpath("plier_res_residual.csv.gz"), index_col=0)
u_df = pd.read_csv(common_dir.joinpath("plier_res_u.csv.gz"), index_col=0)
z_df = pd.read_csv(common_dir.joinpath("plier_res_z.csv.gz"), index_col=0)
uauc_df = pd.read_csv(common_dir.joinpath("plier_res_uauc.csv.gz"), index_col=0)
upval_df =pd.read_csv(common_dir.joinpath("plier_res_up.csv.gz"), index_col=0)
summary_df =pd.read_csv(common_dir.joinpath("plier_res_summary.csv.gz"), index_col=0)
with_prior = {
    "LV1": 1,
    "LV2": 2,
    "LV3": 3,
    "LV4": 4,
    "LV5": 5,
    "LV6": 6,
    "LV7": 7,
    "LV8": 8,
    "LV9": 9,
    "LV10": 10,
    "LV11": 11,
    "LV14": 14,
    "LV15": 15,
    "LV18": 18,
    "LV20": 20,
    "LV23": 23,
    "LV24": 24,
    "LV26": 26,
    "LV27": 27,
    "LV29": 29,
    "LV30": 30,
}

In [4]:
pr = PLIERResults(
        residual=residual_df,
        b=b_df,
        z=z_df,
        u=u_df,
        c=c_df,
        l1=l1,
        l2=l2,
        l3=l3,
        held_out_genes={
            k: g["value"].to_list()
            for k, g in held_out_genes_df.groupby("name")
        },
        with_prior=with_prior,
        uauc=uauc_df,
        up=upval_df,
        summary=summary_df,
    )

In [7]:
pr

b : 30 rows x 36 columns
z : 5892 rows x 30 columns
u : 606 rows x 30 columns
c : 5892 rows x 606 columns
uauc : 606 rows x 30 columns
up : 606 rows x 30 columns
summary : 64 rows x 4 columns
residual : 5892 rows x 36 columns
held_out_genes: 603
with_prior: 21
l1 is set to 18.4306
l2 is set to 36.8612
l3 is set to 0.0006

In [9]:
print(
        "b : 30 rows x 36 columns\n"
        "z : 5892 rows x 30 columns\n"
        "u : 606 rows x 30 columns\n"
        "c : 5892 rows x 606 columns\n"
        "uauc: 606 rows x 30 columns\n"
        "up: 606 rows x 30 columns\n"
        "summary: 64 rows x 4 columns\n"
        "residual: 5892 rows x 36 columns\n"
        "held_out_genes: 603\n"
        "with_prior: 21\n"
        "l1 is set to 18.4306\n"
        "l2 is set to 36.8612\n"
        "l3 is set to 0.0006"
    )


b : 30 rows x 36 columns
z : 5892 rows x 30 columns
u : 606 rows x 30 columns
c : 5892 rows x 606 columns
uauc: 606 rows x 30 columns
up: 606 rows x 30 columns
summary: 64 rows x 4 columns
residual: 5892 rows x 36 columns
held_out_genes: 603
with_prior: 21
l1 is set to 18.4306
l2 is set to 36.8612
l3 is set to 0.0006


In [11]:
pr

b: 30 rows x 36 columns
z: 5892 rows x 30 columns
u: 606 rows x 30 columns
c: 5892 rows x 606 columns
uauc: 606 rows x 30 columns
up: 606 rows x 30 columns
summary: 64 rows x 4 columns
residual: 5892 rows x 36 columns
held_out_genes: 603
with_prior: 21
l1 is set to 18.4306
l2 is set to 36.8612
l3 is set to 0.0006

In [13]:
pr_dict = pr.to_dict()

In [14]:
import dill
plier_res_dir = data_dir / "plier_res"
dict_file = plier_res_dir.joinpath("plier_res_dict.pkl")
with dict_file.open("rb") as df:
    pdt = dill.load(df)

In [15]:
pr_dict.keys()

dict_keys(['b', 'z', 'u', 'c', 'uauc', 'up', 'summary', 'residual', 'l1', 'l2', 'l3', 'held_out_genes', 'with_prior'])

In [16]:
pdt.keys()

dict_keys(['b', 'z', 'u', 'c', 'l1', 'l2', 'l3', 'heldoutgenes', 'withprior', 'residual', 'uauc', 'up', 'summary'])

In [20]:
pdt["with_prior"] = pdt["withprior"]
del(pdt["withprior"])

In [20]:
pdt["with_prior"] = pdt["withprior"]
del(pdt["withprior"])

In [17]:
pdt["held_out_genes"] = pdt["heldoutgenes"]
del(pdt["heldoutgenes"])

In [18]:
from deepdiff import DeepDiff

In [21]:
DeepDiff(pr_dict, pdt, ignore_order=True)

{'dictionary_item_added': [root['z']['V1'], root['z']['V2'], root['z']['V3'], root['z']['V4'], root['z']['V5'], root['z']['V6'], root['z']['V7'], root['z']['V8'], root['z']['V9'], root['z']['V10'], root['z']['V11'], root['z']['V12'], root['z']['V13'], root['z']['V14'], root['z']['V15'], root['z']['V16'], root['z']['V17'], root['z']['V18'], root['z']['V19'], root['z']['V20'], root['z']['V21'], root['z']['V22'], root['z']['V23'], root['z']['V24'], root['z']['V25'], root['z']['V26'], root['z']['V27'], root['z']['V28'], root['z']['V29'], root['z']['V30']],
 'dictionary_item_removed': [root['z']['LV1'], root['z']['LV2'], root['z']['LV3'], root['z']['LV4'], root['z']['LV5'], root['z']['LV6'], root['z']['LV7'], root['z']['LV8'], root['z']['LV9'], root['z']['LV10'], root['z']['LV11'], root['z']['LV12'], root['z']['LV13'], root['z']['LV14'], root['z']['LV15'], root['z']['LV16'], root['z']['LV17'], root['z']['LV18'], root['z']['LV19'], root['z']['LV20'], root['z']['LV21'], root['z']['LV22'], roo

In [None]:
pr_dict

In [26]:
pr_dict["z"]["LV1"]

{'GAS6': 0.0,
 'MMP14': 0.0075163030745041,
 'MARCKSL1': 0.0,
 'SPARC': 0.0,
 'CTSD': 0.0,
 'EPAS1': 0.0,
 'PALLD': 0.0409310928248674,
 'PHC2': 0.0,
 'LGALS3BP': 0.0,
 'SERPING1': 0.0,
 'TGM2': 0.0,
 'THBS1': 0.0720936585010918,
 'ITGB5': 0.0,
 'CREG1': 0.0,
 'CSTB': 0.0,
 'DAB2': 0.0311387482774109,
 'EMP1': 0.0760427817597717,
 'ETS2': 0.0,
 'GPX3': 0.126169450105299,
 'CST3': 0.0,
 'COL6A3': 0.154298580770912,
 'PPIF': 0.0039551573771543,
 'TGFBI': 0.0,
 'IGFBP4': 0.0,
 'GRSF1': 0.0,
 'DUSP3': 0.0,
 'PPP4R1': 0.0,
 'IER3': 0.0,
 'DLG5': 0.112203831158553,
 'TPD52': 0.0,
 'CD14': 0.0,
 'TOMM34': 0.0,
 'PON2': 0.0,
 'RRM2': 0.0684565384102243,
 'PPP1R12B': 0.0403581634431242,
 'DYNLT1': 0.0,
 'NID1': 0.0209792981383203,
 'LTF': 0.0,
 'PLTP': 0.0,
 'SEC14L1': 0.0368531395831298,
 'MX1': 0.0,
 'RALB': 0.0,
 'CHPF': 0.0,
 'LIMK2': 0.0039972445281023,
 'BLVRB': 0.0,
 'PDGFRB': 0.0,
 'UBE2B': 0.0,
 'TRAM2': 0.0355319406748625,
 'ADAM9': 0.111856368048325,
 'IFI27': 0.0101659477728992,
 'R

In [27]:
with dict_file.open("wb") as df:
    dill.dump(pr_dict, df)

In [30]:
pr.to_json("tests/data/plier_res/plier_res.json.gz")

True

In [33]:
pr.to_hdf5("tests/data/plier_res/plier_res.h5", overwrite=True)

In [36]:
summary_df

Unnamed: 0,LV index,AUC,p-value,FDR
REACTOME_GENERIC_TRANSCRIPTION_PATHWAY,29,0.590790,5.872015e-03,8.063065e-03
IRIS_Monocyte-Day0,5,0.800015,1.001095e-11,9.590097e-11
IRIS_Neutrophil-Resting,18,0.690748,2.819430e-08,1.365197e-07
DMAP_MONO1,5,0.842793,1.525878e-07,5.615231e-07
PID_IL6_7PATHWAY,2,0.712596,7.799121e-03,1.055175e-02
...,...,...,...,...
REACTOME_MITOCHONDRIAL_PROTEIN_IMPORT,30,0.762270,1.896318e-03,3.007953e-03
REACTOME_PEPTIDE_CHAIN_ELONGATION,30,0.949471,1.934557e-11,1.617993e-10
REACTOME_FORMATION_OF_THE_TERNARY_COMPLEX_AND_SUBSEQUENTLY_THE_43S_COMPLEX,30,0.998620,1.862630e-08,1.008011e-07
MIPS_NOP56P_ASSOCIATED_PRE_RRNA_COMPLEX,30,0.886059,3.125977e-10,2.054214e-09


In [72]:
pr2 = PLIERResults.read_hdf5("tests/data/plier_res/plier_res.h5")

In [101]:
pd.api.types.is_numeric_dtype(pr.u["LV6"].dtype)

True

In [114]:
pr.u.apply(lambda x: x.astype(np.float64) if (np.mod(x, 1) != 0).any() else x.astype(np.int64), axis=0)

Unnamed: 0,LV1,LV2,LV3,LV4,LV5,LV6,LV7,LV8,LV9,LV10,...,LV21,LV22,LV23,LV24,LV25,LV26,LV27,LV28,LV29,LV30
IRIS_Bcell-Memory_IgG_IgA,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,...,0,0,0.0,0.0,0,0.0,0.0,0,0.0,0.0
IRIS_Bcell-Memory_IgM,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,...,0,0,0.0,0.0,0,0.0,0.0,0,0.0,0.0
IRIS_Bcell-naive,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,...,0,0,0.0,0.0,0,0.0,0.0,0,0.0,0.0
IRIS_CD4Tcell-N0,0.0,0.0,0.0,0.0,0.0,0.085437,0.0,0.0,0.0,0.0,...,0,0,0.0,0.0,0,0.0,0.0,0,0.0,0.0
IRIS_CD4Tcell-Th1-restimulated12hour,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,...,0,0,0.0,0.0,0,0.0,0.0,0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
PID_IL4_2PATHWAY,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,...,0,0,0.0,0.0,0,0.0,0.0,0,0.0,0.0
REACTOME_SIGNALING_BY_THE_B_CELL_RECEPTOR_BCR,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,...,0,0,0.0,0.0,0,0.0,0.0,0,0.0,0.0
PID_BCR_5PATHWAY,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,...,0,0,0.0,0.0,0,0.0,0.0,0,0.0,0.0
PID_TELOMERASEPATHWAY,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,...,0,0,0.0,0.0,0,0.0,0.0,0,0.0,0.0


In [115]:
pr.u.dtypes == "float64"

LV1      True
LV2      True
LV3      True
LV4      True
LV5      True
LV6      True
LV7      True
LV8      True
LV9      True
LV10     True
LV11     True
LV12    False
LV13    False
LV14     True
LV15     True
LV16    False
LV17    False
LV18     True
LV19    False
LV20     True
LV21    False
LV22    False
LV23     True
LV24     True
LV25    False
LV26     True
LV27     True
LV28    False
LV29     True
LV30     True
dtype: bool

In [116]:
pr.u.astype(np.float64)

Unnamed: 0,LV1,LV2,LV3,LV4,LV5,LV6,LV7,LV8,LV9,LV10,...,LV21,LV22,LV23,LV24,LV25,LV26,LV27,LV28,LV29,LV30
IRIS_Bcell-Memory_IgG_IgA,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
IRIS_Bcell-Memory_IgM,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
IRIS_Bcell-naive,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
IRIS_CD4Tcell-N0,0.0,0.0,0.0,0.0,0.0,0.085437,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
IRIS_CD4Tcell-Th1-restimulated12hour,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
PID_IL4_2PATHWAY,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
REACTOME_SIGNALING_BY_THE_B_CELL_RECEPTOR_BCR,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
PID_BCR_5PATHWAY,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
PID_TELOMERASEPATHWAY,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [113]:
pr.u

Unnamed: 0,LV1,LV2,LV3,LV4,LV5,LV6,LV7,LV8,LV9,LV10,...,LV21,LV22,LV23,LV24,LV25,LV26,LV27,LV28,LV29,LV30
IRIS_Bcell-Memory_IgG_IgA,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,...,0,0,0.0,0.0,0,0.0,0.0,0,0.0,0.0
IRIS_Bcell-Memory_IgM,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,...,0,0,0.0,0.0,0,0.0,0.0,0,0.0,0.0
IRIS_Bcell-naive,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,...,0,0,0.0,0.0,0,0.0,0.0,0,0.0,0.0
IRIS_CD4Tcell-N0,0.0,0.0,0.0,0.0,0.0,0.085437,0.0,0.0,0.0,0.0,...,0,0,0.0,0.0,0,0.0,0.0,0,0.0,0.0
IRIS_CD4Tcell-Th1-restimulated12hour,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,...,0,0,0.0,0.0,0,0.0,0.0,0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
PID_IL4_2PATHWAY,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,...,0,0,0.0,0.0,0,0.0,0.0,0,0.0,0.0
REACTOME_SIGNALING_BY_THE_B_CELL_RECEPTOR_BCR,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,...,0,0,0.0,0.0,0,0.0,0.0,0,0.0,0.0
PID_BCR_5PATHWAY,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,...,0,0,0.0,0.0,0,0.0,0.0,0,0.0,0.0
PID_TELOMERASEPATHWAY,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,...,0,0,0.0,0.0,0,0.0,0.0,0,0.0,0.0


In [93]:
pr.up.dtypes

LV1     float64
LV2     float64
LV3     float64
LV4     float64
LV5     float64
LV6     float64
LV7     float64
LV8     float64
LV9     float64
LV10    float64
LV11    float64
LV12      int64
LV13      int64
LV14    float64
LV15    float64
LV16      int64
LV17      int64
LV18    float64
LV19      int64
LV20    float64
LV21      int64
LV22      int64
LV23    float64
LV24    float64
LV25      int64
LV26    float64
LV27    float64
LV28      int64
LV29    float64
LV30    float64
dtype: object

In [73]:
pr2.summary.dtypes

LV index     object
AUC         float64
p-value     float64
FDR         float64
dtype: object

In [59]:
import numpy as np

In [65]:
summary_2_df = pr2.summary.astype({"LV index": np.float64}).astype({"LV index": np.int64}).astype({"LV index": str, "AUC": float, "p-value": float, "FDR": float})

In [70]:
summary_df = summary_df.astype({"LV index": str, "AUC": float, "p-value": float, "FDR": float})

In [71]:
pd.testing.assert_frame_equal(summary_2_df, summary_df)

In [42]:
pr2.summary.dtypes

LV index     object
AUC         float64
p-value     float64
FDR         float64
dtype: object

In [44]:
pr2.summary.astype({"LV index": str, "AUC": float, "p-value": float, "FDR": float}).dtypes

LV index     object
AUC         float64
p-value     float64
FDR         float64
dtype: object

In [45]:
summary_df.dtypes

LV index      int64
AUC         float64
p-value     float64
FDR         float64
dtype: object

In [47]:
summary_df["LV index"]

REACTOME_GENERIC_TRANSCRIPTION_PATHWAY                                        29
IRIS_Monocyte-Day0                                                             5
IRIS_Neutrophil-Resting                                                       18
DMAP_MONO1                                                                     5
PID_IL6_7PATHWAY                                                               2
                                                                              ..
REACTOME_MITOCHONDRIAL_PROTEIN_IMPORT                                         30
REACTOME_PEPTIDE_CHAIN_ELONGATION                                             30
REACTOME_FORMATION_OF_THE_TERNARY_COMPLEX_AND_SUBSEQUENTLY_THE_43S_COMPLEX    30
MIPS_NOP56P_ASSOCIATED_PRE_RRNA_COMPLEX                                       30
REACTOME_SRP_DEPENDENT_COTRANSLATIONAL_PROTEIN_TARGETING_TO_MEMBRANE          30
Name: LV index, Length: 64, dtype: int64

In [53]:
pr2.summary["LV index"].astype(int)

ValueError: invalid literal for int() with base 10: '29.0'

In [24]:
pdt["z"]["V1"]

{'GAS6': 0.0,
 'MMP14': 0.0075163030745041,
 'MARCKSL1': 0.0,
 'SPARC': 0.0,
 'CTSD': 0.0,
 'EPAS1': 0.0,
 'PALLD': 0.0409310928248674,
 'PHC2': 0.0,
 'LGALS3BP': 0.0,
 'SERPING1': 0.0,
 'TGM2': 0.0,
 'THBS1': 0.0720936585010918,
 'ITGB5': 0.0,
 'CREG1': 0.0,
 'CSTB': 0.0,
 'DAB2': 0.0311387482774109,
 'EMP1': 0.0760427817597717,
 'ETS2': 0.0,
 'GPX3': 0.126169450105299,
 'CST3': 0.0,
 'COL6A3': 0.154298580770912,
 'PPIF': 0.0039551573771543,
 'TGFBI': 0.0,
 'IGFBP4': 0.0,
 'GRSF1': 0.0,
 'DUSP3': 0.0,
 'PPP4R1': 0.0,
 'IER3': 0.0,
 'DLG5': 0.112203831158553,
 'TPD52': 0.0,
 'CD14': 0.0,
 'TOMM34': 0.0,
 'PON2': 0.0,
 'RRM2': 0.0684565384102243,
 'PPP1R12B': 0.0403581634431242,
 'DYNLT1': 0.0,
 'NID1': 0.0209792981383203,
 'LTF': 0.0,
 'PLTP': 0.0,
 'SEC14L1': 0.0368531395831298,
 'MX1': 0.0,
 'RALB': 0.0,
 'CHPF': 0.0,
 'LIMK2': 0.0039972445281023,
 'BLVRB': 0.0,
 'PDGFRB': 0.0,
 'UBE2B': 0.0,
 'TRAM2': 0.0355319406748625,
 'ADAM9': 0.111856368048325,
 'IFI27': 0.0101659477728992,
 'R

In [119]:
expected_df = pd.read_csv(data_dir / "pathway_from_gmt" / "reactome_wide.csv.gz", index_col = 0)

In [120]:
expected_df.dtypes

REACTOME_2_LTR_CIRCLE_FORMATION                                      int64
REACTOME_ABACAVIR_METABOLISM                                         int64
REACTOME_ABACAVIR_TRANSMEMBRANE_TRANSPORT                            int64
REACTOME_ABACAVIR_TRANSPORT_AND_METABOLISM                           int64
REACTOME_ABC_FAMILY_PROTEINS_MEDIATED_TRANSPORT                      int64
                                                                     ...  
REACTOME_YAP1_AND_WWTR1_TAZ_STIMULATED_GENE_EXPRESSION               int64
REACTOME_ZBP1_DAI_MEDIATED_INDUCTION_OF_TYPE_I_IFNS                  int64
REACTOME_ZINC_EFFLUX_AND_COMPARTMENTALIZATION_BY_THE_SLC30_FAMILY    int64
REACTOME_ZINC_INFLUX_INTO_CELLS_BY_THE_SLC39_GENE_FAMILY             int64
REACTOME_ZINC_TRANSPORTERS                                           int64
Length: 1604, dtype: object