In [1]:
from dataclasses import dataclass
from typing import Dict, List, Union
import numpy as np
import pandas as pd
from collections import defaultdict
from pathlib import Path
import gzip
import joblib
import json

In [11]:
class PLIERResults(object):
    def __init__(
        self,
        B: pd.DataFrame = pd.DataFrame(),
        Z: pd.DataFrame = pd.DataFrame(),
        U: pd.DataFrame = pd.DataFrame(),
        C: pd.DataFrame = pd.DataFrame(),
        L1: float = 0.0,
        L2: float = 0.0,
        L3: float = 0.0,
        heldOutGenes: Dict[str, List[str]] = defaultdict(list),
        withPrior: Dict[str, int] = defaultdict(int),
        Uauc: pd.DataFrame = pd.DataFrame(),
        Up: pd.DataFrame = pd.DataFrame(),
        summary: pd.DataFrame = pd.DataFrame(),
        residual: pd.DataFrame = pd.DataFrame(),
    ):
        self.residual = residual
        self.B = B
        self.Z = Z
        self.U = U
        self.C = C
        self.L1 = L1
        self.L2 = L2
        self.L3 = L3
        self.heldOutGenes = heldOutGenes
        self.withPrior = withPrior
        self.Uauc = Uauc
        self.Up = Up
        self.summary = summary

    def __repr__(self) -> str:
        return (
            f"B : {self.B.shape[0]} rows x {self.B.shape[1]} columns\n"
            f"Z : {self.Z.shape[0]} rows x {self.Z.shape[1]} columns\n"
            f"U : {self.U.shape[0]} rows x {self.U.shape[1]} columns\n"
            f"C : {self.C.shape[0]} rows x {self.C.shape[1]} columns\n"
            f"heldOutGenes: {len(self.heldOutGenes)}\n"
            f"withPrior: {len(self.withPrior)}\n"
            f"Uauc: {self.Uauc.shape[0]} rows x {self.Uauc.shape[1]} columns\n"
            f"Up: {self.Up.shape[0]} rows x {self.Up.shape[1]} columns\n"
            f"summary: {self.summary.shape[0]} rows x {self.summary.shape[1]} columns\n"
            f"residual: {self.residual.shape[0]} rows x {self.residual.shape[1]} columns\n"
            f"L1 is set to {self.L1:.4f}\n"
            f"L2 is set to {self.L2:.4f}\n"
            f"L3 is set to {self.L3:.4f}"
        )

    def __str__(self) -> str:
        return self.__repr__()

    def __eq__(self, other) -> bool:
        if (
            np.isclose(self.B, other.B).all()
            and np.isclose(self.Z, other.Z).all()
            and np.isclose(self.U, other.U).all()
            and np.isclose(self.C, other.C).all()
            and self.L1 == other.L1
            and self.L2 == other.L2
            and self.L3 == other.L3
            and self.heldOutGenes == other.heldOutGenes
            and self.withPrior == other.withPrior
            and np.isclose(self.Uauc, other.Uauc).all()
            and np.isclose(self.Up, other.Up).all()
            and np.isclose(self.summary, other.summary).all()
            and np.isclose(self.residual, other.residual).all()
        ):
            return True
        else:
            if not np.isclose(self.B, other.B).all():
                print("B is unequal")
            elif not np.isclose(self.Z, other.Z).all():
                print("Z is unequal")
            elif not np.isclose(self.U, other.U).all():
                print("U is unequal")
            elif not np.isclose(self.C, other.C).all():
                print("C is unequal")
            elif self.L1 != other.L1:
                print("L1 is unequal")
            elif self.L2 != other.L2:
                print("L2 is unequal")
            elif self.L3 != other.L3:
                print("L3 is unequal")
            elif self.heldOutGenes != other.heldOutGenes:
                print("heldOutGenes is unequal")
            elif self.withPrior != other.withPrior:
                print("withPrior is unequal")
            elif not np.isclose(self.Uauc, other.Uauc).all():
                print("Uauc is unequal")
            elif not np.isclose(self.Up, other.Up).all():
                print("Up is unequal")
            elif not np.isclose(self.summary, other.summary).all():
                print("summary is unequal")
            elif not np.isclose(self.residual, other.residual).all():
                print("residual is unequal")
            return False

    def to_dict(self):
        return {
            "B": self.B.to_dict(),
            "Z": self.Z.to_dict(),
            "U": self.U.to_dict(),
            "C": self.C.to_dict(),
            "L1": self.L1,
            "L2": self.L2,
            "L3": self.L3,
            "heldOutGenes": self.heldOutGenes,
            "withPrior": self.withPrior,
            "residual": self.residual.to_dict(),
            "Uauc": self.Uauc.to_dict(),
            "Up": self.Up.to_dict(),
            "summary": self.summary.to_dict(),
        }

    def to_disk(self, loc: Path, compress: bool = False) -> bool:
        if not isinstance(loc, Path):
            loc = Path(loc)
        if compress or loc.suffix == ".gz":
            with gzip.open(loc, "wt", encoding="UTF-8") as zipfile:
                json.dump(self.to_dict(), zipfile)
        else:
            with open(loc, "w") as jsonfile:
                json.dump(obj=self.to_dict(), fp=jsonfile)
        return True

    @classmethod
    def from_dict(cls, source):
        pr = cls()
        if "B" in source:
            pr.B = pd.DataFrame.from_dict(source["B"])
        if "Z" in source:
            pr.Z = pd.DataFrame.from_dict(source["Z"])
        if "U" in source:
            pr.U = pd.DataFrame.from_dict(source["U"])
        if "C" in source:
            pr.C = pd.DataFrame.from_dict(source["C"])

        if "L1" in source:
            pr.L1 = source["L1"]
        if "L2" in source:
            pr.L2 = source["L2"]
        if "L3" in source:
            pr.L3 = source["L3"]

        if "heldOutGenes" in source:
            pr.heldOutGenes = source["heldOutGenes"]
        if "withPrior" in source:
            pr.withPrior = source["withPrior"]

        if "residual" in source:
            pr.residual = pd.DataFrame.from_dict(source["residual"])
        if "Uauc" in source:
            pr.Uauc = pd.DataFrame.from_dict(source["Uauc"])
        if "Up" in source:
            pr.Up = pd.DataFrame.from_dict(source["Up"])
        if "summary" in source:
            pr.summary = pd.DataFrame.from_dict(source["summary"])
        return pr

    @classmethod
    def from_disk(cls, loc: Union[Path, str]):
        if str(loc).endswith(".gz"):
            with gzip.open(loc, "rt", encoding="UTF-8") as infile:
                input_dict = json.load(fp=infile)
        else:
            try:
                with open(loc, "r") as infile:
                    input_dict = json.load(fp=infile)
            except UnicodeDecodeError:
                with gzip.open(loc, "rt", encoding="UTF-8") as infile:
                    input_dict = json.load(fp=infile)
        pr = cls().from_dict(input_dict)
        return pr

In [3]:
data_dir = Path("/workspaces/pyplier/tests/data")

In [12]:
project_folder = Path("/workspaces/pyplier/tests")
heldOutGenes_file = project_folder / "data" / "common" / "plierRes_heldoutgenes.csv.gz"
B_file = project_folder / "data" / "common" / "plierRes_b.csv.gz"
C_file = project_folder / "data" / "common" / "plierRes_c.csv.gz"
residual_file = project_folder / "data" / "common" / "plierRes_residual.csv.gz"
U_file = project_folder / "data" / "common" / "plierRes_u.csv.gz"
Z_file = project_folder / "data" / "common" / "plierRes_z.csv.gz"

Uauc_file = project_folder / "data" / "common" / "plierRes_uauc.csv.gz"
Upval_file = project_folder / "data" / "common" / "plierRes_up.csv.gz"
summary_file = project_folder / "data" / "common" / "plierRes_summary.csv.gz"

L1 = 18.43058
L2 = 36.86117
L3 = 0.0004307425
withPrior = {
    "LV1": 1,
    "LV2": 2,
    "LV3": 3,
    "LV4": 4,
    "LV5": 5,
    "LV6": 6,
    "LV7": 7,
    "LV8": 8,
    "LV9": 9,
    "LV10": 10,
    "LV11": 11,
    "LV14": 14,
    "LV15": 15,
    "LV18": 18,
    "LV20": 20,
    "LV23": 23,
    "LV24": 24,
    "LV26": 26,
    "LV27": 27,
    "LV29": 29,
    "LV30": 30,
}


pr = PLIERResults(
    residual=pd.read_csv(residual_file, index_col=0),
    B=pd.read_csv(B_file, index_col=0),
    Z=pd.read_csv(Z_file, index_col=0),
    U=pd.read_csv(U_file, index_col=0),
    C=pd.read_csv(C_file, index_col=0),
    L1=L1,
    L2=L2,
    L3=L3,
    heldOutGenes={
        k: g["value"].tolist()
        for k, g in pd.read_csv(heldOutGenes_file, index_col=0).groupby("name")
    },
    withPrior=withPrior,
    Uauc=pd.read_csv(Uauc_file, index_col=0),
    Up=pd.read_csv(Upval_file, index_col=0),
    summary=pd.read_csv(summary_file, index_col=0),
)

In [13]:
prod = PLIERResults.from_disk(
    Path("/workspaces/pyplier/tests/data/plierRes/plierRes.json.gz")
)

In [15]:
prod

B : 30 rows x 36 columns
Z : 5892 rows x 30 columns
U : 606 rows x 30 columns
C : 5892 rows x 606 columns
heldOutGenes: 603
withPrior: 21
Uauc: 606 rows x 30 columns
Up: 606 rows x 30 columns
summary: 64 rows x 4 columns
residual: 5892 rows x 36 columns
L1 is set to 18.4306
L2 is set to 36.8612
L3 is set to 0.0004

In [16]:
prod_dict = prod.to_dict()

In [23]:
import dill

In [24]:
with open(
    "/workspaces/pyplier/tests/data/plierRes/plierRes_dict.pkl", "wb"
) as pickle_dict:
    dill.dump(prod_dict, pickle_dict)

In [25]:
with open("/workspaces/pyplier/tests/data/plierRes/plierRes_dict.pkl", "rb") as pf:
    pickle_dict = dill.load(pf)

In [26]:
prod_dict == pickle_dict

True

In [27]:
np.isclose(prod_dict, pickle_dict)

TypeError: ufunc 'isfinite' not supported for the input types, and the inputs could not be safely coerced to any supported types according to the casting rule ''safe''

In [28]:
import pytest

In [29]:
pytest.assertDictEqual(prod_dict, pickle_dict)

AttributeError: module 'pytest' has no attribute 'assertDictEqual'

In [14]:
prod == pr

True

In [31]:
prod_dict.keys()

dict_keys(['B', 'Z', 'U', 'C', 'L1', 'L2', 'L3', 'heldOutGenes', 'withPrior', 'residual', 'Uauc', 'Up', 'summary'])

In [36]:
from deepdiff import DeepDiff

In [42]:
dict_diff = DeepDiff(
    prod_dict,
    pickle_dict,
    ignore_order=False,
    max_diffs=100,
    cache_size=5000,
)

DeepDiff has reached the max number of diffs of 100. You can possibly get more accurate results by increasing the max_diffs parameter.


In [43]:
dict_diff = {}

In [41]:
dict_diff

{}

In [8]:
pd.testing.assert_frame_equal(pr.B, prod.B, rtol=1e-3, atol=1e-5)

In [75]:
pd.testing.assert_frame_equal(prod.B, pr.B)

In [86]:
pr.summary.iloc[:, 3].equals(prod.summary.iloc[:, 3])

False

In [87]:
pr.summary.iloc[:, 3][pr.summary.iloc[:, 3] != prod.summary.iloc[:, 3]]

DMAP_ERY3                    2.587629e-45
IRIS_Bcell-Memory_IgG_IgA    1.629453e-09
Name: FDR, dtype: float64

In [88]:
prod.summary.iloc[:, 3][pr.summary.iloc[:, 3] != prod.summary.iloc[:, 3]]

DMAP_ERY3                    2.587629e-45
IRIS_Bcell-Memory_IgG_IgA    1.629453e-09
Name: FDR, dtype: float64

In [89]:
prod.summary.iloc[:, 3][
    pr.summary.iloc[:, 3] != prod.summary.iloc[:, 3]
] == prod.summary.iloc[:, 3][pr.summary.iloc[:, 3] != prod.summary.iloc[:, 3]]

DMAP_ERY3                    True
IRIS_Bcell-Memory_IgG_IgA    True
Name: FDR, dtype: bool

In [93]:
np.where(pr.summary.iloc[:, 3] != prod.summary.iloc[:, 3])

(array([18, 37]),)

In [94]:
pr.summary.iloc[[18, 37], 3]

DMAP_ERY3                    2.587629e-45
IRIS_Bcell-Memory_IgG_IgA    1.629453e-09
Name: FDR, dtype: float64

In [99]:
pd.testing.assert_series_equal(
    left=prod.summary.iloc[[18, 37], 3], right=pr.summary.iloc[[18, 37], 3]
)

In [100]:
pd.testing.assert_series_equal(
    left=prod.summary.iloc[:, 3], right=pr.summary.iloc[:, 3]
)

In [101]:
pd.testing.assert_frame_equal(left=prod.summary, right=pr.summary)

In [102]:
pd.testing.assert_frame_equal(prod.summary, pr.summary, rtol=1e-3, atol=1e-5)

In [76]:
if not pr.B.equals(prod.B):
    print("B is unequal")
elif not pr.Z.equals(prod.Z):
    print("Z is unequal")
elif not pr.U.equals(prod.U):
    print("U is unequal")
elif not pr.C.equals(prod.C):
    print("C is unequal")
elif pr.L1 != prod.L1:
    print("L1 is unequal")
elif pr.L2 != prod.L2:
    print("L2 is unequal")
elif pr.L3 != prod.L3:
    print("L3 is unequal")
elif pr.heldOutGenes != prod.heldOutGenes:
    print("heldOutGenes is unequal")
elif pr.withPrior != prod.withPrior:
    print("withPrior is unequal")
elif not pr.Uauc.equals(prod.Uauc):
    print("Uauc is unequal")
elif not pr.Up.equals(prod.Up):
    print("Up is unequal")
elif not pr.summary.equals(prod.summary):
    print("summary is unequal")
elif not pr.residual.equals(prod.residual):
    print("residual is unequal")

summary is unequal


In [77]:
pr.summary.head(n=30) == prod.summary.head(n=30)

Unnamed: 0,LV index,AUC,p-value,FDR
REACTOME_GENERIC_TRANSCRIPTION_PATHWAY,True,True,True,True
IRIS_Monocyte-Day0,True,True,True,True
IRIS_Neutrophil-Resting,True,True,True,True
DMAP_MONO1,True,True,True,True
PID_IL6_7PATHWAY,True,True,True,True
REACTOME_PROCESSING_OF_CAPPED_INTRON_CONTAINING_PRE_MRNA,True,True,True,True
MIPS_SPLICEOSOME,True,True,True,True
REACTOME_SIGNALING_BY_RHO_GTPASES,True,True,True,True
IRIS_DendriticCell-LPSstimulated,True,True,True,True
KEGG_LYSOSOME,True,True,True,True


In [80]:
pr.summary.head(n=64).tail(n=30) == prod.summary.head(n=64).tail(n=30)

Unnamed: 0,LV index,AUC,p-value,FDR
DMAP_MEGA2,True,True,True,True
DMAP_TCELLA1,True,True,True,True
DMAP_TCELLA3,True,True,True,True
IRIS_Bcell-Memory_IgG_IgA,True,True,True,False
IRIS_Bcell-Memory_IgM,True,True,True,True
IRIS_Bcell-naive,True,True,True,True
DMAP_BCELLA1,True,True,True,True
KEGG_VASCULAR_SMOOTH_MUSCLE_CONTRACTION,True,True,True,True
KEGG_ECM_RECEPTOR_INTERACTION,True,True,True,True
KEGG_PURINE_METABOLISM,True,True,True,True


In [79]:
pr.summary.shape

(64, 4)

In [38]:
pr.summary.loc["DMAP_ERY3", :]

LV index    1.100000e+01
AUC         9.311594e-01
p-value     2.812640e-47
FDR         2.587629e-45
Name: DMAP_ERY3, dtype: float64

In [39]:
prod.summary.loc["DMAP_ERY3", :]

LV index    1.100000e+01
AUC         9.311594e-01
p-value     2.812640e-47
FDR         2.587629e-45
Name: DMAP_ERY3, dtype: float64

In [43]:
pr.summary.loc["DMAP_ERY3", "FDR"]

2.5876290233528305e-45

In [44]:
prod.summary.loc["DMAP_ERY3", "FDR"]

2.5876290233528302e-45

In [30]:
prod.summary

Unnamed: 0,LV index,AUC,p-value,FDR
REACTOME_GENERIC_TRANSCRIPTION_PATHWAY,29,0.590790,5.872015e-03,8.063065e-03
IRIS_Monocyte-Day0,5,0.800015,1.001095e-11,9.590097e-11
IRIS_Neutrophil-Resting,18,0.690748,2.819430e-08,1.365197e-07
DMAP_MONO1,5,0.842793,1.525878e-07,5.615231e-07
PID_IL6_7PATHWAY,2,0.712596,7.799121e-03,1.055175e-02
...,...,...,...,...
REACTOME_MITOCHONDRIAL_PROTEIN_IMPORT,30,0.762270,1.896318e-03,3.007953e-03
REACTOME_PEPTIDE_CHAIN_ELONGATION,30,0.949471,1.934557e-11,1.617993e-10
REACTOME_FORMATION_OF_THE_TERNARY_COMPLEX_AND_SUBSEQUENTLY_THE_43S_COMPLEX,30,0.998620,1.862630e-08,1.008011e-07
MIPS_NOP56P_ASSOCIATED_PRE_RRNA_COMPLEX,30,0.886059,3.125977e-10,2.054214e-09


In [25]:
prod.summary.to_csv(project_folder / "data" / "common" / "plierRes_summary.csv.gz")