In [75]:
import warnings
from copy import deepcopy
from typing import Optional, TypedDict, Union

import numpy as np
import pandas as pd
import dask as ds
import dask.dataframe as dd
import dask.array as da
import vaex as vx
from pyplier import PLIERResults
from glmnet import ElasticNet
from tqdm.auto import trange, tqdm

In [None]:
from gzip import open as g

In [65]:
Z = dd.read_csv("tests/data/solveU/Z.csv.gz",blocksize=None).set_index("gene").repartition(npartitions=8)
Chat = dd.read_csv("tests/data/solveU/Chat.csv.gz",blocksize=None).set_index("pathway").repartition(npartitions=8)
priorMat = dd.read_csv("tests/data/solveU/priorMat.csv.gz",blocksize=None).set_index("gene").repartition(npartitions=8)
penalty_factor = np.loadtxt("tests/data/solveU/penalty_factor.csv.gz")

# vaex cannot handle gzipped files on its own
# Z_vx = vx.from_pandas(pd.read_csv("tests/data/solveU/Z.csv.gz",index_col="gene"), copy_index=True)
# Chat_vx = vx.from_pandas(pd.read_csv("tests/data/solveU/Chat.csv.gz", index_col="pathway"), copy_index=True)
# priorMat_vx = vx.from_pandas(pd.read_csv("tests/data/solveU/priorMat.csv.gz", index_col="gene"), copy_index=True)
# penalty_factor_vx = np.loadtxt("tests/data/solveU/penalty_factor.csv.gz")

In [64]:
U = dd.read_csv("tests/data/solveU/U_complete.csv.gz", assume_missing=True,blocksize=None).set_index("pathway").repartition(npartitions=8)

In [66]:
priorMat.columns.name = "pathway"

In [67]:
pathwaySelection = "fast"
glm_alpha = 0.9
maxPath = 10
target_frac = 0.7
L3 = None

In [68]:
from scipy.stats import rankdata

In [69]:
class solveUReturnDict(TypedDict):
    U: pd.DataFrame
    L3: float

In [78]:
Ur = da.matmul(Chat.to_dask_array(lengths=True), Z.to_dask_array(lengths=True))

In [79]:
Ur

Unnamed: 0,Array,Chunk
Bytes,142.03 kiB,17.81 kiB
Shape,"(606, 30)","(76, 30)"
Count,15 Graph Layers,8 Chunks
Type,float64,numpy.ndarray
"Array Chunk Bytes 142.03 kiB 17.81 kiB Shape (606, 30) (76, 30) Count 15 Graph Layers 8 Chunks Type float64 numpy.ndarray",30  606,

Unnamed: 0,Array,Chunk
Bytes,142.03 kiB,17.81 kiB
Shape,"(606, 30)","(76, 30)"
Count,15 Graph Layers,8 Chunks
Type,float64,numpy.ndarray


In [83]:
Ur_ranks = rankdata(-Ur, axis=0)

In [84]:
Ur_ranks

array([[474., 593., 500., ..., 468., 477., 363.],
       [600., 384., 566., ..., 381., 142., 407.],
       [460., 105., 546., ..., 502.,  23., 154.],
       ...,
       [473., 184., 387., ..., 421., 225., 331.],
       [485., 341., 481., ...,  49., 246., 405.],
       [ 52.,  78., 126., ..., 104., 439., 160.]])

In [85]:
if pathwaySelection != "fast":
    iip = np.where([Ur.min(axis=1) <= maxPath])[1]

In [8]:
def new_solveU(
    Z,
    Chat,
    priorMat,
    penalty_factor,
    pathwaySelection: str = "fast",
    glm_alpha: float = 0.9,
    maxPath: int = 10,
    target_frac: float = 0.7,
    L3: Optional[float] = None,
) -> solveUReturnDict:
    """[summary]

    Parameters
    ----------
    Z : [type]
        current Z estimate
    Chat : [type]
        the inverse of the C matrix
    priorMat : [type]
        the prior pathway or C matrix
    penalty_factor : [type]
        Penalties for different pathways, must have size priorMat.shape[1].
    pathwaySelection : str, optional
        Method to use for pathway selection., by default "fast"
    glm_alpha : float, optional
        The elsatic net alpha parameter, by default 0.9
    maxPath : int, optional
        The maximum number of pathways to consider, by default 10
    target_frac : float, optional
        The target fraction on non-zero columns of, by default 0.7
    L3 : float, optional
        Solve with a given L3, otherwise search, by default None

    Returns
    -------
    [type]
        [description]
    """
    Ur = Chat @ Z  # get U by OLS

    Ur = Ur.rank(axis="index", ascending=False)  # rank

    if pathwaySelection != "fast":
        iip = np.where([Ur.min(axis=1) <= maxPath])[1]

    results = dict()
    
    if L3 is None:
        U = np.zeros(shape=(priorMat.shape[1], Z.shape[1]))
        
        lambdas = np.exp(np.arange(start=-4, stop=-12.125, step=-0.125))
        results = dict()
        lMat = np.full((len(lambdas), Z.shape[1]), np.nan)
        gres = ElasticNet(
            lambda_path=lambdas,
            lower_limits=0,
            standardize=False,
            fit_intercept=True,
            alpha=glm_alpha,
            max_features=150,
        )

        for i in range(Z.shape[1]):   
            if pathwaySelection == "fast":
                iip = np.where([Ur.iloc[:, i] <= maxPath])[1]

            with warnings.catch_warnings():
                warnings.simplefilter("ignore")
                gres.fit(
                    y=Z.iloc[:, i],
                    X=priorMat.iloc[:, iip],
                    relative_penalties=penalty_factor[iip],
                )

            gres.iip = iip
            lMat[:, i] = np.sum(np.where(gres.coef_path_ > 0, 1, 0), axis=0)
            results[i] = deepcopy(gres)

        fracs = np.mean(np.where(lMat > 0, 1, 0), axis=1)
        iibest = np.where(abs(target_frac - fracs) == abs((target_frac - fracs)).min())[
            0
        ][0]

        # yeah, so this is not very pythonic, but it matches the R code
        # TODO: replace this with something like our original attempt
        for i in trange(Z.shape[1]):
            U[results[i].iip, i] = results[i].coef_path_[:, iibest]

        U = pd.DataFrame(U, index=priorMat.columns, columns=Z.columns).fillna(0)
        L3 = lambdas[iibest]
    else:
        # do one fit with a given lambda
        gres = ElasticNet(
            lambda_path=[L3 * 0.9, L3, L3 * 1.1],
            lower_limits=0,
            standardize=False,
            fit_intercept=True,
            alpha=glm_alpha,
            max_features=150,
        )

        for i in range(Z.shape[1]):
            if pathwaySelection == "fast":
                iip = np.where([Ur.iloc[:, i] <= maxPath])[1]

            with warnings.catch_warnings():
                warnings.simplefilter("ignore")

                # try:
                gres.fit(
                    y=Z.iloc[:, i],
                    X=priorMat.iloc[:, iip],
                    relative_penalties=penalty_factor[iip],
                )
            results[i] = pd.Series(
                data=gres.coef_path_[:,1], index=Ur.index[iip]
            )

        U = pd.DataFrame(results, index=priorMat.columns).fillna(0)

    return solveUReturnDict(U=U, L3=L3)


In [9]:
def old_solveU(
    Z,
    Chat,
    priorMat,
    penalty_factor,
    pathwaySelection: str = "fast",
    glm_alpha: float = 0.9,
    maxPath: int = 10,
    target_frac: float = 0.7,
    L3: Optional[float] = None,
) -> solveUReturnDict:
    """[summary]

    Parameters
    ----------
    Z : [type]
        current Z estimate
    Chat : [type]
        the inverse of the C matrix
    priorMat : [type]
        the prior pathway or C matrix
    penalty_factor : [type]
        Penalties for different pathways, must have size priorMat.shape[1].
    pathwaySelection : str, optional
        Method to use for pathway selection., by default "fast"
    glm_alpha : float, optional
        The elsatic net alpha parameter, by default 0.9
    maxPath : int, optional
        The maximum number of pathways to consider, by default 10
    target_frac : float, optional
        The target fraction on non-zero columns of, by default 0.7
    L3 : float, optional
        Solve with a given L3, otherwise search, by default None

    Returns
    -------
    [type]
        [description]
    """
    Ur = Chat @ Z  # get U by OLS

    Ur = Ur.rank(axis="index", ascending=False)  # rank
    Urm = Ur.min(axis=1)

    U = pd.DataFrame(np.zeros(shape=(priorMat.shape[1], Z.shape[1])))
    if L3 is None:
        lambdas = np.exp(np.arange(start=-4, stop=-12.125, step=-0.125))
        results = dict()
        lMat = np.full((len(lambdas), Z.shape[1]), np.nan)

        for i in range(Z.shape[1]):
            if pathwaySelection == "fast":
                iip = np.where([Ur.iloc[:, i] <= maxPath])[1]
            else:
                iip = np.where([Urm <= maxPath])[1]

            with warnings.catch_warnings():
                warnings.simplefilter("ignore")
                gres = ElasticNet(
                    lambda_path=lambdas,
                    lower_limits=0,
                    standardize=False,
                    fit_intercept=True,
                    alpha=glm_alpha,
                    max_features=150,
                )

                gres.fit(
                    y=Z.iloc[:, i].astype(np.float64).values,
                    X=priorMat.iloc[:, iip].astype(np.float64).values,
                    relative_penalties=[penalty_factor[_] for _ in iip],
                )

            gres.iip = iip
            lMat[:, i] = np.sum(np.where(gres.coef_path_ > 0, 1, 0), axis=0)
            results[i] = deepcopy(gres)

        fracs = np.mean(np.where(lMat > 0, 1, 0), axis=1)
        iibest = np.where(abs(target_frac - fracs) == abs((target_frac - fracs)).min())[
            0
        ][0]

        # yeah, so this is not very pythonic, but it matches the R code
        # TODO: replace this with something like our original attempt
        for i in trange(Z.shape[1]):
            U.iloc[results[i].iip, i] = results[i].coef_path_[:, iibest]

        U.index = priorMat.columns
        U.columns = Z.columns
        # try:
        #     U = (pd.DataFrame(
        #             index=(priorMat.columns.set_names("pathway")).merge(pd.DataFrame(data={
        #                 i: pd.Series(
        #                     data=results[i].coef_path_[:, iibest],
        #                     index=Ur.index[results[i].iip].set_names("pathway")
        #                     )
        #                 for i in range(Z.shape[1])
        #             }, ),
        #             on="pathway",
        #             how="left",
        #         ).fillna(0)))
        # except KeyError:
        #     print("oops!")
        #     print(
        #         pd.DataFrame(data={
        #             i: pd.Series(
        #                 data=results[i].coef_path_[:, iibest],
        #                 index=Ur.index[results[i].iip],
        #             )
        #             for i in range(Z.shape[1])
        #         }, ).index.name)
        #     print(pd.DataFrame(index=priorMat.columns).index.name)

        # what is the point of this?  It is never used!
        # Utmp = solveU(Z, Chat, priorMat, penalty.factor,
        #     pathwaySelection = "fast", glm_alpha = 0.9, maxPath = 10,
        #     L3 = lambdas[iibest]
        #     )

        # stop()
        return solveUReturnDict(U=U, L3=lambdas[iibest])
    else:
        # do one fit with a given lambda
        results = dict()
        for i in range(Z.shape[1]):
            if pathwaySelection == "fast":
                iip = np.where([Ur.iloc[:, i] <= maxPath])[1]
            else:
                iip = np.where([Urm <= maxPath])[1]

            with warnings.catch_warnings():
                warnings.simplefilter("ignore")
                gres = ElasticNet(
                    lambda_path=[L3 * 0.9, L3, L3 * 1.1],
                    lower_limits=0,
                    standardize=False,
                    fit_intercept=True,
                    alpha=glm_alpha,
                    max_features=150,
                )

                # try:
                gres.fit(
                    y=Z.iloc[:, i].astype(np.float64).values,
                    X=priorMat.iloc[:, iip].astype(np.float64).values,
                    relative_penalties=[penalty_factor[_] for _ in iip],
                )
            # except TypeError:
            #     print(f"iip: {iip}")
            #     print(f"sliced: {[penalty_factor[_] for _ in iip]}")
            #     print(f"penalty_factor: {penalty_factor}")

            # try:
            results[i] = pd.Series(
                data=[_[1] for _ in gres.coef_path_], index=Ur.index[iip]
            )
            # except AttributeError:
            #     print(dir(gres))

            # U[iip, i] = [_[1] for _ in gres.coef_path_]

        U = (
            pd.DataFrame(index=priorMat.columns.set_names("pathway"))
            .merge(
                pd.DataFrame(
                    {i: results[i] for i in range(Z.shape[1])},
                ).rename_axis(index="pathway", axis="index"),
                on="pathway",
                how="left",
            )
            .fillna(0)
        )

        return solveUReturnDict(U=U, L3=L3)

In [9]:
pathwaySelection: str = "fast"
glm_alpha: float = 0.9
maxPath: int = 10
target_frac: float = 0.7
L3: Optional[float] = None

In [None]:
Chat.repartition(npartitions=16).to_dask_array()

Unnamed: 0,Array,Chunk
Bytes,unknown,unknown
Shape,"(nan, 5892)","(nan, 5892)"
Count,5 Graph Layers,16 Chunks
Type,float64,numpy.ndarray
"Array Chunk Bytes unknown unknown Shape (nan, 5892) (nan, 5892) Count 5 Graph Layers 16 Chunks Type float64 numpy.ndarray",,

Unnamed: 0,Array,Chunk
Bytes,unknown,unknown
Shape,"(nan, 5892)","(nan, 5892)"
Count,5 Graph Layers,16 Chunks
Type,float64,numpy.ndarray


In [34]:
Chat.compute()

Unnamed: 0_level_0,GAS6,MMP14,MARCKSL1,SPARC,CTSD,EPAS1,PALLD,PHC2,LGALS3BP,SERPING1,...,LDHA,LDHB,ACAP1,ACAP2,ACAP3,CFL2,CFL1,SELL,GNGT2,SERPINH1
pathway,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
BIOCARTA_BCR_PATHWAY,-0.000364,0.000548,-0.000282,-0.001829,0.000391,0.001880,0,-0.000405,0.000096,-0.001209,...,0.000991,0.000160,-0.000358,0.000177,0.000177,-0.000001,-0.000307,-0.000408,0.003318,-0.000732
BIOCARTA_BIOPEPTIDES_PATHWAY,0.000052,-0.001438,-0.000771,-0.001178,-0.000395,0.001926,0,0.000515,0.000038,-0.001865,...,0.000980,-0.000224,0.000103,0.000343,0.000343,0.000181,-0.004209,0.001216,0.002241,0.000378
BIOCARTA_CARM_ER_PATHWAY,-0.000193,-0.001541,0.000236,0.000855,-0.003961,0.001471,0,-0.000020,0.000024,0.000160,...,0.000617,0.000302,0.000214,0.000017,0.000017,-0.001011,0.001109,-0.000046,0.001038,0.000026
BIOCARTA_CHREBP2_PATHWAY,0.000139,-0.000094,0.000393,0.000642,-0.002909,-0.000710,0,-0.000062,0.000065,0.000307,...,0.001622,0.000357,0.000033,-0.000034,-0.000034,-0.000051,0.004231,0.000055,-0.005164,-0.000330
BIOCARTA_DEATH_PATHWAY,-0.000084,0.000166,-0.000357,-0.000796,0.001487,-0.000756,0,0.000255,-0.000225,-0.000573,...,0.000030,-0.000235,-0.000291,-0.000219,-0.000219,-0.004631,0.001043,-0.000495,-0.000563,-0.000155
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
ST_JNK_MAPK_PATHWAY,-0.000619,-0.004488,-0.000671,0.000315,0.000007,0.000999,0,0.000413,-0.000609,-0.000389,...,-0.002565,-0.001803,-0.000003,-0.000314,-0.000314,-0.001390,-0.002971,-0.000323,0.000080,0.000009
ST_P38_MAPK_PATHWAY,0.000375,0.000684,-0.002885,-0.000099,0.002332,-0.000342,0,-0.000119,0.000239,0.000110,...,-0.000019,0.000736,0.000245,-0.000319,-0.000319,-0.000807,-0.003931,-0.000008,0.000306,0.000539
ST_PHOSPHOINOSITIDE_3_KINASE_PATHWAY,0.000103,-0.001940,0.000703,0.001493,-0.001961,0.001128,0,0.000096,-0.000134,-0.001157,...,0.001179,0.000665,0.000415,0.000250,0.000250,0.001638,0.000057,-0.001160,-0.001830,-0.000746
ST_T_CELL_SIGNAL_TRANSDUCTION,0.000487,-0.001473,-0.000769,0.000046,0.000566,-0.000396,0,-0.000242,0.000037,0.000232,...,-0.000013,0.000052,-0.000234,-0.000046,-0.000046,-0.001121,0.000516,-0.000350,0.000832,0.000702


In [36]:
Chat.to_dask_array().compute()

array([[-3.63912742e-04,  5.47980923e-04, -2.81629603e-04, ...,
        -4.08018386e-04,  3.31811611e-03, -7.31603148e-04],
       [ 5.19826852e-05, -1.43809192e-03, -7.71412873e-04, ...,
         1.21564986e-03,  2.24073411e-03,  3.78269397e-04],
       [-1.92957775e-04, -1.54148704e-03,  2.35794020e-04, ...,
        -4.59427420e-05,  1.03785155e-03,  2.64017502e-05],
       ...,
       [ 1.03029894e-04, -1.93977886e-03,  7.03200077e-04, ...,
        -1.15972020e-03, -1.83026929e-03, -7.46044767e-04],
       [ 4.87018435e-04, -1.47340150e-03, -7.68641327e-04, ...,
        -3.49936948e-04,  8.31783819e-04,  7.01757019e-04],
       [ 8.42933019e-05,  5.35105410e-04, -1.70064119e-04, ...,
        -6.72343953e-04,  1.17692491e-03, -1.32486623e-04]])

In [39]:
ds.array.matmul(
    Chat.to_dask_array().compute(),
    Z.to_dask_array().compute()
)

Unnamed: 0,Array,Chunk
Bytes,142.03 kiB,142.03 kiB
Shape,"(606, 30)","(606, 30)"
Count,4 Graph Layers,1 Chunks
Type,float64,numpy.ndarray
"Array Chunk Bytes 142.03 kiB 142.03 kiB Shape (606, 30) (606, 30) Count 4 Graph Layers 1 Chunks Type float64 numpy.ndarray",30  606,

Unnamed: 0,Array,Chunk
Bytes,142.03 kiB,142.03 kiB
Shape,"(606, 30)","(606, 30)"
Count,4 Graph Layers,1 Chunks
Type,float64,numpy.ndarray


In [22]:
Z.head()

Unnamed: 0_level_0,V1,V2,V3,V4,V5,V6,V7,V8,V9,V10,...,V21,V22,V23,V24,V25,V26,V27,V28,V29,V30
gene,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
A2M,0.0,0.0,0.002858,0.0,0.007514,0.003213,0.0,0.0,0.027707,0.0,...,0.0,0.006169,0.0,0.0,0.005294,0.001581,0.0,0.0,0.0,0.007603
AAAS,0.0,0.0,0.015572,0.0,0.002834,0.001994,0.0,0.006436,0.0,0.0,...,0.000339,0.0,0.006605,0.006283,0.004323,0.0,0.012863,0.008488,0.002946,0.0
AANAT,0.0,0.006308,0.016629,0.005841,0.0,0.0,0.000709,0.0,0.0,0.0,...,0.033281,0.006352,0.0,0.0,0.0,0.013674,0.009992,0.013903,0.0,0.048256
AARS,0.0027,0.0,0.006869,0.0,0.0,0.000326,0.0,0.020173,0.013918,0.005037,...,0.0,0.0,0.010956,0.004082,0.018543,0.003313,0.0,0.0,0.0,0.0
AARS2,0.0,0.001811,0.031294,0.009151,0.0,0.003764,0.002944,0.002544,0.013016,0.004083,...,0.013575,0.016008,0.0,0.02382,0.005846,0.003274,0.0,0.023125,0.011936,0.0014


In [23]:
Ur = ds.array.matmul(Chat, Z)  # get U by OLS

ValueError: Cannot change dimensions from [5892] to []

In [16]:
Ur.head()

Unnamed: 0_level_0,A2M,AAAS,AANAT,AARS,AARS2,AASDH,AASDHPPT,AATK,ABAT,ABCA1,...,ZNF703,ZNF708,ZNF773,ZNF92,ZNFX1,ZNRD1,ZW10,ZWILCH,ZWINT,ZYX
gene,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
A2M,,,,,,,,,,,...,,,,,,,,,,
AAAS,,,,,,,,,,,...,,,,,,,,,,
AANAT,,,,,,,,,,,...,,,,,,,,,,
AARS,,,,,,,,,,,...,,,,,,,,,,
AARS2,,,,,,,,,,,...,,,,,,,,,,


In [9]:
Ur = Ur.rank(axis="index", ascending=False)  # rank
Urm = Ur.min(axis=1)

In [9]:
U = pd.DataFrame(np.zeros(shape=(priorMat.shape[1], Z.shape[1])))
if L3 is None:
    lambdas = np.exp(np.arange(start=-4, stop=-12.125, step=-0.125))
    results = dict()
    lMat = np.full((len(lambdas), Z.shape[1]), np.nan)

    for i in range(Z.shape[1]):
        if pathwaySelection == "fast":
            iip = np.where([Ur.iloc[:, i] <= maxPath])[1]
        else:
            iip = np.where([Urm <= maxPath])[1]

        with warnings.catch_warnings():
            warnings.simplefilter("ignore")
            gres = ElasticNet(
                lambda_path=lambdas,
                lower_limits=0,
                standardize=False,
                fit_intercept=True,
                alpha=glm_alpha,
                max_features=150,
            )

            gres.fit(
                y=Z.iloc[:, i].astype(np.float64).values,
                X=priorMat.iloc[:, iip].astype(np.float64).values,
                relative_penalties=[penalty_factor[_] for _ in iip],
            )

        gres.iip = iip
        lMat[:, i] = np.sum(np.where(gres.coef_path_ > 0, 1, 0), axis=0)
        results[i] = deepcopy(gres)

    fracs = np.mean(np.where(lMat > 0, 1, 0), axis=1)
    iibest = np.where(abs(target_frac - fracs) == abs((target_frac - fracs)).min())[
        0
    ][0]

    # yeah, so this is not very pythonic, but it matches the R code
    # TODO: replace this with something like our original attempt
    for i in trange(Z.shape[1]):
        U.iloc[results[i].iip, i] = results[i].coef_path_[:, iibest]

    U.index = priorMat.columns
    U.columns = Z.columns
    # try:
    #     U = (pd.DataFrame(
    #             index=(priorMat.columns.set_names("pathway")).merge(pd.DataFrame(data={
    #                 i: pd.Series(
    #                     data=results[i].coef_path_[:, iibest],
    #                     index=Ur.index[results[i].iip].set_names("pathway")
    #                     )
    #                 for i in range(Z.shape[1])
    #             }, ),
    #             on="pathway",
    #             how="left",
    #         ).fillna(0)))
    # except KeyError:
    #     print("oops!")
    #     print(
    #         pd.DataFrame(data={
    #             i: pd.Series(
    #                 data=results[i].coef_path_[:, iibest],
    #                 index=Ur.index[results[i].iip],
    #             )
    #             for i in range(Z.shape[1])
    #         }, ).index.name)
    #     print(pd.DataFrame(index=priorMat.columns).index.name)

    # what is the point of this?  It is never used!
    # Utmp = solveU(Z, Chat, priorMat, penalty.factor,
    #     pathwaySelection = "fast", glm_alpha = 0.9, maxPath = 10,
    #     L3 = lambdas[iibest]
    #     )

    # stop()
    return solveUReturnDict(U=U, L3=lambdas[iibest])
else:
    # do one fit with a given lambda
    results = dict()
    for i in range(Z.shape[1]):
        if pathwaySelection == "fast":
            iip = np.where([Ur.iloc[:, i] <= maxPath])[1]
        else:
            iip = np.where([Urm <= maxPath])[1]

        with warnings.catch_warnings():
            warnings.simplefilter("ignore")
            gres = ElasticNet(
                lambda_path=[L3 * 0.9, L3, L3 * 1.1],
                lower_limits=0,
                standardize=False,
                fit_intercept=True,
                alpha=glm_alpha,
                max_features=150,
            )

            # try:
            gres.fit(
                y=Z.iloc[:, i].astype(np.float64).values,
                X=priorMat.iloc[:, iip].astype(np.float64).values,
                relative_penalties=[penalty_factor[_] for _ in iip],
            )
        # except TypeError:
        #     print(f"iip: {iip}")
        #     print(f"sliced: {[penalty_factor[_] for _ in iip]}")
        #     print(f"penalty_factor: {penalty_factor}")

        # try:
        results[i] = pd.Series(
            data=[_[1] for _ in gres.coef_path_], index=Ur.index[iip]
        )
        # except AttributeError:
        #     print(dir(gres))

        # U[iip, i] = [_[1] for _ in gres.coef_path_]

    U = (
        pd.DataFrame(index=priorMat.columns.set_names("pathway"))
        .merge(
            pd.DataFrame(
                {i: results[i] for i in range(Z.shape[1])},
            ).rename_axis(index="pathway", axis="index"),
            on="pathway",
            how="left",
        )
        .fillna(0)
    )

    return solveUReturnDict(U=U, L3=L3)

In [12]:
%%timeit -n 10
solveu_new_res = new_solveU(
    Z,
    Chat,
    priorMat,
    penalty_factor,
)

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 30/30 [00:00<00:00, 64660.39it/s]
100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 30/30 [00:00<00:00, 62446.21it/s]
100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 30/30 [00:00<00:00, 65775.81it/s]
100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 30/30 [00:00<00:00, 58826.14it/s]
100%|███████████

4.69 s ± 278 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)





In [13]:
%%timeit -n 10
solveu_old_res = old_solveU(
    Z,
    Chat,
    priorMat,
    penalty_factor,
)

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 30/30 [00:00<00:00, 6726.67it/s]
100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 30/30 [00:00<00:00, 5625.16it/s]
100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 30/30 [00:00<00:00, 6470.69it/s]
100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 30/30 [00:00<00:00, 6502.12it/s]
100%|███████████

4.11 s ± 159 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)





In [17]:
Z.index.to_numpy()

array(['GAS6', 'MMP14', 'MARCKSL1', ..., 'SELL', 'GNGT2', 'SERPINH1'],
      dtype=object)

In [68]:
solveu_df_res_l3_old = solveU(
    Z,
    Chat,
    priorMat,
    penalty_factor,
    L3= 0.0000019304541362277093,
)

In [69]:
solveu_df_res_l3_old['U']

Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,...,20,21,22,23,24,25,26,27,28,29
pathway,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
IRIS_Bcell-Memory_IgG_IgA,0.0,0.0,0.000000,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.0000,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0
IRIS_Bcell-Memory_IgM,0.0,0.0,0.004193,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.0000,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0
IRIS_Bcell-naive,0.0,0.0,0.004399,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.0000,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0
IRIS_CD4Tcell-N0,0.0,0.0,0.000000,0.0,0.0,0.008638,0.0,0.0,0.0,0.0,...,0.0,0.0000,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0
IRIS_CD4Tcell-Th1-restimulated12hour,0.0,0.0,0.000000,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.0000,0.0,0.0,0.0,0.0,0.007342,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
PID_IL4_2PATHWAY,0.0,0.0,0.000000,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.0034,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0
REACTOME_SIGNALING_BY_THE_B_CELL_RECEPTOR_BCR,0.0,0.0,0.000000,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.0000,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0
PID_BCR_5PATHWAY,0.0,0.0,0.000000,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.0000,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0
PID_TELOMERASEPATHWAY,0.0,0.0,0.000000,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.0000,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0


In [73]:
solveu_df_res_l3_new = solveU(
    Z,
    Chat,
    priorMat,
    penalty_factor,
    L3= 0.0000019304541362277093,
)

In [75]:
solveu_df_res_l3_new['U']

Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,...,20,21,22,23,24,25,26,27,28,29
pathway,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
IRIS_Bcell-Memory_IgG_IgA,0.0,0.0,0.000000,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.0000,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0
IRIS_Bcell-Memory_IgM,0.0,0.0,0.004193,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.0000,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0
IRIS_Bcell-naive,0.0,0.0,0.004399,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.0000,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0
IRIS_CD4Tcell-N0,0.0,0.0,0.000000,0.0,0.0,0.008638,0.0,0.0,0.0,0.0,...,0.0,0.0000,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0
IRIS_CD4Tcell-Th1-restimulated12hour,0.0,0.0,0.000000,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.0000,0.0,0.0,0.0,0.0,0.007342,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
PID_IL4_2PATHWAY,0.0,0.0,0.000000,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.0034,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0
REACTOME_SIGNALING_BY_THE_B_CELL_RECEPTOR_BCR,0.0,0.0,0.000000,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.0000,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0
PID_BCR_5PATHWAY,0.0,0.0,0.000000,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.0000,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0
PID_TELOMERASEPATHWAY,0.0,0.0,0.000000,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.0000,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0


In [107]:
pd.DataFrame(solveu_arr_res["U"])

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,20,21,22,23,24,25,26,27,28,29
0,0.0,0.0,0.000000,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.000000,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.000693,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.000000,0.0,0.0,0.000835,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.000000,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
601,0.0,0.0,0.000000,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
602,0.0,0.0,0.000000,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
603,0.0,0.0,0.000000,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
604,0.0,0.0,0.000000,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [16]:
%%timeit -n50
Z.iloc[:, 0].astype(np.float64).values

115 µs ± 5.95 µs per loop (mean ± std. dev. of 7 runs, 50 loops each)


In [17]:
%%timeit -n50
Z.iloc[:, 0].to_numpy()

48.4 µs ± 7 µs per loop (mean ± std. dev. of 7 runs, 50 loops each)


In [81]:
%%timeit -n100
U.iloc[:,0]

82.7 µs ± 2.48 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [82]:
U_arr = U.to_numpy()

In [83]:
%%timeit -n100
U_arr[:,0]

226 ns ± 20.9 ns per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [34]:
Ur = Chat @ Z

TypeError: unsupported operand type(s) for @: 'DataFrameLocal' and 'DataFrameLocal'

In [49]:
Z = pd.read_csv("tests/data/solveU/Z.csv.gz",index_col="gene")
Chat = pd.read_csv("tests/data/solveU/Chat.csv.gz", index_col="pathway")
priorMat = pd.read_csv("tests/data/solveU/priorMat.csv.gz", index_col="gene")

In [45]:
Chat_mm = np.memmap("chat_memmap", dtype="float32", mode="w+", shape=Chat.shape)

In [46]:
Chat_mm[:] = Chat.to_numpy()

In [47]:
Chat_mm

memmap([[-1.7110870e-03, -3.3942747e-04, -7.6252496e-04, ...,
          8.4461510e-04,  8.1852864e-05, -2.9508781e-04],
        [-4.8422936e-04, -1.1777684e-04,  1.5851384e-04, ...,
         -1.6482534e-04,  2.5636892e-04, -2.8725798e-04],
        [-1.9961270e-03, -1.5610259e-04,  7.9156773e-04, ...,
         -1.7482779e-04, -1.6023851e-03,  4.0516508e-04],
        ...,
        [ 1.9467092e-04, -2.5748387e-03, -5.4456391e-03, ...,
         -1.1928794e-03, -5.8357610e-04, -3.7277608e-05],
        [-1.0896098e-05, -5.1027059e-04, -1.6774171e-03, ...,
          4.5249774e-04,  6.2461587e-04,  4.3883896e-04],
        [ 6.1849445e-05, -2.6981994e-03,  1.0508702e-03, ...,
         -1.8412851e-03,  2.6885581e-03,  5.2014185e-04]], dtype=float32)

In [50]:
Z_mm = np.memmap("z_memmap.dat", dtype="float32", mode="w+", shape=Z.shape)

In [51]:
Z_mm[:] = Z.to_numpy()

In [52]:
Ur = Chat_mm @ Z_mm

In [53]:
Ur

array([[ 1.5274601e-03, -5.3456315e-04, -8.8955229e-04, ...,
         2.4586052e-03,  2.1628144e-03, -2.2459673e-03],
       [-5.1806436e-04, -3.8182622e-04,  5.3602234e-03, ...,
        -6.7306380e-04,  1.3734872e-03,  2.7820517e-03],
       [-4.1583629e-04, -4.0209983e-05,  5.9829471e-03, ...,
         9.4579073e-04, -6.3950632e-04, -5.5906345e-04],
       ...,
       [ 2.3865569e-03,  1.8074773e-03,  2.5417584e-03, ...,
         1.2286623e-03,  1.4312689e-03,  9.6427504e-04],
       [ 4.6515819e-03, -1.0741022e-03,  1.3789063e-03, ...,
         1.2962836e-03, -1.5030848e-04,  1.5498644e-03],
       [-1.8760606e-03, -1.6183223e-03, -5.0713090e-05, ...,
        -4.3211764e-04,  8.9682144e-06, -3.6528480e-04]], dtype=float32)

In [None]:
from wendelin.bigarray.array_zodb import ZBigArray
