In [91]:
import random
from math import floor, ceil

import numpy as np
import pandas as pd
from icontract import require
from numpy.random import default_rng
from rich import print as rprint
from scipy.linalg import solve, svd
from sklearn.utils.extmath import randomized_svd
from tqdm.auto import trange, tqdm

from pyplier.commonRows import commonRows
from pyplier.crossVal import crossVal
from pyplier.getAUC import getAUC
from pyplier.nameB import nameB
from pyplier.num_pc import num_pc
from pyplier.pinv_ridge import pinv_ridge
from pyplier.solveU import solveU
from pyplier.PLIERRes import PLIERResults
from pyplier.utils import crossprod, rowNorm, setdiff, tcrossprod

In [2]:
from functools import wraps


def add_method(cls):
    """#from https://stackoverflow.com/a/59089116
    can't say I completely understand this at the moment, but this allows us to bind
    a new function to an existing class definition
    so, for instance, I can add this `filter_by` function to pandas.DataFrame
    without creating a new class that inherits from DataFrame
    """

    def decorator(func):
        @wraps(func)
        def wrapper(*args, **kwargs):
            return func(*args, **kwargs)

        setattr(cls, func.__name__, wrapper)
        return func

    return decorator

In [3]:
@add_method(pd.DataFrame)
def peek(df, nrow=5, ncol=5):
    return df.iloc[range(nrow), range(ncol)]

In [4]:
from pathlib import Path

In [207]:
data_dir = Path("/workspaces/pyplier/tests/data/plier/")

In [208]:
allPaths = pd.read_csv(data_dir / "allPaths.csv", index_col=0, header=0)
dataWholeBlood = pd.read_csv(data_dir / "dataWholeBlood.csv", index_col=0, header=0)

In [209]:
data: pd.DataFrame = dataWholeBlood
priorMat: pd.DataFrame = allPaths
svdres = None
num_LVs: float = None
L1: float = None
L2: float = None
L3: float = None
frac: float = 0.7
max_iter: int = 350
trace: bool = False
scale: bool = True
Chat = None
maxPath: int = 10
doCrossval: bool = True
penalty_factor: np.ndarray = None
glm_alpha: float = 0.9
minGenes: int = 10
tol: float = 1e-06
seed: int = 123456
allGenes: bool = False
rseed: int = None
pathwaySelection: str = "complete"

In [210]:
if penalty_factor is None:
    penalty_factor = np.ones(priorMat.shape[1])

In [211]:
if scale:
    Y = rowNorm(data)
else:
    Y = data

In [104]:
peek(Y, 10, 10)

Unnamed: 0_level_0,BD8001,BD8002,BD8003,BD8004,BD8005,BD8006,BD8007,BD8008,BD8009,BD8010
gene,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
GAS6,-1.505242,0.188428,1.382268,-0.88604,0.218978,-1.659888,2.796534,0.82921,-1.649642,-0.699206
MMP14,-1.362254,1.426659,-0.112956,-0.547275,-0.65712,0.623815,0.162686,0.399899,-0.959655,2.260257
MARCKSL1,-1.559936,1.24754,-2.110556,-0.72144,-0.058441,0.154611,1.487958,-0.045567,1.409842,0.235813
SPARC,0.465116,-0.849214,0.790658,0.906851,0.163648,0.117905,-1.060804,-1.701798,0.920468,-1.14101
CTSD,0.024942,0.206589,-1.452483,-1.10291,0.034775,-0.016322,0.844618,0.483747,-0.650142,1.753523
EPAS1,0.974633,-0.710964,0.16812,1.471965,0.520998,0.564923,0.506347,-0.219945,-0.47501,-2.630611
PALLD,2.701797,0.695396,-0.889765,1.075408,0.449772,-1.300666,-1.216742,-0.679337,-0.92498,-1.319492
PHC2,-0.723359,-0.627006,-0.385875,0.577158,0.094306,-0.55281,-0.200565,0.694888,-1.408114,-0.358773
LGALS3BP,-1.948132,-0.570603,-1.008182,-0.684096,0.635633,-0.888391,0.853966,-0.419334,1.235992,0.416686
SERPING1,-1.678543,-1.192548,0.143102,1.446946,-0.620316,-1.512766,-0.17019,-0.939155,1.35624,1.111385


In [105]:
allGenes

False

In [212]:
if (priorMat.shape[0] != data.shape[0]) or not all(priorMat.index == data.index):
    if not allGenes:
        cm = data.index.intersection(priorMat.index)
        rprint(f"Selecting common genes: {len(cm)}")
        priorMat = priorMat.loc[cm, :]
        Y = Y.loc[cm, :]
    else:
        extra_genes = setdiff(data.index, priorMat.index)
        eMat = pd.DataFrame(
            data=np.zeros((len(extra_genes), priorMat.shape[1])),
            columns=priorMat.columns,
            index=extra_genes,
        )
        priorMat = pd.concat([priorMat, eMat], axis=0)
        priorMat = priorMat.loc[data.index, :]

Selecting common genes: 5892


In [213]:
numGenes = priorMat.sum(axis="rows")

In [107]:
numGenes

IRIS_Bcell-Memory_IgG_IgA                         63
IRIS_Bcell-Memory_IgM                             61
IRIS_Bcell-naive                                  70
IRIS_CD4Tcell-N0                                  35
IRIS_CD4Tcell-Th1-restimulated12hour              23
                                                ... 
PID_IL4_2PATHWAY                                  46
REACTOME_SIGNALING_BY_THE_B_CELL_RECEPTOR_BCR    119
PID_BCR_5PATHWAY                                  66
PID_TELOMERASEPATHWAY                             63
PID_PI3KPLCTRKPATHWAY                             30
Length: 606, dtype: int64

In [214]:
iibad = numGenes[numGenes < minGenes].index

In [134]:
iibad

Index(['DMAP_CMP', 'DMAP_NKA4', 'DMAP_PRE_BCELL2'], dtype='object')

In [135]:
priorMat.loc[:, iibad].sum(axis="rows")

DMAP_CMP           6
DMAP_NKA4          6
DMAP_PRE_BCELL2    6
dtype: int64

In [215]:
heldOutGenes = dict()
iibad = numGenes[numGenes < minGenes].index
priorMat.loc[:, iibad] = 0
rprint(f"Removing {len(iibad)} pathways with too few genes")

Removing 3 pathways with too few genes


In [216]:
priorMatCV = priorMat.copy(deep=True)

In [217]:
iiposs_1 = [
    "SPDYE1",
    "HEY1",
    "FCRL1",
    "AFF3",
    "COCH",
    "PARM1",
    "MS4A1",
    "KIAA0125",
    "E2F5",
    "SLC15A2",
    "COBLL1",
    "DENND5B",
]

In [218]:
iiposs_2 = [
    "HLA-DOA",
    "BCL11A",
    "AFF3",
    "TCF4",
    "COBLL1",
    "FAM129C",
    "QRSL1",
    "PAX5",
    "PKIG",
    "BLNK",
    "SLC4A4",
    "TPD52",
]

In [219]:
seed

123456

In [220]:
random.seed(123456)

In [221]:
j = 0

In [222]:
current_col = priorMatCV.iloc[:, j]

In [223]:
current_col

gene
GAS6        0
MMP14       0
MARCKSL1    0
SPARC       0
CTSD        0
           ..
CFL2        0
CFL1        0
SELL        0
GNGT2       0
SERPINH1    0
Name: IRIS_Bcell-Memory_IgG_IgA, Length: 5892, dtype: int64

In [224]:
current_col.sum()

63

In [225]:
current_col.where(lambda x: x > 0).dropna().index

Index(['TPD52', 'PKIG', 'COBLL1', 'PAWR', 'ALOX5', 'CD79A', 'COCH', 'POU2AF1',
       'CD79B', 'SLC15A2', 'CR2', 'PNOC', 'ADAM28', 'BLK', 'CD19', 'TFEC',
       'BLNK', 'PDLIM1', 'GGA2', 'MEF2C', 'LILRA4', 'SLC4A4', 'RHOB', 'TCF4',
       'IGJ', 'ZNF528', 'CD72', 'HHEX', 'QRSL1', 'OSBPL10', 'GRAMD1C',
       'BCL11A', 'EAF2', 'BANK1', 'VPREB3', 'KIAA0125', 'FCRL2', 'E2F5',
       'PAX5', 'COL4A3', 'CD22', 'HEY1', 'TLR10', 'FCRL5', 'PARM1', 'PLEKHG1',
       'HLA-DOA', 'CPNE5', 'AFF3', 'RALGPS2', 'RAB30', 'BTNL9', 'DENND5B',
       'MS4A1', 'MARCH1', 'EML6', 'FAM129C', 'EBF1', 'PRICKLE1', 'SPDYE1',
       'FCRLA', 'FCRL1', 'CR1'],
      dtype='object', name='gene')

In [179]:
len(current_col.where(lambda x: x > 0).dropna().index)

61

In [180]:
iipos = current_col.where(lambda x: x > 0).dropna().index

In [181]:
random.sample(list(iipos), k=floor(len(iipos) / 5))

['DENND5B',
 'IGJ',
 'COBLL1',
 'FAM129C',
 'ADAM28',
 'BTNL9',
 'PAWR',
 'OSBPL10',
 'PCDH9',
 'EML6',
 'BCL11A',
 'MARCH1']

In [182]:
iiposs = random.sample(list(iipos), k=floor(len(iipos) / 5))

In [183]:
iiposs

['PNOC',
 'VPREB3',
 'FCRL2',
 'MS4A1',
 'COBLL1',
 'POU2AF1',
 'DENND5B',
 'ALOX5',
 'EBF1',
 'PCDH9',
 'FCRL5',
 'CD79A']

In [227]:
priorMatCV.loc[iiposs_2, priorMatCV.columns[1]] = 0

In [228]:
priorMatCV.iloc[:, 1].sum()

49

In [231]:
crossprod(priorMatCV.iloc[:, 0:2])

Unnamed: 0,IRIS_Bcell-Memory_IgG_IgA,IRIS_Bcell-Memory_IgM
IRIS_Bcell-Memory_IgG_IgA,51,40
IRIS_Bcell-Memory_IgM,40,49


In [229]:
a = priorMatCV.iloc[:, 0:2].sum(axis="columns")
a[a > 1]

gene
PAWR        2
ALOX5       2
CD79A       2
POU2AF1     2
CD79B       2
CR2         2
PNOC        2
ADAM28      2
BLK         2
CD19        2
TFEC        2
PDLIM1      2
GGA2        2
MEF2C       2
LILRA4      2
RHOB        2
IGJ         2
ZNF528      2
CD72        2
HHEX        2
OSBPL10     2
EAF2        2
BANK1       2
VPREB3      2
FCRL2       2
COL4A3      2
CD22        2
TLR10       2
FCRL5       2
PLEKHG1     2
CPNE5       2
RALGPS2     2
RAB30       2
BTNL9       2
MARCH1      2
EML6        2
EBF1        2
PRICKLE1    2
FCRLA       2
CR1         2
dtype: int64

In [188]:
priorMatCV.loc[iiposs, priorMatCV.columns[1]]

gene
PNOC       0
VPREB3     0
FCRL2      0
MS4A1      0
COBLL1     0
POU2AF1    0
DENND5B    0
ALOX5      0
EBF1       0
PCDH9      0
FCRL5      0
CD79A      0
Name: IRIS_Bcell-Memory_IgM, dtype: int64

In [92]:
if doCrossval:
    priorMatCV = priorMat.copy(deep=True)
    if seed is not None:
        random.seed(seed)
    for j in tqdm(priorMatCV.columns):
        iipos = priorMatCV.loc[:, j].where(lambda x: x > 0).dropna().index
        iiposs = random.sample(list(iipos), k=ceil(len(iipos) / 5))
        priorMatCV.loc[iiposs, j] = 0
        heldOutGenes[j] = list(iiposs)
    C = priorMatCV.copy(deep=True)
else:
    C = priorMat.copy(deep=True)

100%|##########| 606/606 [00:01<00:00, 440.55it/s]


In [55]:
C.peek()

Unnamed: 0_level_0,IRIS_Bcell-Memory_IgG_IgA,IRIS_Bcell-Memory_IgM,IRIS_Bcell-naive,IRIS_CD4Tcell-N0,IRIS_CD4Tcell-Th1-restimulated12hour
gene,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
GAS6,0,0,0,0,0
MMP14,0,0,0,0,0
MARCKSL1,0,0,0,0,0
SPARC,0,0,0,0,0
CTSD,0,0,0,0,0


In [93]:
C.sum(axis=0)

IRIS_Bcell-Memory_IgG_IgA                        50
IRIS_Bcell-Memory_IgM                            48
IRIS_Bcell-naive                                 56
IRIS_CD4Tcell-N0                                 28
IRIS_CD4Tcell-Th1-restimulated12hour             18
                                                 ..
PID_IL4_2PATHWAY                                 36
REACTOME_SIGNALING_BY_THE_B_CELL_RECEPTOR_BCR    95
PID_BCR_5PATHWAY                                 52
PID_TELOMERASEPATHWAY                            50
PID_PI3KPLCTRKPATHWAY                            24
Length: 606, dtype: int64

In [57]:
iiposs

['BAD', 'PLCG1', 'YWHAB', 'CCND1', 'PIK3R1', 'YWHAZ']

In [89]:
ns = data.shape[1]
Bdiff = -1
BdiffTrace = np.ndarray((0,), dtype=np.float64)
BdiffCount = 0

In [31]:
%store C

Stored 'C' (DataFrame)


In [94]:
if Chat is None:
    Cp = crossprod(C)
    Chat = pinv_ridge(Cp, 5) @ C.transpose()

### crossprod(C.iloc[:,0:2])

In [30]:
%store Cp

Stored 'Cp' (DataFrame)


In [29]:
pinv_ridge(Cp, 5).peek()

Unnamed: 0,IRIS_Bcell-Memory_IgG_IgA,IRIS_Bcell-Memory_IgM,IRIS_Bcell-naive,IRIS_CD4Tcell-N0,IRIS_CD4Tcell-Th1-restimulated12hour
IRIS_Bcell-Memory_IgG_IgA,0.0005087729,8.131516e-19,-1.897354e-19,3.366109e-18,7.335305e-18
IRIS_Bcell-Memory_IgM,-5.692060999999999e-19,0.0007100352,1.7415e-18,-5.044928e-18,-9.896098e-18
IRIS_Bcell-naive,-1.185846e-19,-1.142647e-18,0.001250532,2.062525e-18,6.213834e-18
IRIS_CD4Tcell-N0,-1.046086e-18,2.552957e-18,-1.832132e-18,0.001370106,1.3166280000000001e-17
IRIS_CD4Tcell-Th1-restimulated12hour,-2.552957e-18,4.694151e-18,-5.246522e-18,-1.2332800000000001e-17,0.001476022


In [28]:
Cp.peek()

Unnamed: 0,IRIS_Bcell-Memory_IgG_IgA,IRIS_Bcell-Memory_IgM,IRIS_Bcell-naive,IRIS_CD4Tcell-N0,IRIS_CD4Tcell-Th1-restimulated12hour
IRIS_Bcell-Memory_IgG_IgA,51,39,35,1,0
IRIS_Bcell-Memory_IgM,39,49,34,0,0
IRIS_Bcell-naive,35,34,56,1,0
IRIS_CD4Tcell-N0,1,0,1,28,1
IRIS_CD4Tcell-Th1-restimulated12hour,0,0,0,1,19


In [27]:
Chat.peek()

gene,GAS6,MMP14,MARCKSL1,SPARC,CTSD
IRIS_Bcell-Memory_IgG_IgA,2.725752e-18,-7.250602e-18,6.131672e-17,-3.218725e-19,-4.1565600000000006e-17
IRIS_Bcell-Memory_IgM,-8.036649e-18,6.439144000000001e-17,-1.044663e-16,1.0760710000000001e-17,4.593629e-17
IRIS_Bcell-naive,7.657390000000001e-18,-5.0375590000000004e-17,4.266505e-17,-8.450848e-18,9.153885e-18
IRIS_CD4Tcell-N0,1.018663e-17,-3.361111e-17,4.92772e-17,9.666552e-18,-3.8472240000000004e-18
IRIS_CD4Tcell-Th1-restimulated12hour,9.262226e-18,-1.8791270000000002e-17,1.669423e-18,1.036599e-17,-6.983787e-17


In [26]:
C.iloc[:, 1][C.iloc[:, 1] == 1]

gene
TPD52       1
COBLL1      1
PAWR        1
ALOX5       1
CD79A       1
POU2AF1     1
CR2         1
PNOC        1
BLK         1
CD19        1
BLNK        1
PDLIM1      1
GGA2        1
MEF2C       1
LILRA4      1
SLC4A4      1
RHOB        1
TCF4        1
CD72        1
HHEX        1
QRSL1       1
OSBPL10     1
BCL11A      1
EAF2        1
PCDH9       1
KIAA0125    1
FCRL2       1
E2F5        1
PAX5        1
COL4A3      1
CD22        1
FCRL5       1
PLEKHG1     1
HLA-DOA     1
CPNE5       1
AFF3        1
RALGPS2     1
RAB30       1
BTNL9       1
DENND5B     1
MS4A1       1
MARCH1      1
EML6        1
FAM129C     1
EBF1        1
PRICKLE1    1
SPDYE1      1
FCRLA       1
FCRL1       1
Name: IRIS_Bcell-Memory_IgM, dtype: int64

In [27]:
heldOutGenes["IRIS_Bcell-Memory_IgM"]

['PKIG',
 'CD79B',
 'TFEC',
 'HEY1',
 'BANK1',
 'ADAM28',
 'CR1',
 'VPREB3',
 'SLC15A2',
 'ZNF528',
 'TLR10',
 'IGJ']

In [28]:
crossprod(priorMat.iloc[:, 0:2])

Unnamed: 0,IRIS_Bcell-Memory_IgG_IgA,IRIS_Bcell-Memory_IgM
IRIS_Bcell-Memory_IgG_IgA,63,60
IRIS_Bcell-Memory_IgM,60,61


In [29]:
C.iloc[:, 0:2]

Unnamed: 0_level_0,IRIS_Bcell-Memory_IgG_IgA,IRIS_Bcell-Memory_IgM
gene,Unnamed: 1_level_1,Unnamed: 2_level_1
GAS6,0,0
MMP14,0,0
MARCKSL1,0,0
SPARC,0,0
CTSD,0,0
...,...,...
CFL2,0,0
CFL1,0,0
SELL,0,0
GNGT2,0,0


In [30]:
c0 = C.iloc[:, 0]
len(c0[c0 > 0])

51

In [31]:
c1 = C.iloc[:, 1]
len(c1[c1 > 0])

49

In [32]:
len(commonRows(c0[c0 > 0], c1[c1 > 0]))

39

In [33]:
crossprod(C.iloc[:, 0])

51

In [34]:
Cp.peek()

Unnamed: 0,IRIS_Bcell-Memory_IgG_IgA,IRIS_Bcell-Memory_IgM,IRIS_Bcell-naive,IRIS_CD4Tcell-N0,IRIS_CD4Tcell-Th1-restimulated12hour
IRIS_Bcell-Memory_IgG_IgA,51,39,35,1,0
IRIS_Bcell-Memory_IgM,39,49,34,0,0
IRIS_Bcell-naive,35,34,56,1,0
IRIS_CD4Tcell-N0,1,0,1,28,1
IRIS_CD4Tcell-Th1-restimulated12hour,0,0,0,1,19


In [35]:
C.to_csv("/workspaces/pyplier/inprogress_C.csv")

In [36]:
C.transpose() @ C

Unnamed: 0,IRIS_Bcell-Memory_IgG_IgA,IRIS_Bcell-Memory_IgM,IRIS_Bcell-naive,IRIS_CD4Tcell-N0,IRIS_CD4Tcell-Th1-restimulated12hour,IRIS_CD4Tcell-Th1-restimulated48hour,IRIS_CD4Tcell-Th2-restimulated12hour,IRIS_CD4Tcell-Th2-restimulated48hour,IRIS_CD8Tcell-N0,IRIS_DendriticCell-Control,...,KEGG_GNRH_SIGNALING_PATHWAY,KEGG_BASAL_TRANSCRIPTION_FACTORS,REACTOME_SYNTHESIS_OF_DNA,KEGG_HEMATOPOIETIC_CELL_LINEAGE,KEGG_T_CELL_RECEPTOR_SIGNALING_PATHWAY,PID_IL4_2PATHWAY,REACTOME_SIGNALING_BY_THE_B_CELL_RECEPTOR_BCR,PID_BCR_5PATHWAY,PID_TELOMERASEPATHWAY,PID_PI3KPLCTRKPATHWAY
IRIS_Bcell-Memory_IgG_IgA,51,39,35,1,0,0,0,0,1,3,...,0,0,0,3,0,0,4,4,0,0
IRIS_Bcell-Memory_IgM,39,49,34,0,0,0,0,0,0,2,...,0,0,0,3,0,0,3,3,0,0
IRIS_Bcell-naive,35,34,56,1,0,0,0,0,1,3,...,0,0,0,3,0,1,4,4,0,0
IRIS_CD4Tcell-N0,1,0,1,28,1,1,1,1,13,0,...,0,0,0,0,1,0,0,0,0,0
IRIS_CD4Tcell-Th1-restimulated12hour,0,0,0,1,19,13,12,11,2,0,...,0,0,0,0,1,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
PID_IL4_2PATHWAY,0,0,1,0,1,0,0,0,1,3,...,1,0,0,0,3,37,1,4,1,2
REACTOME_SIGNALING_BY_THE_B_CELL_RECEPTOR_BCR,4,3,4,0,0,0,0,0,0,0,...,8,0,29,0,13,1,96,16,2,8
PID_BCR_5PATHWAY,4,3,4,0,0,0,0,0,0,0,...,9,0,0,1,19,4,16,53,5,6
PID_TELOMERASEPATHWAY,0,0,0,0,0,0,0,0,0,0,...,0,0,1,0,4,1,2,5,51,1


In [37]:
C.iloc[:, 9][:6]

gene
GAS6        1
MMP14       0
MARCKSL1    0
SPARC       0
CTSD        0
EPAS1       1
Name: IRIS_DendriticCell-Control, dtype: int64

In [38]:
priorMat

Unnamed: 0_level_0,IRIS_Bcell-Memory_IgG_IgA,IRIS_Bcell-Memory_IgM,IRIS_Bcell-naive,IRIS_CD4Tcell-N0,IRIS_CD4Tcell-Th1-restimulated12hour,IRIS_CD4Tcell-Th1-restimulated48hour,IRIS_CD4Tcell-Th2-restimulated12hour,IRIS_CD4Tcell-Th2-restimulated48hour,IRIS_CD8Tcell-N0,IRIS_DendriticCell-Control,...,KEGG_GNRH_SIGNALING_PATHWAY,KEGG_BASAL_TRANSCRIPTION_FACTORS,REACTOME_SYNTHESIS_OF_DNA,KEGG_HEMATOPOIETIC_CELL_LINEAGE,KEGG_T_CELL_RECEPTOR_SIGNALING_PATHWAY,PID_IL4_2PATHWAY,REACTOME_SIGNALING_BY_THE_B_CELL_RECEPTOR_BCR,PID_BCR_5PATHWAY,PID_TELOMERASEPATHWAY,PID_PI3KPLCTRKPATHWAY
gene,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
GAS6,0,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
MMP14,0,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
MARCKSL1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
SPARC,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
CTSD,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
CFL2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
CFL1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
SELL,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
GNGT2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [39]:
# compute svd and use that as the starting point

if (svdres is not None) and (svdres["v"] != Y.shape[1]):
    rprint("SVD V has the wrong number of columns")
    svdres = None

In [40]:
if Y.isnull().to_numpy().any():
    Y.fillna(0, inplace=True)

In [41]:
ns

36

In [42]:
if svdres is None:
    svdres = dict()
    rprint("Computing SVD")
    if ns > 500:
        rprint("Using rsvd")
        svdres["u"], svdres["d"], svdres["v"] = randomized_svd(
            M=Y.values, n_components=min(ns, max(200, ns / 4)), n_iter=3
        )
    else:
        svdres["u"], svdres["d"], svdres["v"] = svd(
            Y, lapack_driver="gesdd"
        )  # the gesvd driver flips the sign for components > 6 in the v matrix as compared to R's svd function
    rprint("Done")
    svdres["v"] = svdres[
        "v"
    ].transpose()  # as compared to the output from R's svd, the v matrix is transposed.  Took me too long to figure this one out.

Computing SVD
Done


In [43]:
svdres = dict()

In [44]:
svdres["u"], svdres["d"], svdres["v"] = svd(Y, lapack_driver="gesdd")
svdres["v"] = svdres["v"].transpose()

In [45]:
svdres["d"]

array([2.42480269e+02, 1.94446624e+02, 1.38734479e+02, 9.62937294e+01,
       8.60279523e+01, 8.49543370e+01, 7.69335717e+01, 7.19435065e+01,
       6.61213541e+01, 6.35644091e+01, 6.14925578e+01, 5.65143694e+01,
       5.38383741e+01, 5.25222680e+01, 5.01026519e+01, 4.91091749e+01,
       4.87219534e+01, 4.75482839e+01, 4.62771417e+01, 4.47270154e+01,
       4.40225039e+01, 4.19869624e+01, 4.19641647e+01, 4.05841743e+01,
       3.97234713e+01, 3.90811883e+01, 3.84237880e+01, 3.80371928e+01,
       3.73715006e+01, 3.68611679e+01, 3.63211657e+01, 3.62545359e+01,
       3.45800852e+01, 3.02587925e+01, 2.09847670e-12, 6.54042087e-13])

In [46]:
pd.DataFrame(svdres["u"]).peek()

Unnamed: 0,0,1,2,3,4
0,-0.012749,-0.001176,0.003769,0.004952,0.006982
1,-0.003189,-0.007479,0.000564,0.018872,0.018141
2,-0.015081,-0.013293,-0.015643,0.003901,-0.002168
3,-0.003258,-0.000187,-0.00477,-0.012724,-0.052747
4,-0.018336,-0.007475,-0.016042,0.005463,0.012875


In [47]:
pd.DataFrame(svdres["v"]).peek()

Unnamed: 0,0,1,2,3,4
0,0.177491,-0.030616,-0.117442,-0.290516,0.091422
1,0.093049,-0.184942,-0.158544,-0.009213,0.175392
2,0.21991,0.123374,0.098651,-0.466178,-0.191182
3,0.315039,0.281003,-0.268667,0.169794,-0.235057
4,0.045651,0.053069,-0.043128,0.199062,0.041016


In [48]:
c1 = C.iloc[:, 1]
len(c1[c1 > 0])

49

In [49]:
if num_LVs is None:
    num_LVs = num_pc(svdres) * 2
    num_LVs = min(num_LVs, floor(Y.shape[1] * 0.9))
    rprint(f"The number of LVs is set to {num_LVs}")

Smoothing data
The number of LVs is set to 30


In [50]:
if L2 is None:
    L2 = svdres["d"][num_LVs]
    rprint(f"L2 is set to {L2}")

L2 is set to 36.321165678805464


In [51]:
if L1 is None:
    L1 = L2 / 2
    rprint(f"L1 is set to {L1}")

L1 is set to 18.160582839402732


In [52]:
B = (
    svdres["v"][0 : Y.shape[1], 0:num_LVs] @ np.diag(svdres["d"][0:num_LVs])
).transpose()

In [53]:
pd.DataFrame(B).peek()

Unnamed: 0,0,1,2,3,4
0,43.038151,22.562642,53.323955,76.390763,11.069577
1,-5.953241,-35.961385,23.98967,54.640167,10.319179
2,-16.293202,-21.995528,13.686335,-37.273319,-5.983304
3,-27.974824,-0.887174,-44.890009,16.3501,19.168408
4,7.864873,15.088596,-16.447021,-20.221474,3.528512


In [54]:
# following two lines are equivalent to R's diag(x)
# numpy.fill_diagonal modifies in place and does not
# return a value, thus this workaround
diag_mat = np.zeros((num_LVs, num_LVs))
np.fill_diagonal(diag_mat, val=1)

In [55]:
# for R's solve(), if b is missing, it uses the identity matrix of a
# scipy.linalg.solve does not have a default for b, so just give it one
Z = pd.DataFrame(
    data=np.dot(np.dot(Y, B.T), solve(a=np.dot(B, B.T) + L1 * diag_mat, b=diag_mat)),
    index=Y.index,
)

In [56]:
Z.peek()

Unnamed: 0_level_0,0,1,2,3,4
gene,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
GAS6,-0.012745,-0.001176,0.003765,0.004943,0.006965
MMP14,-0.003188,-0.007476,0.000563,0.018836,0.018096
MARCKSL1,-0.015076,-0.013287,-0.015628,0.003894,-0.002163
SPARC,-0.003257,-0.000186,-0.004765,-0.012699,-0.052618
CTSD,-0.01833,-0.007472,-0.016027,0.005452,0.012843


In [57]:
Z = Z.where(cond=lambda x: x > 0, other=0)

In [58]:
Z.peek()

Unnamed: 0_level_0,0,1,2,3,4
gene,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
GAS6,0.0,0.0,0.003765,0.004943,0.006965
MMP14,0.0,0.0,0.000563,0.018836,0.018096
MARCKSL1,0.0,0.0,0.0,0.003894,0.0
SPARC,0.0,0.0,0.0,0.0,0.0
CTSD,0.0,0.0,0.0,0.005452,0.012843


In [59]:
if rseed is not None:
    rprint("using random start")
    random.seed(rseed)

    rng = default_rng()
    rng.shuffle(B, axis=1)  # B = t(apply(B, 1, sample))
    rng.shuffle(Z, axis=0)  # Z = apply(Z, 2, sample)
    Z = Z.transpose()

In [60]:
pd.DataFrame(B).peek()

Unnamed: 0,0,1,2,3,4
0,43.038151,22.562642,53.323955,76.390763,11.069577
1,-5.953241,-35.961385,23.98967,54.640167,10.319179
2,-16.293202,-21.995528,13.686335,-37.273319,-5.983304
3,-27.974824,-0.887174,-44.890009,16.3501,19.168408
4,7.864873,15.088596,-16.447021,-20.221474,3.528512


In [61]:
U = np.zeros((C.shape[1], num_LVs))  # matrix(0, nrow = ncol(C), ncol = num_LVs)

In [62]:
rprint(
    f"errorY (SVD based:best possible) = {((Y - np.dot(Z, B))**2).to_numpy().mean():.4f}"
)

errorY (SVD based:best possible) = 0.4971


In [63]:
iter_full_start = 20
iter_full = 20

In [64]:
if L3 is not None:
    L3_given = True
else:
    L3_given = False

In [65]:
Z.values.shape

(5892, 30)

In [66]:
Chat.values.shape

(606, 5892)

In [67]:
Chat.shape

(606, 5892)

In [68]:
Chat @ Z

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,20,21,22,23,24,25,26,27,28,29
IRIS_Bcell-Memory_IgG_IgA,1.000405e-04,5.481798e-05,3.188418e-04,5.044950e-05,1.633849e-04,3.665334e-05,5.669352e-05,2.184170e-04,5.403841e-05,1.318956e-04,...,1.809738e-04,1.124966e-04,1.060570e-04,5.879469e-05,1.366929e-04,1.224630e-04,1.095878e-04,2.023413e-04,1.669709e-04,1.412092e-04
IRIS_Bcell-Memory_IgM,1.385244e-04,4.865657e-05,4.328783e-04,6.784649e-05,2.046401e-04,4.330335e-05,7.507667e-05,2.494727e-04,3.936053e-05,1.403287e-04,...,2.328004e-04,1.506087e-04,1.227417e-04,5.513670e-05,1.569304e-04,1.480886e-04,1.160207e-04,2.608981e-04,2.509090e-04,2.259427e-04
IRIS_Bcell-naive,2.493831e-04,1.910287e-04,7.885930e-04,2.129919e-04,2.868003e-04,1.065891e-04,1.594260e-04,3.643027e-04,8.391094e-05,2.930973e-04,...,5.247019e-04,4.011681e-04,2.044819e-04,1.691251e-04,3.771610e-04,3.024921e-04,2.177804e-04,4.237669e-04,4.144953e-04,4.163181e-04
IRIS_CD4Tcell-N0,1.629837e-04,2.392004e-05,2.831261e-04,3.044253e-04,4.357449e-05,7.522025e-04,1.801700e-04,2.389711e-04,2.060039e-04,1.757012e-04,...,2.404473e-04,1.235847e-04,1.287112e-04,3.123383e-04,2.562014e-04,1.184722e-04,2.201274e-04,2.179220e-04,3.244629e-04,1.917469e-04
IRIS_CD4Tcell-Th1-restimulated12hour,1.967148e-04,2.385727e-07,4.611196e-05,1.558193e-04,3.955815e-05,2.650390e-04,5.910047e-05,1.894302e-04,2.427090e-04,4.687973e-04,...,1.228944e-04,1.437667e-04,1.029085e-04,2.699654e-04,2.047187e-04,9.883514e-05,3.612428e-04,2.085838e-04,1.727691e-04,2.461351e-04
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
PID_IL4_2PATHWAY,2.038875e-02,1.905467e-02,1.654567e-02,1.266022e-02,8.914451e-03,2.054138e-02,1.095910e-02,1.415547e-02,9.367604e-03,1.933328e-02,...,1.394392e-02,2.346763e-02,1.032142e-02,1.052857e-02,1.034996e-02,4.882922e-03,1.938760e-02,1.513501e-02,1.418706e-02,6.351903e-03
REACTOME_SIGNALING_BY_THE_B_CELL_RECEPTOR_BCR,3.841793e-02,2.281962e-02,3.632389e-02,3.316824e-02,3.112777e-02,2.277363e-02,2.693636e-02,2.641151e-02,2.319848e-02,3.691013e-02,...,1.788364e-02,2.741813e-02,2.372276e-02,3.403576e-02,3.172569e-02,2.263190e-02,2.913796e-02,2.505851e-02,2.891982e-02,2.212264e-02
PID_BCR_5PATHWAY,1.506341e-14,1.278457e-14,1.310142e-14,9.668812e-15,6.286858e-15,5.198314e-15,7.378589e-15,1.066893e-14,5.083860e-15,1.025090e-14,...,5.532951e-15,6.628203e-15,6.372931e-15,7.190074e-15,7.257729e-15,7.726720e-15,4.053071e-15,9.695886e-15,9.223234e-15,5.896388e-15
PID_TELOMERASEPATHWAY,1.281986e-15,4.913564e-16,6.001983e-16,6.729016e-16,3.606905e-16,5.341733e-16,7.008052e-16,7.890998e-16,2.897650e-16,4.683847e-16,...,3.543062e-16,5.295582e-16,5.962335e-16,6.869541e-16,8.579453e-16,4.424853e-16,5.790321e-16,5.245298e-16,1.683205e-16,6.240167e-16


In [69]:
# %store Z
# %store Chat
# %store C
# %store penalty_factor
# %store pathwaySelection
# %store glm_alpha
# %store maxPath
# %store frac

In [70]:
for i in trange(max_iter):
    if i >= iter_full_start:
        if i == iter_full and not L3_given:
            # update L3 to the target fraction
            try:
                Ulist = solveU(
                    Z=Z,
                    Chat=Chat,
                    priorMat=C,
                    penalty_factor=penalty_factor,
                    pathwaySelection=pathwaySelection,
                    glm_alpha=glm_alpha,
                    maxPath=maxPath,
                    target_frac=frac,
                )
            except AttributeError:
                raise
            U = Ulist["U"]
            L3 = Ulist["L3"]
            rprint(f"New L3 is {L3}")
            iter_full = iter_full + iter_full_start
        else:
            U = solveU(
                Z,
                Chat,
                C,
                penalty_factor,
                pathwaySelection,
                glm_alpha,
                maxPath,
                L3=L3,
            )["U"]

        Z1 = tcrossprod(Y, B)
        Z2 = L1 * (C @ U)

        Z1_nonzero = np.argwhere(np.asarray(Z1.T.stack()) > 0).flatten()
        Z2_nonzero = np.argwhere(np.asarray(Z2.T.stack()) > 0).flatten()

        ratio = np.median(
            np.divide(Z2, np.asarray(Z1))
            .T.stack()
            .values[np.intersect1d(Z2_nonzero, Z1_nonzero)]
        )

        Z = (Z1 + Z2) @ solve(a=(tcrossprod(B) + L1 * diag_mat), b=diag_mat)
    else:
        Z = tcrossprod(Y, B) @ solve(a=(tcrossprod(B) + L1 * diag_mat), b=diag_mat)

    Z[Z < 0] = 0

    oldB = B.copy()
    B = solve(a=(Z.transpose() @ Z + L2 * diag_mat), b=diag_mat) @ Z.transpose() @ Y

    Bdiff = ((B - oldB) ** 2).to_numpy().sum() / (B**2).to_numpy().sum()
    BdiffTrace = np.append(BdiffTrace, Bdiff)

    if trace & (i >= iter_full_start):
        rprint(
            f"iter {i} errorY = {np.mean((Y - Z @ B)**2):.4f} prior information ratio= {round(ratio,2)} Bdiff = {Bdiff:.4f} Bkappa= {np.linalg.cond(B):.4f};pos. col. U = {sum(U.sum(axis='index') > 0)}"
        )
    elif trace:
        rprint(
            f"iter {i} errorY = {np.mean((Y - Z @ B)**2):.4f} Bdiff = {np.linalg.cond(Bdiff):.4f} Bkappa = {np.linalg.cond(B):.4f}"
        )

    if (i > 52) and (Bdiff > BdiffTrace[i - 50]):
        BdiffCount += 1
        rprint("Bdiff is not decreasing")
    elif BdiffCount > 1:
        BdiffCount -= 1

    if Bdiff < tol:
        rprint(f"converged at iteration {i}")
        break
    if BdiffCount > 5:
        rprint(f"converged at iteration {i} Bdiff is not decreasing")
        break

  4%|4         | 15/350 [00:00<00:04, 69.63it/s]
100%|##########| 30/30 [00:00<00:00, 5124.38it/s]


New L3 is 0.0003801289578694637


 11%|#1        | 40/350 [00:21<04:43,  1.09it/s]
100%|##########| 30/30 [00:00<00:00, 5752.71it/s]


New L3 is 0.00043074254057568753


 17%|#7        | 60/350 [00:43<04:43,  1.02it/s]
100%|##########| 30/30 [00:00<00:00, 7373.09it/s]


New L3 is 0.00033546262790251185


 23%|##2       | 80/350 [01:06<04:59,  1.11s/it]
100%|##########| 30/30 [00:00<00:00, 7049.65it/s]


New L3 is 0.0003801289578694637


 29%|##8       | 100/350 [01:27<03:17,  1.27it/s]
100%|##########| 30/30 [00:00<00:00, 8510.02it/s]


New L3 is 0.0003801289578694637


 34%|###4      | 120/350 [01:46<03:32,  1.08it/s]
100%|##########| 30/30 [00:00<00:00, 6461.06it/s]


New L3 is 0.000488095243523415


 40%|####      | 140/350 [02:08<03:02,  1.15it/s]
100%|##########| 30/30 [00:00<00:00, 3424.95it/s]


New L3 is 0.00043074254057568753


 41%|####1     | 144/350 [02:15<04:42,  1.37s/it]

Bdiff is not decreasing


 41%|####1     | 145/350 [02:16<04:16,  1.25s/it]

Bdiff is not decreasing


 42%|####1     | 146/350 [02:17<04:00,  1.18s/it]

Bdiff is not decreasing


 42%|####2     | 147/350 [02:18<03:54,  1.15s/it]

Bdiff is not decreasing


 42%|####2     | 148/350 [02:20<03:58,  1.18s/it]

Bdiff is not decreasing


 43%|####2     | 149/350 [02:21<03:50,  1.15s/it]

Bdiff is not decreasing
converged at iteration 149 Bdiff is not decreasing




 43%|####2     | 149/350 [02:22<03:11,  1.05it/s]


In [71]:
U.index = priorMat.columns
U.columns = [f"LV{_+1}" for _ in range(num_LVs)]
Z.columns = [f"LV{_+1}" for _ in range(num_LVs)]

In [72]:
B.index = [f"LV{_+1}" for _ in range(num_LVs)]

In [73]:
out = PLIERResults(
    residual=(Y - (Z.values @ B.values)),
    B=B,
    Z=Z,
    U=U,
    C=C,
    L1=L1,
    L2=L2,
    L3=L3,
    heldOutGenes=heldOutGenes,
)

In [74]:
%store out
%store priorMat
%store priorMatCV

Stored 'out' (PLIERResults)
Stored 'priorMat' (DataFrame)
Stored 'priorMatCV' (DataFrame)


In [75]:
out

B : 30 rows x 36 columns
Z : 5892 rows x 30 columns
U : 606 rows x 30 columns
C : 5892 rows x 606 columns
heldOutGenes: 606
withPrior: 0
Uauc: 0 rows x 0 columns
Up: 0 rows x 0 columns
summary: 0 rows x 0 columns
residual: 5892 rows x 36 columns
L1 is set to 18.1606
L2 is set to 36.3212
L3 is set to 0.0004

In [105]:
from typing import Dict
from collections.abc import Iterable

import numpy as np
import pandas as pd
from statsmodels.stats.multitest import multipletests
from tqdm.auto import tqdm

from pyplier.AUC import AUC
from pyplier.copyMat import copyMat
from pyplier.PLIERRes import PLIERResults


def crossVal(
    plierRes: PLIERResults, priorMat: pd.DataFrame, priorMatcv: pd.DataFrame
) -> Dict[str, pd.DataFrame]:
    """
    title crossVal

    param priorMat the real prior info matrix
    param priorMatcv the zeroed-out prior info matrix used for PLIER
    computations

    """
    out = pd.DataFrame(
        data=np.empty(shape=(0, 4)), columns=["pathway", "LV index", "AUC", "p-value"]
    )
    out_dict = dict()
    ii = plierRes.U.loc[:, plierRes.U.sum(axis=0) > 0].columns
    Uauc = pd.DataFrame(
        np.zeros(shape=plierRes.U.shape),
        index=plierRes.U.index,
        columns=plierRes.U.columns,
    )
    Up = pd.DataFrame(
        np.ones(shape=plierRes.U.shape),
        index=plierRes.U.index,
        columns=plierRes.U.columns,
    )

    for i in tqdm(ii):
        iipath = plierRes.U.loc[(plierRes.U.loc[:, i] > 0), i].index
        if len(iipath) > 1:
            for j in tqdm(iipath):
                a = (
                    priorMat.loc[:, iipath]
                    .sum(axis=1)
                    .where(lambda x: x == 0)
                    .dropna()
                    .index
                )
                b = priorMat.loc[:, j].where(lambda x: x > 0).dropna().index
                c = priorMatcv.loc[:, j].where(lambda x: x == 0).dropna().index
                iiheldout = a.union(b.intersection(c))
                aucres = AUC(priorMat.loc[iiheldout, j], plierRes.Z.loc[iiheldout, i])
                out_dict[j] = {
                    "pathway": j,
                    "LV index": i,
                    "AUC": aucres["auc"],
                    "p-value": aucres["pval"],
                }
                Uauc.loc[j, i] = aucres["auc"]
                Up.loc[j, i] = aucres["pval"]

        else:
            j = iipath[0]
            a = priorMat.loc[:, iipath].where(lambda x: x == 0).dropna().index
            b = priorMat.loc[:, j].where(lambda x: x > 0).dropna().index
            c = priorMatcv.loc[:, j].where(lambda x: x == 0).dropna().index
            iiheldout = a.union(b.intersection(c))

            aucres = AUC(priorMat.loc[iiheldout, j], plierRes.Z.loc[iiheldout, i])
            if isinstance(j, Iterable) and not isinstance(j, str):
                for _ in j:
                    out_dict[_] = {
                        "pathway": _,
                        "LV index": i,
                        "AUC": aucres["auc"],
                        "p-value": aucres["pval"],
                    }
                    Uauc.loc[_, i] = aucres["auc"]
                    Up.loc[_, i] = aucres["pval"]
            elif isinstance(j, str):
                out_dict[j] = {
                    "pathway": j,
                    "LV index": i,
                    "AUC": aucres["auc"],
                    "p-value": aucres["pval"],
                }
                Uauc.loc[j, i] = aucres["auc"]
                Up.loc[j, i] = aucres["pval"]

    out = pd.DataFrame.from_dict(out_dict, orient="index")
    _, fdr, *_ = multipletests(out.loc[:, "p-value"], method="fdr_bh")
    out.loc[:, "FDR"] = fdr
    return {"Uauc": Uauc, "Upval": Up, "summary": out}

In [106]:
if doCrossval:
    outAUC = crossVal(plierRes=out, priorMat=priorMat, priorMatcv=priorMatCV)
else:
    rprint("Not using cross-validation. AUCs and p-values may be over-optimistic")
    outAUC = getAUC(out, Y, priorMat)

  0%|          | 0/19 [00:00<?, ?it/s]
  0%|          | 0/3 [00:00<?, ?it/s][A
100%|##########| 3/3 [00:00<00:00, 16.79it/s][A
  5%|5         | 1/19 [00:00<00:03,  5.43it/s]
  0%|          | 0/2 [00:00<?, ?it/s][A
100%|##########| 2/2 [00:00<00:00, 10.72it/s][A
 11%|#         | 2/19 [00:00<00:03,  5.33it/s]
  0%|          | 0/2 [00:00<?, ?it/s][A
100%|##########| 2/2 [00:00<00:00, 14.42it/s][A
 16%|#5        | 3/19 [00:00<00:02,  6.01it/s]
  0%|          | 0/5 [00:00<?, ?it/s][A
 20%|##        | 1/5 [00:00<00:00,  7.09it/s][A
 60%|######    | 3/5 [00:00<00:00, 12.00it/s][A
100%|##########| 5/5 [00:00<00:00, 13.46it/s][A
 21%|##1       | 4/19 [00:00<00:03,  4.03it/s]
  0%|          | 0/5 [00:00<?, ?it/s][A
 40%|####      | 2/5 [00:00<00:00, 18.61it/s][A
100%|##########| 5/5 [00:00<00:00, 18.45it/s][A
 32%|###1      | 6/19 [00:01<00:02,  4.73it/s]
  0%|          | 0/7 [00:00<?, ?it/s][A
 29%|##8       | 2/7 [00:00<00:00, 14.58it/s][A
 57%|#####7    | 4/7 [00:00<00:00, 16.2

In [82]:
outAUC.keys()

dict_keys(['Uauc', 'Upval', 'summary'])

In [107]:
(outAUC["Upval"] == 1).all()

LV1     False
LV2     False
LV3     False
LV4     False
LV5      True
LV6     False
LV7     False
LV8     False
LV9     False
LV10    False
LV11     True
LV12     True
LV13     True
LV14    False
LV15    False
LV16    False
LV17     True
LV18     True
LV19     True
LV20    False
LV21     True
LV22     True
LV23    False
LV24    False
LV25     True
LV26     True
LV27    False
LV28    False
LV29    False
LV30    False
dtype: bool

In [108]:
outAUC["Upval"]["LV1"][outAUC["Upval"]["LV1"] != 1]

REACTOME_GENERIC_TRANSCRIPTION_PATHWAY                 0.135079
REACTOME_IMMUNE_SYSTEM                                 0.141883
REACTOME_TRANSMEMBRANE_TRANSPORT_OF_SMALL_MOLECULES    0.107460
Name: LV1, dtype: float64

In [92]:
from typing import List

import numpy as np

from pyplier.console import console
from pyplier.PLIERRes import PLIERResults


def nameB(
    plierRes: PLIERResults, top: int = 1, fdr_cutoff: float = 0.01, use: str = None
) -> List[str]:
    """
    Rename latent variables to match the pathways that appear to correlate
    Number of pathways used in the name is controlled by `top`
    """
    if use is None:
        use = "coef"
    elif use not in ("coef", "AUC"):
        raise ValueError(
            "only 'coef' and 'AUC' are the only valid options for the 'use' argument"
        )

    names = list()

    if use == "coef":
        Uuse = plierRes.U.copy(deep=True)
    else:
        Uuse = plierRes.Uauc.copy(deep=True)

    if plierRes.Up is not None:
        pval_cutoff = max(
            plierRes.summary.loc[plierRes.summary["FDR"] < fdr_cutoff, "p-value"]
        )
        Uuse[plierRes.Up > pval_cutoff] = 0
    else:
        console("[red]No p-values in PLIER object: using coefficients only[/]")

    mm = Uuse.apply(func=np.max, axis=0)

    for i in range(plierRes.U.shape[1]):
        if mm[i] > 0:
            names.append(
                f"{i+1},"
                + ",".join(
                    Uuse.iloc[:, i]
                    .sort_values(ascending=False)
                    .where(lambda x: x > 0)
                    .index[0:top]
                )
            )
            # this should give us something like "LV1,REACTOME_GENERIC_TRANSCRIPTION_PATHWAY"
            # this also will only return pathways with some correlation - if there is 0, it will get dropped and the
            # [0:top] is ignored, grabbing just as much as it can
        elif max(plierRes.U.iloc[:, i]) > 0:
            names.append(
                f"{i+1},"
                + ",".join(
                    plierRes.U.iloc[:, i]
                    .sort_values(ascending=False)
                    .where(lambda x: x > 0)
                    .index[0:top]
                )
            )
        else:
            names.append(f"LV {i+1}")

    return names

In [109]:
out.withPrior = U.sum(axis="index")[U.sum(axis="index") > 0].to_dict()
out.Uauc = outAUC["Uauc"]
out.Up = outAUC["Upval"]
out.summary = outAUC["summary"]
tt = out.Uauc.max(axis="index")
rprint(f"There are {sum(tt > 0.7)} LVs with AUC > 0.70")

# return out

There are 5 LVs with AUC > 0.70


In [110]:
outAUC["Uauc"]

Unnamed: 0,LV1,LV2,LV3,LV4,LV5,LV6,LV7,LV8,LV9,LV10,...,LV21,LV22,LV23,LV24,LV25,LV26,LV27,LV28,LV29,LV30
IRIS_Bcell-Memory_IgG_IgA,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
IRIS_Bcell-Memory_IgM,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
IRIS_Bcell-naive,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
IRIS_CD4Tcell-N0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
IRIS_CD4Tcell-Th1-restimulated12hour,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
PID_IL4_2PATHWAY,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
REACTOME_SIGNALING_BY_THE_B_CELL_RECEPTOR_BCR,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
PID_BCR_5PATHWAY,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
PID_TELOMERASEPATHWAY,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [111]:
outAUC["Uauc"]["LV2"][outAUC["Uauc"]["LV2"] > 0.0]

REACTOME_HEMOSTASIS            0.513373
KEGG_MAPK_SIGNALING_PATHWAY    0.539267
Name: LV2, dtype: float64

In [99]:
out.Uauc.max(axis="index")

LV1     0.535370
LV2     0.536111
LV3     0.527583
LV4     0.675721
LV5     0.000000
LV6     0.573108
LV7     0.602947
LV8     0.789125
LV9     0.609793
LV10    0.614747
LV11    0.000000
LV12    0.000000
LV13    0.000000
LV14    0.516963
LV15    0.520314
LV16    0.494157
LV17    0.000000
LV18    0.000000
LV19    0.000000
LV20    0.719989
LV21    0.000000
LV22    0.000000
LV23    0.918386
LV24    0.620513
LV25    0.000000
LV26    0.000000
LV27    0.970264
LV28    0.548726
LV29    0.636876
LV30    0.998991
dtype: float64

In [112]:
out.Uauc.max(axis="index")

LV1     0.546680
LV2     0.539267
LV3     0.529351
LV4     0.685181
LV5     0.000000
LV6     0.573108
LV7     0.611415
LV8     0.800222
LV9     0.609793
LV10    0.620626
LV11    0.000000
LV12    0.000000
LV13    0.000000
LV14    0.516963
LV15    0.520314
LV16    0.494157
LV17    0.000000
LV18    0.000000
LV19    0.000000
LV20    0.724360
LV21    0.000000
LV22    0.000000
LV23    0.935673
LV24    0.625654
LV25    0.000000
LV26    0.000000
LV27    0.974347
LV28    0.548726
LV29    0.641004
LV30    0.999470
dtype: float64

In [97]:
U["LV1"][U["LV1"] != 0.0]

REACTOME_GENERIC_TRANSCRIPTION_PATHWAY                 0.032281
REACTOME_IMMUNE_SYSTEM                                 0.003309
REACTOME_TRANSMEMBRANE_TRANSPORT_OF_SMALL_MOLECULES    0.001338
Name: LV1, dtype: float64

In [52]:
out.B.index = nameB(out)

ValueError: max() arg is an empty sequence

In [54]:
out.Up

Unnamed: 0,LV1,LV2,LV3,LV4,LV5,LV6,LV7,LV8,LV9,LV10,...,LV21,LV22,LV23,LV24,LV25,LV26,LV27,LV28,LV29,LV30
IRIS_Bcell-Memory_IgG_IgA,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
IRIS_Bcell-Memory_IgM,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
IRIS_Bcell-naive,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
IRIS_CD4Tcell-N0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
IRIS_CD4Tcell-Th1-restimulated12hour,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
B,,,,,,,,,,,...,,,,,,,,,,
F,,,,,,,,,,,...,,,,,,,,,,
D,,,,,,,,,,,...,,,,,,,,,,
U,,,,,,,,,,,...,,,,,,,,,,


In [53]:
out.summary

Unnamed: 0,pathway,LV index,AUC,p-value,FDR
REACTOME_IMMUNE_SYSTEM,REACTOME_IMMUNE_SYSTEM,LV27,0.5,,
KEGG_PATHWAYS_IN_CANCER,KEGG_PATHWAYS_IN_CANCER,LV29,0.5,,
REACTOME_SIGNALLING_BY_NGF,REACTOME_SIGNALLING_BY_NGF,LV29,0.5,,
REACTOME_TRANSMEMBRANE_TRANSPORT_OF_SMALL_MOLECULES,REACTOME_TRANSMEMBRANE_TRANSPORT_OF_SMALL_MOLE...,LV1,0.5,,
REACTOME_CYTOKINE_SIGNALING_IN_IMMUNE_SYSTEM,REACTOME_CYTOKINE_SIGNALING_IN_IMMUNE_SYSTEM,LV27,0.5,,
...,...,...,...,...,...
KEGG_RIBOSOME,KEGG_RIBOSOME,LV30,0.5,,
MIPS_NOP56P_ASSOCIATED_PRE_RRNA_COMPLEX,MIPS_NOP56P_ASSOCIATED_PRE_RRNA_COMPLEX,LV30,0.5,,
MIPS_RIBOSOME_CYTOPLASMIC,MIPS_RIBOSOME_CYTOPLASMIC,LV30,0.5,,
REACTOME_INFLUENZA_LIFE_CYCLE,REACTOME_INFLUENZA_LIFE_CYCLE,LV30,0.5,,
