In [1]:
import pandas as pd
import numpy as np
from pathlib import Path
from scipy.linalg import solve

In [31]:
from statsmodels.stats.multitest import multipletests

In [2]:
dataPath = Path("/home/milo/workspace/pyplier/tests/data/common")
data = pd.read_csv(filepath_or_buffer = dataPath / "data.csv", index_col=0)

In [3]:
plierRes_b = pd.read_csv(filepath_or_buffer = dataPath / "plierRes_b.csv", index_col=0)
plierRes_c = pd.read_csv(filepath_or_buffer = dataPath / "plierRes_c.csv", index_col=0)
plierRes_residual = pd.read_csv(filepath_or_buffer = dataPath / "plierRes_residual.csv", index_col=0)
plierRes_u = pd.read_csv(filepath_or_buffer = dataPath / "plierRes_u.csv", index_col=0)
plierRes_z = pd.read_csv(filepath_or_buffer = dataPath / "plierRes_z.csv", index_col=0)

priorMat = pd.read_csv(filepath_or_buffer=dataPath / "priorMat.csv", index_col=0)

In [5]:
target_Uauc = pd.read_csv("~/workspace/pyplier/tests/data/getAUC/aucresults_Uauc.csv", index_col=0)
target_Upval = pd.read_csv("~/workspace/pyplier/tests/data/getAUC/aucresults_Upval.csv", index_col=0)
target_summary = pd.read_csv("~/workspace/pyplier/tests/data/getAUC/aucresults_summary.csv", index_col=0)

In [6]:
B = plierRes_b
Z = plierRes_z
U = plierRes_u
Zcv = Z.copy(deep=True)
k = Z.shape[1]
L1 = 18.5633
L2 = 37.12661

In [7]:
from typing import Optional, Union

import numpy as np
import pandas as pd
# from numba import jit


# @jit(nopython=True)
def crossprod(
    mat1: Union[np.array, pd.DataFrame],
    mat2: Optional[Union[np.array, pd.DataFrame]] = None,
) -> Union[np.array, pd.DataFrame]:

    if mat2 is None:
        return mat1.transpose() @ mat1
    else:
        return mat1.tranpose() @ mat2


# @jit(nopython=True)
def tcrossprod(
    mat1: Union[np.array, pd.DataFrame],
    mat2: Optional[Union[np.array, pd.DataFrame]] = None,
) -> Union[np.array, pd.DataFrame]:

    if mat2 is None:
        return mat1 @ mat1.transpose()
    else:
        return mat1 @ mat2.tranpose()


In [8]:
def copyMat(df: pd.DataFrame, zero: bool = False) -> pd.DataFrame:
    if zero:
        dfnew = pd.DataFrame(
            np.zeros(shape=df.shape), index=df.index, columns=df.columns
        )
    else:
        dfnew = df.copy(deep=True)

    return dfnew

In [9]:
from typing import Dict, Tuple

import numpy as np
import pandas as pd
from scipy.stats import mannwhitneyu, norm


def AUC(labels: pd.Series, values: pd.Series) -> Dict[str, float]:
    posii = labels[labels > 0]
    negii = labels[labels <= 0]
    posn = len(posii)
    negn = len(negii)
    posval = values[posii.index]
    negval = values[negii.index]
    if posn > 0 and negn > 0:
        statistic, pvalue = mannwhitneyu(posval, negval, alternative="greater")
        conf_int_low, conf_int_high = mannwhitneyu_conf_int(posval, negval)
        res = {
            "low": conf_int_low,
            "high": conf_int_high,
            "auc": (statistic / (posn * negn)),
            "pval": pvalue,
        }
    else:
        res = {"auc": 0.5, "pval": np.nan}

    return res


def mannwhitneyu_conf_int(
    x: np.array, y: np.array, alpha: float = 0.05
) -> Tuple[float, float]:
    """
    see: https://www.ncbi.nlm.nih.gov/labs/pmc/articles/PMC2545906/pdf/bmj00286-0037.pdf
    """
    n = len(x)
    m = len(y)

    N = norm.ppf(1 - alpha / 2)

    diffs = sorted([i - j for i in x for j in y])

    # For an approximate 100(1-a)% confidence interval first calculate K:
    nm = n * m
    top = nm * (n + m + 1)
    right = N * np.sqrt(top / 12)
    left = (n * m) / 2
    K = left - right

    # The Kth smallest to the Kth largest of the n x m differences
    # lx and ly should be > ~20
    return (diffs[round(K)], diffs[len(diffs) - round(K)])


In [10]:
i = 1

In [11]:
((data.shape[0]/5)-1)*5+i

5960.0

In [12]:
from math import floor

In [13]:
floor(data.shape[0]/5) - 1

1191

In [14]:
ii = [(_ * 5 + i)-1 for _ in range(floor(data.shape[0]/5)) if (_ * 5 + i) <= Z.shape[0]]

In [15]:
len(ii)

1192

In [16]:
ii[-1]

5955

In [17]:
Z.loc[~Z.index.isin(Z.iloc[ii,:].index)].shape

(4772, 30)

In [18]:
Z.iloc[ii,:].index

Index(['GAS6', 'EPAS1', 'TGM2', 'DAB2', 'COL6A3', 'DUSP3', 'CD14', 'DYNLT1',
       'MX1', 'PDGFRB',
       ...
       'UPK3A', 'BEND5', 'FAIM3', 'FBXL8', 'CXorf57', 'TRPM6', 'GPR19',
       'FLVCR2', 'TMEM156', 'RRP12'],
      dtype='object', name='index', length=1192)

In [19]:
Z_not_ii = Z.loc[~Z.index.isin(Z.iloc[ii,:].index)]

In [20]:
Z_not_ii.shape

(4772, 30)

In [21]:
data_not_ii = data.loc[~data.index.isin(data.iloc[ii,:].index)]

In [22]:
solve_a = crossprod(Z_not_ii) + L2 * np.identity(k)

In [23]:
Bcv = solve(solve_a, np.identity(solve_a.shape[0])) @ Z_not_ii.transpose() @ data_not_ii

In [24]:
solve_Bcv = tcrossprod(Bcv) + L1 * np.identity(k)

In [25]:
Zcv.iloc[ii,:] = data.iloc[ii,:] @ Bcv.transpose() @ solve(solve_Bcv, np.identity(solve_Bcv.shape[0]))

In [26]:
for i in range(5):
    ii = [
        (_ * 5 + i) - 1
        for _ in range(floor(data.shape[0] / 5))
        if (_ * 5 + i) <= Z.shape[0]
    ]
    Z_not_ii = Z.loc[~Z.index.isin(Z.iloc[ii, :].index)]
    data_not_ii = data.loc[~data.index.isin(data.iloc[ii, :].index)]
    solve_a = crossprod(Z_not_ii) + L2 * np.identity(k)
    Bcv = (
        solve(solve_a, np.identity(solve_a.shape[0]))
        @ Z_not_ii.transpose()
        @ data_not_ii
    )
    solve_Bcv = tcrossprod(Bcv) + L1 * np.identity(k)
    Zcv.iloc[ii, :] = (
        data.iloc[ii, :]
        @ Bcv.transpose()
        @ solve(solve_Bcv, np.identity(solve_Bcv.shape[0]))
    )

In [51]:
out = pd.DataFrame(
    data=np.empty(shape=(0, 4)), columns=["pathway", "LV index", "AUC", "p-value"]
)
ii = U.loc[:, np.sum(a=U, axis=0) > 0].columns
Uauc = copyMat(df=U, zero=True)
Up = copyMat(df=U, zero=True)

In [28]:
ii

Index(['LV1', 'LV2', 'LV4', 'LV5', 'LV6', 'LV7', 'LV8', 'LV9', 'LV10', 'LV11',
       'LV13', 'LV14', 'LV15', 'LV20', 'LV21', 'LV23', 'LV26', 'LV27', 'LV28',
       'LV29'],
      dtype='object')

In [52]:
for i in ii:
    iipath = U.loc[(U.loc[:, i] > 0), i].index
    for j in iipath:
        aucres = AUC(priorMat.loc[:, j], Zcv.loc[:, i])

        out = out.append(
            other=pd.DataFrame(
                {
                    "pathway": [j],
                    "LV index": [i],
                    "AUC": [aucres["auc"]],
                    "p-value": [aucres["pval"]],
                }
            )
        )

        Uauc.loc[j, i] = aucres["auc"]
        Up.loc[j, i] = aucres["pval"]

In [30]:
priorMat

Unnamed: 0_level_0,IRIS_Bcell-Memory_IgG_IgA,IRIS_Bcell-Memory_IgM,IRIS_Bcell-naive,IRIS_CD4Tcell-N0,IRIS_CD4Tcell-Th1-restimulated12hour,IRIS_CD4Tcell-Th1-restimulated48hour,IRIS_CD4Tcell-Th2-restimulated12hour,IRIS_CD4Tcell-Th2-restimulated48hour,IRIS_CD8Tcell-N0,IRIS_DendriticCell-Control,...,KEGG_GNRH_SIGNALING_PATHWAY,KEGG_BASAL_TRANSCRIPTION_FACTORS,REACTOME_SYNTHESIS_OF_DNA,KEGG_HEMATOPOIETIC_CELL_LINEAGE,KEGG_T_CELL_RECEPTOR_SIGNALING_PATHWAY,PID_IL4_2PATHWAY,REACTOME_SIGNALING_BY_THE_B_CELL_RECEPTOR_BCR,PID_BCR_5PATHWAY,PID_TELOMERASEPATHWAY,PID_PI3KPLCTRKPATHWAY
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
GAS6,0,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
MMP14,0,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
MARCKSL1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
SPARC,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
CTSD,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
TRPM4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
LAIR2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
ZNF135,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
MARCH3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [53]:
out

Unnamed: 0,pathway,LV index,AUC,p-value
0,REACTOME_GENERIC_TRANSCRIPTION_PATHWAY,LV1,0.642935,5.979941e-16
0,IRIS_Monocyte-Day0,LV2,0.771498,6.281456e-36
0,IRIS_Neutrophil-Resting,LV2,0.917797,6.403945e-126
0,SVM Neutrophils,LV2,0.942809,4.964885e-31
0,REACTOME_SIGNALLING_BY_NGF,LV4,0.593512,6.194152e-06
...,...,...,...,...
0,KEGG_PARKINSONS_DISEASE,LV28,0.835713,4.672248e-29
0,MIPS_SPLICEOSOME,LV28,0.606528,1.125169e-05
0,MIPS_55S_RIBOSOME_MITOCHONDRIAL,LV28,0.881966,3.253850e-29
0,KEGG_SPLICEOSOME,LV28,0.660846,7.638110e-10


In [54]:
_, fdr, *_ = multipletests(out.loc[:,"p-value"], method="fdr_bh")

In [55]:
fdr

array([1.23223027e-015, 4.74598906e-035, 4.35468244e-124, 2.59701681e-030,
       7.26210923e-006, 2.45637916e-008, 5.17463180e-039, 1.58963671e-016,
       3.81293538e-010, 3.44386577e-009, 1.37360744e-024, 3.31669933e-024,
       4.12091815e-007, 8.55780598e-007, 1.14634430e-004, 2.75187491e-008,
       5.00271435e-016, 2.49137134e-026, 3.06291279e-005, 1.59041880e-005,
       1.13808191e-005, 5.87082558e-008, 9.56713743e-009, 9.73158666e-011,
       7.13906531e-014, 7.13906531e-014, 3.06291279e-005, 1.98661703e-023,
       3.04182720e-017, 3.68181678e-013, 1.30310369e-003, 6.10949607e-021,
       4.30906802e-099, 7.63627963e-097, 3.53370870e-099, 1.91875078e-013,
       1.67858225e-017, 4.01425630e-002, 3.61599687e-026, 1.28892241e-026,
       1.09047732e-030, 1.58963671e-016, 1.22477443e-015, 5.52130610e-017,
       6.40817291e-004, 2.19546372e-019, 5.56606028e-010, 2.28890661e-012,
       5.70223154e-018, 6.49946109e-012, 9.40009580e-015, 3.02027724e-047,
       4.13162794e-039, 4

In [56]:
out.loc[:, "FDR"] = fdr

In [57]:
out.head()

Unnamed: 0,pathway,LV index,AUC,p-value,FDR
0,REACTOME_GENERIC_TRANSCRIPTION_PATHWAY,LV1,0.642935,5.979941e-16,1.23223e-15
0,IRIS_Monocyte-Day0,LV2,0.771498,6.281456e-36,4.745989e-35
0,IRIS_Neutrophil-Resting,LV2,0.917797,6.403945e-126,4.354682e-124
0,SVM Neutrophils,LV2,0.942809,4.964885e-31,2.5970169999999998e-30
0,REACTOME_SIGNALLING_BY_NGF,LV4,0.593512,6.194152e-06,7.262109e-06


In [42]:
target_summary.head()

Unnamed: 0,pathway,LV index,AUC,p-value,FDR
1,REACTOME_GENERIC_TRANSCRIPTION_PATHWAY,1,0.642498,5.945028e-16,1.225036e-15
2,IRIS_Monocyte-Day0,2,0.770966,6.286414e-36,4.749735e-35
3,IRIS_Neutrophil-Resting,2,0.91713,6.657733e-126,4.527259e-124
4,SVM Neutrophils,2,0.942135,5.021067e-31,2.6264039999999998e-30
5,REACTOME_SIGNALLING_BY_NGF,4,0.5932,6.065664e-06,7.111468e-06


In [45]:
out.to_csv("/home/milo/workspace/pyplier/tests/data/getAUC/aucresults_summary.csv")

In [47]:
Uauc.to_csv("/home/milo/workspace/pyplier/tests/data/getAUC/aucresults_uauc.csv")

In [58]:
Uauc

Unnamed: 0_level_0,LV1,LV2,LV3,LV4,LV5,LV6,LV7,LV8,LV9,LV10,...,LV21,LV22,LV23,LV24,LV25,LV26,LV27,LV28,LV29,LV30
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
IRIS_Bcell-Memory_IgG_IgA,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
IRIS_Bcell-Memory_IgM,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
IRIS_Bcell-naive,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
IRIS_CD4Tcell-N0,0.0,0.0,0.0,0.0,0.0,0.785914,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
IRIS_CD4Tcell-Th1-restimulated12hour,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
PID_IL4_2PATHWAY,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
REACTOME_SIGNALING_BY_THE_B_CELL_RECEPTOR_BCR,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
PID_BCR_5PATHWAY,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
PID_TELOMERASEPATHWAY,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [59]:
Up

Unnamed: 0_level_0,LV1,LV2,LV3,LV4,LV5,LV6,LV7,LV8,LV9,LV10,...,LV21,LV22,LV23,LV24,LV25,LV26,LV27,LV28,LV29,LV30
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
IRIS_Bcell-Memory_IgG_IgA,0.0,0.0,0.0,0.0,0.0,0.000000e+00,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
IRIS_Bcell-Memory_IgM,0.0,0.0,0.0,0.0,0.0,0.000000e+00,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
IRIS_Bcell-naive,0.0,0.0,0.0,0.0,0.0,0.000000e+00,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
IRIS_CD4Tcell-N0,0.0,0.0,0.0,0.0,0.0,2.582899e-09,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
IRIS_CD4Tcell-Th1-restimulated12hour,0.0,0.0,0.0,0.0,0.0,0.000000e+00,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
PID_IL4_2PATHWAY,0.0,0.0,0.0,0.0,0.0,0.000000e+00,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
REACTOME_SIGNALING_BY_THE_B_CELL_RECEPTOR_BCR,0.0,0.0,0.0,0.0,0.0,0.000000e+00,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
PID_BCR_5PATHWAY,0.0,0.0,0.0,0.0,0.0,0.000000e+00,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
PID_TELOMERASEPATHWAY,0.0,0.0,0.0,0.0,0.0,0.000000e+00,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [60]:
Up.to_csv("/home/milo/workspace/pyplier/tests/data/getAUC/aucresults_upval.csv")

In [63]:
target_summary

Unnamed: 0,pathway,LV index,AUC,p-value,FDR
1,REACTOME_GENERIC_TRANSCRIPTION_PATHWAY,1,0.642498,5.945028e-16,1.225036e-15
2,IRIS_Monocyte-Day0,2,0.770966,6.286414e-36,4.749735e-35
3,IRIS_Neutrophil-Resting,2,0.917130,6.657733e-126,4.527259e-124
4,SVM Neutrophils,2,0.942135,5.021067e-31,2.626404e-30
5,REACTOME_SIGNALLING_BY_NGF,4,0.593200,6.065664e-06,7.111468e-06
...,...,...,...,...,...
64,KEGG_PARKINSONS_DISEASE,28,0.835149,4.666201e-29,2.115345e-28
65,MIPS_SPLICEOSOME,28,0.606130,1.121658e-05,1.271212e-05
66,MIPS_55S_RIBOSOME_MITOCHONDRIAL,28,0.881357,3.265641e-29,1.586168e-28
67,KEGG_SPLICEOSOME,28,0.660422,7.588976e-10,1.032101e-09


In [72]:
out = out.reset_index().drop(columns="index")

In [73]:
out.columns == target_summary.columns

array([ True,  True,  True,  True,  True])

In [80]:
target_summary = target_summary.reset_index().drop(columns="index")

In [83]:
pd.testing.assert_frame_equal(out, target_summary)

AssertionError: Attributes of DataFrame.iloc[:, 1] (column name="LV index") are different

Attribute "dtype" are different
[left]:  object
[right]: int64