In [1]:
import numpy as np
import pandas as pd
from pathlib import Path

In [2]:
def copyMat(df: pd.DataFrame, zero: bool = False) -> pd.DataFrame:
    if zero:
        dfnew = pd.DataFrame(
            np.zeros(shape=df.shape), index=df.index, columns=df.columns
        )
    else:
        dfnew = df.copy(deep=True)
    return dfnew

In [3]:
from typing import Dict, List, Tuple
import numpy as np
from scipy.stats import mannwhitneyu, norm


def AUC(labels: pd.Series, values: pd.Series) -> Dict[str, float]:
    posii = labels[labels > 0]
    negii = labels[labels <= 0]
    posn = len(posii)
    negn = len(negii)
    posval = values[posii.index]
    negval = values[negii.index]
    if posn > 0 and negn > 0:
        statistic, pvalue = mannwhitneyu(posval, negval, alternative="greater")
        conf_int_low, conf_int_high = mannwhitneyu_conf_int(posval, negval)
        res = {
            "low": conf_int_low,
            "high": conf_int_high,
            "auc": (statistic / (posn * negn)),
            "pval": pvalue,
        }
    else:
        res = {"auc": 0.5, "pval": np.nan}

    return res


def mannwhitneyu_conf_int(
    x: np.array, y: np.array, alpha: float = 0.05
) -> Tuple[float, float]:
    n = len(x)
    m = len(y)

    N = norm.var(1 - alpha / 2)

    diffs = sorted([i - j for i in x for j in y])

    # For an approximate 100(1-a)% confidence interval first calculate K:

    nm = n * m
    top = nm * (n + m + 1)
    right = N * np.sqrt(top / 12)
    left = (n * m) / 2
    K = left - right

    k = int(round(n * m / 2 - (N * (n * m * (n + m + 1) / 12) ** 0.5)))

    # The Kth smallest to the Kth largest of the n x m differences
    # lx and ly should be > ~20
    CI = (diffs[k], diffs[len(diffs) - k])
    return CI

In [4]:
x = posval
y = negval

NameError: name 'posval' is not defined

In [None]:
negval

In [5]:
group1 = [38, 26, 29, 41, 36, 31, 32, 30, 35, 33]
group2 = [45, 28, 27, 38, 40, 42, 39, 39, 34, 45]

In [6]:
x = group1
y = group2

In [7]:
n = len(x)
m = len(y)

In [8]:
N = norm.ppf(1 - 0.05 / 2)

In [9]:
diffs = sorted([i - j for i in x for j in y])

In [10]:
nm = n * m
top = nm * (n + m + 1)
right = N * np.sqrt(top / 12)
left = (n * m) / 2
K = left - right

In [11]:
K

24.072113591318864

In [12]:
(diffs[round(K)], diffs[len(diffs) - round(K)])

(-10, 1)

In [13]:
n = len(x)
m = len(y)

N = norm.var(1 - 0.05 / 2)

diffs = sorted([i - j for i in x for j in y])

# For an approximate 100(1-a)% confidence interval first calculate K:

k = int(round(n * m / 2 - (N * (n * m * (n + m + 1) / 12) ** 0.5)))

In [14]:
# The Kth smallest to the Kth largest of the n x m differences
# lx and ly should be > ~20
CI = (diffs[k], diffs[len(diffs) - k])

In [15]:
CI

(-8, -3)

In [17]:
dataPath = Path("/home/milo/workspace/pyplier/tests/data/crossVal")
data = pd.read_csv(filepath_or_buffer=dataPath.joinpath("data.csv"), index_col=0)

In [18]:
data

Unnamed: 0,BD8001,BD8002,BD8003,BD8004,BD8005,BD8006,BD8007,BD8008,BD8009,BD8010,...,BD8031,BD8032,BD8033,BD8034,BD8038,BD8041,BD8042,BD8043,BD8044,BD8045
GAS6,-1.505242,0.188428,1.382268,-0.886040,0.218978,-1.659888,2.796534,0.829210,-1.649642,-0.699206,...,0.167190,-0.443537,-0.413695,-0.033962,-0.003753,-0.562015,-1.457108,0.254652,-0.945488,1.285548
MMP14,-1.362254,1.426659,-0.112956,-0.547275,-0.657120,0.623815,0.162686,0.399899,-0.959655,2.260257,...,-0.136098,-0.776283,0.401661,-0.029038,-1.121673,-0.977343,-1.771282,0.948462,0.723387,0.895571
MARCKSL1,-1.559936,1.247540,-2.110556,-0.721440,-0.058441,0.154611,1.487958,-0.045567,1.409842,0.235813,...,-2.010731,1.165882,-0.192181,-0.207469,-1.077898,0.035008,-0.486909,-0.433482,0.683989,2.203569
SPARC,0.465116,-0.849214,0.790658,0.906851,0.163648,0.117905,-1.060804,-1.701798,0.920468,-1.141010,...,-2.644481,-0.004298,0.902712,-0.857343,-0.450150,-0.669798,0.684989,-0.941961,-0.446238,0.891104
CTSD,0.024942,0.206589,-1.452483,-1.102910,0.034775,-0.016322,0.844618,0.483747,-0.650142,1.753523,...,-0.776214,0.643598,-0.739288,-0.768372,-1.777247,0.343815,-0.374175,0.949864,0.449915,2.400907
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
TRPM4,-0.324201,-0.774261,-0.128307,-0.540268,1.317625,0.363039,-0.019754,1.232543,-1.814046,1.174763,...,0.255074,0.288240,-1.268223,0.669198,0.361300,-0.930080,0.117951,-0.105594,-0.502940,1.052108
LAIR2,-0.394921,0.851458,-2.219011,1.623721,-2.460171,-0.319912,0.197574,0.627586,-0.413208,0.143300,...,-1.237987,-0.185263,0.607988,0.617346,-0.586569,-1.212232,0.003727,0.496207,-0.900624,0.757522
ZNF135,0.545218,-0.707431,0.174777,-0.063097,-0.423388,0.432197,-0.180568,0.141946,-0.231568,0.471386,...,-0.121949,-0.133662,-0.745435,0.416144,-0.409687,-1.083912,-0.355580,0.583608,1.140988,-0.355580
MARCH3,1.117124,-0.882370,0.078125,0.027025,-0.312435,-0.634321,-0.208948,-0.704559,1.774945,1.256757,...,-0.669562,0.712260,0.291270,0.131922,0.131922,-1.069672,0.535645,-0.186776,-2.450563,0.417511


In [19]:
plierRes_b = pd.read_csv(
    filepath_or_buffer=dataPath.joinpath("plierRes_b.csv"), index_col=0
)
plierRes_c = pd.read_csv(
    filepath_or_buffer=dataPath.joinpath("plierRes_c.csv"), index_col=0
)
plierRes_residual = pd.read_csv(
    filepath_or_buffer=dataPath.joinpath("plierRes_residual.csv"), index_col=0
)
plierRes_u = pd.read_csv(
    filepath_or_buffer=dataPath.joinpath("plierRes_u.csv"), index_col=0
)
plierRes_z = pd.read_csv(
    filepath_or_buffer=dataPath.joinpath("plierRes_z.csv"), index_col=0
)

In [20]:
heldoutgenes = pd.read_csv(filepath_or_buffer=dataPath.joinpath("heldOutGenes.csv"))

In [21]:
heldoutgenes

Unnamed: 0,name,value
0,IRIS_Bcell-Memory_IgG_IgA,SPDYE1
1,IRIS_Bcell-Memory_IgG_IgA,HEY1
2,IRIS_Bcell-Memory_IgG_IgA,FCRL1
3,IRIS_Bcell-Memory_IgG_IgA,AFF3
4,IRIS_Bcell-Memory_IgG_IgA,COCH
...,...,...
8004,PID_PI3KPLCTRKPATHWAY,PIK3R1
8005,PID_PI3KPLCTRKPATHWAY,GRB2
8006,PID_PI3KPLCTRKPATHWAY,SOS1
8007,PID_PI3KPLCTRKPATHWAY,FOXO3


In [22]:
heldOutGenes = {k: g["value"].tolist() for k, g in heldoutgenes.groupby("name")}

In [26]:
heldOutGenes

{'BIOCARTA_BCR_PATHWAY': ['MAPK14', 'CD79B', 'CALM2', 'PPP3CB', 'JUN', 'RAF1'],
 'BIOCARTA_BIOPEPTIDES_PATHWAY': ['RAF1',
  'MYLK',
  'GNB1',
  'MAPK3',
  'CAMK2D',
  'HRAS',
  'PLCG1'],
 'BIOCARTA_CARM_ER_PATHWAY': ['HDAC5',
  'NCOR2',
  'PHB2',
  'CCND1',
  'HDAC9',
  'SRA1'],
 'BIOCARTA_CHREBP2_PATHWAY': ['YWHAG',
  'PPP2R2B',
  'PPP2R5B',
  'PPP2CA',
  'PPP2R4',
  'PPP2R5C'],
 'BIOCARTA_DEATH_PATHWAY': ['CYCS', 'LMNA', 'BID', 'CHUK', 'CASP10', 'BIRC2'],
 'BIOCARTA_EGF_PATHWAY': ['PRKCA', 'JUN', 'ELK1', 'RAF1', 'GRB2', 'RASA1'],
 'BIOCARTA_FAS_PATHWAY': ['ARHGDIB', 'MAP3K1', 'CFLAR', 'DFFA', 'SPTAN1'],
 'BIOCARTA_FCER1_PATHWAY': ['NFATC3',
  'FCER1A',
  'PPP3CB',
  'MAPK3',
  'NFATC2',
  'MAPK1',
  'PIK3R1'],
 'BIOCARTA_FMLP_PATHWAY': ['NFATC3',
  'RAC1',
  'RAF1',
  'CALM1',
  'MAP2K1',
  'MAPK3'],
 'BIOCARTA_GPCR_PATHWAY': ['GNAS',
  'PPP3CA',
  'PLCG1',
  'MAP2K1',
  'PRKAR1B',
  'GNAQ'],
 'BIOCARTA_HIVNEF_PATHWAY': ['CFLAR',
  'TRADD',
  'PSEN1',
  'CASP2',
  'PSEN2',
  'CHUK',


In [23]:
plierRes = {
    "B": plierRes_b,
    "C": plierRes_c,
    "U": plierRes_u,
    "Z": plierRes_z,
    "residual": plierRes_residual,
    "heldOutGenes": heldOutGenes,
}