Updates to gaussian imputation code

Some reworking of gaussian imputation code to use numeric indexing (for non-unique columns).
mfitzp · Nov 26, 2015 · 8c3f89d · 8c3f89d
1 parent 51cf9f0
commit 8c3f89d
Show file tree

Hide file tree

Showing 2 changed files with 199 additions and 0 deletions.
diff --git a/maxquant/imputation.py b/maxquant/imputation.py
@@ -0,0 +1,100 @@
+"""
+Algorithms for imputing missing values in data
+"""
+import pandas as pd
+import numpy as np
+import scipy as sp
+from sklearn.cross_decomposition import PLSRegression
+
+
+def gaussian(df, width=0.3, downshift=-1.8, prefix=None):
+    """
+    Impute missing values by drawing from a normal distribution
+
+    :param df:
+    :param width: Scale factor for the imputed distribution relative to the standard deviation of measured values. Can be a single number or list of one per column.
+    :param downshift: Shift the imputed values down, in units of std. dev. Can be a single number or list of one per column
+    :param prefix: The column prefix for imputed columns
+    :return:
+    """
+
+    df = df.copy()
+
+    imputed = df.isnull()  # Keep track of what's real
+
+    if prefix:
+        mask = np.array([l.startswith(prefix) for l in df.columns.values])
+        mycols = np.arange(0, df.shape[1])[mask]
+    else:
+        mycols = np.arange(0, df.shape[1])
+
+
+    if type(width) is not list:
+        width = [width] * len(mycols)
+
+    elif len(mycols) != len(width):
+        raise ValueError("Length of iterable 'width' does not match # of columns")
+
+    if type(downshift) is not list:
+        downshift = [downshift] * len(mycols)
+
+    elif len(mycols) != len(downshift):
+        raise ValueError("Length of iterable 'downshift' does not match # of columns")
+
+    for i in mycols:
+        data = df.iloc[:, i]
+        mask = data.isnull()
+        mean = data.mean(axis=0)
+        stddev = data.std(axis=0)
+
+        m = mean + downshift[i]*stddev
+        s = stddev*width[i]
+
+        # Generate a list of random numbers for filling in
+        values = np.random.normal(loc=m, scale=s, size=df.shape[1])
+
+        # Now fill them in
+        df.values[i, mask] = values[mask]
+
+    return df, imputed
+
+
+def pls(df):
+    """
+    A simple implementation of a least-squares approach to imputation using partial least squares
+    regression (PLS).
+    
+    """
+    df = df.copy()
+    df[np.isinf(df)] = np.nan
+
+    dfo = df.dropna(how='any', axis=0)
+    dfo = dfo.astype(np.float64)
+
+    dfi = df.copy()
+    imputed = df.isnull() #Keep track of what's real
+
+    # List of proteins with missing values in their rows
+    missing_values = df[ np.sum(np.isnan(df), axis=1) > 0 ].index
+    ix_mask = np.arange(0, df.shape[1])
+    total_n = len(missing_values)
+
+    plsr = PLSRegression(n_components=2)
+
+    for n, p in enumerate(missing_values):
+        # Generate model for this protein from missing data
+        target = df.loc[p].values.copy().T
+        ixes = ix_mask[ np.isnan(target) ]
+
+        # Fill missing values with row median for calculation
+        target[np.isnan(target)] = np.nanmedian(target)
+        plsr.fit(dfo.values.T, target)
+
+        # For each missing value, calculate imputed value from the column data input
+        for ix in ixes:
+            imputv = plsr.predict(dfo.iloc[:, ix])[0]
+            dfi.loc[p].ix[ix] = imputv
+
+        print("%d%%" % ((n/total_n)*100), end="\r")
+
+    return dfi, imputed
diff --git a/maxquant/utils.py b/maxquant/utils.py
@@ -0,0 +1,99 @@
+import numpy as np
+import scipy as sp
+import sys, pickle, pdb
+import scipy.stats as st
+import scipy.interpolate
+
+def qvalues(pv, m = None, verbose = False, lowmem = False, pi0 = None):
+    """
+    Copyright (c) 2012, Nicolo Fusi, University of Sheffield
+    All rights reserved.
+
+    Estimates q-values from p-values
+
+    Args
+    =====
+
+    m: number of tests. If not specified m = pv.size
+    verbose: print verbose messages? (default False)
+    lowmem: use memory-efficient in-place algorithm
+    pi0: if None, it's estimated as suggested in Storey and Tibshirani, 2003. 
+         For most GWAS this is not necessary, since pi0 is extremely likely to be
+         1
+
+    """
+
+
+    assert(pv.min() >= 0 and pv.max() <= 1), "p-values should be between 0 and 1"
+
+    original_shape = pv.shape
+    pv = pv.ravel() # flattens the array in place, more efficient than flatten() 
+
+    if m == None:
+        m = float(len(pv))
+    else:
+        # the user has supplied an m
+        m *= 1.0
+
+    # if the number of hypotheses is small, just set pi0 to 1
+    if len(pv) < 100 and pi0 == None:
+        pi0 = 1.0
+    elif pi0 != None:
+        pi0 = pi0
+    else:
+        # evaluate pi0 for different lambdas
+        pi0 = []
+        lam = sp.arange(0, 0.90, 0.01)
+        counts = sp.array([(pv > i).sum() for i in sp.arange(0, 0.9, 0.01)])
+
+        for l in range(len(lam)):
+            pi0.append(counts[l]/(m*(1-lam[l])))
+
+        pi0 = sp.array(pi0)
+
+        # fit natural cubic spline
+        tck = sp.interpolate.splrep(lam, pi0, k = 3)
+        pi0 = sp.interpolate.splev(lam[-1], tck)
+
+        if pi0 > 1:
+            if verbose:
+                print("got pi0 > 1 (%.3f) while estimating qvalues, setting it to 1" % pi0)
+
+            pi0 = 1.0
+
+    assert(pi0 >= 0 and pi0 <= 1), "pi0 is not between 0 and 1: %f" % pi0
+
+
+    if lowmem:
+        # low memory version, only uses 1 pv and 1 qv matrices
+        qv = sp.zeros((len(pv),))
+        last_pv = pv.argmax()
+        qv[last_pv] = (pi0*pv[last_pv]*m)/float(m)
+        pv[last_pv] = -sp.inf
+        prev_qv = last_pv
+        for i in range(int(len(pv))-2, -1, -1):
+            cur_max = pv.argmax()
+            qv_i = (pi0*m*pv[cur_max]/float(i+1))
+            pv[cur_max] = -sp.inf
+            qv_i1 = prev_qv
+            qv[cur_max] = min(qv_i, qv_i1)
+            prev_qv = qv[cur_max]
+
+    else:
+        p_ordered = sp.argsort(pv)    
+        pv = pv[p_ordered]
+        qv = pi0 * m/len(pv) * pv
+        qv[-1] = min(qv[-1],1.0)
+
+        for i in range(len(pv)-2, -1, -1):
+            qv[i] = min(pi0*m*pv[i]/(i+1.0), qv[i+1])
+
+        # reorder qvalues
+        qv_temp = qv.copy()
+        qv = sp.zeros_like(qv)
+        qv[p_ordered] = qv_temp
+
+        # reshape qvalues
+        qv = qv.reshape(original_shape)
+
+    return qv