diff --git a/maxquant/imputation.py b/maxquant/imputation.py new file mode 100644 index 0000000..dd056fd --- /dev/null +++ b/maxquant/imputation.py @@ -0,0 +1,100 @@ +""" +Algorithms for imputing missing values in data +""" +import pandas as pd +import numpy as np +import scipy as sp +from sklearn.cross_decomposition import PLSRegression + + +def gaussian(df, width=0.3, downshift=-1.8, prefix=None): + """ + Impute missing values by drawing from a normal distribution + + :param df: + :param width: Scale factor for the imputed distribution relative to the standard deviation of measured values. Can be a single number or list of one per column. + :param downshift: Shift the imputed values down, in units of std. dev. Can be a single number or list of one per column + :param prefix: The column prefix for imputed columns + :return: + """ + + df = df.copy() + + imputed = df.isnull() # Keep track of what's real + + if prefix: + mask = np.array([l.startswith(prefix) for l in df.columns.values]) + mycols = np.arange(0, df.shape[1])[mask] + else: + mycols = np.arange(0, df.shape[1]) + + + if type(width) is not list: + width = [width] * len(mycols) + + elif len(mycols) != len(width): + raise ValueError("Length of iterable 'width' does not match # of columns") + + if type(downshift) is not list: + downshift = [downshift] * len(mycols) + + elif len(mycols) != len(downshift): + raise ValueError("Length of iterable 'downshift' does not match # of columns") + + for i in mycols: + data = df.iloc[:, i] + mask = data.isnull() + mean = data.mean(axis=0) + stddev = data.std(axis=0) + + m = mean + downshift[i]*stddev + s = stddev*width[i] + + # Generate a list of random numbers for filling in + values = np.random.normal(loc=m, scale=s, size=df.shape[1]) + + # Now fill them in + df.values[i, mask] = values[mask] + + return df, imputed + + +def pls(df): + """ + A simple implementation of a least-squares approach to imputation using partial least squares + regression (PLS). + + """ + df = df.copy() + df[np.isinf(df)] = np.nan + + dfo = df.dropna(how='any', axis=0) + dfo = dfo.astype(np.float64) + + dfi = df.copy() + imputed = df.isnull() #Keep track of what's real + + # List of proteins with missing values in their rows + missing_values = df[ np.sum(np.isnan(df), axis=1) > 0 ].index + ix_mask = np.arange(0, df.shape[1]) + total_n = len(missing_values) + + plsr = PLSRegression(n_components=2) + + for n, p in enumerate(missing_values): + # Generate model for this protein from missing data + target = df.loc[p].values.copy().T + ixes = ix_mask[ np.isnan(target) ] + + # Fill missing values with row median for calculation + target[np.isnan(target)] = np.nanmedian(target) + plsr.fit(dfo.values.T, target) + + # For each missing value, calculate imputed value from the column data input + for ix in ixes: + imputv = plsr.predict(dfo.iloc[:, ix])[0] + dfi.loc[p].ix[ix] = imputv + + print("%d%%" % ((n/total_n)*100), end="\r") + + return dfi, imputed diff --git a/maxquant/utils.py b/maxquant/utils.py new file mode 100644 index 0000000..46420a7 --- /dev/null +++ b/maxquant/utils.py @@ -0,0 +1,99 @@ +import numpy as np +import scipy as sp +import sys, pickle, pdb +import scipy.stats as st +import scipy.interpolate + +def qvalues(pv, m = None, verbose = False, lowmem = False, pi0 = None): + """ + Copyright (c) 2012, Nicolo Fusi, University of Sheffield + All rights reserved. + + Estimates q-values from p-values + + Args + ===== + + m: number of tests. If not specified m = pv.size + verbose: print verbose messages? (default False) + lowmem: use memory-efficient in-place algorithm + pi0: if None, it's estimated as suggested in Storey and Tibshirani, 2003. + For most GWAS this is not necessary, since pi0 is extremely likely to be + 1 + + """ + + + assert(pv.min() >= 0 and pv.max() <= 1), "p-values should be between 0 and 1" + + original_shape = pv.shape + pv = pv.ravel() # flattens the array in place, more efficient than flatten() + + if m == None: + m = float(len(pv)) + else: + # the user has supplied an m + m *= 1.0 + + # if the number of hypotheses is small, just set pi0 to 1 + if len(pv) < 100 and pi0 == None: + pi0 = 1.0 + elif pi0 != None: + pi0 = pi0 + else: + # evaluate pi0 for different lambdas + pi0 = [] + lam = sp.arange(0, 0.90, 0.01) + counts = sp.array([(pv > i).sum() for i in sp.arange(0, 0.9, 0.01)]) + + for l in range(len(lam)): + pi0.append(counts[l]/(m*(1-lam[l]))) + + pi0 = sp.array(pi0) + + # fit natural cubic spline + tck = sp.interpolate.splrep(lam, pi0, k = 3) + pi0 = sp.interpolate.splev(lam[-1], tck) + + if pi0 > 1: + if verbose: + print("got pi0 > 1 (%.3f) while estimating qvalues, setting it to 1" % pi0) + + pi0 = 1.0 + + assert(pi0 >= 0 and pi0 <= 1), "pi0 is not between 0 and 1: %f" % pi0 + + + if lowmem: + # low memory version, only uses 1 pv and 1 qv matrices + qv = sp.zeros((len(pv),)) + last_pv = pv.argmax() + qv[last_pv] = (pi0*pv[last_pv]*m)/float(m) + pv[last_pv] = -sp.inf + prev_qv = last_pv + for i in range(int(len(pv))-2, -1, -1): + cur_max = pv.argmax() + qv_i = (pi0*m*pv[cur_max]/float(i+1)) + pv[cur_max] = -sp.inf + qv_i1 = prev_qv + qv[cur_max] = min(qv_i, qv_i1) + prev_qv = qv[cur_max] + + else: + p_ordered = sp.argsort(pv) + pv = pv[p_ordered] + qv = pi0 * m/len(pv) * pv + qv[-1] = min(qv[-1],1.0) + + for i in range(len(pv)-2, -1, -1): + qv[i] = min(pi0*m*pv[i]/(i+1.0), qv[i+1]) + + # reorder qvalues + qv_temp = qv.copy() + qv = sp.zeros_like(qv) + qv[p_ordered] = qv_temp + + # reshape qvalues + qv = qv.reshape(original_shape) + + return qv \ No newline at end of file