Skip to content

Commit

Permalink
Updates to gaussian imputation code
Browse files Browse the repository at this point in the history
Some reworking of gaussian imputation code to use numeric indexing
(for non-unique columns).
  • Loading branch information
mfitzp committed Nov 26, 2015
1 parent 51cf9f0 commit 8c3f89d
Show file tree
Hide file tree
Showing 2 changed files with 199 additions and 0 deletions.
100 changes: 100 additions & 0 deletions maxquant/imputation.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,100 @@
"""
Algorithms for imputing missing values in data
"""
import pandas as pd
import numpy as np
import scipy as sp
from sklearn.cross_decomposition import PLSRegression


def gaussian(df, width=0.3, downshift=-1.8, prefix=None):
"""
Impute missing values by drawing from a normal distribution
:param df:
:param width: Scale factor for the imputed distribution relative to the standard deviation of measured values. Can be a single number or list of one per column.
:param downshift: Shift the imputed values down, in units of std. dev. Can be a single number or list of one per column
:param prefix: The column prefix for imputed columns
:return:
"""

df = df.copy()

imputed = df.isnull() # Keep track of what's real

if prefix:
mask = np.array([l.startswith(prefix) for l in df.columns.values])
mycols = np.arange(0, df.shape[1])[mask]
else:
mycols = np.arange(0, df.shape[1])


if type(width) is not list:
width = [width] * len(mycols)

elif len(mycols) != len(width):
raise ValueError("Length of iterable 'width' does not match # of columns")

if type(downshift) is not list:
downshift = [downshift] * len(mycols)

elif len(mycols) != len(downshift):
raise ValueError("Length of iterable 'downshift' does not match # of columns")

for i in mycols:
data = df.iloc[:, i]
mask = data.isnull()
mean = data.mean(axis=0)
stddev = data.std(axis=0)

m = mean + downshift[i]*stddev
s = stddev*width[i]

# Generate a list of random numbers for filling in
values = np.random.normal(loc=m, scale=s, size=df.shape[1])

# Now fill them in
df.values[i, mask] = values[mask]

return df, imputed


def pls(df):
"""
A simple implementation of a least-squares approach to imputation using partial least squares
regression (PLS).
"""
df = df.copy()
df[np.isinf(df)] = np.nan

dfo = df.dropna(how='any', axis=0)
dfo = dfo.astype(np.float64)

dfi = df.copy()
imputed = df.isnull() #Keep track of what's real

# List of proteins with missing values in their rows
missing_values = df[ np.sum(np.isnan(df), axis=1) > 0 ].index
ix_mask = np.arange(0, df.shape[1])
total_n = len(missing_values)

plsr = PLSRegression(n_components=2)

for n, p in enumerate(missing_values):
# Generate model for this protein from missing data
target = df.loc[p].values.copy().T
ixes = ix_mask[ np.isnan(target) ]

# Fill missing values with row median for calculation
target[np.isnan(target)] = np.nanmedian(target)
plsr.fit(dfo.values.T, target)

# For each missing value, calculate imputed value from the column data input
for ix in ixes:
imputv = plsr.predict(dfo.iloc[:, ix])[0]
dfi.loc[p].ix[ix] = imputv

print("%d%%" % ((n/total_n)*100), end="\r")

return dfi, imputed
99 changes: 99 additions & 0 deletions maxquant/utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,99 @@
import numpy as np
import scipy as sp
import sys, pickle, pdb
import scipy.stats as st
import scipy.interpolate

def qvalues(pv, m = None, verbose = False, lowmem = False, pi0 = None):
"""
Copyright (c) 2012, Nicolo Fusi, University of Sheffield
All rights reserved.
Estimates q-values from p-values
Args
=====
m: number of tests. If not specified m = pv.size
verbose: print verbose messages? (default False)
lowmem: use memory-efficient in-place algorithm
pi0: if None, it's estimated as suggested in Storey and Tibshirani, 2003.
For most GWAS this is not necessary, since pi0 is extremely likely to be
1
"""


assert(pv.min() >= 0 and pv.max() <= 1), "p-values should be between 0 and 1"

original_shape = pv.shape
pv = pv.ravel() # flattens the array in place, more efficient than flatten()

if m == None:
m = float(len(pv))
else:
# the user has supplied an m
m *= 1.0

# if the number of hypotheses is small, just set pi0 to 1
if len(pv) < 100 and pi0 == None:
pi0 = 1.0
elif pi0 != None:
pi0 = pi0
else:
# evaluate pi0 for different lambdas
pi0 = []
lam = sp.arange(0, 0.90, 0.01)
counts = sp.array([(pv > i).sum() for i in sp.arange(0, 0.9, 0.01)])

for l in range(len(lam)):
pi0.append(counts[l]/(m*(1-lam[l])))

pi0 = sp.array(pi0)

# fit natural cubic spline
tck = sp.interpolate.splrep(lam, pi0, k = 3)
pi0 = sp.interpolate.splev(lam[-1], tck)

if pi0 > 1:
if verbose:
print("got pi0 > 1 (%.3f) while estimating qvalues, setting it to 1" % pi0)

pi0 = 1.0

assert(pi0 >= 0 and pi0 <= 1), "pi0 is not between 0 and 1: %f" % pi0


if lowmem:
# low memory version, only uses 1 pv and 1 qv matrices
qv = sp.zeros((len(pv),))
last_pv = pv.argmax()
qv[last_pv] = (pi0*pv[last_pv]*m)/float(m)
pv[last_pv] = -sp.inf
prev_qv = last_pv
for i in range(int(len(pv))-2, -1, -1):
cur_max = pv.argmax()
qv_i = (pi0*m*pv[cur_max]/float(i+1))
pv[cur_max] = -sp.inf
qv_i1 = prev_qv
qv[cur_max] = min(qv_i, qv_i1)
prev_qv = qv[cur_max]

else:
p_ordered = sp.argsort(pv)
pv = pv[p_ordered]
qv = pi0 * m/len(pv) * pv
qv[-1] = min(qv[-1],1.0)

for i in range(len(pv)-2, -1, -1):
qv[i] = min(pi0*m*pv[i]/(i+1.0), qv[i+1])

# reorder qvalues
qv_temp = qv.copy()
qv = sp.zeros_like(qv)
qv[p_ordered] = qv_temp

# reshape qvalues
qv = qv.reshape(original_shape)

return qv

0 comments on commit 8c3f89d

Please sign in to comment.