-
Notifications
You must be signed in to change notification settings - Fork 9
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Some reworking of gaussian imputation code to use numeric indexing (for non-unique columns).
- Loading branch information
Showing
2 changed files
with
199 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,100 @@ | ||
""" | ||
Algorithms for imputing missing values in data | ||
""" | ||
import pandas as pd | ||
import numpy as np | ||
import scipy as sp | ||
from sklearn.cross_decomposition import PLSRegression | ||
|
||
|
||
def gaussian(df, width=0.3, downshift=-1.8, prefix=None): | ||
""" | ||
Impute missing values by drawing from a normal distribution | ||
:param df: | ||
:param width: Scale factor for the imputed distribution relative to the standard deviation of measured values. Can be a single number or list of one per column. | ||
:param downshift: Shift the imputed values down, in units of std. dev. Can be a single number or list of one per column | ||
:param prefix: The column prefix for imputed columns | ||
:return: | ||
""" | ||
|
||
df = df.copy() | ||
|
||
imputed = df.isnull() # Keep track of what's real | ||
|
||
if prefix: | ||
mask = np.array([l.startswith(prefix) for l in df.columns.values]) | ||
mycols = np.arange(0, df.shape[1])[mask] | ||
else: | ||
mycols = np.arange(0, df.shape[1]) | ||
|
||
|
||
if type(width) is not list: | ||
width = [width] * len(mycols) | ||
|
||
elif len(mycols) != len(width): | ||
raise ValueError("Length of iterable 'width' does not match # of columns") | ||
|
||
if type(downshift) is not list: | ||
downshift = [downshift] * len(mycols) | ||
|
||
elif len(mycols) != len(downshift): | ||
raise ValueError("Length of iterable 'downshift' does not match # of columns") | ||
|
||
for i in mycols: | ||
data = df.iloc[:, i] | ||
mask = data.isnull() | ||
mean = data.mean(axis=0) | ||
stddev = data.std(axis=0) | ||
|
||
m = mean + downshift[i]*stddev | ||
s = stddev*width[i] | ||
|
||
# Generate a list of random numbers for filling in | ||
values = np.random.normal(loc=m, scale=s, size=df.shape[1]) | ||
|
||
# Now fill them in | ||
df.values[i, mask] = values[mask] | ||
|
||
return df, imputed | ||
|
||
|
||
def pls(df): | ||
""" | ||
A simple implementation of a least-squares approach to imputation using partial least squares | ||
regression (PLS). | ||
""" | ||
df = df.copy() | ||
df[np.isinf(df)] = np.nan | ||
|
||
dfo = df.dropna(how='any', axis=0) | ||
dfo = dfo.astype(np.float64) | ||
|
||
dfi = df.copy() | ||
imputed = df.isnull() #Keep track of what's real | ||
|
||
# List of proteins with missing values in their rows | ||
missing_values = df[ np.sum(np.isnan(df), axis=1) > 0 ].index | ||
ix_mask = np.arange(0, df.shape[1]) | ||
total_n = len(missing_values) | ||
|
||
plsr = PLSRegression(n_components=2) | ||
|
||
for n, p in enumerate(missing_values): | ||
# Generate model for this protein from missing data | ||
target = df.loc[p].values.copy().T | ||
ixes = ix_mask[ np.isnan(target) ] | ||
|
||
# Fill missing values with row median for calculation | ||
target[np.isnan(target)] = np.nanmedian(target) | ||
plsr.fit(dfo.values.T, target) | ||
|
||
# For each missing value, calculate imputed value from the column data input | ||
for ix in ixes: | ||
imputv = plsr.predict(dfo.iloc[:, ix])[0] | ||
dfi.loc[p].ix[ix] = imputv | ||
|
||
print("%d%%" % ((n/total_n)*100), end="\r") | ||
|
||
return dfi, imputed |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,99 @@ | ||
import numpy as np | ||
import scipy as sp | ||
import sys, pickle, pdb | ||
import scipy.stats as st | ||
import scipy.interpolate | ||
|
||
def qvalues(pv, m = None, verbose = False, lowmem = False, pi0 = None): | ||
""" | ||
Copyright (c) 2012, Nicolo Fusi, University of Sheffield | ||
All rights reserved. | ||
Estimates q-values from p-values | ||
Args | ||
===== | ||
m: number of tests. If not specified m = pv.size | ||
verbose: print verbose messages? (default False) | ||
lowmem: use memory-efficient in-place algorithm | ||
pi0: if None, it's estimated as suggested in Storey and Tibshirani, 2003. | ||
For most GWAS this is not necessary, since pi0 is extremely likely to be | ||
1 | ||
""" | ||
|
||
|
||
assert(pv.min() >= 0 and pv.max() <= 1), "p-values should be between 0 and 1" | ||
|
||
original_shape = pv.shape | ||
pv = pv.ravel() # flattens the array in place, more efficient than flatten() | ||
|
||
if m == None: | ||
m = float(len(pv)) | ||
else: | ||
# the user has supplied an m | ||
m *= 1.0 | ||
|
||
# if the number of hypotheses is small, just set pi0 to 1 | ||
if len(pv) < 100 and pi0 == None: | ||
pi0 = 1.0 | ||
elif pi0 != None: | ||
pi0 = pi0 | ||
else: | ||
# evaluate pi0 for different lambdas | ||
pi0 = [] | ||
lam = sp.arange(0, 0.90, 0.01) | ||
counts = sp.array([(pv > i).sum() for i in sp.arange(0, 0.9, 0.01)]) | ||
|
||
for l in range(len(lam)): | ||
pi0.append(counts[l]/(m*(1-lam[l]))) | ||
|
||
pi0 = sp.array(pi0) | ||
|
||
# fit natural cubic spline | ||
tck = sp.interpolate.splrep(lam, pi0, k = 3) | ||
pi0 = sp.interpolate.splev(lam[-1], tck) | ||
|
||
if pi0 > 1: | ||
if verbose: | ||
print("got pi0 > 1 (%.3f) while estimating qvalues, setting it to 1" % pi0) | ||
|
||
pi0 = 1.0 | ||
|
||
assert(pi0 >= 0 and pi0 <= 1), "pi0 is not between 0 and 1: %f" % pi0 | ||
|
||
|
||
if lowmem: | ||
# low memory version, only uses 1 pv and 1 qv matrices | ||
qv = sp.zeros((len(pv),)) | ||
last_pv = pv.argmax() | ||
qv[last_pv] = (pi0*pv[last_pv]*m)/float(m) | ||
pv[last_pv] = -sp.inf | ||
prev_qv = last_pv | ||
for i in range(int(len(pv))-2, -1, -1): | ||
cur_max = pv.argmax() | ||
qv_i = (pi0*m*pv[cur_max]/float(i+1)) | ||
pv[cur_max] = -sp.inf | ||
qv_i1 = prev_qv | ||
qv[cur_max] = min(qv_i, qv_i1) | ||
prev_qv = qv[cur_max] | ||
|
||
else: | ||
p_ordered = sp.argsort(pv) | ||
pv = pv[p_ordered] | ||
qv = pi0 * m/len(pv) * pv | ||
qv[-1] = min(qv[-1],1.0) | ||
|
||
for i in range(len(pv)-2, -1, -1): | ||
qv[i] = min(pi0*m*pv[i]/(i+1.0), qv[i+1]) | ||
|
||
# reorder qvalues | ||
qv_temp = qv.copy() | ||
qv = sp.zeros_like(qv) | ||
qv[p_ordered] = qv_temp | ||
|
||
# reshape qvalues | ||
qv = qv.reshape(original_shape) | ||
|
||
return qv |