Skip to content

Commit

Permalink
moving docstring to google format
Browse files Browse the repository at this point in the history
  • Loading branch information
maxibor committed Jun 11, 2019
1 parent 49d00f2 commit 39e8101
Show file tree
Hide file tree
Showing 5 changed files with 189 additions and 150 deletions.
1 change: 1 addition & 0 deletions docs/conf.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,7 @@
'sphinx.ext.mathjax',
'sphinx.ext.viewcode',
'sphinx.ext.githubpages',
'sphinx.ext.napoleon'
]

# Add any paths that contain templates here, relative to this directory.
Expand Down
2 changes: 1 addition & 1 deletion sourcepredict
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,7 @@ Homepage & Documentation: github.com/maxibor/sourcepredict
'-n',
dest="normalization",
default='GMPR',
help="Normalization method (RLE | CLR | Subsample | GMPR). Default = GMPR")
help="Normalization method (RLE | Subsample | GMPR). Default = GMPR")
parser.add_argument(
'-dt',
dest="distance",
Expand Down
103 changes: 62 additions & 41 deletions sourcepredictlib/ml.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,15 +27,17 @@
class sourceunknown():

def __init__(self, source, sink, labels):
"""
"""Init of sourceunknown object
Init of sourceunknown object
Combines sink and source in one pd Dataframe
Args:
- source(str): training data csv file with OTUs at index,
source(str): training data csv file with OTUs at index,
Samples as columns
- sink(str): test data csv file with OTUs at index,
sink(str): test data csv file with OTUs at index,
Samples as columns
- labels(str): labels csv file with Samples in first column,
labels(str): labels csv file with Samples in first column,
class in 2nd column
"""
Expand All @@ -52,15 +54,17 @@ def __repr__(self):
return(f'A sourceforest object of source {self.source} and sink {self.tmp_sink}')

def add_unknown(self, alpha, seed):
"""
"""Add unkown
Create unknown Samples from test sample
N Random samples are created with N being average of class counts
For each random samples OTU, count is taken from nornal distrib with a
mean of test OTU count.
Args:
- alpha(float): proportion of each OTU count from test samples
alpha(float): proportion of each OTU count from test samples
to include in unknown sample
- seed(int): seed for random number generator
seed(int): seed for random number generator
"""

np.random.seed = seed
Expand All @@ -85,12 +89,14 @@ def add_unknown(self, alpha, seed):
data=['unknown']*len(unk_labs), index=unk_labs)

def normalize(self, method, threads):
"""
"""Sample count normalization
Performs normalization of the count data to balance coverage differences
and missing OTUs
Args:
- method(str): normalization method
- threads(int): number of threads for parallelization
method(str): normalization method
threads(int): number of threads for parallelization
"""
if method == 'RLE':
self.normalized = normalize.RLE_normalize(self.combined)
Expand All @@ -116,10 +122,10 @@ def normalize(self, method, threads):
self.y_unk = self.y_unk.append(self.unk_labs)

def compute_distance(self, rank='species'):
"""
Sample pairwise distance computation
"""Sample pairwise distance computation
Args:
- rank(str): Taxonomics rank to keep for filtering OTUs
rank(str): Taxonomics rank to keep for filtering OTUs
"""

# Getting a single Taxonomic rank
Expand All @@ -138,11 +144,13 @@ def compute_distance(self, rank='species'):

def embed(self, out_csv, seed, n_comp=200):
"""
Embedding of a distance matrix in lower dimensions
Args:
- out_csv(str): Path to file for writing out embedding coordinates
- seed(int): seed for random number generator
- n_comp(int): dimension of embedding
out_csv(str): Path to file for writing out embedding coordinates
seed(int): seed for random number generator
n_comp(int): dimension of embedding
"""

embed = skbio_mds(
Expand All @@ -167,15 +175,17 @@ def embed(self, out_csv, seed, n_comp=200):
to_write.to_csv(out_csv)

def ml(self, seed, threads):
"""
"""KNN machine learning
KNN machine learning to predict unknown proportion
Correction of predicted probabilies with Platt scaling from sklearn
Training on 64% of data, validation on 16%, test on 20%
Args:
- seed(int) seed for random number generator
- threads(int) number of threads for parallelization
seed(int): seed for random number generator
threads(int): number of threads for parallelization
Returns:
- predictions(dict): Probability/proportion of each class
predictions(dict): Probability/proportion of each class
"""
train_features, test_features, train_labels, test_labels = train_test_split(
self.source.drop('labels', axis=1), self.source.loc[:, 'labels'], test_size=0.2, random_state=seed)
Expand Down Expand Up @@ -205,18 +215,22 @@ def ml(self, seed, threads):

class sourcemap():
def __init__(self, source, sink, labels, norm_method, threads=4):
'''
Init of sourceumap object
"""Init of sourcemap object
Init of sourcemap object
Combines sink and source in one pd Dataframe
Args:
- source(str): training data csv file with OTUs at index,
source (str): training data csv file with OTUs at index,
Samples as columns
- sink(str): test data csv file with OTUs at index,
sink (str): test data csv file with OTUs at index,
Samples as columns
- labels(str): labels csv file with Samples in first column,
labels (str): labels csv file with Samples in first column,
class in 2nd column
- norm_method(str): normalization method
'''
norm_method (str): normalization method
threads (int, optional): number of processes for parallelization. Defaults to 4.
"""

self.train = pd.read_csv(source, index_col=0)
self.test = pd.read_csv(sink, index_col=0)
combined = self.train.merge(
Expand All @@ -237,12 +251,13 @@ class in 2nd column
self.labels = labels.loc[self.train.columns, 'labels']

def compute_distance(self, distance_method, rank='species'):
"""
Sample pairwise distance computation
"""Sample pairwise distance computation
Args:
- distance_method(str): distance method used
- rank(str): Taxonomics rank to keep for filtering OTUs
distance_method (str): distance method
rank (str, optional): Taxonomics rank to keep for filtering OTUs. Defaults to 'species'.
"""

# Getting a single Taxonomic rank
ncbi = NCBITaxa()
only_rank = []
Expand All @@ -261,14 +276,17 @@ def compute_distance(self, distance_method, rank='species'):
self.wu = self.skbio_wu.to_data_frame()

def embed(self, method, out_csv, seed, n_comp=200):
"""
"""Distance matrix embedding
Embedding of a distance matrix in lower dimensions
Args:
- method(str): method used for embedding
- out_csv(str): Path to file for writing out embedding coordinates
- seed(int): seed for random number generator
- n_comp(int): dimension of embedding
method (str): embedding method
out_csv (str): Path to file for writing out embedding coordinates
seed (int): seed for random number generator
n_comp (int, optional): dimension of embedding. Defaults to 200.
"""

cols = [f"PC{i}" for i in range(1, n_comp+1)]

if method == 'UMAP':
Expand Down Expand Up @@ -311,17 +329,20 @@ def embed(self, method, out_csv, seed, n_comp=200):
self.sink = self.my_embed.drop(self.train_samples, axis=0)

def knn_classification(self, kfold, threads, seed):
"""
"""Performs KNN classification
KNN machine learning to predict unknown proportion
Correction of predicted probabilies with Platt scaling from sklearn
Training on 64% of data, validation on 16%, test on 20%
Args:
- kfold
- threads(int) number of threads for parallelization
- seed(int) seed for random number generator
kfold (int): number of cross validation folds
threads (int): number of processes for parallelization
seed (int): seed for random number generator
Returns:
- predictions(dict): Probability/proportion of each class
predictions(dict): Probability/proportion of each class
"""

train_features, test_features, train_labels, test_labels = train_test_split(
self.source.drop('labels', axis=1), self.source.loc[:, 'labels'], test_size=0.2, random_state=seed)
train_features, validation_features, train_labels, validation_labels = train_test_split(
Expand Down
76 changes: 46 additions & 30 deletions sourcepredictlib/normalize.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,13 +7,15 @@


def RLE_normalize(pd_dataframe):
'''
Normalize with Relative Log Expression
INPUT:
pd_dataframe(pandas DataFrame): Colums as Samples, Rows as OTUs
OUTPUT:
step7(pandas DataFrame): RLE Normalized. Colums as Samples, Rows as OTUs
'''
"""Normalize with Relative Log Expression
Args:
pd_dataframe (pandas DataFrame): OTU count dataframe,
colums as Samples, Rows as OTUs
Returns:
pandas DataFrame: RLE Normalized datafrane. Colums as Samples, Rows as OTUs
"""

step1 = pd_dataframe.apply(np.log, 0)
step2 = step1.apply(np.average, 1)
step3 = step2[step2.replace([np.inf, -np.inf], np.nan).notnull()]
Expand All @@ -27,27 +29,37 @@ def RLE_normalize(pd_dataframe):


def subsample_normalize_pd(pd_dataframe):
'''
Normalize with Subsampling
INPUT:
pd_dataframe(pandas DataFrame): Colums as Samples, Rows as OTUs
OUTPUT:
step7(pandas DataFrame): SubSample Normalized. Colums as Samples, Rows as OTUs
'''
"""Normalize with Subsampling
Args:
pd_dataframe (pandas DataFrame): OTU count dataframe,
colums as Samples, Rows as OTUs
Returns:
pandas DataFrame: Subsample Normalized dataframe. Colums as Samples, Rows as OTUs
"""

def subsample_normalize(serie, omax):
'''
"""Subsample normalization column wise
imin: minimum of input range
imax: maximum of input range
omin: minimum of output range
omax: maximum of output range
x in [imin,imax]
x in [imin, imax]
f(x) in [omin, omax]
x - imin
f(x) = ------------ x (omax - omin) + omin
f(x) = ------------ x(omax - omin) + omin
imax - imin
'''
Args:
serie (pandas Series): Indivudal Sample Column
omax (int): maximum of output range
Returns:
pandas Series: normalized pandas Series
"""

imin = min(serie)
imax = max(serie)
omin = 0
Expand All @@ -67,18 +79,16 @@ def subsample_normalize(serie, omax):
return(step3)


def CLR_normalize(pd_dataframe):
d = pd_dataframe
d = d+1
step1_1 = d.apply(np.log, 0)
step1_2 = step1_1.apply(np.average, 0)
step1_3 = step1_2.apply(np.exp)
step2 = d.divide(step1_3, 1)
step3 = step2.apply(np.log, 0)
return(step3)


def gmpr_size_factor(col, ar):
"""Generate GMPR size factor
Args:
col (list): individual columms of the numpy array
ar (numpy array): numpy array of OTU counts,
colums as Samples, Rows as OTUs
Returns:
float: GMPR size factor per column
"""
pr = np.apply_along_axis(lambda x: np.divide(ar[:, col], x), 0, ar)
pr[np.isinf(pr)] = np.nan
pr[pr == 0] = np.nan
Expand All @@ -87,12 +97,18 @@ def gmpr_size_factor(col, ar):


def GMPR_normalize(df, process):
"""
"""Compute GMPR normalization
Global Mean of Pairwise Ratios
Chen, L., Reeve, J., Zhang, L., Huang, S., Wang, X., & Chen, J. (2018).
GMPR: A robust normalization method for zero-inflated count data
with application to microbiome sequencing data.
PeerJ, 6, e4600.
Args:
df (pandas Dataframe): OTU count dataframe,
colums as Samples, Rows as OTUs
process (int): number of process for parallelization
"""
ar = np.asarray(df)

Expand Down

0 comments on commit 39e8101

Please sign in to comment.