moving docstring to google format

maxibor · Jun 11, 2019 · 39e8101 · 39e8101
1 parent 49d00f2
commit 39e8101
Show file tree

Hide file tree

Showing 5 changed files with 189 additions and 150 deletions.
diff --git a/docs/conf.py b/docs/conf.py
@@ -52,6 +52,7 @@
     'sphinx.ext.mathjax',
     'sphinx.ext.viewcode',
     'sphinx.ext.githubpages',
+    'sphinx.ext.napoleon'
 ]
 
 # Add any paths that contain templates here, relative to this directory.

diff --git a/sourcepredict b/sourcepredict
@@ -47,7 +47,7 @@ Homepage & Documentation: github.com/maxibor/sourcepredict
         '-n',
         dest="normalization",
         default='GMPR',
-        help="Normalization method (RLE | CLR | Subsample | GMPR). Default = GMPR")
+        help="Normalization method (RLE | Subsample | GMPR). Default = GMPR")
     parser.add_argument(
         '-dt',
         dest="distance",

diff --git a/sourcepredictlib/ml.py b/sourcepredictlib/ml.py
@@ -27,15 +27,17 @@
 class sourceunknown():
 
     def __init__(self, source, sink, labels):
-        """
+        """Init of sourceunknown object
+
         Init of sourceunknown object
         Combines sink and source in one pd Dataframe
+
         Args:
-            - source(str): training data csv file with OTUs at index, 
+            source(str): training data csv file with OTUs at index, 
                 Samples as columns
-            - sink(str): test data csv file with OTUs at index, 
+            sink(str): test data csv file with OTUs at index, 
                 Samples as columns
-            - labels(str): labels csv file with Samples in first column, 
+            labels(str): labels csv file with Samples in first column, 
                 class in 2nd column
 
         """
@@ -52,15 +54,17 @@ def __repr__(self):
         return(f'A sourceforest object of source {self.source} and sink {self.tmp_sink}')
 
     def add_unknown(self, alpha, seed):
-        """
+        """Add unkown
+
         Create unknown Samples from test sample
         N Random samples are created with N being average of class counts
         For each random samples OTU, count is taken from nornal distrib with a 
         mean of test OTU count.
+
         Args:
-            - alpha(float): proportion of each OTU count from test samples 
+            alpha(float): proportion of each OTU count from test samples 
                 to include in unknown sample
-            - seed(int): seed for random number generator
+            seed(int): seed for random number generator
         """
 
         np.random.seed = seed
@@ -85,12 +89,14 @@ def add_unknown(self, alpha, seed):
             data=['unknown']*len(unk_labs), index=unk_labs)
 
     def normalize(self, method, threads):
-        """
+        """Sample count normalization
+
         Performs normalization of the count data to balance coverage differences
         and missing OTUs
+
         Args:
-            - method(str): normalization method
-            - threads(int): number of threads for parallelization
+            method(str): normalization method
+            threads(int): number of threads for parallelization
         """
         if method == 'RLE':
             self.normalized = normalize.RLE_normalize(self.combined)
@@ -116,10 +122,10 @@ def normalize(self, method, threads):
         self.y_unk = self.y_unk.append(self.unk_labs)
 
     def compute_distance(self, rank='species'):
-        """
-        Sample pairwise distance computation
+        """Sample pairwise distance computation
+
         Args:
-            - rank(str): Taxonomics rank to keep for filtering OTUs
+            rank(str): Taxonomics rank to keep for filtering OTUs
         """
 
         # Getting a single Taxonomic rank
@@ -138,11 +144,13 @@ def compute_distance(self, rank='species'):
 
     def embed(self, out_csv, seed, n_comp=200):
         """
+
         Embedding of a distance matrix in lower dimensions
+
         Args:
-            - out_csv(str): Path to file for writing out embedding coordinates
-            - seed(int): seed for random number generator
-            - n_comp(int): dimension of embedding
+            out_csv(str): Path to file for writing out embedding coordinates
+            seed(int): seed for random number generator
+            n_comp(int): dimension of embedding
         """
 
         embed = skbio_mds(
@@ -167,15 +175,17 @@ def embed(self, out_csv, seed, n_comp=200):
             to_write.to_csv(out_csv)
 
     def ml(self, seed, threads):
-        """
+        """KNN machine learning
+
         KNN machine learning to predict unknown proportion
         Correction of predicted probabilies with Platt scaling from sklearn
         Training on 64% of data, validation on 16%, test on 20%
+
         Args:
-            - seed(int) seed for random number generator
-            - threads(int) number of threads for parallelization
+            seed(int): seed for random number generator
+            threads(int): number of threads for parallelization
         Returns:
-            - predictions(dict): Probability/proportion of each class
+            predictions(dict): Probability/proportion of each class
         """
         train_features, test_features, train_labels, test_labels = train_test_split(
             self.source.drop('labels', axis=1), self.source.loc[:, 'labels'], test_size=0.2, random_state=seed)
@@ -205,18 +215,22 @@ def ml(self, seed, threads):
 
 class sourcemap():
     def __init__(self, source, sink, labels, norm_method, threads=4):
-        '''
-        Init of sourceumap object
+        """Init of sourcemap object
+
+        Init of sourcemap object
         Combines sink and source in one pd Dataframe
+
         Args:
-            - source(str): training data csv file with OTUs at index, 
+            source (str): training data csv file with OTUs at index, 
                 Samples as columns
-            - sink(str): test data csv file with OTUs at index, 
+            sink (str): test data csv file with OTUs at index, 
                 Samples as columns
-            - labels(str): labels csv file with Samples in first column, 
+            labels (str): labels csv file with Samples in first column, 
                 class in 2nd column
-            - norm_method(str): normalization method
-        '''
+            norm_method (str): normalization method
+            threads (int, optional): number of processes for parallelization. Defaults to 4.
+        """
+
         self.train = pd.read_csv(source, index_col=0)
         self.test = pd.read_csv(sink, index_col=0)
         combined = self.train.merge(
@@ -237,12 +251,13 @@ class in 2nd column
         self.labels = labels.loc[self.train.columns, 'labels']
 
     def compute_distance(self, distance_method, rank='species'):
-        """
-        Sample pairwise distance computation
+        """Sample pairwise distance computation
+
         Args:
-            - distance_method(str): distance method used
-            - rank(str): Taxonomics rank to keep for filtering OTUs
+            distance_method (str): distance method
+            rank (str, optional): Taxonomics rank to keep for filtering OTUs. Defaults to 'species'.
         """
+
         # Getting a single Taxonomic rank
         ncbi = NCBITaxa()
         only_rank = []
@@ -261,14 +276,17 @@ def compute_distance(self, distance_method, rank='species'):
         self.wu = self.skbio_wu.to_data_frame()
 
     def embed(self, method, out_csv, seed, n_comp=200):
-        """
+        """Distance matrix embedding
+
         Embedding of a distance matrix in lower dimensions
+
         Args:
-            - method(str): method used for embedding
-            - out_csv(str): Path to file for writing out embedding coordinates
-            - seed(int): seed for random number generator
-            - n_comp(int): dimension of embedding
+            method (str): embedding method
+            out_csv (str): Path to file for writing out embedding coordinates
+            seed (int): seed for random number generator
+            n_comp (int, optional): dimension of embedding. Defaults to 200.
         """
+
         cols = [f"PC{i}" for i in range(1, n_comp+1)]
 
         if method == 'UMAP':
@@ -311,17 +329,20 @@ def embed(self, method, out_csv, seed, n_comp=200):
         self.sink = self.my_embed.drop(self.train_samples, axis=0)
 
     def knn_classification(self, kfold, threads, seed):
-        """
+        """Performs KNN classification
+
         KNN machine learning to predict unknown proportion
         Correction of predicted probabilies with Platt scaling from sklearn
         Training on 64% of data, validation on 16%, test on 20%
+
         Args:
-            - kfold
-            - threads(int) number of threads for parallelization
-            - seed(int) seed for random number generator
+            kfold (int): number of cross validation folds
+            threads (int): number of processes for parallelization
+            seed (int): seed for random number generator
         Returns:
-            - predictions(dict): Probability/proportion of each class
+            predictions(dict): Probability/proportion of each class
         """
+
         train_features, test_features, train_labels, test_labels = train_test_split(
             self.source.drop('labels', axis=1), self.source.loc[:, 'labels'], test_size=0.2, random_state=seed)
         train_features, validation_features, train_labels, validation_labels = train_test_split(

diff --git a/sourcepredictlib/normalize.py b/sourcepredictlib/normalize.py
@@ -7,13 +7,15 @@
 
 
 def RLE_normalize(pd_dataframe):
-    '''
-    Normalize with Relative Log Expression
-    INPUT:
-        pd_dataframe(pandas DataFrame): Colums as Samples, Rows as OTUs
-    OUTPUT:
-        step7(pandas DataFrame): RLE Normalized. Colums as Samples, Rows as OTUs
-    '''
+    """Normalize with Relative Log Expression
+
+    Args:
+        pd_dataframe (pandas DataFrame): OTU count dataframe,
+            colums as Samples, Rows as OTUs
+    Returns:
+        pandas DataFrame: RLE Normalized datafrane. Colums as Samples, Rows as OTUs
+    """
+
     step1 = pd_dataframe.apply(np.log, 0)
     step2 = step1.apply(np.average, 1)
     step3 = step2[step2.replace([np.inf, -np.inf], np.nan).notnull()]
@@ -27,27 +29,37 @@ def RLE_normalize(pd_dataframe):
 
 
 def subsample_normalize_pd(pd_dataframe):
-    '''
-    Normalize with Subsampling
-    INPUT:
-        pd_dataframe(pandas DataFrame): Colums as Samples, Rows as OTUs
-    OUTPUT:
-        step7(pandas DataFrame): SubSample Normalized. Colums as Samples, Rows as OTUs
-    '''
+    """Normalize with Subsampling
+
+    Args:
+        pd_dataframe (pandas DataFrame): OTU count dataframe,
+            colums as Samples, Rows as OTUs
+    Returns:
+       pandas DataFrame: Subsample Normalized dataframe. Colums as Samples, Rows as OTUs
+    """
+
     def subsample_normalize(serie, omax):
-        '''
+        """Subsample normalization column wise
+
         imin: minimum of input range
         imax: maximum of input range
         omin: minimum of output range
         omax: maximum of output range
-        x in [imin,imax]
+        x in [imin, imax]
         f(x) in [omin, omax]
 
                  x - imin
-        f(x) = ------------ x (omax - omin) + omin
+        f(x) = ------------ x(omax - omin) + omin
                imax - imin
 
-        '''
+
+        Args:
+            serie (pandas Series): Indivudal Sample Column
+            omax (int): maximum of output range
+        Returns:
+            pandas Series: normalized pandas Series
+        """
+
         imin = min(serie)
         imax = max(serie)
         omin = 0
@@ -67,18 +79,16 @@ def subsample_normalize(serie, omax):
     return(step3)
 
 
-def CLR_normalize(pd_dataframe):
-    d = pd_dataframe
-    d = d+1
-    step1_1 = d.apply(np.log, 0)
-    step1_2 = step1_1.apply(np.average, 0)
-    step1_3 = step1_2.apply(np.exp)
-    step2 = d.divide(step1_3, 1)
-    step3 = step2.apply(np.log, 0)
-    return(step3)
-
-
 def gmpr_size_factor(col, ar):
+    """Generate GMPR size factor
+
+    Args:
+        col (list): individual columms of the numpy array
+        ar (numpy array): numpy array of OTU counts,
+            colums as Samples, Rows as OTUs
+    Returns:
+        float: GMPR size factor per column
+    """
     pr = np.apply_along_axis(lambda x: np.divide(ar[:, col], x), 0, ar)
     pr[np.isinf(pr)] = np.nan
     pr[pr == 0] = np.nan
@@ -87,12 +97,18 @@ def gmpr_size_factor(col, ar):
 
 
 def GMPR_normalize(df, process):
-    """
+    """Compute GMPR normalization
+
     Global Mean of Pairwise Ratios
     Chen, L., Reeve, J., Zhang, L., Huang, S., Wang, X., & Chen, J. (2018). 
     GMPR: A robust normalization method for zero-inflated count data 
     with application to microbiome sequencing data. 
     PeerJ, 6, e4600.
+
+    Args:
+        df (pandas Dataframe): OTU count dataframe,
+            colums as Samples, Rows as OTUs
+        process (int): number of process for parallelization
     """
     ar = np.asarray(df)