merge from master

maxibor · Oct 29, 2019 · 7c50be8 · 7c50be8
2 parents dd4b3d2 + 468e331
commit 7c50be8
Show file tree

Hide file tree

Showing 5 changed files with 24 additions and 13 deletions.
diff --git a/.coverage b/.coverage
@@ -1 +1,5 @@
-!coverage.py: This is a private format, don't read it directly!{"lines":{"/Users/borry/Documents/GitHub/sourcepredict/sourcepredictlib/utils.py":[3,4,5,6,260,136,9,137,138,261,264,142,145,18,19,20,21,22,23,26,157,158,159,163,166,38,39,40,41,42,43,46,177,178,179,183,186,66,206,84,93,94,95,96,97,98,99,228,222,102,225,223,240,113,114,115,116,117,246,241,243,121,124],"/Users/borry/Documents/GitHub/sourcepredict/sourcepredictlib/ml.py":[5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,28,29,30,32,33,34,36,51,52,53,54,55,56,57,58,60,63,77,79,81,82,83,84,85,87,88,89,90,91,93,94,95,96,97,99,109,111,113,114,115,116,117,121,122,123,124,125,126,127,131,133,136,137,138,140,151,152,153,154,155,157,158,160,161,162,163,165,171,185,186,187,188,190,191,193,194,195,196,197,198,199,200,202,203,204,205,206,207,208,211,212,229,230,231,232,233,235,237,238,239,240,243,244,245,246,247,249,258,259,260,261,262,263,264,265,266,267,268,269,270,271,272,281,283,295,297,301,302,303,304,305,306,307,308,309,310,315,316,317,318,319,320,322,331,332,333,334,336,337,338,340,341,343,344,345,346,347,348,349,350,351,353,368,369,370,371,373,375,394,395,396,397,399,400,401,402,404,405,406,408,409,410,411,412,413],"/Users/borry/Documents/GitHub/sourcepredict/sourcepredictlib/normalize.py":[3,4,5,6,9,21,22,23,24,25,26,27,28,29,30,33,43,65,66,67,68,69,70,73,75,76,78,79,80,81,84,94,95,96,97,98,101,115,117,118,119,121]}}
+<<<<<<< HEAD
+!coverage.py: This is a private format, don't read it directly!{"lines":{"/Users/borry/Documents/GitHub/sourcepredict/sourcepredictlib/utils.py":[3,4,5,6,260,136,9,137,138,261,264,142,145,18,19,20,21,22,23,26,157,158,159,163,166,38,39,40,41,42,43,46,177,178,179,183,186,66,206,84,93,94,95,96,97,98,99,228,222,102,225,223,240,113,114,115,116,117,246,241,243,121,124],"/Users/borry/Documents/GitHub/sourcepredict/sourcepredictlib/ml.py":[5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,28,29,30,32,33,34,36,51,52,53,54,55,56,57,58,60,63,77,79,81,82,83,84,85,87,88,89,90,91,93,94,95,96,97,99,109,111,113,114,115,116,117,121,122,123,124,125,126,127,131,133,136,137,138,140,151,152,153,154,155,157,158,160,161,162,163,165,171,185,186,187,188,190,191,193,194,195,196,197,198,199,200,202,203,204,205,206,207,208,211,212,229,230,231,232,233,235,237,238,239,240,243,244,245,246,247,249,258,259,260,261,262,263,264,265,266,267,268,269,270,271,272,281,283,295,297,301,302,303,304,305,306,307,308,309,310,315,316,317,318,319,320,322,331,332,333,334,336,337,338,340,341,343,344,345,346,347,348,349,350,351,353,368,369,370,371,373,375,394,395,396,397,399,400,401,402,404,405,406,408,409,410,411,412,413],"/Users/borry/Documents/GitHub/sourcepredict/sourcepredictlib/normalize.py":[3,4,5,6,9,21,22,23,24,25,26,27,28,29,30,33,43,65,66,67,68,69,70,73,75,76,78,79,80,81,84,94,95,96,97,98,101,115,117,118,119,121]}}
+=======
+!coverage.py: This is a private format, don't read it directly!{"lines":{"/projects1/users/borry/18_sourcepredict/sourcepredictlib/ml.py":[3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,25,26,27,29,31,34,36,60,63,99,133,140,171,210,211,247,281,334,228,229,230,231,232,234,236,237,238,239,242,243,244,245,51,52,53,54,55,56,57,58,77,79,81,82,83,84,85,87,88,89,90,91,93,94,95,96,97,109,111,113,114,115,116,117,121,122,123,124,125,126,127,131,136,137,138,151,152,153,154,155,157,158,160,161,162,163,165,184,185,186,187,189,190,192,193,194,195,196,197,198,199,201,202,203,204,205,206,207,256,257,258,259,260,262,263,261,264,265,266,267,268,269,270,279,293,295,299,300,301,302,313,314,315,320,329,330,331,332,303,304,305,306,307,308,316,317,318,349,350,351,352,354,356,375,376,377,378,380,381,382,383,385,386,387,389,390,391,392,393,394],"/projects1/users/borry/18_sourcepredict/sourcepredictlib/normalize.py":[3,4,5,6,9,33,84,101,115,117,118,119,121,21,22,23,24,25,26,27,28,29,30,43,75,76,78,79,65,66,67,68,69,70,73,80,81,94,95,96,97,98],"/projects1/users/borry/18_sourcepredict/sourcepredictlib/utils.py":[3,4,5,6,9,26,46,66,84,106,127,148,168,190,208,75,76,77,78,79,80,81,38,39,40,41,42,43,18,19,20,21,22,23,95,96,98,99,103,97,118,119,120,124,139,140,141,145,159,160,161,165,184,187,185,202,203,205,222,223,226]}}
+>>>>>>> 468e331da275343b1fd57fc870e4414ed88ec7b5
diff --git a/docs/index.rst b/docs/index.rst
@@ -24,6 +24,7 @@ __ homepage_
    add_new_sources
    run_example
    mixed_prop
+   CDI_analysis
 
 
 Indices and tables

diff --git a/sourcepredict b/sourcepredict
@@ -55,6 +55,12 @@ Homepage & Documentation: github.com/maxibor/sourcepredict
         default='weighted_unifrac',
         help="Distance method. (unweighted_unifrac | weighted_unifrac) Default = weighted_unifrac"
     )
+    parser.add_argument(
+        '-r',
+        dest="tax_rank",
+        default='species',
+        help="Taxonomic rank to use for Unifrac distances. Default = species"
+    )
     parser.add_argument(
         '-me',
         dest="method",
@@ -120,6 +126,7 @@ Homepage & Documentation: github.com/maxibor/sourcepredict
     labels = args.labels
     seed = int(args.seed)
     distance = args.distance
+    rank = args.tax_rank
     method = args.method
     ml = args.learning
     neighbors = int(args.neighbors)
@@ -130,13 +137,13 @@ Homepage & Documentation: github.com/maxibor/sourcepredict
     kfold = int(args.kfold)
     threads = int(args.threads)
 
-    return(sink, alpha, normalization, sources, labels, seed, distance, method, ml, neighbors, weights, dim, output, embed, kfold, threads)
+    return(sink, alpha, normalization, sources, labels, seed, distance, rank, ml, method, neighbors, weights, dim, output, embed, kfold, threads)
 
 
 if __name__ == "__main__":
     version = "0.34"
     warnings.filterwarnings("ignore")
-    SINK, ALPHA, NORMALIZATION, SOURCES, LABELS, SEED, DISTANCE, METHOD, ML, NEIGHBORS, WEIGTHS, DIM, OUTPUT, EMBED_CSV, KFOLD, THREADS = _get_args()
+    SINK, ALPHA, NORMALIZATION, SOURCES, LABELS, SEED, DISTANCE, RANK, ML, METHOD, NEIGHBORS, WEIGTHS, DIM, OUTPUT, EMBED_CSV, KFOLD, THREADS = _get_args()
     SEED = utils.check_gen_seed(SEED)
     np.random.seed(SEED)
     embed_method = utils.check_embed(METHOD)
@@ -146,7 +153,6 @@ if __name__ == "__main__":
     predictions = {}
     distance_method = utils.check_distance(DISTANCE)
     weigth = utils.check_weigths(WEIGTHS)
-    tax_rank = "species"
     samp_pred = {}
     print("Step 1: Checking for unknown proportion")
     if ALPHA == 0:
@@ -173,8 +179,8 @@ if __name__ == "__main__":
     print("Step 2: Checking for source proportion")
     sm = sourcemap(source=SOURCES, sink=SINK, labels=LABELS,
                    norm_method=normalization, threads=THREADS)
-    print(f"\tComputing {distance_method} distance on {tax_rank} rank")
-    sm.compute_distance(distance_method=distance_method, rank=tax_rank)
+    print(f"\tComputing {distance_method} distance on {RANK} rank")
+    sm.compute_distance(distance_method=distance_method, rank=RANK)
     print(f"\t{embed_method} embedding in {DIM} dimension{utils.plural(DIM)}")
     sm.embed(n_comp=DIM, method=embed_method, seed=SEED, out_csv=EMBED_CSV)
     if ml == 'knn':

diff --git a/sourcepredictlib/ml.py b/sourcepredictlib/ml.py
@@ -124,7 +124,7 @@ def normalize(self, method, threads):
                                 ['unknown'] * self.ref_u.shape[1], index=self.normalized_ref_u.columns, name='labels')
         try:
             self.sink = self.normalized.drop(
-                self.ref.columns, axis=1).T
+                self.ref.columns, axis=1, errors='ignore').T
         except KeyError:
             print(f"ERROR: Test sample present in training dataset")
             sys.exit(1)
@@ -157,7 +157,7 @@ def embed(self, seed, n_comp=200, out_csv=None):
         self.my_embed = my_embed
         self.my_embed.set_index(self.bc.index, inplace=True)
 
-        self.ref_u = self.my_embed.drop(self.tmp_sink.columns, axis=0)
+        self.ref_u = self.my_embed.drop(self.tmp_sink.columns, axis=0, errors = 'ignore')
         self.ref_u = self.ref_u.merge(
             self.labels.to_frame(), left_index=True, right_index=True)
         self.sink = self.my_embed.loc[self.tmp_sink.columns, :]
@@ -330,8 +330,8 @@ def embed(self, method, out_csv, seed, n_comp=200):
 
         self.ref_t = self.my_embed.drop(self.test_samples, axis=0)
         self.ref_t = self.ref_t.merge(
-            self.labels.to_frame(), left_index=True, right_index=True)
-        self.sink_t = self.my_embed.drop(self.train_samples, axis=0)
+            self.labels.to_frame(), left_index=True, right_index=True).dropna(axis=0)
+        self.sink_t = self.my_embed.drop(self.train_samples, axis=0).dropna(axis=0)
 
     def gmm_classification(self, seed):
         train_t_features, test_t_features, train_t_labels, test_t_labels = train_test_split(

diff --git a/sourcepredictlib/normalize.py b/sourcepredictlib/normalize.py
@@ -27,7 +27,7 @@ def RLE_normalize(pd_dataframe):
     step5 = step4.apply(np.median, 0)
     step6 = step5.apply(np.exp)
     step7 = pd_dataframe.divide(step6, 1).apply(round, 1)
-    return(step7)
+    return(step7.dropna(axis=1))
 
 
 def subsample_normalize_pd(pd_dataframe):
@@ -78,7 +78,7 @@ def subsample_normalize(serie, omax):
     step2 = pd_dataframe.apply(
         subsample_normalize, axis=0, args=(themax,))
     step3 = step2.apply(np.floor, axis=1)
-    return(step3)
+    return(step3.dropna(axis=1))
 
 
 def gmpr_size_factor(col, ar):
@@ -118,4 +118,4 @@ def GMPR_normalize(df, process):
     with multiprocessing.Pool(process) as p:
         sf = p.map(gmpr_sf_partial, list(range(np.shape(ar)[1])))
 
-    return(pd.DataFrame(np.divide(ar, sf), index=df.index, columns=df.columns))
+    return(pd.DataFrame(np.divide(ar, sf), index=df.index, columns=df.columns).dropna(axis=1))