Skip to content

Commit

Permalink
working on unit tests
Browse files Browse the repository at this point in the history
  • Loading branch information
maxibor committed Jun 20, 2019
1 parent 18e45c3 commit 078e549
Show file tree
Hide file tree
Showing 8 changed files with 190 additions and 6 deletions.
2 changes: 1 addition & 1 deletion paper/paper.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
---
title: 'Sourcepredict: Prediction of metagenomic sample sources using machine learning algorithms'
title: 'Sourcepredict: Prediction of metagenomic sample sources using dimension reduction followed by machine learning classification'
tags:
- microbiome
- source tracking
Expand Down
14 changes: 10 additions & 4 deletions sourcepredictlib/ml.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,12 +16,18 @@
from io import StringIO
import umap
import warnings
import os
import sys

from collections import Counter
from . import normalize

from . import utils

parentScriptDir = "/".join(os.path.dirname(
os.path.realpath(__file__)).split("/")[:-1])
sys.path.append(parentScriptDir+"/sourcepredictlib")

import normalize

import utils


class sourceunknown():
Expand Down Expand Up @@ -54,7 +60,7 @@ def __repr__(self):
return(f'A sourceforest object of source {self.ref} and sink {self.tmp_sink}')

def add_unknown(self, alpha, seed):
"""Add unkown
"""Add unknown samples
Create unknown Samples from test sample
N Random samples are created with N being average of class counts
Expand Down
4 changes: 3 additions & 1 deletion sourcepredictlib/normalize.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,8 @@ def RLE_normalize(pd_dataframe):
colums as Samples, Rows as OTUs
Returns:
pandas DataFrame: RLE Normalized datafrane. Colums as Samples, Rows as OTUs
Example:
>>> RLE_normalize(pd.DataFrame)
"""

step1 = pd_dataframe.apply(np.log, 0)
Expand Down Expand Up @@ -83,7 +85,7 @@ def gmpr_size_factor(col, ar):
"""Generate GMPR size factor
Args:
col (list): individual columms of the numpy array
col (int): columm index of the numpy array
ar (numpy array): numpy array of OTU counts,
colums as Samples, Rows as OTUs
Returns:
Expand Down
18 changes: 18 additions & 0 deletions sourcepredictlib/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -88,6 +88,9 @@ def check_norm(method):
method(str): Normalization method
Returns:
str: capitalized normalization method name
Example:
>>> check_norm('rle')
'RLE'
"""

methods = ['RLE', 'SUBSAMPLE', 'GMPR']
Expand All @@ -106,6 +109,9 @@ def check_embed(method):
method(str): Embedding method
Returns:
str: capitalized embedding method name
Example:
>>> check_embed('tsne')
'TSNE'
"""

methods = ['TSNE', 'UMAP', 'MDS']
Expand All @@ -124,6 +130,9 @@ def check_distance(method):
method(str): distance method
Returns:
str: capitalized distance method name
Example:
>>> check_distance('Weighted_unifrac')
'weighted_unifrac'
"""

methods = ['weighted_unifrac', 'unweighted_unifrac']
Expand All @@ -146,6 +155,9 @@ def check_gen_seed(seed, amin=1, amax=10000):
Defaults to 10000.
Returns:
int: random seed sampled between 1 and 10000
Example:
>>> check_gen_seed(42)
42
"""

if seed is None:
Expand All @@ -161,6 +173,9 @@ def plural(count):
count(int): number of occurences
Returns:
str: '' or 's'
Example:
>>> plural(3)
's'
"""

if count == 1:
Expand All @@ -178,6 +193,9 @@ def _get_basename(file_name):
file_name(str): path to file
Returns:
str: file basename
Example:
>>> _get_basename('/path/to/myfile.txt')
'myfile'
"""

if ("/") in file_name:
Expand Down
9 changes: 9 additions & 0 deletions tests/test.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
import sys
import os
parentScriptDir = "/".join(os.path.dirname(
os.path.realpath(__file__)).split("/")[:-1])
sys.path.append(parentScriptDir+"/sourcepredictlib")

print(sys.path)

import ml
71 changes: 71 additions & 0 deletions tests/test_ml.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,71 @@
import sys
import os
import pandas as pd
import random

parentScriptDir = "/".join(os.path.dirname(
os.path.realpath(__file__)).split("/")[:-1])
sys.path.append(parentScriptDir+"/sourcepredictlib")
random.seed(42)

import ml
import utils


def test_sourceunknown_init():

PYTHONHASHSEED = 0

labels = os.path.dirname(os.path.abspath(
__file__)) + '/../data/modern_gut_microbiomes_labels.csv'
sources = os.path.dirname(os.path.abspath(
__file__))+'/../data/modern_gut_microbiomes_sources.csv'
sink_file = os.path.dirname(os.path.abspath(
__file__))+'/../data/test/dog_test_sample.csv'
sink = utils.split_sinks(sink_file)[0]

su = ml.sourceunknown(source=sources, sink=sink, labels=labels)

assert su.ref.shape == (5664, 432)
assert su.y.shape == (432,)
assert su.y_unk.shape == (432,)
assert su.tmp_sink.shape == (570, 1)
assert su.combined.shape == (5664, 433)
assert hash(str(su.combined)) == -1867655657877779130


# def test_sourceunknown_add_unkown():
# PYTHONHASHSEED = 0
# labels = os.path.dirname(os.path.abspath(
# __file__)) + '/../data/modern_gut_microbiomes_labels.csv'
# sources = os.path.dirname(os.path.abspath(
# __file__))+'/../data/modern_gut_microbiomes_sources.csv'
# sink_file = os.path.dirname(os.path.abspath(
# __file__))+'/../data/test/dog_test_sample.csv'
# sink = utils.split_sinks(sink_file)[0]

# su = ml.sourceunknown(source=sources, sink=sink, labels=labels)
# su.add_unknown(alpha=0.1, seed=42)

# assert su.ref_u.shape == (570, 144)
# assert su.ref_u.dtypes ==
# assert hash(str(su.ref_u.columns)) == 5867343156924504419
# assert hash(str(su.ref_u.index)) == 4313932402357376923
# assert hash(str(su.ref_u_labs)) == -5906979299339891562


# def test_sourceunknown_normalized():

# labels = os.path.dirname(os.path.abspath(
# __file__)) + '/data/modern_gut_microbiomes_labels.csv'
# sources = os.path.dirname(os.path.abspath(
# __file__))+'/data/modern_gut_microbiomes_sources.csv'
# sink_file = os.path.dirname(os.path.abspath(
# __file__))+'data/test/dog_test_sample.csv'
# sink = utils.split_sinks(sink_file)[0]

# su = ml.sourceunknown(source=sources, sink=sink, labels=labels)
# su.add_unknown(alpha=0.1, seed=42)
# su_rle = su.normalize(method='rle', threads=1)
# su_subsample = su.normalize(method='subsample', threads=1)
# su_gmpr = su.normalize(method='gmpr', threads=1)
62 changes: 62 additions & 0 deletions tests/test_normalize.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,62 @@
import sys
import os
import numpy as np
import pandas as pd

parentScriptDir = "/".join(os.path.dirname(
os.path.realpath(__file__)).split("/")[:-1])
sys.path.append(parentScriptDir+"/sourcepredictlib")

import normalize


def test_RLE():
"""
Test RLE normalization
"""

input_df = pd.DataFrame([[1, 2, 3], [4, 5, 6], [7, 8, 9]])
output_df = pd.DataFrame(
[[1.0, 2.0, 2.0], [5.0, 5.0, 5.0], [9.0, 8.0, 7.0]])

assert normalize.RLE_normalize(
input_df).all().all() == output_df.all().all()


def test_subsample():
"""
Test subsample normalization
"""

input_df = pd.DataFrame([[1, 2, 3], [4, 5, 6], [7, 8, 9]])
output_df = pd.DataFrame(
[[0.0, 0.0, 0.0], [4.0, 4.0, 4.0], [9.0, 9.0, 9.0]])

assert normalize.subsample_normalize_pd(
input_df).all().all() == output_df.all().all()


def test_gmpr_size_factor():
"""
Test GMPR normalization size factor
"""

input_ar = np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]])
output = 1.0137003325955667
assert normalize.gmpr_size_factor(col=1, ar=input_ar) == output


def test_GMPR():
"""
Test GMPR normalization
"""

input_df = pd.DataFrame([[1, 2, 3], [4, 5, 6], [7, 8, 9]])
output_df = pd.DataFrame([[1.2331060371652351, 1.9729696594643762, 2.4662120743304703],
[4.932424148660941, 4.932424148660941,
4.932424148660941],
[8.631742260156646, 7.891878637857505, 7.398636222991411]])
assert normalize.GMPR_normalize(
input_df, 1).all().all() == output_df.all().all()
assert normalize.GMPR_normalize(
input_df, 2).all().all() == output_df.all().all()
16 changes: 16 additions & 0 deletions tests/test_utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
import sys
import os

parentScriptDir = "/".join(os.path.dirname(
os.path.realpath(__file__)).split("/")[:-1])
sys.path.append(parentScriptDir+"/sourcepredictlib")

import utils


def test_checks():
assert utils.check_norm('rle') == 'RLE'
assert utils.check_embed('tsne') == 'TSNE'
assert utils.check_distance('Weighted_unifrac') == 'weighted_unifrac'
assert utils.check_gen_seed(42) == 42
assert type(utils.check_gen_seed(seed=None)) is int

0 comments on commit 078e549

Please sign in to comment.