diff --git a/README.md b/README.md
index ada86b41..ef6d502a 100644
--- a/README.md
+++ b/README.md
@@ -1,6 +1,6 @@
# peppy python package
-[![Documentation Status](http://readthedocs.org/projects/pep/badge/?version=latest)](http://peppy.readthedocs.io/en/latest/?badge=latest) [![Build Status](https://travis-ci.org/pepkit/peppy.svg?branch=master)](https://travis-ci.org/pepkit/peppy)
+[![Documentation Status](http://readthedocs.org/projects/pep/badge/?version=latest)](http://peppy.readthedocs.io/en/latest/?badge=latest) [![Build Status](https://travis-ci.org/pepkit/peppy.svg?branch=master)](https://travis-ci.org/pepkit/peppy) [![PEP compatible](http://pepkit.github.io/img/PEP-compatible-green.svg)](http://pepkit.github.io)
`peppy` is the official python package for reading **Portable Encapsulated Projects** or **PEP**s in `python`.
diff --git a/doc/source/changelog.rst b/doc/source/changelog.rst
index 208576ec..57342964 100644
--- a/doc/source/changelog.rst
+++ b/doc/source/changelog.rst
@@ -1,6 +1,21 @@
Changelog
******************************
+- **v0.19** (*2019-01-16*):
+
+ - Changed
+
+ - ``Project`` construction no longer requires sample annotations sheet.
+
+ - Specification of assembly/ies in project config outside of ``implied_attributes`` is deprecated.
+
+ - ``implied_columns`` and ``derived_columns`` are deprecated in favor of ``implied_attributes`` and ``derived_attributes``.
+
+ - New
+
+ - Added ``activate_subproject`` method to ``Project``.
+
+
- **v0.18.2** (*2018-07-23*):
- Fixed
@@ -22,7 +37,7 @@ Changelog
- Add ``get_sample`` and ``get_samples`` functions to ``Project`` objects.
- - Add ``get_subsamples``and ``get_subsample`` functions to both ``Project`` and ``Sample`` objects.
+ - Add ``get_subsamples`` and ``get_subsample`` functions to both ``Project`` and ``Sample`` objects.
- Subsamples are now objects that can be retrieved individually by name, with the ``subsample_name`` as the index column header.
diff --git a/doc/source/jupyter/subannotation.ipynb b/doc/source/jupyter/subannotation.ipynb
index 1ff384e4..681addde 100644
--- a/doc/source/jupyter/subannotation.ipynb
+++ b/doc/source/jupyter/subannotation.ipynb
@@ -6,8 +6,28 @@
"source": [
"# Sample subannotation\n",
"\n",
+ "The PEPs that this examples are based on are available in the [example_peps repsitory](https://github.com/pepkit/example_peps).\n",
+ "\n",
"This vignette will show you how sample subannotations work in a series of examples.\n",
"\n",
+ "Import libraries and set the working directory"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 1,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import os\n",
+ "import peppy\n",
+ "os.chdir(\"/Users/mstolarczyk/Uczelnia/UVA/\")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
"## Example 1: basic sample subannotation table\n",
"\n",
"Example 1 demonstrates how a `sample_subannotation` is used. In this example, 2 samples have multiple input files that need merging (`frog_1` and `frog_2`), while 1 sample (`frog_3`) does not. Therefore, `frog_3` specifies its file in the `sample_annotation` table, while the others leave that field blank and instead specify several files in the `sample_subannotation`."
@@ -15,7 +35,7 @@
},
{
"cell_type": "code",
- "execution_count": 1,
+ "execution_count": 2,
"metadata": {},
"outputs": [
{
@@ -24,20 +44,19 @@
"'data/frog1a_data.txt data/frog1b_data.txt data/frog1c_data.txt'"
]
},
- "execution_count": 1,
+ "execution_count": 2,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
- "import peppy\n",
- "p1 = peppy.Project(\"example_subannotation1/project_config.yaml\")\n",
+ "p1 = peppy.Project(\"example_peps/example_subannotation1/project_config.yaml\")\n",
"p1.samples[0].file"
]
},
{
"cell_type": "code",
- "execution_count": 2,
+ "execution_count": 3,
"metadata": {},
"outputs": [
{
@@ -66,38 +85,38 @@
},
{
"cell_type": "code",
- "execution_count": 3,
+ "execution_count": 4,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
- "'/home/mjs5kd/UVA/example_peps/example_subannotation2/../data/frog1a_data.txt /home/mjs5kd/UVA/example_peps/example_subannotation2/../data/frog1b_data.txt /home/mjs5kd/UVA/example_peps/example_subannotation2/../data/frog1c_data.txt'"
+ "'/Users/mstolarczyk/Uczelnia/UVA/example_peps/example_subannotation2/../data/frog1a_data.txt /Users/mstolarczyk/Uczelnia/UVA/example_peps/example_subannotation2/../data/frog1b_data.txt /Users/mstolarczyk/Uczelnia/UVA/example_peps/example_subannotation2/../data/frog1c_data.txt'"
]
},
- "execution_count": 3,
+ "execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"import peppy\n",
- "p2 = peppy.Project(\"example_subannotation2/project_config.yaml\")\n",
+ "p2 = peppy.Project(\"example_peps/example_subannotation2/project_config.yaml\")\n",
"p2.samples[0].file"
]
},
{
"cell_type": "code",
- "execution_count": 4,
+ "execution_count": 5,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
- "'/home/mjs5kd/UVA/example_peps/example_subannotation2/../data/frog2a_data.txt /home/mjs5kd/UVA/example_peps/example_subannotation2/../data/frog2b_data.txt'"
+ "'/Users/mstolarczyk/Uczelnia/UVA/example_peps/example_subannotation2/../data/frog2a_data.txt /Users/mstolarczyk/Uczelnia/UVA/example_peps/example_subannotation2/../data/frog2b_data.txt'"
]
},
- "execution_count": 4,
+ "execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
@@ -108,16 +127,16 @@
},
{
"cell_type": "code",
- "execution_count": 5,
+ "execution_count": 6,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
- "'/home/mjs5kd/UVA/example_peps/example_subannotation2/../data/frog3_data.txt'"
+ "'/Users/mstolarczyk/Uczelnia/UVA/example_peps/example_subannotation2/../data/frog3_data.txt'"
]
},
- "execution_count": 5,
+ "execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
@@ -128,16 +147,16 @@
},
{
"cell_type": "code",
- "execution_count": 6,
+ "execution_count": 7,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
- "'/home/mjs5kd/UVA/example_peps/example_subannotation2/../data/frog4_data.txt'"
+ "'/Users/mstolarczyk/Uczelnia/UVA/example_peps/example_subannotation2/../data/frog4_data.txt'"
]
},
- "execution_count": 6,
+ "execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
@@ -157,37 +176,37 @@
},
{
"cell_type": "code",
- "execution_count": 7,
+ "execution_count": 8,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
- "'/home/mjs5kd/UVA/example_peps/example_subannotation3/../data/frog1a_data.txt /home/mjs5kd/UVA/example_peps/example_subannotation3/../data/frog1b_data.txt /home/mjs5kd/UVA/example_peps/example_subannotation3/../data/frog1c_data.txt'"
+ "'/Users/mstolarczyk/Uczelnia/UVA/example_peps/example_subannotation3/../data/frog1a_data.txt /Users/mstolarczyk/Uczelnia/UVA/example_peps/example_subannotation3/../data/frog1b_data.txt /Users/mstolarczyk/Uczelnia/UVA/example_peps/example_subannotation3/../data/frog1c_data.txt'"
]
},
- "execution_count": 7,
+ "execution_count": 8,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
- "p3 = peppy.Project(\"example_subannotation3/project_config.yaml\")\n",
+ "p3 = peppy.Project(\"example_peps/example_subannotation3/project_config.yaml\")\n",
"p3.samples[0].file"
]
},
{
"cell_type": "code",
- "execution_count": 8,
+ "execution_count": 9,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
- "'/home/mjs5kd/UVA/example_peps/example_subannotation3/../data/frog2_data.txt /home/mjs5kd/UVA/example_peps/example_subannotation3/../data/frog2a_data.txt /home/mjs5kd/UVA/example_peps/example_subannotation3/../data/frog2b_data.txt'"
+ "'/Users/mstolarczyk/Uczelnia/UVA/example_peps/example_subannotation3/../data/frog2_data.txt /Users/mstolarczyk/Uczelnia/UVA/example_peps/example_subannotation3/../data/frog2a_data.txt /Users/mstolarczyk/Uczelnia/UVA/example_peps/example_subannotation3/../data/frog2b_data.txt'"
]
},
- "execution_count": 8,
+ "execution_count": 9,
"metadata": {},
"output_type": "execute_result"
}
@@ -198,16 +217,16 @@
},
{
"cell_type": "code",
- "execution_count": 9,
+ "execution_count": 10,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
- "'/home/mjs5kd/UVA/example_peps/example_subannotation3/../data/frog3_data.txt'"
+ "'/Users/mstolarczyk/Uczelnia/UVA/example_peps/example_subannotation3/../data/frog3_data.txt'"
]
},
- "execution_count": 9,
+ "execution_count": 10,
"metadata": {},
"output_type": "execute_result"
}
@@ -218,16 +237,16 @@
},
{
"cell_type": "code",
- "execution_count": 10,
+ "execution_count": 11,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
- "'/home/mjs5kd/UVA/example_peps/example_subannotation3/../data/frog4_data.txt'"
+ "'/Users/mstolarczyk/Uczelnia/UVA/example_peps/example_subannotation3/../data/frog4_data.txt'"
]
},
- "execution_count": 10,
+ "execution_count": 11,
"metadata": {},
"output_type": "execute_result"
}
@@ -247,7 +266,7 @@
},
{
"cell_type": "code",
- "execution_count": 11,
+ "execution_count": 12,
"metadata": {},
"outputs": [
{
@@ -256,19 +275,19 @@
"'frog1a_data.txt frog1b_data.txt frog1c_data.txt'"
]
},
- "execution_count": 11,
+ "execution_count": 12,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
- "p4 = peppy.Project(\"example_subannotation4/project_config.yaml\")\n",
+ "p4 = peppy.Project(\"example_peps/example_subannotation4/project_config.yaml\")\n",
"p4.samples[0].read1"
]
},
{
"cell_type": "code",
- "execution_count": 12,
+ "execution_count": 13,
"metadata": {},
"outputs": [
{
@@ -277,7 +296,7 @@
"'frog1a_data2.txt frog1b_data2.txt frog1b_data2.txt'"
]
},
- "execution_count": 12,
+ "execution_count": 13,
"metadata": {},
"output_type": "execute_result"
}
@@ -297,37 +316,37 @@
},
{
"cell_type": "code",
- "execution_count": 13,
+ "execution_count": 14,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
- "'/home/mjs5kd/UVA/example_peps/example_subannotation5/../data/frog1a_R1.fq.gz /home/mjs5kd/UVA/example_peps/example_subannotation5/../data/frog1b_R1.fq.gz /home/mjs5kd/UVA/example_peps/example_subannotation5/../data/frog1c_R1.fq.gz'"
+ "'/Users/mstolarczyk/Uczelnia/UVA/example_peps/example_subannotation5/../data/frog1a_R1.fq.gz /Users/mstolarczyk/Uczelnia/UVA/example_peps/example_subannotation5/../data/frog1b_R1.fq.gz /Users/mstolarczyk/Uczelnia/UVA/example_peps/example_subannotation5/../data/frog1c_R1.fq.gz'"
]
},
- "execution_count": 13,
+ "execution_count": 14,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
- "p5 = peppy.Project(\"example_subannotation5/project_config.yaml\")\n",
+ "p5 = peppy.Project(\"example_peps/example_subannotation5/project_config.yaml\")\n",
"p5.samples[0].read1"
]
},
{
"cell_type": "code",
- "execution_count": 14,
+ "execution_count": 15,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
- "'/home/mjs5kd/UVA/example_peps/example_subannotation5/../data/frog1a_R2.fq.gz /home/mjs5kd/UVA/example_peps/example_subannotation5/../data/frog1b_R2.fq.gz /home/mjs5kd/UVA/example_peps/example_subannotation5/../data/frog1c_R2.fq.gz'"
+ "'/Users/mstolarczyk/Uczelnia/UVA/example_peps/example_subannotation5/../data/frog1a_R2.fq.gz /Users/mstolarczyk/Uczelnia/UVA/example_peps/example_subannotation5/../data/frog1b_R2.fq.gz /Users/mstolarczyk/Uczelnia/UVA/example_peps/example_subannotation5/../data/frog1c_R2.fq.gz'"
]
},
- "execution_count": 14,
+ "execution_count": 15,
"metadata": {},
"output_type": "execute_result"
}
@@ -338,16 +357,16 @@
},
{
"cell_type": "code",
- "execution_count": 15,
+ "execution_count": 16,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
- "'/home/mjs5kd/UVA/example_peps/example_subannotation5/../data/frog2_R1.fq.gz'"
+ "'/Users/mstolarczyk/Uczelnia/UVA/example_peps/example_subannotation5/../data/frog2_R1.fq.gz'"
]
},
- "execution_count": 15,
+ "execution_count": 16,
"metadata": {},
"output_type": "execute_result"
}
@@ -358,16 +377,16 @@
},
{
"cell_type": "code",
- "execution_count": 16,
+ "execution_count": 17,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
- "'/home/mjs5kd/UVA/example_peps/example_subannotation5/../data/frog2_R2.fq.gz'"
+ "'/Users/mstolarczyk/Uczelnia/UVA/example_peps/example_subannotation5/../data/frog2_R2.fq.gz'"
]
},
- "execution_count": 16,
+ "execution_count": 17,
"metadata": {},
"output_type": "execute_result"
}
@@ -378,16 +397,16 @@
},
{
"cell_type": "code",
- "execution_count": 17,
+ "execution_count": 18,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
- "'/home/mjs5kd/UVA/example_peps/example_subannotation5/../data/frog3_R1.fq.gz'"
+ "'/Users/mstolarczyk/Uczelnia/UVA/example_peps/example_subannotation5/../data/frog3_R1.fq.gz'"
]
},
- "execution_count": 17,
+ "execution_count": 18,
"metadata": {},
"output_type": "execute_result"
}
@@ -398,16 +417,16 @@
},
{
"cell_type": "code",
- "execution_count": 18,
+ "execution_count": 19,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
- "'/home/mjs5kd/UVA/example_peps/example_subannotation5/../data/frog3_R2.fq.gz'"
+ "'/Users/mstolarczyk/Uczelnia/UVA/example_peps/example_subannotation5/../data/frog3_R2.fq.gz'"
]
},
- "execution_count": 18,
+ "execution_count": 19,
"metadata": {},
"output_type": "execute_result"
}
@@ -418,16 +437,16 @@
},
{
"cell_type": "code",
- "execution_count": 19,
+ "execution_count": 20,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
- "'/home/mjs5kd/UVA/example_peps/example_subannotation5/../data/frog4_R1.fq.gz'"
+ "'/Users/mstolarczyk/Uczelnia/UVA/example_peps/example_subannotation5/../data/frog4_R1.fq.gz'"
]
},
- "execution_count": 19,
+ "execution_count": 20,
"metadata": {},
"output_type": "execute_result"
}
@@ -438,16 +457,16 @@
},
{
"cell_type": "code",
- "execution_count": 20,
+ "execution_count": 21,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
- "'/home/mjs5kd/UVA/example_peps/example_subannotation5/../data/frog4_R2.fq.gz'"
+ "'/Users/mstolarczyk/Uczelnia/UVA/example_peps/example_subannotation5/../data/frog4_R2.fq.gz'"
]
},
- "execution_count": 20,
+ "execution_count": 21,
"metadata": {},
"output_type": "execute_result"
}
@@ -455,13 +474,6 @@
"source": [
"p5.samples[3].read2"
]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": []
}
],
"metadata": {
diff --git a/doc/source/jupyter/subprojects.ipynb b/doc/source/jupyter/subprojects.ipynb
new file mode 100644
index 00000000..f1c2303b
--- /dev/null
+++ b/doc/source/jupyter/subprojects.ipynb
@@ -0,0 +1,372 @@
+{
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "# Subprojects\n",
+ "\n",
+ "The PEP that this example is based on is available in the [example_peps repsitory](https://github.com/pepkit/example_peps) in the [example_subprojects1 folder](https://github.com/pepkit/example_peps/tree/master/example_subprojects1).\n",
+ "\n",
+ "The example below demonstrates how and why to use implied attributes functionality to **define numerous similar projects in a single project config file**. This functionality is extremely convenient when one has to define projects with small settings discreptancies, like different attributes in the annotation sheet. For example libraries `ABCD` and `EFGH` instead of the original `RRBS`.\n",
+ "\n",
+ "Import libraries and set the working directory:"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 1,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import os\n",
+ "import peppy\n",
+ "os.chdir(\"/Users/mstolarczyk/Uczelnia/UVA/\")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Code\n",
+ "\n",
+ "Read in the project metadata by specifying the path to the `project_config.yaml`"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 2,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "p_subproj = peppy.Project(\"example_peps/example_subprojects1/project_config.yaml\")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "To see whether there are any subprojects available within the `project_config.yaml` file run the following command:"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "Let's inspect the sample annotation sheet."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 3,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "
\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " sample_name | \n",
+ " library | \n",
+ " organism | \n",
+ " time | \n",
+ " file_path | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " pig_0h | \n",
+ " RRBS | \n",
+ " pig | \n",
+ " 0 | \n",
+ " source1 | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " pig_1h | \n",
+ " RRBS | \n",
+ " pig | \n",
+ " 1 | \n",
+ " source1 | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " frog_0h | \n",
+ " RRBS | \n",
+ " frog | \n",
+ " 0 | \n",
+ " source1 | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " frog_1h | \n",
+ " RRBS | \n",
+ " frog | \n",
+ " 1 | \n",
+ " source1 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " sample_name library organism time file_path\n",
+ "0 pig_0h RRBS pig 0 source1\n",
+ "1 pig_1h RRBS pig 1 source1\n",
+ "2 frog_0h RRBS frog 0 source1\n",
+ "3 frog_1h RRBS frog 1 source1"
+ ]
+ },
+ "execution_count": 3,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "p_subproj.sheet"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 4,
+ "metadata": {
+ "scrolled": true
+ },
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "{'newLib': {'metadata': {'sample_annotation': 'sample_annotation_newLib.csv'}}, 'newLib2': {'metadata': {'sample_annotation': 'sample_annotation_newLib2.csv'}}}"
+ ]
+ },
+ "execution_count": 4,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "p_subproj.subprojects"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "As you can see, there are two subprojects available: `newLib` and `newLib2`. Nonetheless, only the main opne is \"active\".\n",
+ "\n",
+ "Each of subprojects can be activated with the following command:"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 5,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "p_subproj.activate_subproject(\"newLib\")\n",
+ "p_subproj.activate_subproject(\"newLib2\")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "Let's inspect the sample annotation sheet when the `newLib2` subproject is active."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 6,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " sample_name | \n",
+ " library | \n",
+ " organism | \n",
+ " time | \n",
+ " file_path | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " pig_0h | \n",
+ " EFGH | \n",
+ " pig | \n",
+ " 0 | \n",
+ " source1 | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " pig_1h | \n",
+ " EFGH | \n",
+ " pig | \n",
+ " 1 | \n",
+ " source1 | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " frog_0h | \n",
+ " EFGH | \n",
+ " frog | \n",
+ " 0 | \n",
+ " source1 | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " frog_1h | \n",
+ " EFGH | \n",
+ " frog | \n",
+ " 1 | \n",
+ " source1 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " sample_name library organism time file_path\n",
+ "0 pig_0h EFGH pig 0 source1\n",
+ "1 pig_1h EFGH pig 1 source1\n",
+ "2 frog_0h EFGH frog 0 source1\n",
+ "3 frog_1h EFGH frog 1 source1"
+ ]
+ },
+ "execution_count": 6,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "p_subproj.sheet"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## The PEP\n",
+ "\n",
+ "The `library` attribute in each sample has changed from `RRBS` to `EFGH`. This behavior was specified in the `project_config.yaml` that points to a different `sample_annotation_newLib2.csv` with changed `library` attribute."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 7,
+ "metadata": {
+ "scrolled": true
+ },
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "metadata:\n",
+ " sample_annotation: sample_annotation.csv\n",
+ " output_dir: $HOME/hello_looper_results\n",
+ "\n",
+ "derived_attributes: [file_path]\n",
+ "data_sources:\n",
+ " source1: /data/lab/project/{organism}_{time}h.fastq\n",
+ " source2: /path/from/collaborator/weirdNamingScheme_{external_id}.fastq\n",
+ "\n",
+ "subprojects:\n",
+ " newLib:\n",
+ " metadata:\n",
+ " sample_annotation: sample_annotation_newLib.csv\n",
+ " newLib2:\n",
+ " metadata:\n",
+ " sample_annotation: sample_annotation_newLib2.csv\n",
+ "\n",
+ "\n"
+ ]
+ }
+ ],
+ "source": [
+ "with open(\"example_peps/example_subprojects1/project_config.yaml\") as f:\n",
+ " print(f.read())"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 8,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "sample_name,library,organism,time,file_path\n",
+ "pig_0h,EFGH,pig,0,source1\n",
+ "pig_1h,EFGH,pig,1,source1\n",
+ "frog_0h,EFGH,frog,0,source1\n",
+ "frog_1h,EFGH,frog,1,source1\n",
+ "\n"
+ ]
+ }
+ ],
+ "source": [
+ "with open(\"example_peps/example_subprojects1/sample_annotation_newLib2.csv\") as f:\n",
+ " print(f.read())"
+ ]
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "Python 3",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.6.5"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/doc/source/jupyter/tutorial.ipynb b/doc/source/jupyter/tutorial.ipynb
index 1a06dc2d..1dbc59d3 100644
--- a/doc/source/jupyter/tutorial.ipynb
+++ b/doc/source/jupyter/tutorial.ipynb
@@ -6,6 +6,8 @@
"source": [
"# Basic PEP example\n",
"\n",
+ "The PEP that this example is based on is available in the [example_peps repsitory](https://github.com/pepkit/example_peps) in the [example_basic](https://github.com/pepkit/example_peps/tree/master/example_basic) folder.\n",
+ "\n",
"This vignette will show you a simple example PEP-formatted project, and how to read it into python using the `peppy` package.\n",
"\n",
"\n",
diff --git a/peppy/__init__.py b/peppy/__init__.py
index 284cdac6..6a63fa00 100644
--- a/peppy/__init__.py
+++ b/peppy/__init__.py
@@ -14,12 +14,13 @@
from ._version import __version__
from .attribute_dict import AttributeDict
from .const import *
+from .exceptions import PeppyError
from .project import Project, ProjectContext
from .sample import Sample, Subsample
__classes__ = ["AttributeDict", "Project", "Sample"]
-__all__ = __classes__
+__all__ = __classes__ + ["PeppyError"]
LOGGING_LEVEL = "INFO"
@@ -41,7 +42,7 @@
def setup_peppy_logger(level, additional_locations=None, devmode=False):
"""
- Establish a logger for a pe.
+ Establish a project logger.
This configures a logger to provide information about pep models.
Verbosity, destination(s) for messages, and message text format are
diff --git a/peppy/_version.py b/peppy/_version.py
index b0d7306a..5fb6b765 100644
--- a/peppy/_version.py
+++ b/peppy/_version.py
@@ -1 +1 @@
-__version__ = "0.18.2"
+__version__ = "0.19"
diff --git a/peppy/attribute_dict.py b/peppy/attribute_dict.py
index cb549f75..3061ff73 100644
--- a/peppy/attribute_dict.py
+++ b/peppy/attribute_dict.py
@@ -9,7 +9,9 @@
from pandas import Series
-from .utils import copy
+from .const import DERIVATIONS_DECLARATION, IMPLICATIONS_DECLARATION
+from .utils import \
+ copy, has_null_value, non_null_value, warn_derived_cols, warn_implied_cols
ATTRDICT_METADATA = {"_force_nulls": False, "_attribute_identity": False}
@@ -67,6 +69,7 @@ def add_entries(self, entries):
:param Iterable[(object, object)] | Mapping | pandas.Series entries:
collection of pairs of keys and values
+ :return AttributeDict: the updated instance
"""
if entries is None:
return
@@ -82,6 +85,27 @@ def add_entries(self, entries):
# Assume we now have pairs; allow corner cases to fail hard here.
for key, value in entries_iter:
self.__setitem__(key, value)
+ return self
+
+
+ def is_null(self, item):
+ """
+ Conjunction of presence in underlying mapping and value being None
+
+ :param object item: Key to check for presence and null value
+ :return bool: True iff the item is present and has null value
+ """
+ return has_null_value(item, self)
+
+
+ def non_null(self, item):
+ """
+ Conjunction of presence in underlying mapping and value not being None
+
+ :param object item: Key to check for presence and non-null value
+ :return bool: True iff the item is present and has non-null value
+ """
+ return non_null_value(item, self)
def __setattr__(self, key, value):
@@ -141,6 +165,12 @@ def __setitem__(self, key, value):
:raises _MetadataOperationException: if attempt is made
to set value for privileged metadata key
"""
+ if key == "derived_columns":
+ warn_derived_cols()
+ key = DERIVATIONS_DECLARATION
+ elif key == "implied_columns":
+ warn_implied_cols()
+ key = IMPLICATIONS_DECLARATION
if isinstance(value, Mapping):
try:
# Combine AttributeDict instances.
diff --git a/peppy/const.py b/peppy/const.py
index 2d810014..62b091ed 100644
--- a/peppy/const.py
+++ b/peppy/const.py
@@ -14,9 +14,10 @@
# Project-related
DATA_SOURCES_SECTION = "data_sources"
-IMPLICATIONS_DECLARATION = "implied_columns"
+DERIVATIONS_DECLARATION = "derived_attributes"
+IMPLICATIONS_DECLARATION = "implied_attributes"
SAMPLE_INDEPENDENT_PROJECT_SECTIONS = \
- ["metadata", "derived_columns", IMPLICATIONS_DECLARATION, "trackhubs"]
+ ["metadata", DERIVATIONS_DECLARATION, IMPLICATIONS_DECLARATION, "trackhubs"]
PROJECT_CONSTANTS = ["DATA_SOURCES_SECTION", "IMPLICATIONS_DECLARATION",
"SAMPLE_INDEPENDENT_PROJECT_SECTIONS"]
diff --git a/peppy/exceptions.py b/peppy/exceptions.py
new file mode 100644
index 00000000..ad9fcc1c
--- /dev/null
+++ b/peppy/exceptions.py
@@ -0,0 +1,12 @@
+""" Custom error types """
+
+from abc import ABCMeta
+
+
+class PeppyError(Exception):
+ """ Base error type for peppy custom errors. """
+
+ __metaclass__ = ABCMeta
+
+ def __init__(self, msg):
+ super(PeppyError, self).__init__(msg)
diff --git a/peppy/project.py b/peppy/project.py
index 52c2dc7d..6dec8024 100644
--- a/peppy/project.py
+++ b/peppy/project.py
@@ -55,6 +55,7 @@
from collections import Iterable, Mapping
else:
from collections.abc import Iterable, Mapping
+import warnings
import pandas as pd
import yaml
@@ -64,12 +65,17 @@
COMPUTE_SETTINGS_VARNAME, DATA_SOURCE_COLNAME, \
DEFAULT_COMPUTE_RESOURCES_NAME, IMPLICATIONS_DECLARATION, \
SAMPLE_ANNOTATIONS_KEY, SAMPLE_NAME_COLNAME
+from .exceptions import PeppyError
from .sample import merge_sample, Sample
from .utils import \
- add_project_sample_constants, alpha_cased, copy, fetch_samples, is_url
+ add_project_sample_constants, alpha_cased, copy, fetch_samples, is_url, \
+ non_null_value, warn_derived_cols, warn_implied_cols
MAX_PROJECT_SAMPLES_REPR = 12
+GENOMES_KEY = "genomes"
+TRANSCRIPTOMES_KEY = "transcriptomes"
+IDEALLY_IMPLIED = [GENOMES_KEY, TRANSCRIPTOMES_KEY]
_LOGGER = logging.getLogger(__name__)
@@ -120,7 +126,7 @@ def __exit__(self, *args):
@copy
class Project(AttributeDict):
"""
- A class to model a Project.
+ A class to model a Project (collection of samples and metadata).
:param config_file: Project config file (YAML).
:type config_file: str
@@ -165,7 +171,7 @@ class Project(AttributeDict):
"""
- DERIVED_COLUMNS_DEFAULT = [DATA_SOURCE_COLNAME]
+ DERIVED_ATTRIBUTES_DEFAULT = [DATA_SOURCE_COLNAME]
def __init__(self, config_file, subproject=None,
@@ -195,15 +201,16 @@ def __init__(self, config_file, subproject=None,
default_compute, when_missing=no_environment_exception)
# Load settings from environment yaml for local compute infrastructure.
+ compute_env_file = compute_env_file or os.getenv(self.compute_env_var)
if compute_env_file:
- _LOGGER.debug("Updating environment settings based on file '%s'",
- compute_env_file)
- self.update_environment(compute_env_file)
-
+ if os.path.isfile(compute_env_file):
+ self.update_environment(compute_env_file)
+ else:
+ _LOGGER.warning("Compute env path isn't a file: {}".
+ format(compute_env_file))
else:
- _LOGGER.info("Using default {envvar}. You may set environment "
- "variable {envvar} to configure environment "
- "settings.".format(envvar=self.compute_env_var))
+ _LOGGER.info("No compute env file was provided and {} is unset; "
+ "using default".format(self.compute_env_var))
# Initialize default compute settings.
_LOGGER.debug("Establishing project compute settings")
@@ -216,7 +223,7 @@ def __init__(self, config_file, subproject=None,
if no_compute_exception:
no_compute_exception(message)
else:
- _LOGGER.warn(message)
+ _LOGGER.warning(message)
else:
_LOGGER.debug("Compute: %s", str(self.compute))
@@ -233,7 +240,7 @@ def __init__(self, config_file, subproject=None,
_LOGGER.info("Using subproject: '{}'".format(subproject))
self.parse_config_file(subproject)
- if "data_sources" in self:
+ if self.non_null("data_sources"):
# Expand paths now, so that it's not done for every sample.
for src_key, src_val in self.data_sources.items():
src_val = os.path.expandvars(src_val)
@@ -254,11 +261,11 @@ def __init__(self, config_file, subproject=None,
# Establish derived columns.
try:
# Do not duplicate derived column names.
- self.derived_columns.extend(
- [colname for colname in self.DERIVED_COLUMNS_DEFAULT
- if colname not in self.derived_columns])
+ self.derived_attributes.extend(
+ [colname for colname in self.DERIVED_ATTRIBUTES_DEFAULT
+ if colname not in self.derived_attributes])
except AttributeError:
- self.derived_columns = self.DERIVED_COLUMNS_DEFAULT
+ self.derived_attributes = self.DERIVED_ATTRIBUTES_DEFAULT
self.finalize_pipelines_directory()
@@ -269,29 +276,18 @@ def __init__(self, config_file, subproject=None,
self.metadata.pipelines_dir))
path_anns_file = self.metadata.sample_annotation
- _LOGGER.debug("Reading sample annotations sheet: '%s'", path_anns_file)
- try:
+ if path_anns_file:
+ _LOGGER.debug("Reading sample annotations sheet: '%s'", path_anns_file)
_LOGGER.info("Setting sample sheet from file '%s'", path_anns_file)
- self.sheet = check_sample_sheet(path_anns_file)
- except IOError:
- _LOGGER.error("Alleged annotations file doesn't exist: '%s'",
- path_anns_file)
- anns_folder_path = os.path.dirname(path_anns_file)
- try:
- annotations_file_folder_contents = \
- os.listdir(anns_folder_path)
- except OSError:
- _LOGGER.error("Annotations file folder doesn't exist either: "
- "'%s'", anns_folder_path)
- else:
- _LOGGER.error("Annotations file folder's contents: {}".
- format(annotations_file_folder_contents))
- raise
+ self._sheet = self.parse_sample_sheet(path_anns_file)
+ else:
+ _LOGGER.warning("No sample annotations sheet in config")
+ self._sheet = None
self.sample_subannotation = None
# Basic sample maker will handle name uniqueness check.
- if defer_sample_construction:
+ if defer_sample_construction or self._sheet is None:
self._samples = None
else:
self._set_basic_samples()
@@ -299,11 +295,13 @@ def __init__(self, config_file, subproject=None,
def __repr__(self):
""" Representation in interpreter. """
+ if len(self) == 0:
+ return "{}"
samples_message = "{} (from '{}')". \
format(self.__class__.__name__, self.config_file)
try:
num_samples = len(self._samples)
- except AttributeError:
+ except (AttributeError, TypeError):
pass
else:
samples_message += " with {} sample(s)".format(num_samples)
@@ -337,14 +335,50 @@ def constants(self):
@property
def default_compute_envfile(self):
- """ Path to default compute environment settings file. """
+ """
+ Path to default compute environment settings file.
+
+ :return str: Path to this project's default compute env config file.
+ """
return os.path.join(
self.templates_folder, "default_compute_settings.yaml")
+ @property
+ def derived_columns(self):
+ """
+ Collection of sample attributes for which value of each is derived from elsewhere
+
+ :return list[str]: sample attribute names for which value is derived
+ """
+ warn_derived_cols()
+ try:
+ return self.derived_attributes
+ except AttributeError:
+ return []
+
+
+ @property
+ def implied_columns(self):
+ """
+ Collection of sample attributes for which value of each is implied by other(s)
+
+ :return list[str]: sample attribute names for which value is implied by other(s)
+ """
+ warn_implied_cols()
+ try:
+ return self.implied_attributes
+ except AttributeError:
+ return AttributeDict()
+
+
@property
def num_samples(self):
- """ Number of samples available in this Project. """
+ """
+ Count the number of samples available in this Project.
+
+ :return int: number of samples available in this Project.
+ """
return sum(1 for _ in self.sample_names)
@@ -424,13 +458,22 @@ def samples(self):
:return Iterable[Sample]: Sample instance for each
of this Project's samples
"""
- if self._samples is None:
- _LOGGER.debug("Building basic sample object(s) for %s",
- self.__class__.__name__)
- self._set_basic_samples()
return self._samples
+ @property
+ def sheet(self):
+ """
+ Annotations/metadata sheet describing this Project's samples.
+
+ :return pandas.core.frame.DataFrame: table of samples in this Project
+ """
+ from copy import copy as cp
+ if self._sheet is None:
+ self._sheet = self.parse_sample_sheet(self.metadata.sample_annotation)
+ return cp(self._sheet)
+
+
@property
def templates_folder(self):
"""
@@ -482,9 +525,9 @@ def get_sample(self, sample_name):
:return Sample: The requested Sample object
"""
- samples = self.get_samples(sample_name)
+ samples = self.get_samples([sample_name])
if len(samples) > 1:
- _LOGGER.warn("More than one sample was detected; returning the first")
+ _LOGGER.warning("More than one sample was detected; returning the first")
if len(samples) == 0:
raise ValueError("Project has no sample named {name}.".format(name=sample_name))
@@ -492,6 +535,23 @@ def get_sample(self, sample_name):
return samples[0]
+ def activate_subproject(self, subproject):
+ """
+ Activate a subproject.
+
+ This method will update Project attributes, adding new values
+ associated with the subproject indicated, and in case of collision with
+ an existing key/attribute the subproject's value will be favored.
+
+ :param str subproject: A string with a subproject name to be activated
+ :return Project: A Project with the selected subproject activated
+ """
+ conf_file = self.config_file
+ self.clear()
+ self.__init__(conf_file, subproject)
+ return self
+
+
def get_samples(self, sample_names):
"""
Returns a list of sample objects given a list of sample names
@@ -536,9 +596,9 @@ def _check_unique_samples(self):
repeats = {name: n for name, n in Counter(
s.name for s in self._samples).items() if n > 1}
if repeats:
- histogram_text = "\n".join(
+ hist_text = "\n".join(
"{}: {}".format(name, n) for name, n in repeats.items())
- _LOGGER.warn("Non-unique sample names:\n{}".format(histogram_text))
+ _LOGGER.warning("Non-unique sample names:\n{}".format(hist_text))
def finalize_pipelines_directory(self, pipe_path=""):
@@ -648,7 +708,7 @@ def make_project_dirs(self):
try:
os.makedirs(folder_path)
except OSError as e:
- _LOGGER.warn("Could not create project folder: '%s'",
+ _LOGGER.warning("Could not create project folder: '%s'",
str(e))
@@ -667,13 +727,13 @@ def _set_basic_samples(self):
except KeyError:
_LOGGER.debug("No sample subannotations")
else:
- _LOGGER.warn("Switch to 'sample_subannotation' in lieu of "
- "'merge_table.'")
+ _LOGGER.warning("'merge_table' attribute is deprecated. Please use "
+ "'sample_subannotation' instead.")
if self.sample_subannotation is None:
if sub_ann and os.path.isfile(sub_ann):
_LOGGER.info("Reading subannotations: %s", sub_ann)
- self.sample_subannotation = pd.read_table(
+ self.sample_subannotation = pd.read_csv(
sub_ann, sep=None, engine="python")
_LOGGER.debug("Subannotations shape: {}".
format(self.sample_subannotation.shape))
@@ -703,14 +763,13 @@ def _prep_samples(self):
# Add values that are constant across this Project's samples.
sample = add_project_sample_constants(sample, self)
- # TODO: use implied_columns in 0.8.
sample.set_genome(self.get("genomes"))
sample.set_transcriptome(self.get("transcriptomes"))
_LOGGER.debug("Merging sample '%s'", sample.name)
- sample.infer_columns(self.get(IMPLICATIONS_DECLARATION))
+ sample.infer_attributes(self.get(IMPLICATIONS_DECLARATION))
merge_sample(sample, self.sample_subannotation,
- self.data_sources, self.derived_columns)
+ self.data_sources, self.derived_attributes)
_LOGGER.debug("Setting sample file paths")
sample.set_file_paths(self)
# Hack for backwards-compatibility
@@ -732,6 +791,7 @@ def parse_config_file(self, subproject=None):
"""
Parse provided yaml config file and check required fields exist.
+ :param str subproject: Name of subproject to activate, optional
:raises KeyError: if config file lacks required section(s)
"""
@@ -740,6 +800,9 @@ def parse_config_file(self, subproject=None):
with open(self.config_file, 'r') as conf_file:
config = yaml.safe_load(conf_file)
+ for msg in suggest_implied_attributes(config):
+ warnings.warn(msg, DeprecationWarning)
+
_LOGGER.debug("{} config data: {}".format(
self.__class__.__name__, config))
@@ -752,19 +815,27 @@ def parse_config_file(self, subproject=None):
self.__class__.__name__, len(self.keys()), self.keys()))
# Overwrite any config entries with entries in the subproject.
- if "subprojects" in config and subproject:
+ if non_null_value("subprojects", config) and subproject:
_LOGGER.debug("Adding entries for subproject '{}'".
format(subproject))
- subproj_updates = config['subprojects'][subproject]
+ try:
+ subproj_updates = config['subprojects'][subproject]
+ except KeyError:
+ raise Exception(
+ "Unknown subproject ({}); defined subprojects: {}".format(
+ subproject, ", ".join([sp for sp in config["subprojects"]])))
_LOGGER.debug("Updating with: {}".format(subproj_updates))
self.add_entries(subproj_updates)
+ elif subproject:
+ _LOGGER.warning("Subproject {} requested but no subprojects "
+ "are defined".format(subproject))
else:
- _LOGGER.debug("No subproject")
+ _LOGGER.debug("No subproject requested")
# In looper 0.4, for simplicity the paths section was eliminated.
# For backwards compatibility, mirror the paths section into metadata.
if "paths" in config:
- _LOGGER.warn(
+ _LOGGER.warning(
"Paths section in project config is deprecated. "
"Please move all paths attributes to metadata section. "
"This option will be removed in future versions.")
@@ -868,9 +939,7 @@ def parse_config_file(self, subproject=None):
# Required variables check
if not hasattr(self.metadata, SAMPLE_ANNOTATIONS_KEY):
- raise _MissingMetadataException(
- missing_section=SAMPLE_ANNOTATIONS_KEY,
- path_config_file=self.config_file)
+ self.metadata.sample_annotation = None
def set_compute(self, setting):
@@ -914,9 +983,7 @@ def set_compute(self, setting):
def set_project_permissions(self):
- """
- Make the project's public_html folder executable.
- """
+ """ Make the project's public_html folder executable. """
try:
os.chmod(self.trackhubs.trackhub_dir, 0o0755)
except OSError:
@@ -993,47 +1060,75 @@ def _handle_missing_env_attrs(self, env_settings_file, when_missing):
message = "'{}' lacks environment attributes: {}". \
format(env_settings_file, missing_env_attrs)
if when_missing is None:
- _LOGGER.warn(message)
+ _LOGGER.warning(message)
else:
when_missing(message)
+ @staticmethod
+ def parse_sample_sheet(sample_file, dtype=str):
+ """
+ Check if csv file exists and has all required columns.
+
+ :param str sample_file: path to sample annotations file.
+ :param type dtype: data type for CSV read.
+ :raises IOError: if given annotations file can't be read.
+ :raises ValueError: if required column(s) is/are missing.
+ """
+ # Although no null value replacements or supplements are being passed,
+ # toggling the keep_default_na value to False solved an issue with 'nan'
+ # and/or 'None' as an argument for an option in the pipeline command
+ # that's generated from a Sample's attributes.
+ #
+ # See https://github.com/pepkit/peppy/issues/159 for the original issue
+ # and https://github.com/pepkit/peppy/pull/160 for the pull request
+ # that resolved it.
+ try:
+ df = pd.read_csv(sample_file, sep=None, dtype=dtype, index_col=False,
+ engine="python", keep_default_na=False)
+ except IOError:
+ raise Project.MissingSampleSheetError(sample_file)
+ else:
+ _LOGGER.info("Setting sample sheet from file '%s'", sample_file)
+ missing = {SAMPLE_NAME_COLNAME} - set(df.columns)
+ if len(missing) != 0:
+ _LOGGER.warning(
+ "Annotation sheet ('{}') is missing column(s):\n{}\n"
+ "It has: {}".format(sample_file, "\n".join(missing),
+ ", ".join(list(df.columns))))
+ return df
+
+
+ class MissingMetadataException(PeppyError):
+ """ Project needs certain metadata. """
+ def __init__(self, missing_section, path_config_file=None):
+ reason = "Project configuration lacks required metadata section {}".\
+ format(missing_section)
+ if path_config_file:
+ reason += "; used config file '{}'".format(path_config_file)
+ super(Project.MissingMetadataException, self).__init__(reason)
+
+
+ class MissingSampleSheetError(PeppyError):
+ """ Represent case in which sample sheet is specified but nonexistent. """
+ def __init__(self, sheetfile):
+ super(Project.MissingSampleSheetError, self).__init__(
+ "Missing sample annotation sheet ({}); a project need not use "
+ "a sample sheet, but if it does the file must exist."
+ .format(sheetfile))
+
+
-def check_sample_sheet(sample_file, dtype=str):
+def suggest_implied_attributes(prj):
"""
- Check if csv file exists and has all required columns.
+ If given project contains what could be implied attributes, suggest that.
- :param str sample_file: path to sample annotations file.
- :param type dtype: data type for CSV read.
- :raises IOError: if given annotations file can't be read.
- :raises ValueError: if required column(s) is/are missing.
+ :param Iterable prj: Intent is a Project, but this could be any iterable
+ of strings to check for suitability of declaration as implied attr
+ :return list[str]: (likely empty) list of warning messages about project
+ config keys that could be implied attributes
"""
- # Although no null value replacements or supplements are being passed,
- # toggling the keep_default_na value to False solved an issue with 'nan'
- # and/or 'None' as an argument for an option in the pipeline command
- # that's generated from a Sample's attributes.
- #
- # See https://github.com/pepkit/peppy/issues/159 for the original issue
- # and https://github.com/pepkit/peppy/pull/160 for the pull request
- # that resolved it.
- df = pd.read_table(sample_file, sep=None, dtype=dtype,
- index_col=False, engine="python", keep_default_na=False)
- req = [SAMPLE_NAME_COLNAME]
- missing = set(req) - set(df.columns)
- if len(missing) != 0:
- raise ValueError(
- "Annotation sheet ('{}') is missing column(s):\n{}\nIt has: {}".
- format(sample_file, "\n".join(missing),
- ", ".join(list(df.columns))))
- return df
-
-
-
-class _MissingMetadataException(Exception):
- """ Project needs certain metadata. """
- def __init__(self, missing_section, path_config_file=None):
- reason = "Project configuration lacks required metadata section {}".\
- format(missing_section)
- if path_config_file:
- reason += "; used config file '{}'".format(path_config_file)
- super(_MissingMetadataException, self).__init__(reason)
+ def suggest(key):
+ return "To declare {}, consider using {}".format(
+ key, IMPLICATIONS_DECLARATION)
+ return [suggest(k) for k in prj if k in IDEALLY_IMPLIED]
diff --git a/peppy/sample.py b/peppy/sample.py
index 5c5020b0..3fc18a65 100644
--- a/peppy/sample.py
+++ b/peppy/sample.py
@@ -21,7 +21,7 @@
ALL_INPUTS_ATTR_NAME, DATA_SOURCE_COLNAME, DATA_SOURCES_SECTION, \
REQUIRED_INPUTS_ATTR_NAME, SAMPLE_EXECUTION_TOGGLE, VALID_READ_TYPES
from .utils import check_bam, check_fastq, copy, get_file_size, \
- grab_project_data, is_url,parse_ftype, sample_folder
+ grab_project_data, parse_ftype, sample_folder
COL_KEY_SUFFIX = "_key"
@@ -205,7 +205,7 @@ def determine_missing_requirements(self):
# set_pipeline_attributes must be run first.
if not hasattr(self, "required_inputs"):
- _LOGGER.warn("You must run set_pipeline_attributes "
+ _LOGGER.warning("You must run set_pipeline_attributes "
"before determine_missing_requirements")
return null_return
@@ -316,11 +316,11 @@ def get_sheet_dict(self):
originally provided via the sample sheet (i.e., the a map-like
representation of the instance, excluding derived items)
"""
- return OrderedDict([[k, getattr(self, k)]
- for k in self.sheet_attributes])
+ return OrderedDict(
+ [[k, getattr(self, k)] for k in self.sheet_attributes])
- def infer_columns(self, implications):
+ def infer_attributes(self, implications):
"""
Infer value for additional field(s) from other field(s).
@@ -480,7 +480,7 @@ def locate_data_source(self, data_sources, column_name=DATA_SOURCE_COLNAME,
try:
# Grab a temporary dictionary of sample attributes and update these
# with any provided extra variables to use in the replacement.
- # This is necessary for derived_columns in the merge table.
+ # This is necessary for derived_attributes in the merge table.
# Here the copy() prevents the actual sample from being
# updated by update().
temp_dict = self.__dict__.copy()
@@ -490,7 +490,7 @@ def locate_data_source(self, data_sources, column_name=DATA_SOURCE_COLNAME,
_LOGGER.debug("Pre-glob: %s", val)
val_globbed = sorted(glob.glob(val))
if not val_globbed:
- _LOGGER.warn("Unmatched regex-like: '%s'", val)
+ _LOGGER.warning("Unmatched regex-like: '%s'", val)
else:
val = " ".join(val_globbed)
_LOGGER.debug("Post-glob: %s", val)
@@ -524,7 +524,7 @@ def set_file_paths(self, project=None):
project = project or self.prj
- for col in project.get("derived_columns", []):
+ for col in project.get("derived_attributes", []):
# Only proceed if the specified column exists
# and was not already merged or derived.
if not hasattr(self, col):
@@ -659,16 +659,14 @@ def set_pipeline_attributes(
# read_type, read_length, paired.
self.ngs_inputs = self.get_attr_values("ngs_inputs_attr")
- set_rtype = False
+ set_rtype_reason = ""
if not hasattr(self, "read_type"):
set_rtype_reason = "read_type not yet set"
- set_rtype = True
elif not self.read_type or self.read_type.lower() \
not in VALID_READ_TYPES:
set_rtype_reason = "current read_type is invalid: '{}'". \
format(self.read_type)
- set_rtype = True
- if set_rtype:
+ if set_rtype_reason:
_LOGGER.debug(
"Setting read_type for %s '%s': %s",
self.__class__.__name__, self.name, set_rtype_reason)
@@ -747,7 +745,7 @@ def set_read_type(self, rlen_sample_size=10, permissive=True):
except NotImplementedError as e:
if not permissive:
raise
- _LOGGER.warn(e.message)
+ _LOGGER.warning(e.message)
return
except IOError:
if not permissive:
@@ -798,7 +796,7 @@ def set_read_type(self, rlen_sample_size=10, permissive=True):
setattr(self, feature, feat_val)
if getattr(self, feature) is None and len(existing_files) > 0:
- _LOGGER.warn("Not all input files agree on '%s': '%s'",
+ _LOGGER.warning("Not all input files agree on '%s': '%s'",
feature, self.name)
@@ -879,7 +877,7 @@ def obj2dict(obj, name=None,
for k, v in obj.__dict__.items() if
k not in to_skip}
elif isinstance(obj, Series):
- _LOGGER.warn("Serializing series as mapping, not array-like")
+ _LOGGER.warning("Serializing series as mapping, not array-like")
return obj.to_dict()
elif hasattr(obj, 'dtype'): # numpy data types
# TODO: this fails with ValueError for multi-element array.
@@ -929,16 +927,24 @@ def obj2dict(obj, name=None,
outfile.write(yaml_data)
- def update(self, newdata):
+ def update(self, newdata, **kwargs):
"""
Update Sample object with attributes from a dict.
"""
- for key, value in newdata.items():
- setattr(self, key, value)
+ duplicates = [k for k in set(newdata.keys()) & set(kwargs.keys())
+ if newdata[k] != kwargs[k]]
+ if len(duplicates) != 0:
+ raise ValueError("{} duplicate keys with different values: {}".
+ format(len(duplicates), ", ".join(duplicates)))
+ for k, v in newdata.items():
+ setattr(self, k, v)
+ for k, v in kwargs.items():
+ setattr(self, k, v)
-def merge_sample(sample, sample_subann, data_sources=None, derived_columns=None):
+def merge_sample(sample, sample_subann,
+ data_sources=None, derived_attributes=None):
"""
Use merge table (subannotation) data to augment/modify Sample.
@@ -946,9 +952,9 @@ def merge_sample(sample, sample_subann, data_sources=None, derived_columns=None)
:param sample_subann: data with which to alter Sample
:param Mapping data_sources: collection of named paths to data locations,
optional
- :param Iterable[str] derived_columns: names of columns for which
+ :param Iterable[str] derived_attributes: names of attributes for which
corresponding Sample attribute's value is data-derived, optional
- :return Set[str]: names of columns that were merged
+ :return Set[str]: names of columns/attributes that were merged
"""
merged_attrs = {}
@@ -966,9 +972,9 @@ def merge_sample(sample, sample_subann, data_sources=None, derived_columns=None)
format(data_sources))
# Hash derived columns for faster lookup in case of many samples/columns.
- derived_columns = set(derived_columns or [])
- _LOGGER.debug("Merging Sample with derived columns: {}".
- format(derived_columns))
+ derived_attributes = set(derived_attributes or [])
+ _LOGGER.debug("Merging Sample with derived attributes: {}".
+ format(derived_attributes))
sample_name = getattr(sample, SAMPLE_NAME_COLNAME)
sample_indexer = sample_subann[SAMPLE_NAME_COLNAME] == sample_name
@@ -988,7 +994,6 @@ def merge_sample(sample, sample_subann, data_sources=None, derived_columns=None)
merged_attrs = {key: "" for key in this_sample_rows.columns}
subsamples = []
_LOGGER.debug(this_sample_rows)
- subsample_count = 0
for subsample_row_id, row in this_sample_rows.iterrows():
try:
row['subsample_name']
@@ -1004,7 +1009,7 @@ def merge_sample(sample, sample_subann, data_sources=None, derived_columns=None)
# during-iteration change of dictionary size.
for attr_name in this_sample_rows.columns:
if attr_name == SAMPLE_NAME_COLNAME or \
- attr_name not in derived_columns:
+ attr_name not in derived_attributes:
_LOGGER.log(5, "Skipping merger of attribute '%s'", attr_name)
continue
@@ -1019,9 +1024,9 @@ def merge_sample(sample, sample_subann, data_sources=None, derived_columns=None)
extra_vars=rowdata) # 1)
rowdata[attr_name] = data_src_path
- _LOGGER.log(5, "Adding derived columns")
+ _LOGGER.log(5, "Adding derived attributes")
- for attr in derived_columns:
+ for attr in derived_attributes:
# Skip over any attributes that the sample lacks or that are
# covered by the data from the current (row's) data.
diff --git a/peppy/utils.py b/peppy/utils.py
index 7b088c1a..4da2be0f 100644
--- a/peppy/utils.py
+++ b/peppy/utils.py
@@ -12,10 +12,12 @@
from urlparse import urlparse
else:
from urllib.parse import urlparse
-
-
+if sys.version_info < (3, 3):
+ from collections import Sized
+else:
+ from collections.abc import Sized
+import warnings
import yaml
-
from .const import GENERIC_PROTOCOL_KEY, SAMPLE_INDEPENDENT_PROJECT_SECTIONS
@@ -112,6 +114,17 @@ def check_sample_sheet_row_count(sheet, filepath):
+def coll_like(c):
+ """
+ Determine whether an object is collection-like.
+
+ :param object c: Object to test as collection
+ :return bool: Whether the argument is a (non-string) collection
+ """
+ return isinstance(c, Iterable) and not isinstance(c, str)
+
+
+
def copy(obj):
def copy(self):
"""
@@ -245,6 +258,18 @@ def grab_project_data(prj):
+def has_null_value(k, m):
+ """
+ Determine whether a mapping has a null value for a given key.
+
+ :param Hashable k: Key to test for null value
+ :param Mapping m: Mapping to test for null value for given key
+ :return bool: Whether given mapping contains given key with null value
+ """
+ return k in m and _is_null(m[k])
+
+
+
def import_from_source(module_filepath):
"""
Import a module from a particular filesystem location.
@@ -295,6 +320,18 @@ def is_url(maybe_url):
+def non_null_value(k, m):
+ """
+ Determine whether a mapping has a non-null value for a given key.
+
+ :param Hashable k: Key to test for non-null value
+ :param Mapping m: Mapping to test for non-null value for given key
+ :return bool: Whether given mapping contains given key with non-null value
+ """
+ return k in m and not _is_null(m[k])
+
+
+
def parse_ftype(input_file):
"""
Checks determine filetype from extension.
@@ -380,6 +417,28 @@ def standard_stream_redirector(stream):
+def warn_derived_cols():
+ """ Produce deprecation warning about derived columns. """
+ _warn_cols_to_attrs("derived")
+
+
+def warn_implied_cols():
+ """ Produce deprecation warning about implied columns. """
+ _warn_cols_to_attrs("implied")
+
+
+def _is_null(x):
+ """ Whether an object is effectively null """
+ return x in [None, ""] or (coll_like(x) and isinstance(x, Sized) and 0 == len(x))
+
+
+def _warn_cols_to_attrs(prefix):
+ """ Produce deprecation warning about 'columns' rather than 'attributes' """
+ warnings.warn("{pfx}_columns should be encoded and referenced "
+ "as {pfx}_attributes".format(pfx=prefix), DeprecationWarning)
+
+
+
class CommandChecker(object):
"""
Validate PATH availability of executables referenced by a config file.
diff --git a/requirements/requirements-dev.txt b/requirements/requirements-dev.txt
index 9fc487d2..8e0796aa 100644
--- a/requirements/requirements-dev.txt
+++ b/requirements/requirements-dev.txt
@@ -1,2 +1,2 @@
mock>=2.0.0
-pytest>=3.0.7
+pytest==3.10.1
diff --git a/requirements/requirements-test.txt b/requirements/requirements-test.txt
index 0dbaf66b..9f0c760d 100644
--- a/requirements/requirements-test.txt
+++ b/requirements/requirements-test.txt
@@ -1,2 +1,2 @@
coveralls>=1.1
-pytest-cov>=2.4.0
+pytest-cov==2.6.1
diff --git a/setup.py b/setup.py
index 8d99b9d3..b0ebc769 100644
--- a/setup.py
+++ b/setup.py
@@ -5,17 +5,25 @@
import sys
+REQDIR = "requirements"
+
+
+def read_reqs(reqs_name):
+ deps = []
+ with open(os.path.join(REQDIR, "requirements-{}.txt".format(reqs_name)), 'r') as f:
+ for l in f:
+ if not l.strip():
+ continue
+ #deps.append(l.split("=")[0].rstrip("<>"))
+ deps.append(l)
+ return deps
+
+
# Additional keyword arguments for setup().
extra = {}
# Ordinary dependencies
-DEPENDENCIES = []
-with open("requirements/requirements-all.txt", "r") as reqs_file:
- for line in reqs_file:
- if not line.strip():
- continue
- #DEPENDENCIES.append(line.split("=")[0].rstrip("<>"))
- DEPENDENCIES.append(line)
+DEPENDENCIES = read_reqs("all")
# numexpr for pandas
try:
@@ -54,9 +62,10 @@ def get_static(name, condition=None):
try:
import pypandoc
long_description = pypandoc.convert_file('README.md', 'rst')
-except(IOError, ImportError):
+except(IOError, ImportError, OSError):
long_description = open('README.md').read()
+
setup(
name="peppy",
packages=["peppy"],
@@ -77,7 +86,7 @@ def get_static(name, condition=None):
scripts=scripts,
include_package_data=True,
test_suite="tests",
- tests_require=(["mock", "pytest"]),
+ tests_require=read_reqs("dev"),
setup_requires=(["pytest-runner"] if {"test", "pytest", "ptr"} & set(sys.argv) else []),
**extra
)
diff --git a/tests/conftest.py b/tests/conftest.py
index 4f4084c7..db5c81a0 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -35,14 +35,14 @@
pipeline_interfaces: pipelines
sample_subannotation: merge.csv
-derived_columns: [{derived_column_names}]
+derived_attributes: [{derived_attribute_names}]
data_sources:
src1: "{basedir}/data/{sample_name}{col_modifier}.txt"
src3: "{basedir}/data/{sample_name}.txt"
src2: "{basedir}/data/{sample_name}-bamfile.bam"
-implied_columns:
+implied_attributes:
sample_name:
a:
genome: hg38
@@ -127,7 +127,7 @@
"testngs.sh": FILE_BY_SAMPLE
}
-SAMPLE_ANNOTATION_LINES = """sample_name,library,file,file2,organism,nonmerged_col,data_source,dcol2
+SAMPLE_ANNOTATION_LINES = """sample_name,protocol,file,file2,organism,nonmerged_col,data_source,dcol2
a,testlib,src3,src3,,src3,src3,
b,testlib,,,,src3,src3,src1
c,testlib,src3,src3,,src3,src3,
@@ -170,7 +170,7 @@
}
COMPARISON_FUNCTIONS = ["__eq__", "__ne__", "__len__",
"keys", "values", "items"]
-COLUMNS = [SAMPLE_NAME_COLNAME, "val1", "val2", "library"]
+COLUMNS = [SAMPLE_NAME_COLNAME, "val1", "val2", "protocol"]
PROJECT_CONFIG_DATA = {"metadata": {"sample_annotation": "annotations.csv"}}
@@ -312,8 +312,8 @@ class _DataSourceFormatMapping(dict):
mechanism that pep uses to derive columns, but it's also the
core string formatting mechanism.
"""
- def __missing__(self, derived_column):
- return "{" + derived_column + "}"
+ def __missing__(self, derived_attribute):
+ return "{" + derived_attribute + "}"
@@ -333,8 +333,8 @@ def _write_temp(lines, dirpath, fname):
:return str: full path to written file
"""
basedir_replacement = _DataSourceFormatMapping(basedir=dirpath)
- derived_columns_replacement = _DataSourceFormatMapping(
- **{"derived_column_names": ", ".join(DERIVED_COLNAMES)}
+ derived_attributes_replacement = _DataSourceFormatMapping(
+ **{"derived_attribute_names": ", ".join(DERIVED_COLNAMES)}
)
filepath = os.path.join(dirpath, fname)
data_source_formatter = string.Formatter()
@@ -342,12 +342,14 @@ def _write_temp(lines, dirpath, fname):
with open(filepath, 'w') as tmpf:
for l in lines:
if "{basedir}" in l:
- l = data_source_formatter.vformat(
+ out = data_source_formatter.vformat(
l, (), basedir_replacement)
- elif "{derived_column_names}" in l:
- l = data_source_formatter.vformat(
- l, (), derived_columns_replacement)
- tmpf.write(l)
+ elif "{derived_attribute_names}" in l:
+ out = data_source_formatter.vformat(
+ l, (), derived_attributes_replacement)
+ else:
+ out = l
+ tmpf.write(out)
num_lines += 1
_LOGGER.debug("Wrote %d line(s) to disk: '%s'", num_lines, filepath)
return filepath
@@ -442,6 +444,14 @@ def write_project_files(request):
+@pytest.fixture(scope="function")
+def subannotation_filepath(tmpdir):
+ """ Write sample subannotations (temp) file and return path to it. """
+ return _write_temp(SAMPLE_SUBANNOTATION_LINES,
+ dirpath=tmpdir.strpath, fname=MERGE_TABLE_FILENAME)
+
+
+
# Placed here (rather than near top of file) for data/use locality.
_TEST_DATA_FOLDER = "data"
_BAMFILE_PATH = os.path.join(os.path.dirname(__file__),
@@ -505,8 +515,8 @@ def proj(request):
Create project instance using data from file pointed to by request class.
To use this fixture, the test case must reside within a class that
- defines a "project_config_file" attribute. This is best done by marking
- the class with "@pytest.mark.usefixtures("write_project_files")"
+ defines a "project_config_file" attribute. This is most easily done by
+ marking the class with "@pytest.mark.usefixtures('write_project_files')"
:param pytest._pytest.fixtures.SubRequest request: test case requesting
a project instance
diff --git a/tests/helpers.py b/tests/helpers.py
index 6a2d2687..edffcf1b 100644
--- a/tests/helpers.py
+++ b/tests/helpers.py
@@ -4,6 +4,7 @@
import itertools
import numpy as np
import pytest
+import peppy
__author__ = "Vince Reuter"
@@ -61,3 +62,45 @@ def powerset(items, min_items=0, include_full_pop=True):
nonempty_powerset = partial(powerset, min_items=1)
+
+
+
+class TempLogFileHandler(object):
+ """ Context manager for temporary file handler logging attachment """
+
+ def __init__(self, filepath, level, mode='w'):
+ """
+ Create the temporary file handler by providing path and level
+
+ :param str filepath: Path to file to use for logging handler.
+ :param str | int level: Minimal severity level for file handler.
+ :param str mode: Mode in which to create the file handler.
+ """
+ self.logfile = filepath
+ self._level = level
+ self._mode = mode
+ self._used = False
+
+ def __enter__(self):
+ """ Add the handler to project module's logger, and update state. """
+ import logging
+ if self._used:
+ raise Exception("Cannot reuse a {}".format(self.__class__.__name__))
+ handler = logging.FileHandler(self.logfile, mode='w')
+ handler.setLevel(self._level)
+ peppy.project._LOGGER.handlers.append(handler)
+ self._used = True
+
+ def __exit__(self, exc_type, exc_val, exc_tb):
+ """ Remove the added file handler from the logger. """
+ del peppy.project._LOGGER.handlers[-1]
+
+ @property
+ def messages(self):
+ """ Open the handler's underlying file and read the messages. """
+ if not self._used:
+ raise Exception(
+ "Attempted to read messages from unused logfile: "
+ "{}", self.logfile)
+ with open(self.logfile, 'r') as f:
+ return f.readlines()
diff --git a/tests/models/independent/test_AttributeDict.py b/tests/models/independent/test_AttributeDict.py
index d62ecbba..85bb0881 100644
--- a/tests/models/independent/test_AttributeDict.py
+++ b/tests/models/independent/test_AttributeDict.py
@@ -546,12 +546,48 @@ def test_attribute_access(
assert expected == observed
+class NullityTests:
+ """ Tests of null/non-null values """
+
+ _KEYNAMES = ["sample_name", "protocol", "arbitrary_attribute"]
+
+ @pytest.mark.parametrize(
+ argnames="item", argvalues=ATTRDICT_METADATA.keys())
+ def test_metadata_are_non_null(self, item):
+ """ Test the special/reserverd AD keys """
+ assert AttributeDict().non_null(item)
+ assert not AttributeDict().is_null(item)
+
+ @pytest.mark.parametrize(argnames="item", argvalues=_KEYNAMES)
+ def test_missing_is_neither_null_nor_non_null(self, item):
+ """ Value of absent key is neither null nor non-null """
+ ad = AttributeDict()
+ assert not ad.is_null(item) and not ad.non_null(item)
+
+ @pytest.mark.parametrize(argnames="item", argvalues=_KEYNAMES)
+ def test_is_null(self, item):
+ """ Null-valued key/item evaluates as such. """
+ ad = AttributeDict()
+ ad[item] = None
+ assert ad.is_null(item) and not ad.non_null(item)
+
+ @pytest.mark.parametrize(
+ argnames=["k", "v"],
+ argvalues=list(zip(_KEYNAMES, ["sampleA", "WGBS", "random"])))
+ def test_non_null(self, k, v):
+ """ AD is sensitive to value updates """
+ ad = AttributeDict()
+ assert not ad.is_null(k) and not ad.non_null(k)
+ ad[k] = None
+ assert ad.is_null(k) and not ad.non_null(k)
+ ad[k] = v
+ assert not ad.is_null(k) and ad.non_null(k)
+
@pytest.mark.usefixtures("write_project_files")
class SampleYamlTests:
""" AttributeDict metadata only appear in YAML if non-default. """
-
@pytest.mark.parametrize(
argnames="metadata_attribute", argvalues=ATTRDICT_METADATA.keys(),
ids=lambda attr_name: " metadata item = {} ".format(attr_name))
@@ -561,7 +597,6 @@ def test_all_defaults_no_metadata(self, tmpdir, proj, metadata_attribute):
filepath = os.path.join(tmpdir.strpath, "sample{}.yaml".format(i))
lines, _ = self._yaml_data(sample, filepath)
assert all([metadata_attribute not in line for line in lines])
-
@staticmethod
def _yaml_data(sample, filepath, section_to_change=None,
@@ -585,3 +620,11 @@ def _yaml_data(sample, filepath, section_to_change=None,
with open(filepath, 'r') as f:
lines = f.readlines()
return lines, data
+
+
+@pytest.mark.parametrize(
+ ["func", "exp"],
+ [(repr, "{}"), (str, AttributeDict().__class__.__name__ + ": {}")])
+def test_text_repr_empty(func, exp):
+ """ Empty AttributeDict is correctly represented as text. """
+ assert exp == func(AttributeDict())
diff --git a/tests/models/independent/test_Project.py b/tests/models/independent/test_Project.py
index b6038b8e..3f59aab1 100644
--- a/tests/models/independent/test_Project.py
+++ b/tests/models/independent/test_Project.py
@@ -3,21 +3,21 @@
import copy
import logging
import os
+import warnings
import mock
from numpy import random as nprand
import pytest
import yaml
-import peppy
from peppy import AttributeDict, Project, Sample
-from peppy.const import SAMPLE_ANNOTATIONS_KEY, SAMPLE_NAME_COLNAME
-from peppy.project import _MissingMetadataException
+from peppy.const import IMPLICATIONS_DECLARATION, SAMPLE_ANNOTATIONS_KEY
+from peppy.project import GENOMES_KEY, TRANSCRIPTOMES_KEY
from peppy.sample import COL_KEY_SUFFIX
from tests.conftest import \
DERIVED_COLNAMES, EXPECTED_MERGED_SAMPLE_FILES, \
MERGED_SAMPLE_INDICES, NUM_SAMPLES
-from tests.helpers import named_param
+from tests.helpers import named_param, TempLogFileHandler
__author__ = "Vince Reuter"
@@ -25,28 +25,32 @@
+_GENOMES = {"human": "hg19", "mouse": "mm10"}
+_TRASCRIPTOMES = {"human": "hg19_cdna", "mouse": "mm10_cdna"}
+
+
+
@pytest.fixture(scope="function")
def project_config_data():
""" Provide some basic data for a Project configuration. """
return {
"metadata": {
- SAMPLE_ANNOTATIONS_KEY: "sample-anns-filler.csv",
+ SAMPLE_ANNOTATIONS_KEY: "samples.csv",
"output_dir": "$HOME/sequencing/output",
"pipeline_interfaces": "${CODE}/pipelines"},
"data_sources": {"arbitrary": "placeholder/data/{filename}"},
- "genomes": {"human": "hg19", "mouse": "mm10"},
- "transcriptomes": {"human": "hg19_cdna", "mouse": "mm10_cdna"}}
+ }
def pytest_generate_tests(metafunc):
""" Dynamic parameterization/customization for tests in this module. """
- if metafunc.cls == DerivedColumnsTests:
- # Parameterize derived columns tests over whether the specification
- # is explicit (vs. implied), and which default column to validate.
+ if metafunc.cls == DerivedAttributesTests:
+ # Parameterize derived attribute tests over whether the specification
+ # is explicit (vs. implied), and which default attribute to validate.
metafunc.parametrize(
argnames="case_type",
- argvalues=DerivedColumnsTests.DERIVED_COLUMNS_CASE_TYPES,
+ argvalues=DerivedAttributesTests.DERIVED_ATTRIBUTES_CASE_TYPES,
ids=lambda case_type: "case_type={}".format(case_type))
@@ -71,7 +75,7 @@ def test_no_samples(self, path_empty_project):
ids=lambda lazy: "lazy={}".format(lazy))
def test_no_sample_subannotation_in_config(
self, tmpdir, spec_type, lazy, proj_conf_data, path_sample_anns):
- """ Merge table attribute remains null if config lacks subannotation. """
+ """ Subannotation attribute remains null if config lacks subannotation. """
metadata = proj_conf_data["metadata"]
try:
assert "sample_subannotation" in metadata
@@ -94,14 +98,6 @@ def test_no_sample_subannotation_in_config(
assert p.sample_subannotation is None
- @pytest.mark.skip("Not implemented")
- def test_sample_subannotation_construction(
- self, tmpdir, project_config_data):
- """ Merge table is constructed iff samples are constructed. """
- # TODO: implement
- pass
-
-
def test_counting_samples_doesnt_create_samples(
self, sample_annotation_lines,
path_project_conf, path_sample_anns):
@@ -154,18 +150,16 @@ class ProjectRequirementsTests:
""" Tests for a Project's set of requirements. """
- def test_lacks_sample_annotations(
+ def test_lacks_sample_annotation(
self, project_config_data, env_config_filepath, tmpdir):
- """ Lack of sample annotations precludes Project construction. """
-
+ """ Project can be built without sample annotations. """
# Remove sample annotations KV pair from config data for this test.
del project_config_data["metadata"][SAMPLE_ANNOTATIONS_KEY]
-
- # Write the config and assert the expected exception for Project ctor.
+ # Write the (sans-annotations) config and assert Project is created.
conf_path = _write_project_config(
project_config_data, dirpath=tmpdir.strpath)
- with pytest.raises(_MissingMetadataException):
- Project(conf_path, default_compute=env_config_filepath)
+ prj = Project(conf_path, default_compute=env_config_filepath)
+ assert isinstance(prj, Project)
def test_minimal_configuration_doesnt_fail(
@@ -257,29 +251,22 @@ def test_nonexistent_env_settings_file(
misnamed_envconf = os.path.join(envconf_dirpath, envconf_filename)
# Create and add log message handler for expected errors.
- logfile = tmpdir.join("project-error-messages.log").strpath
- expected_error_message_handler = logging.FileHandler(logfile, mode='w')
- expected_error_message_handler.setLevel(logging.ERROR)
- peppy.project._LOGGER.handlers.append(expected_error_message_handler)
-
- # Create Project, expecting to generate error messages.
- project = Project(minimal_project_conf_path,
- default_compute=misnamed_envconf)
+ log = tmpdir.join("project-error-messages.log").strpath
+ logview = TempLogFileHandler(log, level=logging.ERROR)
- # Remove the temporary message handler.
- del peppy.project._LOGGER.handlers[-1]
+ with logview:
+ # Create Project, expecting to generate error messages.
+ project = Project(
+ minimal_project_conf_path, default_compute=misnamed_envconf)
# Ensure nulls for all relevant Project attributes.
self._assert_null_compute_environment(project)
+
# We should have two error messages, describing the exception caught
# during default environment parsing and that it couldn't be set.
- with open(logfile, 'r') as messages:
- exception_messages = messages.readlines()
- try:
- assert 2 == len(exception_messages)
- except AssertionError:
- print("Exception messages: {}".format(exception_messages))
- raise
+ exception_messages = logview.messages
+ assert 2 == len(exception_messages), \
+ "Exception messages: {}".format(exception_messages)
def test_project_environment_uses_default_environment_settings(
@@ -337,11 +324,11 @@ def default_compute_settings(project):
-class DerivedColumnsTests:
- """ Tests for the behavior of Project's derived_columns attribute. """
+class DerivedAttributesTests:
+ """ Tests for the behavior of Project's derived_attributes attribute. """
- ADDITIONAL_DERIVED_COLUMNS = ["arbitrary1", "filler2", "placeholder3"]
- DERIVED_COLUMNS_CASE_TYPES = ["implicit", "disjoint", "intersection"]
+ ADDITIONAL_DERIVED_ATTRIBUTES = ["arbitrary1", "filler2", "placeholder3"]
+ DERIVED_ATTRIBUTES_CASE_TYPES = ["implicit", "disjoint", "intersection"]
def create_project(
@@ -354,68 +341,68 @@ def create_project(
:param str default_env_path: path to the default environment config
file to pass to Project constructor
:param str case_type: type of test case to execute; this determines
- how to specify the derived columns in the config file
+ how to specify the derived attribute in the config file
:param str dirpath: path in which to write config file
:return (Iterable[str], Project): collection of names of derived
- columns to expect, along with Project instance with which to test
+ attribute to expect, along with Project instance with which to test
"""
# Ensure valid parameterization.
- if case_type not in self.DERIVED_COLUMNS_CASE_TYPES:
+ if case_type not in self.DERIVED_ATTRIBUTES_CASE_TYPES:
raise ValueError(
- "Unexpected derived_columns case type: '{}' (known={})".
- format(case_type, self.DERIVED_COLUMNS_CASE_TYPES))
+ "Unexpected derived_attributes case type: '{}' (known={})".
+ format(case_type, self.DERIVED_ATTRIBUTES_CASE_TYPES))
# Parameterization specifies expectation and explicit specification.
- expected_derived_columns = copy.copy(Project.DERIVED_COLUMNS_DEFAULT)
+ expected_derived_attributes = copy.copy(Project.DERIVED_ATTRIBUTES_DEFAULT)
if case_type == "implicit":
- # Negative control; ensure config data lacks derived columns.
- assert "derived_columns" not in project_config_data
+ # Negative control; ensure config data lacks derived attributes.
+ assert "derived_attributes" not in project_config_data
else:
- explicit_derived_columns = \
- copy.copy(self.ADDITIONAL_DERIVED_COLUMNS)
- expected_derived_columns.extend(self.ADDITIONAL_DERIVED_COLUMNS)
- # Determine explicit inclusion of default derived columns.
+ explicit_derived_attributes = \
+ copy.copy(self.ADDITIONAL_DERIVED_ATTRIBUTES)
+ expected_derived_attributes.extend(self.ADDITIONAL_DERIVED_ATTRIBUTES)
+ # Determine explicit inclusion of default derived attributes.
if case_type == "intersection":
- explicit_derived_columns.extend(
- Project.DERIVED_COLUMNS_DEFAULT)
- project_config_data["derived_columns"] = explicit_derived_columns
+ explicit_derived_attributes.extend(
+ Project.DERIVED_ATTRIBUTES_DEFAULT)
+ project_config_data["derived_attributes"] = explicit_derived_attributes
# Write the config and build the Project.
conf_file_path = _write_project_config(
project_config_data, dirpath=dirpath)
- with mock.patch("peppy.project.check_sample_sheet"):
+ with mock.patch("peppy.project.Project.parse_sample_sheet"):
project = Project(conf_file_path, default_compute=default_env_path)
- return expected_derived_columns, project
+ return expected_derived_attributes, project
- def test_default_derived_columns_always_present(self,
+ def test_default_derived_attributes_always_present(self,
env_config_filepath, project_config_data, case_type, tmpdir):
- """ Explicit or implicit, default derived columns are always there. """
+ """ Explicit or implicit, default derived attributes are always there. """
- expected_derived_columns, project = self.create_project(
+ expected_derived_attributes, project = self.create_project(
project_config_data=project_config_data,
default_env_path=env_config_filepath,
case_type=case_type, dirpath=tmpdir.strpath)
# Rough approximation of order-agnostic validation of
# presence and number agreement for all elements.
- assert len(expected_derived_columns) == len(project.derived_columns)
- assert set(expected_derived_columns) == set(project.derived_columns)
+ assert len(expected_derived_attributes) == len(project.derived_attributes)
+ assert set(expected_derived_attributes) == set(project.derived_attributes)
- def test_default_derived_columns_not_duplicated(self,
+ def test_default_derived_attributes_not_duplicated(self,
env_config_filepath, project_config_data, case_type, tmpdir):
- """ Default derived columns are not added if already present. """
+ """ Default derived attributes are not added if already present. """
from collections import Counter
_, project = self.create_project(
project_config_data=project_config_data,
default_env_path=env_config_filepath,
case_type=case_type, dirpath=tmpdir.strpath)
- num_occ_by_derived_column = Counter(project.derived_columns)
- for default_derived_colname in Project.DERIVED_COLUMNS_DEFAULT:
- assert 1 == num_occ_by_derived_column[default_derived_colname]
+ num_occ_by_derived_attribute = Counter(project.derived_attributes)
+ for default_derived_colname in Project.DERIVED_ATTRIBUTES_DEFAULT:
+ assert 1 == num_occ_by_derived_attribute[default_derived_colname]
@@ -597,7 +584,7 @@ def observed_argstring_elements(
conf_file_path = _write_project_config(confdata, dirpath=confpath)
# Subvert requirement for sample annotations file.
- with mock.patch("peppy.project.check_sample_sheet"):
+ with mock.patch("peppy.project.Project.parse_sample_sheet"):
project = Project(conf_file_path, default_compute=envpath)
argstring = project.get_arg_string(pipeline)
@@ -683,25 +670,20 @@ def test_merge_samples_negative(self, proj, sample_index):
@pytest.mark.parametrize(argnames="sample_index",
argvalues=MERGED_SAMPLE_INDICES)
def test_data_sources_derivation(self, proj, sample_index):
- """ Samples in merge file, check data_sources --> derived_columns. """
- # Make sure these columns were merged:
- merged_columns = filter(
- lambda col_key: (col_key != "col_modifier") and
- not col_key.endswith(COL_KEY_SUFFIX),
- proj.samples[sample_index].merged_cols.keys())
+ """ Samples in merge file, check data_sources --> derived_attributes. """
# Order may be lost due to mapping.
# We don't care about that here, or about duplicates.
- expected = set(DERIVED_COLNAMES)
- observed = set(merged_columns)
+ required = set(DERIVED_COLNAMES)
+ observed = {k for k in proj.samples[sample_index].merged_cols.keys()
+ if k != "col_modifier" and not k.endswith(COL_KEY_SUFFIX)}
# Observed may include additional things (like auto-added subsample_name)
- for val in expected:
- assert val in observed
+ assert required == (required & observed)
@named_param(argnames="sample_index", argvalues=MERGED_SAMPLE_INDICES)
- def test_derived_columns_sample_subannotation_sample(
+ def test_derived_attributes_sample_subannotation_sample(
self, proj, sample_index):
- """ Make sure derived columns works on merged table. """
+ """ Make sure derived attributes works on merged table. """
observed_merged_sample_filepaths = \
[os.path.basename(f) for f in
proj.samples[sample_index].file2.split(" ")]
@@ -717,8 +699,8 @@ def test_unmerged_samples_lack_merged_cols(self, proj, sample_index):
assert not proj.samples[sample_index].merged_cols
- def test_duplicate_derived_columns_still_derived(self, proj):
- """ Duplicated derived columns can still be derived. """
+ def test_duplicate_derived_attributes_still_derived(self, proj):
+ """ Duplicated derived attributes can still be derived. """
sample_index = 2
observed_nonmerged_col_basename = \
os.path.basename(proj.samples[sample_index].nonmerged_col)
@@ -728,6 +710,180 @@ def test_duplicate_derived_columns_still_derived(self, proj):
+class SubprojectActivationTest:
+ """ Test cases for the effect of activating a subproject. """
+
+ MARK_NAME = "marker"
+ SUBPROJ_SECTION = {
+ "neurons": {MARK_NAME: "NeuN"}, "astrocytes": {MARK_NAME: "GFAP"},
+ "oligodendrocytes": {MARK_NAME: "NG2"}, "microglia": {MARK_NAME: "Iba1"}
+ }
+
+
+ @pytest.mark.parametrize("sub", SUBPROJ_SECTION.keys())
+ def test_subproj_activation_returns_project(self, tmpdir, sub):
+ """ Subproject activation returns the project instance. """
+ prj = self.make_proj(tmpdir.strpath, incl_subs=True)
+ updated_prj = prj.activate_subproject(sub)
+ assert updated_prj is prj
+
+
+ @pytest.mark.parametrize(
+ argnames="attr", argvalues=["permissive", "file_checks"])
+ @pytest.mark.parametrize("sub", SUBPROJ_SECTION.keys())
+ def test_sp_act_resets_all_attributes(self, tmpdir, attr, sub):
+ """ Subproject activation doesn't affect non-config attributes. """
+ prj = self.make_proj(tmpdir.strpath, incl_subs=True)
+ original = prj[attr]
+ prj[attr] = not original
+ assert prj[attr] is not original
+ prj.activate_subproject(sub)
+ assert prj[attr] is original
+
+
+ @pytest.mark.parametrize("sub", SUBPROJ_SECTION.keys())
+ def test_subproj_activation_adds_new_config_entries(self, tmpdir, sub):
+ """ Previously nonexistent entries are added by subproject. """
+ prj = self.make_proj(tmpdir.strpath, incl_subs=True)
+ assert self.MARK_NAME not in prj
+ prj.activate_subproject(sub)
+ assert self.MARK_NAME in prj
+ assert self.SUBPROJ_SECTION[sub][self.MARK_NAME] == prj[self.MARK_NAME]
+
+
+ @pytest.mark.parametrize("sub", SUBPROJ_SECTION.keys())
+ def test_sp_act_overwrites_existing_config_entries(self, tmpdir, sub):
+ """ An activated subproject's values are favored over preexisting. """
+ prj = self.make_proj(tmpdir.strpath, incl_subs=True)
+ prj[self.MARK_NAME] = "temp-mark"
+ assert "temp-mark" == prj[self.MARK_NAME]
+ prj.activate_subproject(sub)
+ expected = self.SUBPROJ_SECTION[sub][self.MARK_NAME]
+ assert expected == prj[self.MARK_NAME]
+
+
+ def test_activate_unknown_subproj(self, tmpdir):
+ """ With subprojects, attempt to activate undefined one is an error. """
+ prj = self.make_proj(tmpdir.strpath, incl_subs=True)
+ with pytest.raises(Exception):
+ prj.activate_subproject("DNE-subproject")
+
+
+ @pytest.mark.parametrize("sub", SUBPROJ_SECTION.keys())
+ def test_subproj_activation_when_none_exist(self, tmpdir, sub):
+ """ Without subprojects, activation attempt produces warning. """
+ prj = self.make_proj(tmpdir.strpath, incl_subs=False)
+ logfile = tmpdir.join("project-error-messages.log").strpath
+ logview = TempLogFileHandler(logfile, level=logging.WARN)
+ with logview:
+ # Call that should produce a warning message
+ prj.activate_subproject(sub)
+ # Check for warning message.
+ exception_messages = logview.messages
+ for msg in exception_messages:
+ if "no subprojects are defined" in msg:
+ break
+ else:
+ raise AssertionError("Did not find expected message among lines: "
+ "{}".format(exception_messages))
+
+
+ @classmethod
+ def make_proj(cls, folder, incl_subs):
+ """ Write temp config and create Project with subproject option. """
+ conf_file_path = os.path.join(folder, "conf.yaml")
+ conf_data = {"metadata": {}}
+ if incl_subs:
+ conf_data.update(**{"subprojects": cls.SUBPROJ_SECTION})
+ with open(conf_file_path, 'w') as f:
+ yaml.safe_dump(conf_data, f)
+ return Project(conf_file_path)
+
+
+
+@pytest.mark.usefixtures("write_project_files")
+class ProjectWarningTests:
+ """ Tests for warning messages related to projects """
+
+ @pytest.mark.parametrize(
+ "ideally_implied_mappings",
+ [{}, {GENOMES_KEY: _GENOMES}, {TRANSCRIPTOMES_KEY: _TRASCRIPTOMES},
+ {GENOMES_KEY: _GENOMES, TRANSCRIPTOMES_KEY: _TRASCRIPTOMES}])
+ def test_suggests_implied_attributes(
+ self, recwarn, tmpdir, path_sample_anns,
+ project_config_data, ideally_implied_mappings):
+ """ Assemblies directly in proj conf (not implied) is deprecated. """
+
+ # Add the mappings parameterization to the config data.
+ conf_data = copy.deepcopy(project_config_data)
+ conf_data.update(ideally_implied_mappings)
+
+ # Write the config file.
+ conf_file = tmpdir.join("proj_conf.yaml").strpath
+ assert not os.path.isfile(conf_file), \
+ "Test project temp config file already exists: {}".format(conf_file)
+ with open(conf_file, 'w') as cf:
+ yaml.safe_dump(conf_data, cf)
+
+ # (Hopefully) generate the warnings.
+ assert 0 == len(recwarn) # Ensure a fresh start.
+ warnings.simplefilter('always') # Allow DeprecationWarning capture.
+ Project(conf_file) # Generate the warning(s).
+ msgs = [str(w.message) for w in recwarn # Grab deprecation messages.
+ if isinstance(w.message, DeprecationWarning)]
+ assert len(ideally_implied_mappings) == len(msgs) # 1:1 warnings
+ for k in ideally_implied_mappings:
+ # Each section that should be implied should generate exactly 1
+ # warning; check message for content then remove it from the pool.
+ matched = [m for m in msgs if k in m and
+ IMPLICATIONS_DECLARATION in m]
+ assert 1 == len(matched)
+ msgs.remove(matched[0])
+
+ @pytest.mark.parametrize("assembly_implications",
+ [{"genome": {"organism": _GENOMES}},
+ {"transcriptome": {"organism": _TRASCRIPTOMES}},
+ {"genome": {"organism": _GENOMES},
+ "transcriptome": {"organism": _TRASCRIPTOMES}}])
+ def test_no_warning_if_assemblies_are_implied(
+ self, recwarn, tmpdir, path_sample_anns,
+ project_config_data, assembly_implications):
+ """ Assemblies declaration within implied columns is not deprecated. """
+
+ # Add the mappings parameterization to the config data.
+ conf_data = copy.deepcopy(project_config_data)
+ conf_data[IMPLICATIONS_DECLARATION] = assembly_implications
+
+ # Write the config file.
+ conf_file = tmpdir.join("proj_conf.yaml").strpath
+ assert not os.path.isfile(conf_file), \
+ "Test project temp config file already exists: {}".format(conf_file)
+ with open(conf_file, 'w') as cf:
+ yaml.safe_dump(conf_data, cf)
+
+ # Check that there are no warnings before or after test.
+ assert 0 == len(recwarn)
+ warnings.simplefilter('always')
+ Project(conf_file)
+ assert 0 == len(recwarn)
+
+
+
+@pytest.mark.usefixtures("write_project_files")
+class SampleSubannotationTests:
+
+ @pytest.mark.parametrize("defer", [False, True])
+ def test_sample_subannotation_construction(self, defer,
+ subannotation_filepath, path_project_conf, path_sample_anns):
+ """ Merge table is constructed iff samples are constructed. """
+ p = Project(path_project_conf, defer_sample_construction=defer)
+ if defer:
+ assert p.sample_subannotation is None
+ else:
+ assert p.sample_subannotation is not None
+
+
+
def _write_project_config(config_data, dirpath, filename="proj-conf.yaml"):
"""
Write the configuration file for a Project.
diff --git a/tests/models/independent/test_ProjectContext.py b/tests/models/independent/test_ProjectContext.py
index 6e4c3f9a..d57f5b9c 100644
--- a/tests/models/independent/test_ProjectContext.py
+++ b/tests/models/independent/test_ProjectContext.py
@@ -14,13 +14,12 @@
RNA_NAME = "rna_PE"
WGBS_NAME = "wgbs-hs"
RRBS_NAME = "rrbs_mm"
+RRBS_NAME = "rrbs_mm"
ADD_PROJECT_DATA = {
- "genome": {"organism": {
- "mouse": "mm10", "human": "hg38", "rat": "rn6"}},
"data_sources": {"src": "{sample}-{flowcell}.bam"},
- "derived_columns": ["data_source"],
+ "derived_attributes": ["data_source"],
"pipeline_args": {"--epilog": None},
- "implied_columns": {"organism": "assembly"},
+ "implied_attributes": {"organism": "assembly"},
"user": "test-user",
"email": "tester@domain.org",
}
@@ -112,8 +111,7 @@ def test_no_filtration(self, samples, project):
argnames=["inclusion", "expected_names"],
argvalues=[("ATAC", {"atac-PE"}),
(("WGBS", "RRBS"), {WGBS_NAME, RRBS_NAME}),
- ({"RNA", "CHIP"}, {RNA_NAME, CHIP_NAME})],
- ids=lambda incl_exp_pair: "{}-{}".format(*incl_exp_pair))
+ ({"RNA", "CHIP"}, {RNA_NAME, CHIP_NAME})])
def test_inclusion(self, samples, project, inclusion, expected_names):
""" Sample objects can be selected for by protocol. """
_assert_samples(samples, project.samples)
diff --git a/tests/models/independent/test_Sample.py b/tests/models/independent/test_Sample.py
index 0921dd1c..db0ed957 100644
--- a/tests/models/independent/test_Sample.py
+++ b/tests/models/independent/test_Sample.py
@@ -36,7 +36,7 @@ class ParseSampleImplicationsTests:
def test_project_no_implications(self, sample, implications):
""" With no implications mapping, sample is unmodified. """
before_inference = sample.__dict__
- sample.infer_columns(implications)
+ sample.infer_attributes(implications)
after_inference = sample.__dict__
assert before_inference == after_inference
@@ -44,16 +44,13 @@ def test_project_no_implications(self, sample, implications):
def test_null_intersection_between_sample_and_implications(self, sample):
""" Sample with none of implications' fields --> no change. """
before_inference = sample.__dict__
- sample.infer_columns(self.IMPLICATIONS_MAP)
+ sample.infer_attributes(self.IMPLICATIONS_MAP)
assert before_inference == sample.__dict__
@pytest.mark.parametrize(
argnames=["implier_value", "implications"],
- argvalues=IMPLICATIONS.items(),
- ids=lambda implier_and_implications:
- "implier='{}', implications={}".format(
- implier_and_implications[0], str(implier_and_implications[1])))
+ argvalues=IMPLICATIONS.items())
def test_intersection_between_sample_and_implications(
self, sample, implier_value, implications):
""" Intersection between implications and sample fields --> append. """
@@ -64,7 +61,7 @@ def test_intersection_between_sample_and_implications(
# Set the parameterized value for the implications source field.
setattr(sample, self.IMPLIER_NAME, implier_value)
- sample.infer_columns(self.IMPLICATIONS_MAP)
+ sample.infer_attributes(self.IMPLICATIONS_MAP)
# Validate updates to sample based on column implications & inference.
for implied_name, implied_value in implications.items():
@@ -85,7 +82,7 @@ def no_implied_values():
no_implied_values()
setattr(sample, self.IMPLIER_NAME, unmapped_implier_value)
- sample.infer_columns(self.IMPLICATIONS_MAP)
+ sample.infer_attributes(self.IMPLICATIONS_MAP)
no_implied_values()
@@ -246,7 +243,7 @@ def prj_data(self, request):
"results_subdir": "results_pipeline",
"submission_subdir": "submission"},
DATA_SOURCES_SECTION: self.DATA_SOURCES,
- "derived_columns": [data_src]}
+ "derived_attributes": [data_src]}
@named_param(
@@ -261,7 +258,7 @@ def test_equivalence_between_implicit_and_explicit_prj(
# Explicitly-passed object needs to at least be an AttributeDict.
sample_data = AttributeDict(
{SAMPLE_NAME_COLNAME: "arbitrary_sample", "prj": prj_data,
- data_src_attr: src_key, "derived_columns": [data_src_attr]})
+ data_src_attr: src_key, "derived_attributes": [data_src_attr]})
# Create the samples and make the calls under test.
s = Sample(sample_data)
@@ -291,8 +288,8 @@ def test_prefers_explicit_project_context(self, prj_data):
assert new_src_val == getattr(s, DATA_SOURCE_COLNAME)
- @named_param(argnames="exclude_derived_columns", argvalues=[False, True])
- def test_no_derived_columns(self, prj_data, exclude_derived_columns):
+ @named_param(argnames="exclude_derived_attributes", argvalues=[False, True])
+ def test_no_derived_attributes(self, prj_data, exclude_derived_attributes):
""" Passing Sample's project is equivalent to its inference. """
# Here we're disinterested in parameterization w.r.t. data source key,
@@ -300,8 +297,8 @@ def test_no_derived_columns(self, prj_data, exclude_derived_columns):
src_key = self.SOURCE_KEYS[0]
# Explicitly-passed object needs to at least be an AttributeDict.
- if exclude_derived_columns:
- prj_data.pop("derived_columns")
+ if exclude_derived_attributes:
+ prj_data.pop("derived_attributes")
sample_data = {
SAMPLE_NAME_COLNAME: "arbitrary_sample", "prj": prj_data,
DATA_SOURCE_COLNAME: src_key}
@@ -317,7 +314,7 @@ def test_no_derived_columns(self, prj_data, exclude_derived_columns):
# Check results.
putative_new_attr = self.DATA_SOURCES[src_key]
- if exclude_derived_columns:
+ if exclude_derived_attributes:
# The value to which the source key maps won't have been added.
assert not hasattr(s, putative_new_attr)
assert putative_new_attr not in s
diff --git a/tests/models/integration/test_Project_Sample_interaction.py b/tests/models/integration/test_Project_Sample_interaction.py
index 9727d2a3..1a834937 100644
--- a/tests/models/integration/test_Project_Sample_interaction.py
+++ b/tests/models/integration/test_Project_Sample_interaction.py
@@ -31,21 +31,22 @@
NAME_ANNOTATIONS_FILE = "annotations.csv"
SAMPLE_NAMES = ["WGBS_mm10", "ATAC_mm10", "WGBS_rn6", "ATAC_rn6"]
-COLUMNS = [SAMPLE_NAME_COLNAME, "val1", "val2", "library"]
+PROTOCOL_COLNAME = "protocol"
+COLUMNS = [SAMPLE_NAME_COLNAME, "val1", "val2", PROTOCOL_COLNAME]
VALUES1 = [random.randint(-5, 5) for _ in range(len(SAMPLE_NAMES))]
VALUES2 = [random.randint(-5, 5) for _ in range(len(SAMPLE_NAMES))]
-LIBRARIES = ["WGBS", "ATAC", "WGBS", "ATAC"]
-DATA = list(zip(SAMPLE_NAMES, VALUES1, VALUES2, LIBRARIES))
+PROTOCOLS = ["WGBS", "ATAC", "WGBS", "ATAC"]
+DATA = list(zip(SAMPLE_NAMES, VALUES1, VALUES2, PROTOCOLS))
DATA_FOR_SAMPLES = [
{SAMPLE_NAME_COLNAME: SAMPLE_NAMES},
- {"val1": VALUES1}, {"val2": VALUES2}, {"library": LIBRARIES}]
+ {"val1": VALUES1}, {"val2": VALUES2}, {PROTOCOL_COLNAME: PROTOCOLS}]
PROJECT_CONFIG_DATA = {"metadata": {"sample_annotation": NAME_ANNOTATIONS_FILE}}
-PROTOCOLS = ["WGBS", "ATAC"]
def pytest_generate_tests(metafunc):
""" Customization of test cases within this module. """
+ protos = ["WGBS", "ATAC"]
if metafunc.cls == BuildSheetTests:
if "protocols" in metafunc.fixturenames:
# Apply the test case to each of the possible combinations of
@@ -53,10 +54,9 @@ def pytest_generate_tests(metafunc):
metafunc.parametrize(
argnames="protocols",
argvalues=list(itertools.chain.from_iterable(
- itertools.combinations(PROTOCOLS, x)
- for x in range(1 + len(PROTOCOLS)))),
- ids=lambda protos:
- " protocols = {} ".format(",".join(protos)))
+ itertools.combinations(protos, x)
+ for x in range(1 + len(protos)))),
+ ids=lambda ps: " protocols = {} ".format(",".join(ps)))
if "delimiter" in metafunc.fixturenames:
metafunc.parametrize(argnames="delimiter", argvalues=[",", "\t"])
@@ -102,7 +102,7 @@ def samples_rawdata():
@pytest.fixture(scope="function")
def sample_sheet(samples_rawdata):
df = pd.DataFrame(samples_rawdata)
- df.columns = [SAMPLE_NAME_COLNAME, "val1", "val2", "library"]
+ df.columns = [SAMPLE_NAME_COLNAME, "val1", "val2", PROTOCOL_COLNAME]
return df
@@ -195,7 +195,7 @@ def test_multiple_samples(
# But the sheet permits filtering to specific protocol(s).
exp_num_samples = len(SAMPLE_NAMES) if not protocols else \
- sum(sum(1 for l in LIBRARIES if l == p) for p in protocols)
+ sum(sum(1 for p2 in PROTOCOLS if p2 == p1) for p1 in protocols)
sheet = p.build_sheet(*protocols)
assert exp_num_samples == len(sheet)
if protocols:
diff --git a/tests/test_utils.py b/tests/test_utils.py
index 0456cf70..e9d539d9 100644
--- a/tests/test_utils.py
+++ b/tests/test_utils.py
@@ -1,6 +1,13 @@
""" Tests for utility functions """
import copy
+import random
+import string
+import sys
+if sys.version_info < (3, 3):
+ from collections import Mapping
+else:
+ from collections.abc import Mapping
import mock
import pytest
@@ -8,7 +15,8 @@
from peppy import AttributeDict, Project, Sample
from peppy.const import SAMPLE_INDEPENDENT_PROJECT_SECTIONS, SAMPLE_NAME_COLNAME
from peppy.utils import \
- add_project_sample_constants, copy as pepcopy, grab_project_data
+ add_project_sample_constants, coll_like, copy as pepcopy, \
+ grab_project_data, has_null_value, non_null_value
from tests.helpers import named_param, nonempty_powerset
@@ -38,8 +46,8 @@ def basic_project_data():
"output_dir": "outdir",
"results_subdir": "results_pipeline",
"submission_subdir": "submission"},
- "derived_columns": ["data_source"],
- "implied_columns": {"organism": {"genomes": {
+ "derived_attributes": ["data_source"],
+ "implied_attributes": {"organism": {"genomes": {
"mouse": "mm10", "rat": "rn6", "human": "hg38"}}},
"trackhubs": []
}
@@ -171,6 +179,92 @@ def test_name_collision(self, basic_sample, collision, old_val, new_val):
+def _randcoll(pool, dt):
+ """
+ Generate random collection of 1-10 elements.
+
+ :param Iterable pool: elements from which to choose
+ :param type dt: type of collection to create
+ :return Iterable[object]: collection of randomly generated elements
+ """
+ valid_types = [tuple, list, set, dict]
+ if dt not in valid_types:
+ raise TypeError("{} is an invalid type; choose from {}".
+ format(str(dt), ", ".join(str(t) for t in valid_types)))
+ rs = [random.choice(pool) for _ in range(random.randint(1, 10))]
+ return dict(enumerate(rs)) if dt == dict else rs
+
+
+
+@pytest.mark.parametrize(
+ ["arg", "exp"],
+ [(random.randint(-sys.maxsize - 1, sys.maxsize), False),
+ (random.random(), False),
+ (random.choice(string.ascii_letters), False),
+ ([], True), (set(), True), (dict(), True), (tuple(), True),
+ (_randcoll(string.ascii_letters, list), True),
+ (_randcoll(string.ascii_letters, dict), True),
+ (_randcoll([int(d) for d in string.digits], tuple), True),
+ (_randcoll([int(d) for d in string.digits], set), True)]
+)
+def test_coll_like(arg, exp):
+ """ Test arbiter of whether an object is collection-like. """
+ assert exp == coll_like(arg)
+
+
+def _get_empty_attrdict(data):
+ ad = AttributeDict()
+ ad.add_entries(data)
+ return ad
+
+
+class NullValueHelperTests:
+ """ Tests of accuracy of null value arbiter. """
+
+ _DATA = {"a": 1, "b": [2]}
+
+ @pytest.mark.skip("Not implemented")
+ @pytest.fixture(
+ params=[lambda d: dict(d),
+ lambda d: AttributeDict().add_entries(d),
+ lambda d: _DummyProject(d)],
+ ids=["dict", AttributeDict.__name__, _DummyProject.__name__])
+ def kvs(self, request):
+ """ For test cases provide KV pair map of parameterized type."""
+ return request.param(self._DATA)
+
+ def test_missing_key_neither_null_nor_non_null(self, kvs):
+ """ A key not in a mapping has neither null nor non-null value. """
+ k = "new_key"
+ assert k not in kvs
+ assert not has_null_value(k, kvs)
+ assert not non_null_value(k, kvs)
+
+ @pytest.mark.parametrize("coll", [list(), set(), tuple(), dict()])
+ def test_empty_collection_is_null(self, coll, kvs):
+ """ A key with an empty collection instance as its value is null. """
+ ck = "empty"
+ assert ck not in kvs
+ kvs[ck] = coll
+ assert has_null_value(ck, kvs)
+ assert not non_null_value(ck, kvs)
+
+ def test_None_is_null(self, kvs):
+ """ A key with None as value is null. """
+ bad_key = "nv"
+ assert bad_key not in kvs
+ kvs[bad_key] = None
+ assert has_null_value(bad_key, kvs)
+ assert not non_null_value(bad_key, kvs)
+
+ @pytest.mark.parametrize("k", _DATA.keys())
+ def test_non_nulls(self, k, kvs):
+ """ Keys with non-None atomic or nonempty collection are non-null. """
+ assert k in kvs
+ assert non_null_value(k, kvs)
+
+
+
def test_copy():
""" Test reference and equivalence comparison operators. """
class ExampleObject: