Skip to content

Commit

Permalink
Merge pull request #588 from molpopgen/update_tskit_metadata
Browse files Browse the repository at this point in the history
Use new tskit metadata schema.
  • Loading branch information
molpopgen committed Oct 29, 2020
2 parents af250cd + 0460351 commit ce47163
Show file tree
Hide file tree
Showing 6 changed files with 224 additions and 116 deletions.
28 changes: 7 additions & 21 deletions doc/pages/advancedtopics.rst
Expand Up @@ -40,19 +40,16 @@ A common set of idioms used by these examples are:
Decoding metadata from :class:`tskit.TreeSequence`
---------------------------------------------------------------------

`tskit` and `fwdpy11` treat metadata quite differently. The former is much more general,
while the latter gives you direct access to the data objects on the C++ side.
The `tskit` approach is based on binary strings. What `fwdpy11` does is encode strings that
can be converted back to Python dictionaries. For example, here is how one may process the
individual metadata after dumping the tables to `tskit`:
As of version 0.10.0, we make use of the metadata schema introduced in `tskit` 0.3.2.
See `here <https://tskit.readthedocs.io/en/latest/tutorial.html#working-with-metadata>`_ for details.

.. code-block:: python
import tskit
individuals = ts.tables.individuals
md = tskit.unpack_bytes(individuals.metadata, individuals.metadata_offset)
first_individual = eval(md[0])
first_individual = ts.tables.individuals[0].metadata
The data stored in `first_individual` will be a :class:`dict`.

In order to distinguish "alive" from "dead" individuals (*e.g.*, those
preserved as ancient samples), we need to make use of flags found in
Expand Down Expand Up @@ -94,20 +91,9 @@ The mutation metadata follow the same general recipe:

.. code-block:: python
mutations = ts.tables.mutations
sites = ts.tables.sites
md = tskit.unpack_bytes(mutations.metadata, mutations.metadata_offset)
Here, ``md`` is a :class:`dict` whose key names are the same as the attributes of
:class`fwdpy11.Mutation`, with one exception. The `tskit` representation
of the mutation record's the allele's *age* in generations while
:attr:`fwdpy11.Mutation.g` is the generation when the mutation arose.
The reason for this discrepancy is because ``fwdpy11`` thinks forward in time
while ``tskit`` thinks backwards. The conversion to and from is trivial:

.. code-block:: python
first_mutations = ts.tables.mutations[0].metadata
print(f"Origin to age = {pop.generation - m.g + 1}")
The mutation's data will be represented as a :class:`dict`.

.. _howlongtorun:

Expand Down
108 changes: 43 additions & 65 deletions fwdpy11/_monkeypatch/_diploid_population.py
Expand Up @@ -18,11 +18,13 @@
#

import json
import typing

import numpy as np
import tskit

import fwdpy11.tskit_tools
import fwdpy11.tskit_tools.metadata_schema


def _alive_nodes(self):
Expand Down Expand Up @@ -77,65 +79,32 @@ def _traverse_sample_timepoints(self, include_alive=True):
yield self.generation, nodes, md


# NOTE: mutation origin times are recorded forwards in time,
# So we convert them into mutation ages.
def _generate_mutation_metadata(self):
muts = []
for mr in self.tables.mutations:
m = self.mutations[mr.key]
d = {
"s": m.s,
"h": m.h,
"age": self.generation - m.g + 1,
"label": m.label,
"esizes": list(m.esizes),
"heffects": list(m.heffects),
"neutral": m.neutral,
"key": mr.key,
}
muts.append(str(d).encode("utf-8"))
return tskit.pack_bytes(muts)


def _initializePopulationTable(node_view, tc):
population_metadata = []
tc.populations.metadata_schema = (
fwdpy11.tskit_tools.metadata_schema.PopulationMetadata
)
for i in sorted(np.unique(node_view["deme"])):
md = "deme" + str(i)
population_metadata.append(md.encode("utf-8"))

pmd, pmdo = tskit.pack_bytes(population_metadata)
tc.populations.set_columns(metadata=pmd, metadata_offset=pmdo)


def _generate_individual_metadata(dmd, tc):
strings = []
for i in dmd:
d = {
"g": i.g,
"e": i.e,
"w": i.w,
"geography": i.geography,
"parents": i.parents,
"sex": i.sex,
"deme": i.deme,
"label": i.label,
}
strings.append(str(d).encode("utf-8"))
return strings
tc.populations.add_row(metadata={"name": "deme" + str(i)})


def _initializeIndividualTable(self, tc):
"""
Returns node ID -> individual map
"""
tc.individuals.metadata_schema = (
fwdpy11.tskit_tools.metadata_schema.IndividualDiploidMetadata
)
# First, alive individuals:
individal_nodes = {}
flags = []
for i in range(self.N):
for i, d in enumerate(self.diploid_metadata):
individal_nodes[2 * i] = i
individal_nodes[2 * i + 1] = i
flags.append(fwdpy11.tskit_tools.INDIVIDUAL_IS_ALIVE)
metadata_strings = _generate_individual_metadata(self.diploid_metadata, tc)
tc.individuals.add_row(
flags=fwdpy11.tskit_tools.INDIVIDUAL_IS_ALIVE,
metadata=fwdpy11.tskit_tools.metadata_schema.generate_individual_metadata(
d
),
)

# Now, preserved nodes
num_ind_nodes = self.N
Expand All @@ -150,19 +119,17 @@ def _initializeIndividualTable(self, tc):
and self.tables.nodes[i.nodes[1]].time == 0.0
):
flag |= fwdpy11.tskit_tools.INDIVIDUAL_IS_FIRST_GENERATION
flags.append(flag)

metadata_strings.extend(
_generate_individual_metadata(self.ancient_sample_metadata, tc)
)
tc.individuals.add_row(
flags=flag,
metadata=fwdpy11.tskit_tools.metadata_schema.generate_individual_metadata(
i
),
)

md, mdo = tskit.pack_bytes(metadata_strings)
# flags = [0 for i in range(self.N + len(self.ancient_sample_metadata))]
tc.individuals.set_columns(flags=flags, metadata=md, metadata_offset=mdo)
return individal_nodes


def _dump_tables_to_tskit(self, parameters=None):
def _dump_tables_to_tskit(self, parameters: typing.Optional[typing.Dict] = None):
"""
Dump the population's TableCollection into
an tskit TreeSequence
Expand All @@ -179,6 +146,12 @@ def _dump_tables_to_tskit(self, parameters=None):
The provenance information is validated using
:func:`tskit.validate_provenance`, which may
raise an exception.
.. versionchanged:: 0.10.0
Use tskit metadata schema.
Mutation time is now stored in the tskit.MutationTable column.
Origin time of mutations is part of the metadata.
"""
from fwdpy11 import pybind11_version, gsl_version

Expand Down Expand Up @@ -247,16 +220,21 @@ def _dump_tables_to_tskit(self, parameters=None):
ancestral_state_offset=ancestral_state_offset,
)

derived_state = np.zeros(len(mut_view), dtype=np.int8) + ord("1")
md, mdo = _generate_mutation_metadata(self)
tc.mutations.set_columns(
site=np.arange(len(mpos), dtype=np.int32),
node=mut_view["node"],
derived_state=derived_state,
derived_state_offset=ancestral_state_offset,
metadata=md,
metadata_offset=mdo,
tc.mutations.metadata_schema = (
fwdpy11.tskit_tools.metadata_schema.determine_mutation_metadata_schema(
self.mutations
)
)
for m in self.tables.mutations:
tc.mutations.add_row(
site=m.site,
node=m.node,
derived_state="1",
time=self.generation - self.mutations[m.key].g,
metadata=fwdpy11.tskit_tools.metadata_schema.generate_mutation_metadata(
m, self.mutations
),
)
tc.provenances.add_row(json.dumps(provenance))
return tc.tree_sequence()

Expand Down
148 changes: 148 additions & 0 deletions fwdpy11/tskit_tools/metadata_schema.py
@@ -0,0 +1,148 @@
#
# Copyright (C) 2020 Kevin Thornton <krthornt@uci.edu>
#
# This file is part of fwdpy11.
#
# fwdpy11 is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# fwdpy11 is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with fwdpy11. If not, see <http://www.gnu.org/licenses/>.
#
import copy
import typing

import tskit

import fwdpy11

IndividualDiploidMetadata = tskit.metadata.MetadataSchema(
{
"codec": "struct",
"type": "object",
"properties": {
"g": {"type": "number", "binaryFormat": "d"},
"e": {"type": "number", "binaryFormat": "d"},
"w": {"type": "number", "binaryFormat": "d"},
"sex": {"type": "number", "binaryFormat": "i"},
"deme": {"type": "number", "binaryFormat": "i"},
"label": {"type": "number", "binaryFormat": "Q"},
"parents": {
"type": "array",
"items": {"type": "number", "binaryFormat": "Q"},
"arrayLengthFormat": "H",
},
"geography": {
"type": "array",
"items": {"type": "number", "binaryFormat": "d"},
"arrayLengthFormat": "H",
},
"nodes": {
"type": "array",
"items": {"type": "number", "binaryFormat": "i"},
"arrayLengthFormat": "H",
},
},
}
)

PopulationMetadata = tskit.metadata.MetadataSchema(
{
"codec": "json",
"type": "object",
"name": "Population metadata",
"properties": {"name": {"type": "string"}},
}
)

# NOTE: we could use JSON schema here and
# have the array fields not included in "required".
# However:
# 1. That is much slower.
# 2. This metadata contains floating point values.
# We don't want to lose precision in case anyone
# wants to try to recalculate diploid phenotypes/fitnesses

MutationMetadata = tskit.metadata.MetadataSchema(
{
"codec": "struct",
"type": "object",
"name": "Mutation metadata",
"properties": {
"s": {"type": "number", "binaryFormat": "d"},
"h": {"type": "number", "binaryFormat": "d"},
"origin": {"type": "number", "binaryFormat": "I"},
"neutral": {"type": "number", "binaryFormat": "?"},
"label": {"type": "number", "binaryFormat": "H"},
"key": {"type": "number", "binaryFormat": "Q"},
},
"additionalProperties": False,
}
)

_MutationMetaWithVectorsDict = copy.deepcopy(MutationMetadata.schema)

_MutationMetaWithVectorsDict["name"] = "Mutation metadata with vectors"
_MutationMetaWithVectorsDict["properties"]["esizes"] = {
"type": "array",
"items": {"type": "number", "binaryFormat": "d"},
}
_MutationMetaWithVectorsDict["properties"]["heffects"] = {
"type": "array",
"items": {"type": "number", "binaryFormat": "d"},
}

MutationMetadataWithVectors = tskit.metadata.MetadataSchema(
_MutationMetaWithVectorsDict
)


def generate_individual_metadata(
metadata: fwdpy11._fwdpy11.DiploidMetadata,
) -> typing.Dict:
d = {
"g": metadata.g,
"e": metadata.e,
"w": metadata.w,
"geography": [i for i in metadata.geography],
"parents": [i for i in metadata.parents],
"nodes": [i for i in metadata.nodes],
"sex": metadata.sex,
"deme": metadata.deme,
"label": metadata.label,
}
return d


def determine_mutation_metadata_schema(
mutations: fwdpy11._fwdpy11.MutationVector,
) -> tskit.metadata.MetadataSchema:
if len(mutations) == 0 or len(mutations[0].esizes) == 0:
return MutationMetadata

return MutationMetadataWithVectors


def generate_mutation_metadata(
mr: fwdpy11._fwdpy11.MutationRecord, mutations: fwdpy11._fwdpy11.MutationVector
) -> typing.Dict:
m = mutations[mr.key]
d = {
"s": m.s,
"h": m.h,
"origin": m.g,
"label": m.label,
"neutral": int(m.neutral),
"key": mr.key,
}
if len(m.esizes) > 0:
d["esizes"] = list(m.esizes)
d["heffects"] = list(m.heffects)
return d
2 changes: 1 addition & 1 deletion requirements.txt
Expand Up @@ -16,7 +16,7 @@ ipython
matplotlib
seaborn
msprime
tskit==0.2.3
tskit>=0.3.2
setuptools_scm==1.11.1

#below are for formatting, linting,
Expand Down
2 changes: 1 addition & 1 deletion setup.py
Expand Up @@ -11,7 +11,7 @@
from setuptools.command.build_ext import build_ext

PYBIND11_MIN_VERSION = "2.4.3"
TSKIT_MIN_VERSION = "0.2.3"
TSKIT_MIN_VERSION = "0.3.2"

if sys.version_info[0] < 3:
raise ValueError("Python 3 is required!")
Expand Down

0 comments on commit ce47163

Please sign in to comment.