In [1]:
%load_ext Cython

# Integrating fwdpy with fwdpp and libsequence

## Run some simulations

In [2]:
import fwdpy as fp
import fwdpy.fwdpyio as fpio
import multiprocessing as mp
import numpy as np

In [3]:
#We will simulate 10 replicate populations using
#Python's multiprocessing functionality.

def simpop(seed):
    """
    This function does the evolution
    and returns a serialized population
    """
    N=1000
    theta=100.
    nlist=np.array([N]*(10*N),dtype=np.uint32)
    rng = fp.GSLrng(seed)
    nregions=[fp.Region(0,1,1)]
    sregions=[]
    recregions=nregions
    #Simulate 10 populations
    pops = fp.evolve_regions(rng,1,N,nlist,theta/(4.*float(N)),
                             0.,theta/(4.*float(N)),
                             nregions,sregions,recregions)
    #return the population as a binary string
    return fpio.serialize(pops[0])

#Run 100 sims via a multiprocessing pool
seeds=[(i) for i in np.random.randint(0,42000000,100)]
#Auto-determine degree of parallelism
P=mp.Pool()
serialized_pops = [i for i in P.imap(simpop,seeds)]
P.close()
P.join()
pops = fpio.deserialize_singlepops(serialized_pops)

## Calculate nucleotide diversity for a sample from each simulated replicate

The next block of code is pure Python.  The function `pi_from_libsequence` uses [pylibseq](https://molpopgen.github.io/pylibseq) to calculate nucleotide diversity in a sample of size $n=100$.  The Python package pylibseq is based on the C++11 library [libsequence](https://molpopgen.github.io/libsequence)

In [4]:
import libsequence.polytable as pt
import libsequence.summstats as st

def pi_using_pylibseq(rng,x,nsam):
    s=fp.ms_sample(rng,x,nsam)
    d=pt.SimData()
    d.assign(s)
    ad=st.PolySIM(d)
    return ad.thetapi()

Behind the scenes, [Cython](http://www.cython.org) is used to pass the simulated population's data back to functions from both the [fwdpp](http://molpopgen.github.io) and [libsequence](https://molpopgen.github.io/libsequence) libraries.

fwdpy exposes a lot of the [fwdpp](http://molpopgen.github.io) API to you via Cython, and [pylibseq](https://molpopgen.github.io/pylibseq) gives you access to what you need from [libsequence](https://molpopgen.github.io/libsequence) to analyze simulation output.

Let's re-implement `py_using_pylibseq` ourselves, so we can see some of what we can do:

### Calculate nucleotide diversity using fwdpy's Cython APIs

In [5]:
fwdpy_includes = fp.get_includes()
fwdpp_includes = fp.get_fwdpp_includes()

In [6]:
%%cython --cplus --compile-args=-std=c++11 -I $fwdpy_includes -I $fwdpp_includes -l sequence -l gsl -l gslcblas

from fwdpy.fwdpy cimport *
from cython.operator cimport dereference as deref
cdef pi_from_singlepop_cpp(const gsl_rng * r,const singlepop_t * pop, const unsigned nsam):
    s = sample_single[singlepop_t](r,deref(pop),nsam,True)
    pi = 0.
    for i in s:
        ones = float(i[1].count(b'1'))
        pi += 2.0*ones*(float(nsam)-ones)
    pi /= (float(nsam)*float(nsam-1))
    return pi

def pi_using_cython(GSLrng rng,Spop pop,nsam):
    return pi_from_singlepop_cpp(rng.thisptr.get(),pop.pop.get(),nsam)

In [7]:
rng=fp.GSLrng(100)
%timeit -n10 pi = [pi_using_pylibseq(rng,i,100) for i in pops]

1 loop, best of 3: 787 ms per loop


In [8]:
rng=fp.GSLrng(100)
%timeit -n10 pi = [pi_using_cython(rng,i,100) for i in pops]

1 loop, best of 3: 461 ms per loop


In [9]:
rng=fp.GSLrng(100)
print(pi_using_pylibseq(rng,pops[0],100))
rng=fp.GSLrng(100)
print(pi_using_cython(rng,pops[0],100))

85.70808080808064
85.70808080808081


In [28]:
%%cython --cplus --compile-args=-std=c++11 -I $fwdpy_includes -I $fwdpp_includes -l sequence -l gsl -l gslcblas

from fwdpy.fwdpy cimport *
from libcpp.memory cimport unique_ptr
from libsequence.polytable cimport CppSimData
from libsequence.summstats cimport CppPolySIM
from cython.operator cimport dereference as deref

cdef pi_using_pylibseq_cpp(const gsl_rng * r, const singlepop_t * pop, const unsigned nsam):
    cdef sample_t s = sample_single[singlepop_t](r,deref(pop),nsam,True)
    cdef CppSimData sd = CppSimData(s.begin(),s.end())
    cdef unique_ptr[CppPolySIM] ps
    ps.reset(new CppPolySIM(&sd))
    return ps.get().ThetaPi()

def pi_using_pylibseq_cython(GSLrng r,Spop pop,nsam):
    return pi_using_pylibseq_cpp(r.thisptr.get(),pop.pop.get(),nsam)

In [30]:
rng=fp.GSLrng(100)
%timeit -n10 pi = [pi_using_pylibseq_cython(rng,i,100) for i in pops]

10 loops, best of 3: 759 ms per loop
