In [1]:
import ipyrad.analysis as ipa
import toytree
import toyplot

In [2]:
print('ipyrad', ipa.__version__)
print('toytree', toytree.__version__)
! treemix --version | grep 'TreeMix v. '

ipyrad 0.9.43
toytree 1.1.2
TreeMix v. 1.13


In [3]:
# the path to your HDF5 formatted snps file
data = "P_ni_6rm_v9.snps.hdf5"

In [4]:
imap = {
"ref": ["reference"],
"Inam": ["P_ni_A7862_In", "P_ni_A7911_In", "P_ni_A7928_In"],
"Puru": ["P_ni_T22153_jigu","P_ni_T6243_In","P_ni_T5850_pu", "P_ni_T5940_pu", "P_ni_T5974_pu", "P_ni_T15938_pu","P_ni_80034_pu", "P_ni_T3609_pu", "P_ni_T3611_pu",  "P_ni_T3817_pu", "P_ni_T4043_pu", "P_ni_T4051_pu", "P_ni_T4313_pu", "P_ni_T4404_pu"],
"JiArTa": [ "P_ni_T3261_jigu","P_ni_T15863_jigu", "P_ni_T15868_jigu", "P_ni_T15871_jigu", "P_ni_A3255_jigu","P_ni_T443_ma", "P_ni_T467_ma","P_ni_T369_ma","P_ni_J434_ma", "P_ni_J461_ma", "P_ni_J462_ma", "P_ni_J485_ma", "P_ni_J210_ma", "P_ni_J227_ma", "P_ni_J260_ma", "P_ni_A2418_ma",  "P_ni_A542_ma","P_ni_J684_roar", "P_ni_J724_roar","P_ni_J361_roar", "P_ni_J363_roar", "P_ni_J371_roar", "P_ni_J373_roar", "P_ni_J381_roar", "P_ni_J385_roar", "P_ni_J389_roar", "P_ni_J417_roar","P_ni_J551_arsu", "P_ni_J602_arsu", "P_ni_J603_arsu", "P_ni_J614_arsu", "P_ni_J617_arsu","P_ni_80555_arsu","P_ni_86072_arsu", "P_ni_80684_arsu", "P_ni_80802_arsu", "P_ni_80874_arsu", "P_ni_85430_arsu","P_ni_T14543_suta",  "P_ni_T9076_suta","P_ni_T16698_suta","P_ni_T10967_suta",  "P_ni_T11888_suta","P_ni_T10204_suta","P_ni_A15120_suta", "P_ni_77876_suta", "P_ni_78155_suta","P_ni_85721_suta"],
"Para": ["P_ni_T1642_pa", "P_ni_T18703_pa","P_ni_T12345_pa", "P_ni_T12854_pa","P_ni_T11193_pa", "P_ni_T11222_pa","P_ni_T10673_pa", "P_ni_T10940_pa","P_ni_A7066_pa", "P_ni_A14342_pa", "P_ni_A15277_pa",]
}

# minimum % of samples that must be present in each SNP from each group
minmap = {i: 0.75 for i in imap}

In [5]:
tmx1 = ipa.treemix(
    data=data, 
    imap=imap,
    minmap=minmap, 
    seed=123456,
    root="ref",
    m=2,
)

Samples: 77
Sites before filtering: 1247688
Filtered (indels): 0
Filtered (bi-allel): 27379
Filtered (mincov): 0
Filtered (minmap): 1075782
Filtered (combined): 1079600
Sites after filtering: 168088
Sites containing missing values: 148813 (88.53%)
Missing values in SNP matrix: 651284 (5.03%)
subsampled 24349 unlinked SNPs


In [6]:
# print the command string that will be called and run it
print(tmx1.command)
tmx1.run()

/home/lmusher/array1/miniconda3/envs/treemix/bin/treemix -i /array1/lmusher/rio_roosevelt_outfiles/P_ni_6rm_v9_outfiles/analysis-treemix/test.treemix.in.gz -o /array1/lmusher/rio_roosevelt_outfiles/P_ni_6rm_v9_outfiles/analysis-treemix/test -m 2 -seed 123456 -root ref


In [7]:
# draw the resulting tree
tmx1.draw_tree();

In [8]:
# draw the covariance matrix
tmx1.draw_cov();

#We are now randomly sampling 1snp per locus, over 100 iterations to see how likelihood changes
#We are also sampling datasets that vary randomly in the percent of missing data ranging from 50% complete to 95% complete
#We do this for m = 1:5

In [9]:
import random
boots = range(250)

In [10]:
tests0 = {}
samp0 = {}
for i in boots:
    minSamp = random.randrange(50,95)/100
    # init a treemix analysis object with a random (no) seed
    tmx = ipa.treemix(
        data=data, 
        imap=imap,
        minmap={i: minSamp for i in imap},
        root="ref",
        global_=True,
        m=0,
        quiet=True
    )
    #print(minSamp),
    # run model fit
    tmx.run()
    tests0[i] = tmx.results.llik
    samp0[i] = minSamp

In [11]:
tests1 = {}
samp1 = {}
for i in boots:
    minSamp = random.randrange(50,95)/100
    # init a treemix analysis object with a random (no) seed
    tmx = ipa.treemix(
        data=data, 
        imap=imap,
        minmap={i: minSamp for i in imap},
        root="ref",
        global_=True,
        m=1,
        quiet=True
    )
    #print(minSamp),
    # run model fit
    tmx.run()
    tests1[i] = tmx.results.llik
    samp1[i] = minSamp

In [12]:
tests2 = {}
samp2 = {}
for i in boots:
    minSamp = random.randrange(50,95)/100
    # init a treemix analysis object with a random (no) seed
    tmx = ipa.treemix(
        data=data, 
        imap=imap,
        minmap={i: minSamp for i in imap},
        root="ref",
        global_=True,
        m=2,
        quiet=True
    )
    #print(minSamp),
    # run model fit
    tmx.run()
    tests2[i] = tmx.results.llik
    samp2[i] = minSamp

In [13]:
tests3 = {}
samp3 = {}
for i in boots:
    minSamp = random.randrange(50,95)/100
    # init a treemix analysis object with a random (no) seed
    tmx = ipa.treemix(
        data=data, 
        imap=imap,
        minmap={i: minSamp for i in imap},
        root="ref",
        global_=True,
        m=3,
        quiet=True
    )
    #print(minSamp),
    # run model fit
    tmx.run()
    tests3[i] = tmx.results.llik
    samp3[i] = minSamp

In [14]:
import numpy

test0 = numpy.mean([tests0[i] for i in boots])
test1 = numpy.mean([tests1[i] for i in boots])
test2 = numpy.mean([tests2[i] for i in boots])
test3 = numpy.mean([tests3[i] for i in boots])

tests = [test0, test1, test2, test3]

In [15]:
# plot the likelihood for different values of m
toyplot.plot(
    range(len(tests)),
    [tests[i] for i in range(len(tests))],
    width=350, 
    height=275,
    stroke_width=4,
    xlabel="# admixture edges",
    ylabel="ln(likelihood)",
    color="steelblue"
);

In [16]:
# import pandas as pd 
import pandas as pd 

# list of strings 
test0 = [tests0[i] for i in boots]
test1 = [tests1[i] for i in boots]
test2 = [tests2[i] for i in boots]
test3 = [tests3[i] for i in boots]
samps0 = [samp0[i] for i in boots]
samps1 = [samp1[i] for i in boots]
samps2 = [samp2[i] for i in boots]
samps3 = [samp3[i] for i in boots]

# Calling DataFrame constructor on list 
df = pd.DataFrame(list(zip(test0, samps0, test1, samps1, test2, samps2, test3, samps3)), 
               columns =['m0', 's0', 'm1', 's1', 'm2', 's2', 'm3', 's3']) 

df.to_csv(r'./analysis-treemix/G_cy_nadmix_tests.csv')

In [17]:
df.head()

Unnamed: 0,m0,s0,m1,s1,m2,s2,m3,s3
0,92.019,0.74,105.821,0.76,110.373,0.53,106.377,0.74
1,91.925,0.85,106.571,0.69,103.631,0.9,110.147,0.53
2,84.274,0.53,106.345,0.73,103.847,0.88,106.392,0.78
3,81.375,0.63,106.22,0.76,109.765,0.61,104.792,0.83
4,95.731,0.89,105.673,0.81,106.308,0.78,110.247,0.52


In [18]:
# a gridded canvas to plot trees on 
canvas = toyplot.Canvas(width=1200, height=200)
counts = 0
# iterate over multiple set of SNPs
for i in [0.5, 0.6, 0.7, 0.8, 0.9, 0.99]:
    
    # init a treemix analysis object with a random (no) seed
    tmx = ipa.treemix(
        data=data, 
        imap=imap,
        minmap={j: i for j in imap},
        root="ref",
        global_=True,
        m=1,
        quiet=True
    )
    
    # run model fit
    tmx.run()

    # select a plot grid axis and add tree to axes
    axes = canvas.cartesian(grid=(1, 6, counts))
    tmx.draw_tree(axes)
    counts = counts+1