
# STRUCTURE

In [46]:
import ipyrad.analysis as ipa
import toyplot
import ipyparallel as ipp

In [47]:
## look for running ipcluster instance, and create load-balancer
ipyclient = ipp.Client()
print("{} engines found".format(len(ipyclient)))

24 engines found


In [48]:
# the path to your HDF5 formatted snps file
data = "./M_ru_3rm_v9.snps.hdf5"

In [72]:
imap = {
#"ref": ["reference"],
#"Inam": ['M_ru_A7875_In','M_ru_T14456_In', 'M_ru_T23245_In', 'M_ru_T23246_In', 'M_ru_T23416_In', 'M_ru_T23478_In'],
#"Puru": ['M_ru_A10311_pu','M_ru_A10329_pu', 'M_ru_A1380_pu', 'M_ru_A2741_pu', 'M_ru_A436_pu', 'M_ru_A440_pu'],
#"JiGu": ['M_ru_A207_jigu', 'M_ru_T3228_jigu', 'M_ru_T7634_jigu'],
"Mach": ['M_ru_T368_ma', 'M_ru_T381_ma', 'M_ru_T476_ma', 'M_ru_T494_ma'],#'M_ru_T13253_ma','M_ru_A474_ma', 'M_ru_T3164_ma', 'M_ru_J265_ma', 
"Roar": ['M_ru_J640_roar','M_ru_J676_roar'],
"ArSu": ['M_ru_80819_arsu', 'M_ru_81347_arsu', 'M_ru_85426_arsu'],
"SuTa": [ 'M_ru_85919_suta','M_ru_77750_suta', 'M_ru_78182_suta', 'M_ru_A11834_suta', 'M_ru_A15176_suta', 'M_ru_A5487_suta', 'M_ru_T10184_suta', 'M_ru_T11780_suta', 'M_ru_T14532_suta', 'M_ru_T14622_suta', 'M_ru_T753_suta', 'M_ru_A9903_pa'],
"Para": ['M_ru_A16195_pa','M_ru_A9235_pa', 'M_ru_T11079_pa', 'M_ru_T11238_pa', 'M_ru_T12541_pa', 'M_ru_T1649_pa', 'M_ru_T16553_pa', 'M_ru_T19782_pa', 'M_ru_T6500_pa', 'M_ru_T6577_pa']
}
 
# minimum % of samples that must be present in each SNP from each group
minmap = {i: 0.5 for i in imap}

In [5]:
# init analysis object with input data and (optional) parameter options
struct = ipa.structure(
    name="M_ru_east_str",
    data=data,
    imap=imap,
    minmap=minmap,
    mincov=0.95,
)

Samples: 31
Sites before filtering: 819934
Filtered (indels): 0
Filtered (bi-allel): 9343
Filtered (mincov): 726024
Filtered (minmap): 770178
Filtered (combined): 771026
Sites after filtering: 48908
Sites containing missing values: 0 (0.00%)
Missing values in SNP matrix: 0 (0.00%)


In [6]:
struct.mainparams.burnin = 25000
struct.mainparams.numreps = 100000
struct.write_structure_files(abs)

('/array1/lmusher/rio_roosevelt_outfiles/M_ru_3rm_v9_outfiles/analysis-structure/tmp-M_ru_east_str-<built-in function abs>-1.mainparams.txt',
 '/array1/lmusher/rio_roosevelt_outfiles/M_ru_3rm_v9_outfiles/analysis-structure/tmp-M_ru_east_str-<built-in function abs>-1.extraparams.txt',
 '/array1/lmusher/rio_roosevelt_outfiles/M_ru_3rm_v9_outfiles/analysis-structure/tmp-M_ru_east_str-<built-in function abs>-1.strfile.txt')

In [None]:
struct.run(nreps=10, kpop=[2, 3, 4, 5, 6, 7, 8], ipyclient=ipyclient, force=True)

Parallel connection | amnh-gen-001.internal.amnh.org: 24 cores
[######              ]  34% 2:02:26 | running 70 structure jobs 

In [50]:
struct = ipa.structure(
    data=data, 
    name="M_ru_east_str", 
    workdir="analysis-structure",
    imap=imap,
    load_only=True,
)

70 previous results loaded for run [M_ru_east_str]


In [51]:
etable = struct.get_evanno_table([2, 3, 4, 5, 6, 7, 8])
etable

Unnamed: 0,Nreps,deltaK,estLnProbMean,estLnProbStdev,lnPK,lnPPK
2,10,0.0,-14427.48,343.098,0.0,0.0
3,10,3.095,-15400.72,317.891,-973.24,983.89
4,10,8.775,-15390.07,297.86,10.65,2613.8
5,10,0.108,-17993.22,8583.985,-2603.15,925.26
6,10,0.366,-19671.11,6135.162,-1677.89,2246.33
7,10,0.017,-23595.33,12732.423,-3924.22,217.88
8,10,0.0,-27737.43,10549.358,-4142.1,0.0


In [52]:
# get canvas object and set size
canvas = toyplot.Canvas(width=400, height=300)

# plot the mean log probability of the models in red
axes = canvas.cartesian(ylabel="estLnProbMean")
axes.plot(etable.estLnProbMean * -1, color="darkred", marker="o")
axes.y.spine.style = {"stroke": "darkred"}

# plot delta K with its own scale bar of left side and in blue
axes = axes.share("x", ylabel="deltaK", ymax=etable.deltaK.max() + etable.deltaK.max() * .25)
axes.plot(etable.deltaK, color="steelblue", marker="o");
axes.y.spine.style = {"stroke": "steelblue"}

# set x labels
axes.x.ticks.locator = toyplot.locator.Explicit(range(len(etable.index)), etable.index)
axes.x.label.text = "K (N ancestral populations)"

In [53]:
k = 2
table = struct.get_clumpp_table(k)

[K2] 10/10 results permuted across replicates (max_var=0).


In [54]:
# sort list by columns
table.sort_values(by=list(range(k)), inplace=True)

# or, sort by a list of names (here taken from imap)
import itertools
onames = list(itertools.chain(*imap.values()))
table = table.loc[onames]

In [55]:
# build barplot
canvas = toyplot.Canvas(width=500, height=250)
axes = canvas.cartesian(bounds=("10%", "90%", "10%", "45%"))
axes.bars(table)

# add labels to x-axis
ticklabels = [i for i in table.index.tolist()]
axes.x.ticks.locator = toyplot.locator.Explicit(labels=ticklabels)
axes.x.ticks.labels.angle = -60
axes.x.ticks.show = True
axes.x.ticks.labels.offset = 10
axes.x.ticks.labels.style = {"font-size": "12px"}

In [56]:
k = 3
table = struct.get_clumpp_table(k)

[K3] 10/10 results permuted across replicates (max_var=0).


In [57]:
# sort list by columns
table.sort_values(by=list(range(k)), inplace=True)

# or, sort by a list of names (here taken from imap)
import itertools
onames = list(itertools.chain(*imap.values()))
table = table.loc[onames]

In [58]:
# build barplot
canvas = toyplot.Canvas(width=500, height=250)
axes = canvas.cartesian(bounds=("10%", "90%", "10%", "45%"))
axes.bars(table)

# add labels to x-axis
ticklabels = [i for i in table.index.tolist()]
axes.x.ticks.locator = toyplot.locator.Explicit(labels=ticklabels)
axes.x.ticks.labels.angle = -60
axes.x.ticks.show = True
axes.x.ticks.labels.offset = 10
axes.x.ticks.labels.style = {"font-size": "12px"}

In [73]:
k = 4
table = struct.get_clumpp_table(k)

[K4] 10/10 results permuted across replicates (max_var=0).


In [74]:
# sort list by columns
table.sort_values(by=list(range(k)), inplace=True)

# or, sort by a list of names (here taken from imap)
import itertools
onames = list(itertools.chain(*imap.values()))
table = table.loc[onames]

In [71]:
# build barplot
canvas = toyplot.Canvas(width=500, height=250)
axes = canvas.cartesian(bounds=("10%", "90%", "10%", "45%"))
axes.bars(table)

# add labels to x-axis
ticklabels = [i for i in table.index.tolist()]
axes.x.ticks.locator = toyplot.locator.Explicit(labels=ticklabels)
axes.x.ticks.labels.angle = -60
axes.x.ticks.show = True
axes.x.ticks.labels.offset = 10
axes.x.ticks.labels.style = {"font-size": "12px"}

In [75]:
k = 5
table = struct.get_clumpp_table(k)

[K5] 10/10 results permuted across replicates (max_var=0).


In [76]:
# sort list by columns
table.sort_values(by=list(range(k)), inplace=True)

# or, sort by a list of names (here taken from imap)
import itertools
onames = list(itertools.chain(*imap.values()))
table = table.loc[onames]

In [77]:
# build barplot
canvas = toyplot.Canvas(width=500, height=250)
axes = canvas.cartesian(bounds=("10%", "90%", "10%", "45%"))
axes.bars(table)

# add labels to x-axis
ticklabels = [i for i in table.index.tolist()]
axes.x.ticks.locator = toyplot.locator.Explicit(labels=ticklabels)
axes.x.ticks.labels.angle = -60
axes.x.ticks.show = True
axes.x.ticks.labels.offset = 10
axes.x.ticks.labels.style = {"font-size": "12px"}

In [78]:
k = 6
table = struct.get_clumpp_table(k)

[K6] 10/10 results permuted across replicates (max_var=0).


In [79]:
# sort list by columns
table.sort_values(by=list(range(k)), inplace=True)

# or, sort by a list of names (here taken from imap)
import itertools
onames = list(itertools.chain(*imap.values()))
table = table.loc[onames]

In [80]:
# build barplot
canvas = toyplot.Canvas(width=500, height=250)
axes = canvas.cartesian(bounds=("10%", "90%", "10%", "45%"))
axes.bars(table)

# add labels to x-axis
ticklabels = [i for i in table.index.tolist()]
axes.x.ticks.locator = toyplot.locator.Explicit(labels=ticklabels)
axes.x.ticks.labels.angle = -60
axes.x.ticks.show = True
axes.x.ticks.labels.offset = 10
axes.x.ticks.labels.style = {"font-size": "12px"}

# PCA

In [81]:
#!conda install scikit-learn -c conda-forge -y

In [82]:
import ipyrad.analysis as ipa
import pandas as pd
import toyplot

In [83]:
# init pca object with input data and (optional) parameter options
pca = ipa.pca(
    data=data,
    imap=imap,
    minmap=minmap,
    mincov=0.5,
    impute_method="sample",
)

Samples: 31
Sites before filtering: 819934
Filtered (indels): 0
Filtered (bi-allel): 9343
Filtered (mincov): 500244
Filtered (minmap): 770178
Filtered (combined): 771026
Sites after filtering: 48908
Sites containing missing values: 0 (0.00%)
Missing values in SNP matrix: 0 (0.00%)


In [84]:
# run the PCA analysis
pca.run()

Subsampling SNPs: 6264/48908


In [85]:
pca.draw()

(<toyplot.canvas.Canvas at 0x2b31d85c7110>,
 <toyplot.coordinates.Cartesian at 0x2b31d85c7710>,
 <toyplot.mark.Point at 0x2b31d8dcf050>)

In [86]:
# store the PC axes as a dataframe
df = pd.DataFrame(pca.pcaxes[0], index=pca.names)

# write the PC axes to a CSV file
df.to_csv("M_ru_pca_analysis.csv")

# show the first ten samples and the first 10 PC axes
df.iloc[:10, :10].round(2)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
M_ru_77750_suta,-6.37,-1.73,0.68,2.87,1.76,-0.95,-0.01,-1.15,-0.11,-1.61
M_ru_78182_suta,-7.2,-1.71,-0.69,1.97,5.16,-0.97,-1.74,-0.03,0.07,5.36
M_ru_80819_arsu,-6.82,-1.61,-0.64,1.12,0.81,3.33,3.64,5.16,-1.3,-1.04
M_ru_81347_arsu,-6.73,-1.11,-0.16,2.8,-0.76,1.4,2.51,3.62,-1.19,2.15
M_ru_85426_arsu,-6.18,-2.16,1.9,3.69,-2.25,6.3,-2.51,-1.79,-3.4,-0.96
M_ru_85919_suta,-7.01,-2.25,0.52,-0.58,6.1,-0.61,-0.82,0.29,0.55,-1.67
M_ru_A11834_suta,-6.77,-2.6,-2.9,-0.12,1.43,0.25,-2.5,-5.11,-1.27,-1.89
M_ru_A15176_suta,-5.97,-3.33,0.67,-4.56,-0.13,1.01,-1.4,0.7,3.48,-2.8
M_ru_A16195_pa,13.94,0.28,-0.15,0.01,0.03,-0.16,0.25,-1.04,-0.23,0.19
M_ru_A5487_suta,-6.83,-1.78,-1.89,-1.28,-4.29,-3.63,2.48,-3.23,-2.48,1.93


# Subsampling with replication

In [87]:
# plot PC axes 0 and 2 with many replicate subsamples
pca.run(nreplicates=25, seed=12345)
pca.draw(0, 1);

Subsampling SNPs: 6264/48908


# Kmeans imputation (integer)

In [88]:
# kmeans imputation 
pca3 = ipa.pca(
    data=data,
    imap=imap,
    minmap=minmap,
    mincov=0.5,
    impute_method=7,
)

# run and draw results for kmeans clustering into 7 groups
pca3.run(nreplicates=25, seed=123)
pca3.draw(0, 1);

Subsampling SNPs: 6264/48908


In [89]:
import toyplot.pdf

# save returned plot objects as variables
canvas, axes, mark = pca3.draw(0, 2)

# pass the canvas object to toyplot render function
toyplot.pdf.render(canvas, "M_ru_PCA-kmeans-7.pdf")

# T-SNE (ASSESSING COMPONENT LOADINGS)

In [90]:
import numpy as np
import pandas as pd
from sklearn.manifold import TSNE

In [91]:
pca.run_tsne(subsample=True, perplexity=3.0, n_iter=100000, seed=623)

Subsampling SNPs: 6264/48908


In [92]:
pca.draw(size=8);

In [93]:
import toyplot.pdf

# save returned plot objects as variables
canvas, axes, mark = pca.draw(size=8)

# pass the canvas object to toyplot render function
toyplot.pdf.render(canvas, "M_ru_TSNE_perp3.pdf")

# GENETIC DISTNANCES

In [94]:
import ipyrad.analysis as ipa
import toyplot

In [95]:
# load the snp data into distance tool with arguments
from ipyrad.analysis.distance import Distance
dist = Distance(
    data=data, 
    imap=imap,
    minmap=minmap,
    mincov=0.5,
    impute_method="sample",
    subsample_snps=False,
)
dist.run()

Samples: 31
Sites before filtering: 819934
Filtered (indels): 0
Filtered (bi-allel): 9343
Filtered (mincov): 500244
Filtered (minmap): 770178
Filtered (combined): 771026
Sites after filtering: 48908
Sites containing missing values: 0 (0.00%)
Missing values in SNP matrix: 0 (0.00%)
Imputation: 'sampled'; (0, 1, 2) = nan%, nan%, nan%


In [96]:
# save to a CSV file
dist.dists.to_csv("M_ru_distances.csv")

In [97]:
# save to a CSV file with no labels (eems style)
dist.dists.to_csv(
    "M_ru_distances_eems.csv",
    header=None,
    index=False,
    sep=" ",
)

In [101]:
# get list of concatenated names from each group
ordered_names = []
for group in dist.imap.values():
    ordered_names += group

# reorder matrix to match name order    
ordered_matrix = dist.dists[ordered_names].T[ordered_names]

In [102]:
toyplot.matrix(
    ordered_matrix,
    bshow=False,
    tshow=False,
    rlocator=toyplot.locator.Explicit(
        range(len(ordered_names)),
        ordered_names,
));