
# STRUCTURE

In [1]:
import ipyrad.analysis as ipa
import toyplot
import ipyparallel as ipp

In [2]:
## look for running ipcluster instance, and create load-balancer
ipyclient = ipp.Client()
print("{} engines found".format(len(ipyclient)))

24 engines found


In [3]:
# the path to your HDF5 formatted snps file
data = "./G_cy_5rm_v9.snps.hdf5"

In [12]:
imap = {
#"ref": ["reference"],
"Inam": ["G_cy_T12385_In","G_cy_T310_In","G_cy_T23196_In","G_cy_T7636_In"],
"Puru": ["G_cy_82508_pu","G_cy_T12275_pu","G_cy_T12279_pu","G_cy_T12392_pu","G_cy_T13184_pu","G_cy_T26228_pu","G_cy_T26229_pu","G_cy_T26252_pu"],
"JiGu": ["G_cy_T3343_jigu","G_cy_T3384_jigu","G_cy_T3385_jigu"],
"Mach": ["G_cy_J296_ma","G_cy_J477_ma","G_cy_J773_ma","G_cy_T13251_ma","G_cy_T363_ma","G_cy_T364_ma"],
"Roar": ["G_cy_J691_roar","G_cy_J694_roar"],
"ArSu": ["G_cy_80582_arsu","G_cy_85678_arsu","G_cy_80701_arsu","G_cy_80801_arsu","G_cy_80826_arsu","G_cy_81108_arsu","G_cy_81118_arsu","G_cy_85499_arsu"],
"SuTa": ["G_cy_85356_suta","G_cy_86297_suta","G_cy_86321_suta","G_cy_86458_suta","G_cy_86478_suta","G_cy_T14558_suta","G_cy_T16693_suta","G_cy_T18563_suta","G_cy_T18620_suta"],
"Para": ["G_cy_T10897_pa","G_cy_T11062_pa","G_cy_T16771_pa","G_cy_T1705_pa","G_cy_T18744_pa","G_cy_T19429_pa","G_cy_T19520_pa","G_cy_T19765_pa","G_cy_T2497_pa","G_cy_T6579_pa","G_cy_T9133_pa"]
}

# minimum % of samples that must be present in each SNP from each group
minmap = {i: 0.5 for i in imap}

In [12]:
# init analysis object with input data and (optional) parameter options
struct = ipa.structure(
    name="G_cy_str",
    data=data,
    imap=imap,
    minmap=minmap,
    mincov=0.95,
)

Samples: 52
Sites before filtering: 1130537
Filtered (indels): 0
Filtered (bi-allel): 21079
Filtered (mincov): 1079809
Filtered (minmap): 820258
Filtered (combined): 1081015
Sites after filtering: 49522
Sites containing missing values: 42350 (85.52%)
Missing values in SNP matrix: 67609 (2.63%)


In [14]:
struct.mainparams.burnin = 50000
struct.mainparams.numreps = 250000
struct.write_structure_files(abs)

('/array1/lmusher/rio_roosevelt_outfiles/G_cy_5rm_v9_outfiles/analysis-structure/tmp-G_cy_str2-<built-in function abs>-1.mainparams.txt',
 '/array1/lmusher/rio_roosevelt_outfiles/G_cy_5rm_v9_outfiles/analysis-structure/tmp-G_cy_str2-<built-in function abs>-1.extraparams.txt',
 '/array1/lmusher/rio_roosevelt_outfiles/G_cy_5rm_v9_outfiles/analysis-structure/tmp-G_cy_str2-<built-in function abs>-1.strfile.txt')

In [None]:
struct.run(nreps=10, kpop=[2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12], ipyclient=ipyclient, force=True)

Parallel connection | amnh-gen-001.internal.amnh.org: 20 cores
[############        ]  63% 1 day, 2:48:18 | running 110 structure jobs 

In [48]:
rerun = ipa.structure(
    data=data, 
    name="G_cy_str", 
    workdir="analysis-structure",
    imap=imap,
    load_only=True,
)

110 previous results loaded for run [G_cy_str]


In [49]:
etable = struct.get_evanno_table([2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12])
etable

Unnamed: 0,Nreps,lnPK,lnPPK,deltaK,estLnProbMean,estLnProbStdev
2,10,0.0,0.0,0.0,-63099.88,690.81376
3,10,6839.85,8521.36,10.627458,-56260.03,801.82483
4,10,-1681.51,2813.03,0.876252,-57941.54,3210.297857
5,10,1131.52,270548.89,90.331776,-56810.02,2995.057803
6,10,-269417.37,466603.32,0.813844,-326227.39,573332.704979
7,10,197185.95,385838.61,1.791553,-129041.44,215365.417458
8,10,-188652.66,82862.37,0.240171,-317694.1,345013.837754
9,10,-105790.29,136660.62,0.405655,-423484.39,336888.57021
10,10,30870.33,132281.79,0.427659,-392614.06,309316.003239
11,10,-101411.46,263784.5,0.552141,-494025.52,477748.659139


In [50]:
# get canvas object and set size
canvas = toyplot.Canvas(width=400, height=300)

# plot the mean log probability of the models in red
axes = canvas.cartesian(ylabel="estLnProbMean")
axes.plot(etable.estLnProbMean * -1, color="darkred", marker="o")
axes.y.spine.style = {"stroke": "darkred"}

# plot delta K with its own scale bar of left side and in blue
axes = axes.share("x", ylabel="deltaK", ymax=etable.deltaK.max() + etable.deltaK.max() * .25)
axes.plot(etable.deltaK, color="steelblue", marker="o");
axes.y.spine.style = {"stroke": "steelblue"}

# set x labels
axes.x.ticks.locator = toyplot.locator.Explicit(range(len(etable.index)), etable.index)
axes.x.label.text = "K (N ancestral populations)"

In [51]:
k = 2
table = struct.get_clumpp_table(k)

[K2] 10/10 results permuted across replicates (max_var=0).


In [52]:
# sort list by columns
table.sort_values(by=list(range(k)), inplace=True)

# or, sort by a list of names (here taken from imap)
import itertools
onames = list(itertools.chain(*imap.values()))
table = table.loc[onames]

In [53]:
# build barplot
canvas = toyplot.Canvas(width=500, height=250)
axes = canvas.cartesian(bounds=("10%", "90%", "10%", "45%"))
axes.bars(table)

# add labels to x-axis
ticklabels = [i for i in table.index.tolist()]
axes.x.ticks.locator = toyplot.locator.Explicit(labels=ticklabels)
axes.x.ticks.labels.angle = -60
axes.x.ticks.show = True
axes.x.ticks.labels.offset = 10
axes.x.ticks.labels.style = {"font-size": "12px"}

In [54]:
k = 3
table = struct.get_clumpp_table(k)

[K3] 10/10 results permuted across replicates (max_var=0).


In [55]:
# sort list by columns
table.sort_values(by=list(range(k)), inplace=True)

# or, sort by a list of names (here taken from imap)
import itertools
onames = list(itertools.chain(*imap.values()))
table = table.loc[onames]

In [56]:
# build barplot
canvas = toyplot.Canvas(width=500, height=250)
axes = canvas.cartesian(bounds=("10%", "90%", "10%", "45%"))
axes.bars(table)

# add labels to x-axis
ticklabels = [i for i in table.index.tolist()]
axes.x.ticks.locator = toyplot.locator.Explicit(labels=ticklabels)
axes.x.ticks.labels.angle = -60
axes.x.ticks.show = True
axes.x.ticks.labels.offset = 10
axes.x.ticks.labels.style = {"font-size": "12px"}

In [57]:
k = 4
table = struct.get_clumpp_table(k)

[K4] 10/10 results permuted across replicates (max_var=0).


In [58]:
# sort list by columns
table.sort_values(by=list(range(k)), inplace=True)

# or, sort by a list of names (here taken from imap)
import itertools
onames = list(itertools.chain(*imap.values()))
table = table.loc[onames]

In [59]:
# build barplot
canvas = toyplot.Canvas(width=500, height=250)
axes = canvas.cartesian(bounds=("10%", "90%", "10%", "45%"))
axes.bars(table)

# add labels to x-axis
ticklabels = [i for i in table.index.tolist()]
axes.x.ticks.locator = toyplot.locator.Explicit(labels=ticklabels)
axes.x.ticks.labels.angle = -60
axes.x.ticks.show = True
axes.x.ticks.labels.offset = 10
axes.x.ticks.labels.style = {"font-size": "12px"}

In [60]:
k = 5
table = struct.get_clumpp_table(k)

[K5] 10/10 results permuted across replicates (max_var=0).


In [61]:
# sort list by columns
table.sort_values(by=list(range(k)), inplace=True)

# or, sort by a list of names (here taken from imap)
import itertools
onames = list(itertools.chain(*imap.values()))
table = table.loc[onames]

In [62]:
# build barplot
canvas = toyplot.Canvas(width=500, height=250)
axes = canvas.cartesian(bounds=("10%", "90%", "10%", "45%"))
axes.bars(table)

# add labels to x-axis
ticklabels = [i for i in table.index.tolist()]
axes.x.ticks.locator = toyplot.locator.Explicit(labels=ticklabels)
axes.x.ticks.labels.angle = -60
axes.x.ticks.show = True
axes.x.ticks.labels.offset = 10
axes.x.ticks.labels.style = {"font-size": "12px"}

In [63]:
k = 6
table = struct.get_clumpp_table(k)

[K6] 10/10 results permuted across replicates (max_var=0).


In [64]:
# sort list by columns
table.sort_values(by=list(range(k)), inplace=True)

# or, sort by a list of names (here taken from imap)
import itertools
onames = list(itertools.chain(*imap.values()))
table = table.loc[onames]

In [65]:
# build barplot
canvas = toyplot.Canvas(width=500, height=250)
axes = canvas.cartesian(bounds=("10%", "90%", "10%", "45%"))
axes.bars(table)

# add labels to x-axis
ticklabels = [i for i in table.index.tolist()]
axes.x.ticks.locator = toyplot.locator.Explicit(labels=ticklabels)
axes.x.ticks.labels.angle = -60
axes.x.ticks.show = True
axes.x.ticks.labels.offset = 10
axes.x.ticks.labels.style = {"font-size": "12px"}

# PCA

In [66]:
#!conda install scikit-learn -c conda-forge -y

In [13]:
import ipyrad.analysis as ipa
import pandas as pd
import toyplot

In [14]:
# init pca object with input data and (optional) parameter options
pca = ipa.pca(
    data=data,
    imap=imap,
    minmap=minmap,
    mincov=0.5,
    impute_method="sample",
)

Samples: 51
Sites before filtering: 1130537
Filtered (indels): 0
Filtered (bi-allel): 21079
Filtered (mincov): 625064
Filtered (minmap): 1123199
Filtered (combined): 1123365
Sites after filtering: 7172
Sites containing missing values: 0 (0.00%)
Missing values in SNP matrix: 0 (0.00%)


In [91]:
# run the PCA analysis
pca.run()

Subsampling SNPs: 30391/302918


In [92]:
pca.draw()

(<toyplot.canvas.Canvas at 0x2b0f07661da0>,
 <toyplot.coordinates.Cartesian at 0x2b0f07661d30>,
 <toyplot.mark.Point at 0x2b0f6fe8a940>)

In [93]:
# store the PC axes as a dataframe
df = pd.DataFrame(pca.pcaxes[0], index=pca.names)

# write the PC axes to a CSV file
df.to_csv("G_cy_pca_analysis.csv")

# show the first ten samples and the first 10 PC axes
df.iloc[:10, :10].round(2)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
G_cy_80582_arsu,30.82,0.79,-0.93,0.55,-0.12,-0.79,-1.49,-7.54,-0.5,3.28
G_cy_80701_arsu,30.42,0.15,-0.59,0.29,-0.46,0.29,0.18,-7.67,-4.21,-2.83
G_cy_80801_arsu,29.65,0.23,-1.14,0.51,-1.07,1.15,-2.55,-9.2,-1.55,2.52
G_cy_80826_arsu,29.73,0.4,-0.99,0.99,-0.73,-2.1,-0.22,-12.32,-3.9,-3.19
G_cy_81108_arsu,29.63,0.88,-0.59,0.47,-1.84,-0.25,-1.93,-10.94,-2.99,-1.52
G_cy_81118_arsu,30.21,1.39,-0.68,0.54,0.5,-0.12,1.4,-11.5,-0.98,0.78
G_cy_82508_pu,-17.85,21.21,2.56,25.6,24.68,0.58,0.41,-0.95,1.01,-0.29
G_cy_85356_suta,30.41,1.09,-1.22,1.66,1.3,0.32,1.04,1.91,-1.53,-2.47
G_cy_85499_arsu,30.84,0.96,-0.58,0.93,0.52,-0.87,-1.04,-14.48,-4.43,0.32
G_cy_85678_arsu,30.79,0.57,-0.73,0.49,1.62,1.02,-0.08,-10.19,-2.62,0.83


# Subsampling with replication

In [72]:
# plot PC axes 0 and 2 with many replicate subsamples
pca.run(nreplicates=25, seed=12345)
pca.draw(0, 2);

Subsampling SNPs: 30391/302918


# Kmeans imputation (integer)

In [73]:
# kmeans imputation 
pca3 = ipa.pca(
    data=data,
    imap=imap,
    minmap=minmap,
    mincov=0.5,
    impute_method=7,
)

# run and draw results for kmeans clustering into 7 groups
pca3.run(nreplicates=25, seed=123)
pca3.draw(0, 2);

Kmeans clustering: iter=0, K=7, mincov=0.9, minmap={'global': 0.5}
Samples: 51
Sites before filtering: 1130537
Filtered (indels): 0
Filtered (bi-allel): 21079
Filtered (mincov): 990529
Filtered (minmap): 625064
Filtered (combined): 993791
Sites after filtering: 136746
Sites containing missing values: 129574 (94.76%)
Missing values in SNP matrix: 416173 (5.97%)
Imputation: 'sampled'; (0, 1, 2) = 58.1%, 6.1%, 35.8%
{0: ['G_cy_J296_ma', 'G_cy_J477_ma', 'G_cy_J691_roar', 'G_cy_J694_roar', 'G_cy_J773_ma', 'G_cy_T13251_ma', 'G_cy_T363_ma', 'G_cy_T364_ma'], 1: ['G_cy_T12275_pu', 'G_cy_T12279_pu', 'G_cy_T12392_pu', 'G_cy_T13184_pu', 'G_cy_T26228_pu'], 2: ['G_cy_80582_arsu', 'G_cy_80701_arsu', 'G_cy_80801_arsu', 'G_cy_80826_arsu', 'G_cy_81108_arsu', 'G_cy_81118_arsu', 'G_cy_85356_suta', 'G_cy_85499_arsu', 'G_cy_85678_arsu', 'G_cy_86297_suta', 'G_cy_86321_suta', 'G_cy_86458_suta', 'G_cy_86478_suta', 'G_cy_T14558_suta', 'G_cy_T16693_suta', 'G_cy_T18563_suta', 'G_cy_T18620_suta'], 3: ['G_cy_T10897

In [74]:
import toyplot.pdf

# save returned plot objects as variables
canvas, axes, mark = pca3.draw(0, 2)

# pass the canvas object to toyplot render function
toyplot.pdf.render(canvas, "G_cy_PCA-kmeans-7.pdf")

# T-SNE (ASSESSING COMPONENT LOADINGS)

In [7]:
import numpy as np
import pandas as pd
from sklearn.manifold import TSNE

In [21]:
pca.run_tsne(subsample=True, perplexity=8.0, n_iter=1000000, seed=123)

Subsampling SNPs: 1079/7172


In [22]:
pca.draw(size=8);

In [23]:
# store the PC axes as a dataframe
df = pd.DataFrame(pca.pcaxes[0], index=pca.names)

# write the PC axes to a CSV file
df.to_csv("G_cy_TSNE.csv")

# show the first ten samples and the first 10 PC axes
df.head()

Unnamed: 0,0,1
G_cy_80582_arsu,-29.968,-93.493
G_cy_80701_arsu,-35.568,-77.026
G_cy_80801_arsu,-32.888,-100.457
G_cy_80826_arsu,-37.213,-94.466
G_cy_81108_arsu,-42.121,-79.816


In [97]:
import toyplot.pdf

# save returned plot objects as variables
canvas, axes, mark = pca.draw(size=8)

# pass the canvas object to toyplot render function
toyplot.pdf.render(canvas, "G_cy_TSNE_perp8.pdf")

# GENETIC DISTNANCES

In [79]:
import ipyrad.analysis as ipa
import toyplot

In [80]:
# load the snp data into distance tool with arguments
from ipyrad.analysis.distance import Distance
dist = Distance(
    data=data, 
    imap=imap,
    minmap=minmap,
    mincov=0.5,
    impute_method="sample",
    subsample_snps=False,
)
dist.run()

Samples: 51
Sites before filtering: 1130537
Filtered (indels): 0
Filtered (bi-allel): 21079
Filtered (mincov): 625064
Filtered (minmap): 820258
Filtered (combined): 827619
Sites after filtering: 302918
Sites containing missing values: 295746 (97.63%)
Missing values in SNP matrix: 2062023 (13.35%)
Imputation: 'sampled'; (0, 1, 2) = 56.5%, 4.5%, 39.0%


In [81]:
# save to a CSV file
dist.dists.to_csv("G_cy_distances.csv")

In [82]:
# save to a CSV file with no labels (eems style)
dist.dists.to_csv(
    "G_cy_distances_eems.csv",
    header=None,
    index=False,
    sep=" ",
)

In [83]:
# get list of concatenated names from each group
ordered_names = []
for group in dist.imap.values():
    ordered_names += group

# reorder matrix to match name order    
ordered_matrix = dist.dists[ordered_names].T[ordered_names]

In [84]:
toyplot.matrix(
    ordered_matrix,
    bshow=False,
    tshow=False,
    rlocator=toyplot.locator.Explicit(
        range(len(ordered_names)),
        ordered_names,
));