
# STRUCTURE

In [3]:
import ipyrad.analysis as ipa
import toyplot
import ipyparallel as ipp

In [4]:
## look for running ipcluster instance, and create load-balancer
ipyclient = ipp.Client()
print("{} engines found".format(len(ipyclient)))

24 engines found


In [5]:
# the path to your HDF5 formatted snps file
data = "./G_cy_5rm_v9.snps.hdf5"

In [6]:
imap = {
#"ref": ["reference"],
#"Inam": ["G_cy_T12385_In","G_cy_T310_In","G_cy_T23196_In","G_cy_T7636_In"],
#"Puru": ["G_cy_82508_pu","G_cy_T12275_pu","G_cy_T12279_pu","G_cy_T12392_pu","G_cy_T13184_pu","G_cy_T26228_pu","G_cy_T26229_pu","G_cy_T26252_pu"],
#"JiGu": ["G_cy_T3343_jigu","G_cy_T3384_jigu","G_cy_T3385_jigu"],
"Mach": ["G_cy_J296_ma","G_cy_J477_ma","G_cy_J773_ma","G_cy_T13251_ma","G_cy_T363_ma","G_cy_T364_ma"],
"Roar": ["G_cy_J691_roar","G_cy_J694_roar"],
"ArSu": ["G_cy_80582_arsu","G_cy_85678_arsu","G_cy_80701_arsu","G_cy_80801_arsu","G_cy_80826_arsu","G_cy_81108_arsu","G_cy_81118_arsu","G_cy_85499_arsu"],
"SuTa": ["G_cy_85356_suta","G_cy_86297_suta","G_cy_86321_suta","G_cy_86458_suta","G_cy_86478_suta","G_cy_T14558_suta","G_cy_T16693_suta","G_cy_T18563_suta","G_cy_T18620_suta"],
"Para": ["G_cy_T10897_pa","G_cy_T11062_pa","G_cy_T16771_pa","G_cy_T1705_pa","G_cy_T18744_pa","G_cy_T19429_pa","G_cy_T19520_pa","G_cy_T19765_pa","G_cy_T2497_pa","G_cy_T6579_pa","G_cy_T9133_pa"]
}

# minimum % of samples that must be present in each SNP from each group
minmap = {i: 0.5 for i in imap}

In [5]:
# init analysis object with input data and (optional) parameter options
struct = ipa.structure(
    name="G_cy_str_east",
    data=data,
    imap=imap,
    minmap=minmap,
    mincov=0.95,
)

Samples: 36
Sites before filtering: 1130537
Filtered (indels): 0
Filtered (bi-allel): 15257
Filtered (mincov): 1054810
Filtered (minmap): 1100681
Filtered (combined): 1101162
Sites after filtering: 29375
Sites containing missing values: 0 (0.00%)
Missing values in SNP matrix: 0 (0.00%)


In [6]:
struct.mainparams.burnin = 25000
struct.mainparams.numreps = 100000
#struct.write_structure_files(abs)

In [7]:
struct.run(nreps=10, kpop=[2, 3, 4, 5, 6, 7, 8], ipyclient=ipyclient, force=True)

Parallel connection | amnh-gen-001.internal.amnh.org: 24 cores
[####################] 100% 3:44:04 | running 70 structure jobs 


In [8]:
struct = ipa.structure(
    data=data, 
    name="G_cy_str_east", 
    workdir="analysis-structure",
    imap=imap,
    load_only=True,
)

70 previous results loaded for run [G_cy_str_east]


In [10]:
etable = struct.get_evanno_table([2, 3, 4, 5, 6, 7, 8])
etable

Unnamed: 0,Nreps,deltaK,estLnProbMean,estLnProbStdev,lnPK,lnPPK
2,10,0.0,-22199.22,328.164,0.0,0.0
3,10,1.272,-20854.4,1466.701,1344.82,1865.14
4,10,1.566,-21374.72,454.242,-520.32,711.49
5,10,78.932,-21183.55,370.581,191.17,29250.8
6,10,0.623,-50243.18,91503.878,-29059.63,56975.39
7,10,27.05,-22327.42,1073.957,27915.76,29050.78
8,10,0.0,-23462.44,2710.024,-1135.02,0.0


In [11]:
# get canvas object and set size
canvas = toyplot.Canvas(width=400, height=300)

# plot the mean log probability of the models in red
axes = canvas.cartesian(ylabel="estLnProbMean")
axes.plot(etable.estLnProbMean * -1, color="darkred", marker="o")
axes.y.spine.style = {"stroke": "darkred"}

# plot delta K with its own scale bar of left side and in blue
axes = axes.share("x", ylabel="deltaK", ymax=etable.deltaK.max() + etable.deltaK.max() * .25)
axes.plot(etable.deltaK, color="steelblue", marker="o");
axes.y.spine.style = {"stroke": "steelblue"}

# set x labels
axes.x.ticks.locator = toyplot.locator.Explicit(range(len(etable.index)), etable.index)
axes.x.label.text = "K (N ancestral populations)"

In [12]:
k = 2
table = struct.get_clumpp_table(k)

[K2] 10/10 results permuted across replicates (max_var=0).


In [13]:
# sort list by columns
table.sort_values(by=list(range(k)), inplace=True)

# or, sort by a list of names (here taken from imap)
import itertools
onames = list(itertools.chain(*imap.values()))
table = table.loc[onames]

In [14]:
# build barplot
canvas = toyplot.Canvas(width=500, height=250)
axes = canvas.cartesian(bounds=("10%", "90%", "10%", "45%"))
axes.bars(table)

# add labels to x-axis
ticklabels = [i for i in table.index.tolist()]
axes.x.ticks.locator = toyplot.locator.Explicit(labels=ticklabels)
axes.x.ticks.labels.angle = -60
axes.x.ticks.show = True
axes.x.ticks.labels.offset = 10
axes.x.ticks.labels.style = {"font-size": "12px"}

In [15]:
k = 3
table = struct.get_clumpp_table(k)

[K3] 10/10 results permuted across replicates (max_var=0).


In [16]:
# sort list by columns
table.sort_values(by=list(range(k)), inplace=True)

# or, sort by a list of names (here taken from imap)
import itertools
onames = list(itertools.chain(*imap.values()))
table = table.loc[onames]

In [17]:
# build barplot
canvas = toyplot.Canvas(width=500, height=250)
axes = canvas.cartesian(bounds=("10%", "90%", "10%", "45%"))
axes.bars(table)

# add labels to x-axis
ticklabels = [i for i in table.index.tolist()]
axes.x.ticks.locator = toyplot.locator.Explicit(labels=ticklabels)
axes.x.ticks.labels.angle = -60
axes.x.ticks.show = True
axes.x.ticks.labels.offset = 10
axes.x.ticks.labels.style = {"font-size": "12px"}

In [18]:
k = 4
table = struct.get_clumpp_table(k)

[K4] 10/10 results permuted across replicates (max_var=0).


In [19]:
# sort list by columns
table.sort_values(by=list(range(k)), inplace=True)

# or, sort by a list of names (here taken from imap)
import itertools
onames = list(itertools.chain(*imap.values()))
table = table.loc[onames]

In [20]:
# build barplot
canvas = toyplot.Canvas(width=500, height=250)
axes = canvas.cartesian(bounds=("10%", "90%", "10%", "45%"))
axes.bars(table)

# add labels to x-axis
ticklabels = [i for i in table.index.tolist()]
axes.x.ticks.locator = toyplot.locator.Explicit(labels=ticklabels)
axes.x.ticks.labels.angle = -60
axes.x.ticks.show = True
axes.x.ticks.labels.offset = 10
axes.x.ticks.labels.style = {"font-size": "12px"}

In [34]:
k = 5
table = struct.get_clumpp_table(k)

[K5] 10/10 results permuted across replicates (max_var=0).


In [35]:
# sort list by columns
table.sort_values(by=list(range(k)), inplace=True)

# or, sort by a list of names (here taken from imap)
import itertools
onames = list(itertools.chain(*imap.values()))
table = table.loc[onames]

In [36]:
# build barplot
canvas = toyplot.Canvas(width=500, height=250)
axes = canvas.cartesian(bounds=("10%", "90%", "10%", "45%"))
axes.bars(table)

# add labels to x-axis
ticklabels = [i for i in table.index.tolist()]
axes.x.ticks.locator = toyplot.locator.Explicit(labels=ticklabels)
axes.x.ticks.labels.angle = -60
axes.x.ticks.show = True
axes.x.ticks.labels.offset = 10
axes.x.ticks.labels.style = {"font-size": "12px"}

In [30]:
k = 6
table = struct.get_clumpp_table(k)

[K6] 10/10 results permuted across replicates (max_var=0).


In [31]:
# sort list by columns
table.sort_values(by=list(range(k)), inplace=True)

# or, sort by a list of names (here taken from imap)
import itertools
onames = list(itertools.chain(*imap.values()))
table = table.loc[onames]

In [32]:
# build barplot
canvas = toyplot.Canvas(width=500, height=250)
axes = canvas.cartesian(bounds=("10%", "90%", "10%", "45%"))
axes.bars(table)

# add labels to x-axis
ticklabels = [i for i in table.index.tolist()]
axes.x.ticks.locator = toyplot.locator.Explicit(labels=ticklabels)
axes.x.ticks.labels.angle = -60
axes.x.ticks.show = True
axes.x.ticks.labels.offset = 10
axes.x.ticks.labels.style = {"font-size": "12px"}

In [33]:
k = 7
table = struct.get_clumpp_table(k)
# sort list by columns
table.sort_values(by=list(range(k)), inplace=True)

# or, sort by a list of names (here taken from imap)
import itertools
onames = list(itertools.chain(*imap.values()))
table = table.loc[onames]

# build barplot
canvas = toyplot.Canvas(width=500, height=250)
axes = canvas.cartesian(bounds=("10%", "90%", "10%", "45%"))
axes.bars(table)

# add labels to x-axis
ticklabels = [i for i in table.index.tolist()]
axes.x.ticks.locator = toyplot.locator.Explicit(labels=ticklabels)
axes.x.ticks.labels.angle = -60
axes.x.ticks.show = True
axes.x.ticks.labels.offset = 10
axes.x.ticks.labels.style = {"font-size": "12px"}

[K7] 10/10 results permuted across replicates (max_var=0).


# PCA

In [66]:
#!conda install scikit-learn -c conda-forge -y

In [7]:
import ipyrad.analysis as ipa
import pandas as pd
import toyplot

In [8]:
# init pca object with input data and (optional) parameter options
pca = ipa.pca(
    data=data,
    imap=imap,
    minmap=minmap,
    mincov=0.5,
    impute_method="sample",
)

Samples: 36
Sites before filtering: 1130537
Filtered (indels): 0
Filtered (bi-allel): 15257
Filtered (mincov): 588913
Filtered (minmap): 1100681
Filtered (combined): 1101162
Sites after filtering: 29375
Sites containing missing values: 0 (0.00%)
Missing values in SNP matrix: 0 (0.00%)


In [9]:
# run the PCA analysis
pca.run()

Subsampling SNPs: 4084/29375


In [10]:
pca.draw()

(<toyplot.canvas.Canvas at 0x2b728ebdae90>,
 <toyplot.coordinates.Cartesian at 0x2b728e0d0350>,
 <toyplot.mark.Point at 0x2b728e044f10>)

In [11]:
# store the PC axes as a dataframe
df = pd.DataFrame(pca.pcaxes[0], index=pca.names)

# write the PC axes to a CSV file
df.to_csv("G_cy_pca_analysis.csv")

# show the first ten samples and the first 10 PC axes
df.iloc[:10, :10].round(2)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
G_cy_80582_arsu,-8.37,-1.03,-1.88,7.74,5.23,-1.16,4.13,-0.77,2.05,-5.24
G_cy_80701_arsu,-8.05,-1.56,1.18,1.19,0.57,0.97,-2.27,3.91,-2.12,-1.58
G_cy_80801_arsu,-8.32,-1.46,5.09,1.08,-2.6,1.62,-0.33,1.09,0.54,0.17
G_cy_80826_arsu,-8.57,-1.96,0.12,-1.2,1.87,1.55,-1.12,0.23,-1.0,1.27
G_cy_81108_arsu,-8.2,-1.32,4.44,0.94,-0.69,2.59,5.25,-3.41,-1.09,-2.48
G_cy_81118_arsu,-8.05,-1.12,-2.46,-0.98,2.69,-4.94,-3.74,-4.66,-1.64,0.86
G_cy_85356_suta,-8.27,-0.75,0.27,-2.58,-4.82,-0.46,-2.47,-2.81,-4.0,-3.24
G_cy_85499_arsu,-7.7,-1.74,-0.6,0.92,1.93,-0.82,-2.36,0.28,-1.75,-1.98
G_cy_85678_arsu,-8.24,-0.97,-1.17,-2.19,-4.09,-4.41,2.48,-4.03,6.23,-0.85
G_cy_86297_suta,-8.09,-1.78,-0.18,-0.31,0.49,1.44,-0.82,2.03,-3.05,0.23


# Subsampling with replication

In [12]:
# plot PC axes 0 and 2 with many replicate subsamples
pca.run(nreplicates=25, seed=12345)
pca.draw(0, 2);

Subsampling SNPs: 4084/29375


# Kmeans imputation (integer)

In [13]:
# kmeans imputation 
pca3 = ipa.pca(
    data=data,
    imap=imap,
    minmap=minmap,
    mincov=0.5,
    impute_method=7,
)

# run and draw results for kmeans clustering into 7 groups
pca3.run(nreplicates=25, seed=123)
pca3.draw(0, 2);

Subsampling SNPs: 4084/29375


In [14]:
import toyplot.pdf

# save returned plot objects as variables
canvas, axes, mark = pca3.draw(0, 2)

# pass the canvas object to toyplot render function
toyplot.pdf.render(canvas, "G_cy_PCA-kmeans-7.pdf")

# T-SNE (ASSESSING COMPONENT LOADINGS)

In [15]:
import numpy as np
import pandas as pd
from sklearn.manifold import TSNE

In [16]:
pca.run_tsne(subsample=True, perplexity=8.0, n_iter=100000, seed=123)

Subsampling SNPs: 4084/29375


In [17]:
pca.draw(size=10);

In [18]:
import toyplot.pdf

# save returned plot objects as variables
canvas, axes, mark = pca.draw(size=8)

# pass the canvas object to toyplot render function
toyplot.pdf.render(canvas, "G_cy_east_TSNE_perp8.pdf")

# GENETIC DISTNANCES

In [19]:
import ipyrad.analysis as ipa
import toyplot

In [20]:
# load the snp data into distance tool with arguments
from ipyrad.analysis.distance import Distance
dist = Distance(
    data=data, 
    imap=imap,
    minmap=minmap,
    mincov=0.5,
    impute_method="sample",
    subsample_snps=False,
)
dist.run()

Samples: 36
Sites before filtering: 1130537
Filtered (indels): 0
Filtered (bi-allel): 15257
Filtered (mincov): 588913
Filtered (minmap): 1100681
Filtered (combined): 1101162
Sites after filtering: 29375
Sites containing missing values: 0 (0.00%)
Missing values in SNP matrix: 0 (0.00%)
Imputation: 'sampled'; (0, 1, 2) = nan%, nan%, nan%


  100 * np.sum(imputed == 0) / imputed.size,
  100 * np.sum(imputed == 1) / imputed.size,
  100 * np.sum(imputed == 2) / imputed.size,


In [21]:
# save to a CSV file
dist.dists.to_csv("G_cy_distances.csv")

In [22]:
# save to a CSV file with no labels (eems style)
dist.dists.to_csv(
    "G_cy_distances_eems.csv",
    header=None,
    index=False,
    sep=" ",
)

In [23]:
# get list of concatenated names from each group
ordered_names = []
for group in dist.imap.values():
    ordered_names += group

# reorder matrix to match name order    
ordered_matrix = dist.dists[ordered_names].T[ordered_names]

In [24]:
toyplot.matrix(
    ordered_matrix,
    bshow=False,
    tshow=False,
    rlocator=toyplot.locator.Explicit(
        range(len(ordered_names)),
        ordered_names,
));