# evaluating effects of missing data threshold and population assignment on PCA, t-SNE, and dxy 

In [5]:
import ipyrad.analysis as ipa
import toyplot
import ipyparallel as ipp
import pandas as pd
import numpy as np
from sklearn.manifold import TSNE

In [6]:
# the path to your HDF5 formatted snps file
data = "../plate1/M_ru_3rm_v9_outfiles/M_ru_3rm_v9.snps.hdf5"

Let's define populations Western Inambari (Inam), Purus-Madeira (Puru), Jiparana-Guapore (JiGu), Jiparaná-Roosevelt (Mach), Roosevelt-Aripuana (Roar), Aripuana-Sucunduri (ArSu), Sucunduri-Tapajos (SuTa), and Western Para (Para). 

These assignments are made only to color points in the PCA analysis below.

In [7]:
imap = {
#"ref": ["reference"],
"Inam": ['M_ru_A7875_In','M_ru_T14456_In', 'M_ru_T23245_In', 'M_ru_T23246_In', 'M_ru_T23416_In', 'M_ru_T23478_In'],
"Puru": ['M_ru_A10311_pu','M_ru_A10329_pu', 'M_ru_A1380_pu', 'M_ru_A2741_pu', 'M_ru_A436_pu', 'M_ru_A440_pu'],
"JiGu": ['M_ru_A207_jigu', 'M_ru_T3228_jigu', 'M_ru_T7634_jigu'],
"Mach": ['M_ru_T13253_ma','M_ru_A474_ma', 'M_ru_T3164_ma', 'M_ru_J265_ma', 'M_ru_T368_ma', 'M_ru_T381_ma', 'M_ru_T476_ma', 'M_ru_T494_ma'],
"Roar": ['M_ru_J640_roar','M_ru_J676_roar'],
"ArSu": ['M_ru_80819_arsu', 'M_ru_81347_arsu', 'M_ru_85426_arsu'],
"SuTa": [ 'M_ru_85919_suta','M_ru_77750_suta', 'M_ru_78182_suta', 'M_ru_A11834_suta', 'M_ru_A15176_suta', 'M_ru_A5487_suta', 'M_ru_T10184_suta', 'M_ru_T11780_suta', 'M_ru_T14532_suta', 'M_ru_T14622_suta', 'M_ru_T753_suta', 'M_ru_A9903_pa'],
"Para": ['M_ru_A16195_pa','M_ru_A9235_pa', 'M_ru_T11079_pa', 'M_ru_T11238_pa', 'M_ru_T12541_pa', 'M_ru_T1649_pa', 'M_ru_T16553_pa', 'M_ru_T19782_pa', 'M_ru_T6500_pa', 'M_ru_T6577_pa']
}
# minimum % of samples that must be present in each SNP from each population: do 5 subsets to see robustness of impu
#Because there is a lot of missing data, we want to make sure that results are not biased due to imputing too many haplotypes
#sampling across different minmaps will help address this. If the data are more organized by population with more missing data, then this could be bias in the imputation from small sample sizes
minmap1 = {i: 0.5 for i in imap}
minmap2 = {i: 0.65 for i in imap}
minmap3 = {i: 0.75 for i in imap}
minmap4 = {i: 0.85 for i in imap}
minmap5 = {i: 0.95 for i in imap}

let's run PCA for different values of minmap to see how varying missing data affects our results

note we are assuming K=8 which is likely high, but k-means clustering will cluster samples independent of a priori geographic assignment

Here we are using k-means clustering to assign individuals to populations independently of our a priori geographic assignments in order to circumscribe populations from which to impute missing haplotypes.

The PCA plots can then be colored based on our geographic assignments above

In [8]:
#init pca object with input data and (optional) parameter options

pca = ipa.pca(
    data=data,
    imap=imap,
    minmap=minmap1,
    mincov=0.85,
    impute_method=8,
)
pca2 = ipa.pca(
    data=data,
    imap=imap,
    minmap=minmap2,
    mincov=0.85,
    impute_method=8,
)
pca3 = ipa.pca(
    data=data,
    imap=imap,
    minmap=minmap3,
    mincov=0.85,
    impute_method=8,
)
pca4 = ipa.pca(
    data=data,
    imap=imap,
    minmap=minmap4,
    mincov=0.85,
    impute_method=8,
)
pca5 = ipa.pca(
    data=data,
    imap=imap,
    minmap=minmap5,
    mincov=0.85,
    impute_method=8,
)

Kmeans clustering: iter=0, K=8, mincov=0.9, minmap={'global': 0.85}
Samples: 50
Sites before filtering: 819934
Filtered (indels): 0
Filtered (bi-allel): 14244
Filtered (mincov): 691215
Filtered (minmap): 666808
Filtered (combined): 694543
Sites after filtering: 125391
Sites containing missing values: 92803 (74.01%)
Missing values in SNP matrix: 243340 (3.88%)
Imputation: 'sampled'; (0, 1, 2) = 35.8%, 3.4%, 60.8%
{0: ['M_ru_A10311_pu', 'M_ru_A10329_pu', 'M_ru_A1380_pu', 'M_ru_A2741_pu', 'M_ru_A436_pu', 'M_ru_A440_pu'], 1: ['M_ru_77750_suta', 'M_ru_78182_suta', 'M_ru_80819_arsu', 'M_ru_81347_arsu', 'M_ru_85426_arsu', 'M_ru_85919_suta', 'M_ru_A11834_suta', 'M_ru_A15176_suta', 'M_ru_A5487_suta', 'M_ru_A9903_pa', 'M_ru_J640_roar', 'M_ru_T10184_suta', 'M_ru_T11780_suta', 'M_ru_T14532_suta', 'M_ru_T14622_suta', 'M_ru_T753_suta'], 2: ['M_ru_A16195_pa', 'M_ru_A9235_pa', 'M_ru_T11079_pa', 'M_ru_T11238_pa', 'M_ru_T12541_pa', 'M_ru_T1649_pa', 'M_ru_T16553_pa', 'M_ru_T19782_pa', 'M_ru_T6500_pa', 'M

In [9]:
# # run the PCA analysis
pca.run()
pca2.run()
pca3.run()
pca4.run()
pca5.run()

Subsampling SNPs: 6711/48914
Subsampling SNPs: 6780/49618
Subsampling SNPs: 6681/48637
Subsampling SNPs: 6873/50442
Subsampling SNPs: 4482/32588


In [10]:
pca.draw()

(<toyplot.canvas.Canvas at 0x2b4ea43a1850>,
 <toyplot.coordinates.Cartesian at 0x2b4ea43a1450>)

In [11]:
pca2.draw()

(<toyplot.canvas.Canvas at 0x2b4ea31f4c50>,
 <toyplot.coordinates.Cartesian at 0x2b4ea31f4650>)

In [12]:
pca3.draw()

(<toyplot.canvas.Canvas at 0x2b4ea354f750>,
 <toyplot.coordinates.Cartesian at 0x2b4ea354f890>)

In [13]:
pca4.draw()

(<toyplot.canvas.Canvas at 0x2b4ea3561350>,
 <toyplot.coordinates.Cartesian at 0x2b4ea35613d0>)

In [14]:
pca5.draw()

(<toyplot.canvas.Canvas at 0x2b4ea355f150>,
 <toyplot.coordinates.Cartesian at 0x2b4ea355fcd0>)

As you can see, for varying degrees of missing data, we generally get the same or similar results.

there are three to five clusters of points here that correspond pretty clearly to many river barriers and these are consistent among runs.

Now we can write the PCA results to a file

In [15]:
# # store the PC axes as a dataframe
df4 = pd.DataFrame(pca4.pcaxes[0], index=pca4.names)

# # write the PC axes to a CSV file
df4.to_csv("M_ru_pca_85minmap_12Jan2022.csv")

# # show the first ten samples and the first 10 PC axes
df4.iloc[:10, :10].round(2)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
M_ru_77750_suta,-7.83,-6.14,0.75,-0.07,-0.43,-1.63,-0.09,-2.1,-0.72,-0.17
M_ru_78182_suta,-8.18,-6.12,0.44,-0.54,-0.5,-2.0,-2.12,-3.96,-5.1,4.03
M_ru_80819_arsu,-7.65,-5.85,0.7,-0.28,-1.13,-2.84,-1.65,-2.65,5.15,-3.56
M_ru_81347_arsu,-8.33,-6.14,0.43,-0.19,0.08,-1.3,-1.2,-1.75,-1.22,3.33
M_ru_85426_arsu,-7.77,-5.5,0.36,-0.54,-0.53,-1.9,0.47,-0.28,-0.08,-0.43
M_ru_85919_suta,-7.63,-5.35,0.13,-0.33,-0.61,-1.78,-0.96,-0.72,-2.58,0.08
M_ru_A10311_pu,11.39,-0.85,-7.66,-6.94,-0.7,-0.09,-0.3,-0.29,1.32,1.22
M_ru_A10329_pu,11.58,-0.68,-7.18,-7.26,-0.71,-0.48,0.11,-2.17,2.92,1.64
M_ru_A11834_suta,-8.0,-6.31,0.46,0.27,-0.17,-2.94,0.24,0.74,0.68,0.52
M_ru_A1380_pu,11.71,-0.67,-6.95,-6.64,-0.22,0.31,-0.4,1.67,-0.72,1.44


Now let's look at if and how t-SNE differs from PCA

In [16]:
pca.run_tsne(subsample=True, perplexity=5.0, n_iter=1000000, seed=123)
pca2.run_tsne(subsample=True, perplexity=5.0, n_iter=1000000, seed=123)
pca3.run_tsne(subsample=True, perplexity=5.0, n_iter=1000000, seed=123)
pca4.run_tsne(subsample=True, perplexity=5.0, n_iter=10000000, seed=123)
pca5.run_tsne(subsample=True, perplexity=5.0, n_iter=1000000, seed=223)

Subsampling SNPs: 6711/48914
Subsampling SNPs: 6780/49618
Subsampling SNPs: 6681/48637
Subsampling SNPs: 6873/50442
Subsampling SNPs: 4482/32588


In [17]:
pca.draw();
pca2.draw();
pca3.draw();
pca4.draw();
pca5.draw();

amazingly with t-sne we see significantly more clusters

note that these results are sensitive to values of perplexity and starting seed, so you can play with those parameters to get more interpretable results

Then we can again write the t-sne results to a file

In [18]:
# # store the PC axes as a dataframe
df4 = pd.DataFrame(pca4.pcaxes[0], index=pca4.names)

# # write the PC axes to a CSV file
df4.to_csv("M_ru_TSNE_85minmap_12Jan2022.csv")

# # show the first ten samples and the first 10 PC axes
df4.iloc[:10, :10].round(2)

Unnamed: 0,0,1
M_ru_77750_suta,-155.21,81.63
M_ru_78182_suta,-123.9,124.63
M_ru_80819_arsu,-142.68,92.18
M_ru_81347_arsu,-158.49,119.06
M_ru_85426_arsu,-124.72,98.65
M_ru_85919_suta,-92.67,102.26
M_ru_A10311_pu,137.12,-20.8
M_ru_A10329_pu,133.07,-42.8
M_ru_A11834_suta,-94.39,59.35
M_ru_A1380_pu,176.64,-20.26


Let's skip this for now, but the next section of code does 10,000 TSNE replicates for downstream analysis using randomply generated values for starting seed and "perplexity"

In [19]:
# !rm -r M_ru_TSNE
# !mkdir M_ru_TSNE
# import random
# for i in range(10000):
#     pca3.run_tsne(subsample=True, perplexity=random.randrange(3,8), n_iter=100000, seed=random.randrange(100,9999))
#     df4 = pd.DataFrame(pca3.pcaxes[0], index=pca4.names)
#     df4.to_csv("./M_ru_TSNE/M_ru_TSNE_rep"+str(i)+".csv")

#### Now let's look at just the western clade, and here we assume K=4; again clustering is independent of geography

In [20]:
#RUN AGAIN WITH WESTERN POPULATIONS ONLY

imap = {
#"ref": ["reference"],
"Inam": ['M_ru_A7875_In','M_ru_T14456_In', 'M_ru_T23245_In', 'M_ru_T23246_In', 'M_ru_T23416_In', 'M_ru_T23478_In'],
"Puru": ['M_ru_A10311_pu','M_ru_A10329_pu', 'M_ru_A1380_pu', 'M_ru_A2741_pu', 'M_ru_A436_pu', 'M_ru_A440_pu'],
"JiGu": ['M_ru_A207_jigu', 'M_ru_T3228_jigu', 'M_ru_T7634_jigu'],
"Mach": ['M_ru_T13253_ma','M_ru_A474_ma', 'M_ru_T3164_ma', 'M_ru_J265_ma'],#'M_ru_T368_ma', 'M_ru_T381_ma', 'M_ru_T476_ma', 'M_ru_T494_ma'
#"Roar": ['M_ru_J640_roar','M_ru_J676_roar'],
#"ArSu": ['M_ru_80819_arsu', 'M_ru_81347_arsu', 'M_ru_85426_arsu'],
#"SuTa": [ 'M_ru_85919_suta','M_ru_77750_suta', 'M_ru_78182_suta', 'M_ru_A11834_suta', 'M_ru_A15176_suta', 'M_ru_A5487_suta', 'M_ru_T10184_suta', 'M_ru_T11780_suta', 'M_ru_T14532_suta', 'M_ru_T14622_suta', 'M_ru_T753_suta', 'M_ru_A9903_pa'],
#"Para": ['M_ru_A16195_pa','M_ru_A9235_pa', 'M_ru_T11079_pa', 'M_ru_T11238_pa', 'M_ru_T12541_pa', 'M_ru_T1649_pa', 'M_ru_T16553_pa', 'M_ru_T19782_pa', 'M_ru_T6500_pa', 'M_ru_T6577_pa']
}

# minimum % of samples that must be present in each SNP from each group
minmap1 = {i: 0.55 for i in imap}
minmap2 = {i: 0.65 for i in imap}
minmap3 = {i: 0.75 for i in imap}
minmap4 = {i: 0.85 for i in imap}
minmap5 = {i: 0.95 for i in imap}

In [21]:
# init pca object with input data and (optional) parameter options
pca = ipa.pca(
    data=data,
    imap=imap,
    minmap=minmap1,
    mincov=0.85,
    impute_method=4,
)
# init pca object with input data and (optional) parameter options
pca2 = ipa.pca(
    data=data,
    imap=imap,
    minmap=minmap2,
    mincov=0.85,
    impute_method=4,
)
# init pca object with input data and (optional) parameter options
pca3 = ipa.pca(
    data=data,
    imap=imap,
    minmap=minmap3,
    mincov=0.85,
    impute_method=4,
)
# init pca object with input data and (optional) parameter options
pca4 = ipa.pca(
    data=data,
    imap=imap,
    minmap=minmap4,
    mincov=0.85,
    impute_method=4,
)
# init pca object with input data and (optional) parameter options
pca5 = ipa.pca(
    data=data,
    imap=imap,
    minmap=minmap5,
    mincov=0.85,
    impute_method=4,
)

Kmeans clustering: iter=0, K=4, mincov=0.9, minmap={'global': 0.85}
Samples: 19
Sites before filtering: 819934
Filtered (indels): 0
Filtered (bi-allel): 6174
Filtered (mincov): 710465
Filtered (minmap): 675930
Filtered (combined): 711703
Sites after filtering: 108231
Sites containing missing values: 42638 (39.40%)
Missing values in SNP matrix: 42638 (2.07%)
Imputation: 'sampled'; (0, 1, 2) = 35.2%, 2.5%, 62.2%
{0: ['M_ru_A10311_pu', 'M_ru_A10329_pu', 'M_ru_A1380_pu', 'M_ru_A2741_pu', 'M_ru_A436_pu', 'M_ru_A440_pu'], 1: ['M_ru_A7875_In', 'M_ru_T14456_In', 'M_ru_T23245_In', 'M_ru_T23246_In', 'M_ru_T23416_In', 'M_ru_T23478_In'], 2: ['M_ru_A474_ma', 'M_ru_J265_ma', 'M_ru_T13253_ma', 'M_ru_T3164_ma'], 3: ['M_ru_A207_jigu', 'M_ru_T3228_jigu', 'M_ru_T7634_jigu']}

Kmeans clustering: iter=1, K=4, mincov=0.8875, minmap={0: 0.85, 1: 0.85, 2: 0.85, 3: 0.85}
Samples: 19
Sites before filtering: 819934
Filtered (indels): 0
Filtered (bi-allel): 6174
Filtered (mincov): 675930
Filtered (minmap): 753598

  100 * np.sum(imputed == 0) / imputed.size,
  100 * np.sum(imputed == 1) / imputed.size,
  100 * np.sum(imputed == 2) / imputed.size,


{0: ['M_ru_A207_jigu', 'M_ru_T3228_jigu', 'M_ru_T7634_jigu'], 1: ['M_ru_A7875_In', 'M_ru_T14456_In', 'M_ru_T23245_In', 'M_ru_T23246_In', 'M_ru_T23416_In', 'M_ru_T23478_In'], 2: ['M_ru_A10311_pu', 'M_ru_A10329_pu', 'M_ru_A1380_pu', 'M_ru_A2741_pu', 'M_ru_A436_pu', 'M_ru_A440_pu'], 3: ['M_ru_A474_ma', 'M_ru_J265_ma', 'M_ru_T13253_ma', 'M_ru_T3164_ma']}

Kmeans clustering: iter=2, K=4, mincov=0.875, minmap={0: 0.85, 1: 0.85, 2: 0.85, 3: 0.85}
Samples: 19
Sites before filtering: 819934
Filtered (indels): 0
Filtered (bi-allel): 6174
Filtered (mincov): 675930
Filtered (minmap): 753598
Filtered (combined): 754341
Sites after filtering: 65593
Sites containing missing values: 0 (0.00%)
Missing values in SNP matrix: 0 (0.00%)
Imputation: 'sampled'; (0, 1, 2) = nan%, nan%, nan%


  100 * np.sum(imputed == 0) / imputed.size,
  100 * np.sum(imputed == 1) / imputed.size,
  100 * np.sum(imputed == 2) / imputed.size,


{0: ['M_ru_A474_ma', 'M_ru_J265_ma', 'M_ru_T13253_ma', 'M_ru_T3164_ma'], 1: ['M_ru_A7875_In', 'M_ru_T14456_In', 'M_ru_T23245_In', 'M_ru_T23246_In', 'M_ru_T23416_In', 'M_ru_T23478_In'], 2: ['M_ru_A10311_pu', 'M_ru_A10329_pu', 'M_ru_A1380_pu', 'M_ru_A2741_pu', 'M_ru_A436_pu', 'M_ru_A440_pu'], 3: ['M_ru_A207_jigu', 'M_ru_T3228_jigu', 'M_ru_T7634_jigu']}

Kmeans clustering: iter=3, K=4, mincov=0.8625, minmap={0: 0.85, 1: 0.85, 2: 0.85, 3: 0.85}
Samples: 19
Sites before filtering: 819934
Filtered (indels): 0
Filtered (bi-allel): 6174
Filtered (mincov): 675930
Filtered (minmap): 753598
Filtered (combined): 754341
Sites after filtering: 65593
Sites containing missing values: 0 (0.00%)
Missing values in SNP matrix: 0 (0.00%)
Imputation: 'sampled'; (0, 1, 2) = nan%, nan%, nan%


  100 * np.sum(imputed == 0) / imputed.size,
  100 * np.sum(imputed == 1) / imputed.size,
  100 * np.sum(imputed == 2) / imputed.size,


{0: ['M_ru_A10311_pu', 'M_ru_A10329_pu', 'M_ru_A1380_pu', 'M_ru_A2741_pu', 'M_ru_A436_pu', 'M_ru_A440_pu'], 1: ['M_ru_A7875_In', 'M_ru_T14456_In', 'M_ru_T23245_In', 'M_ru_T23246_In', 'M_ru_T23416_In', 'M_ru_T23478_In'], 2: ['M_ru_A474_ma', 'M_ru_J265_ma', 'M_ru_T13253_ma', 'M_ru_T3164_ma'], 3: ['M_ru_A207_jigu', 'M_ru_T3228_jigu', 'M_ru_T7634_jigu']}

Kmeans clustering: iter=4, K=4, mincov=0.85, minmap={0: 0.85, 1: 0.85, 2: 0.85, 3: 0.85}
Samples: 19
Sites before filtering: 819934
Filtered (indels): 0
Filtered (bi-allel): 6174
Filtered (mincov): 675930
Filtered (minmap): 753598
Filtered (combined): 754341
Sites after filtering: 65593
Sites containing missing values: 0 (0.00%)
Missing values in SNP matrix: 0 (0.00%)
Imputation: 'sampled'; (0, 1, 2) = nan%, nan%, nan%


  100 * np.sum(imputed == 0) / imputed.size,
  100 * np.sum(imputed == 1) / imputed.size,
  100 * np.sum(imputed == 2) / imputed.size,


Kmeans clustering: iter=0, K=4, mincov=0.9, minmap={'global': 0.85}
Samples: 19
Sites before filtering: 819934
Filtered (indels): 0
Filtered (bi-allel): 6174
Filtered (mincov): 710465
Filtered (minmap): 675930
Filtered (combined): 711703
Sites after filtering: 108231
Sites containing missing values: 42638 (39.40%)
Missing values in SNP matrix: 42638 (2.07%)
Imputation: 'sampled'; (0, 1, 2) = 35.3%, 2.5%, 62.2%
{0: ['M_ru_A207_jigu', 'M_ru_T3228_jigu', 'M_ru_T7634_jigu'], 1: ['M_ru_A10311_pu', 'M_ru_A10329_pu', 'M_ru_A1380_pu', 'M_ru_A2741_pu', 'M_ru_A436_pu', 'M_ru_A440_pu'], 2: ['M_ru_A7875_In', 'M_ru_T14456_In', 'M_ru_T23245_In', 'M_ru_T23246_In', 'M_ru_T23416_In', 'M_ru_T23478_In'], 3: ['M_ru_A474_ma', 'M_ru_J265_ma', 'M_ru_T13253_ma', 'M_ru_T3164_ma']}

Kmeans clustering: iter=1, K=4, mincov=0.8875, minmap={0: 0.85, 1: 0.85, 2: 0.85, 3: 0.85}
Samples: 19
Sites before filtering: 819934
Filtered (indels): 0
Filtered (bi-allel): 6174
Filtered (mincov): 675930
Filtered (minmap): 753598

  100 * np.sum(imputed == 0) / imputed.size,
  100 * np.sum(imputed == 1) / imputed.size,
  100 * np.sum(imputed == 2) / imputed.size,


{0: ['M_ru_A474_ma', 'M_ru_J265_ma', 'M_ru_T13253_ma', 'M_ru_T3164_ma'], 1: ['M_ru_A7875_In', 'M_ru_T14456_In', 'M_ru_T23245_In', 'M_ru_T23246_In', 'M_ru_T23416_In', 'M_ru_T23478_In'], 2: ['M_ru_A10311_pu', 'M_ru_A10329_pu', 'M_ru_A1380_pu', 'M_ru_A2741_pu', 'M_ru_A436_pu', 'M_ru_A440_pu'], 3: ['M_ru_A207_jigu', 'M_ru_T3228_jigu', 'M_ru_T7634_jigu']}

Kmeans clustering: iter=2, K=4, mincov=0.875, minmap={0: 0.85, 1: 0.85, 2: 0.85, 3: 0.85}
Samples: 19
Sites before filtering: 819934
Filtered (indels): 0
Filtered (bi-allel): 6174
Filtered (mincov): 675930
Filtered (minmap): 753598
Filtered (combined): 754341
Sites after filtering: 65593
Sites containing missing values: 0 (0.00%)
Missing values in SNP matrix: 0 (0.00%)
Imputation: 'sampled'; (0, 1, 2) = nan%, nan%, nan%


  100 * np.sum(imputed == 0) / imputed.size,
  100 * np.sum(imputed == 1) / imputed.size,
  100 * np.sum(imputed == 2) / imputed.size,


{0: ['M_ru_A474_ma', 'M_ru_J265_ma', 'M_ru_T13253_ma', 'M_ru_T3164_ma'], 1: ['M_ru_A7875_In', 'M_ru_T14456_In', 'M_ru_T23245_In', 'M_ru_T23246_In', 'M_ru_T23416_In', 'M_ru_T23478_In'], 2: ['M_ru_A10311_pu', 'M_ru_A10329_pu', 'M_ru_A1380_pu', 'M_ru_A2741_pu', 'M_ru_A436_pu', 'M_ru_A440_pu'], 3: ['M_ru_A207_jigu', 'M_ru_T3228_jigu', 'M_ru_T7634_jigu']}

Kmeans clustering: iter=3, K=4, mincov=0.8625, minmap={0: 0.85, 1: 0.85, 2: 0.85, 3: 0.85}
Samples: 19
Sites before filtering: 819934
Filtered (indels): 0
Filtered (bi-allel): 6174
Filtered (mincov): 675930
Filtered (minmap): 753598
Filtered (combined): 754341
Sites after filtering: 65593
Sites containing missing values: 0 (0.00%)
Missing values in SNP matrix: 0 (0.00%)
Imputation: 'sampled'; (0, 1, 2) = nan%, nan%, nan%


  100 * np.sum(imputed == 0) / imputed.size,
  100 * np.sum(imputed == 1) / imputed.size,
  100 * np.sum(imputed == 2) / imputed.size,


{0: ['M_ru_A10311_pu', 'M_ru_A10329_pu', 'M_ru_A1380_pu', 'M_ru_A2741_pu', 'M_ru_A436_pu', 'M_ru_A440_pu'], 1: ['M_ru_A207_jigu', 'M_ru_T3228_jigu', 'M_ru_T7634_jigu'], 2: ['M_ru_A7875_In', 'M_ru_T14456_In', 'M_ru_T23245_In', 'M_ru_T23246_In', 'M_ru_T23416_In', 'M_ru_T23478_In'], 3: ['M_ru_A474_ma', 'M_ru_J265_ma', 'M_ru_T13253_ma', 'M_ru_T3164_ma']}

Kmeans clustering: iter=4, K=4, mincov=0.85, minmap={0: 0.85, 1: 0.85, 2: 0.85, 3: 0.85}
Samples: 19
Sites before filtering: 819934
Filtered (indels): 0
Filtered (bi-allel): 6174
Filtered (mincov): 675930
Filtered (minmap): 753598
Filtered (combined): 754341
Sites after filtering: 65593
Sites containing missing values: 0 (0.00%)
Missing values in SNP matrix: 0 (0.00%)
Imputation: 'sampled'; (0, 1, 2) = nan%, nan%, nan%


  100 * np.sum(imputed == 0) / imputed.size,
  100 * np.sum(imputed == 1) / imputed.size,
  100 * np.sum(imputed == 2) / imputed.size,


Kmeans clustering: iter=0, K=4, mincov=0.9, minmap={'global': 0.85}
Samples: 19
Sites before filtering: 819934
Filtered (indels): 0
Filtered (bi-allel): 6174
Filtered (mincov): 710465
Filtered (minmap): 675930
Filtered (combined): 711703
Sites after filtering: 108231
Sites containing missing values: 42638 (39.40%)
Missing values in SNP matrix: 42638 (2.07%)
Imputation: 'sampled'; (0, 1, 2) = 35.1%, 2.7%, 62.2%
{0: ['M_ru_A10311_pu', 'M_ru_A10329_pu', 'M_ru_A1380_pu', 'M_ru_A2741_pu', 'M_ru_A436_pu', 'M_ru_A440_pu'], 1: ['M_ru_A7875_In', 'M_ru_T14456_In', 'M_ru_T23245_In', 'M_ru_T23246_In', 'M_ru_T23416_In', 'M_ru_T23478_In'], 2: ['M_ru_A474_ma', 'M_ru_J265_ma', 'M_ru_T13253_ma', 'M_ru_T3164_ma'], 3: ['M_ru_A207_jigu', 'M_ru_T3228_jigu', 'M_ru_T7634_jigu']}

Kmeans clustering: iter=1, K=4, mincov=0.8875, minmap={0: 0.85, 1: 0.85, 2: 0.85, 3: 0.85}
Samples: 19
Sites before filtering: 819934
Filtered (indels): 0
Filtered (bi-allel): 6174
Filtered (mincov): 675930
Filtered (minmap): 753598

  100 * np.sum(imputed == 0) / imputed.size,
  100 * np.sum(imputed == 1) / imputed.size,
  100 * np.sum(imputed == 2) / imputed.size,


{0: ['M_ru_A10311_pu', 'M_ru_A10329_pu', 'M_ru_A1380_pu', 'M_ru_A2741_pu', 'M_ru_A436_pu', 'M_ru_A440_pu'], 1: ['M_ru_A207_jigu', 'M_ru_T3228_jigu', 'M_ru_T7634_jigu'], 2: ['M_ru_A7875_In', 'M_ru_T14456_In', 'M_ru_T23245_In', 'M_ru_T23246_In', 'M_ru_T23416_In', 'M_ru_T23478_In'], 3: ['M_ru_A474_ma', 'M_ru_J265_ma', 'M_ru_T13253_ma', 'M_ru_T3164_ma']}

Kmeans clustering: iter=2, K=4, mincov=0.875, minmap={0: 0.85, 1: 0.85, 2: 0.85, 3: 0.85}
Samples: 19
Sites before filtering: 819934
Filtered (indels): 0
Filtered (bi-allel): 6174
Filtered (mincov): 675930
Filtered (minmap): 753598
Filtered (combined): 754341
Sites after filtering: 65593
Sites containing missing values: 0 (0.00%)
Missing values in SNP matrix: 0 (0.00%)
Imputation: 'sampled'; (0, 1, 2) = nan%, nan%, nan%


  100 * np.sum(imputed == 0) / imputed.size,
  100 * np.sum(imputed == 1) / imputed.size,
  100 * np.sum(imputed == 2) / imputed.size,


{0: ['M_ru_A10311_pu', 'M_ru_A10329_pu', 'M_ru_A1380_pu', 'M_ru_A2741_pu', 'M_ru_A436_pu', 'M_ru_A440_pu'], 1: ['M_ru_A7875_In', 'M_ru_T14456_In', 'M_ru_T23245_In', 'M_ru_T23246_In', 'M_ru_T23416_In', 'M_ru_T23478_In'], 2: ['M_ru_A474_ma', 'M_ru_J265_ma', 'M_ru_T13253_ma', 'M_ru_T3164_ma'], 3: ['M_ru_A207_jigu', 'M_ru_T3228_jigu', 'M_ru_T7634_jigu']}

Kmeans clustering: iter=3, K=4, mincov=0.8625, minmap={0: 0.85, 1: 0.85, 2: 0.85, 3: 0.85}
Samples: 19
Sites before filtering: 819934
Filtered (indels): 0
Filtered (bi-allel): 6174
Filtered (mincov): 675930
Filtered (minmap): 753598
Filtered (combined): 754341
Sites after filtering: 65593
Sites containing missing values: 0 (0.00%)
Missing values in SNP matrix: 0 (0.00%)
Imputation: 'sampled'; (0, 1, 2) = nan%, nan%, nan%


  100 * np.sum(imputed == 0) / imputed.size,
  100 * np.sum(imputed == 1) / imputed.size,
  100 * np.sum(imputed == 2) / imputed.size,


{0: ['M_ru_A7875_In', 'M_ru_T14456_In', 'M_ru_T23245_In', 'M_ru_T23246_In', 'M_ru_T23416_In', 'M_ru_T23478_In'], 1: ['M_ru_A10311_pu', 'M_ru_A10329_pu', 'M_ru_A1380_pu', 'M_ru_A2741_pu', 'M_ru_A436_pu', 'M_ru_A440_pu'], 2: ['M_ru_A474_ma', 'M_ru_J265_ma', 'M_ru_T13253_ma', 'M_ru_T3164_ma'], 3: ['M_ru_A207_jigu', 'M_ru_T3228_jigu', 'M_ru_T7634_jigu']}

Kmeans clustering: iter=4, K=4, mincov=0.85, minmap={0: 0.85, 1: 0.85, 2: 0.85, 3: 0.85}
Samples: 19
Sites before filtering: 819934
Filtered (indels): 0
Filtered (bi-allel): 6174
Filtered (mincov): 675930
Filtered (minmap): 753598
Filtered (combined): 754341
Sites after filtering: 65593
Sites containing missing values: 0 (0.00%)
Missing values in SNP matrix: 0 (0.00%)
Imputation: 'sampled'; (0, 1, 2) = nan%, nan%, nan%


  100 * np.sum(imputed == 0) / imputed.size,
  100 * np.sum(imputed == 1) / imputed.size,
  100 * np.sum(imputed == 2) / imputed.size,


In [22]:
# run the PCA analysis
pca.run()
pca2.run()
pca3.run()
pca4.run()
pca5.run()

Subsampling SNPs: 8454/65593
Subsampling SNPs: 8454/65593
Subsampling SNPs: 8454/65593
Subsampling SNPs: 8454/65593
Subsampling SNPs: 8454/65593


In [23]:
pca.draw()
pca2.draw()
pca3.draw()
pca4.draw()
pca5.draw()

(<toyplot.canvas.Canvas at 0x2b4ea70ddd10>,
 <toyplot.coordinates.Cartesian at 0x2b4ea70dd950>)

When we zoom in on one clade, we can see that there is even more structure than we initially thought

In [24]:
pca.run_tsne(subsample=True, perplexity=5.0, n_iter=1000000, seed=123)
pca2.run_tsne(subsample=True, perplexity=5.0, n_iter=1000000, seed=123)
pca3.run_tsne(subsample=True, perplexity=5.0, n_iter=1000000, seed=123)
pca4.run_tsne(subsample=True, perplexity=5.0, n_iter=1000000, seed=123)
pca5.run_tsne(subsample=True, perplexity=5.0, n_iter=1000000, seed=123)

Subsampling SNPs: 8454/65593
Subsampling SNPs: 8454/65593
Subsampling SNPs: 8454/65593
Subsampling SNPs: 8454/65593
Subsampling SNPs: 8454/65593


In [25]:
pca.draw();
pca2.draw();
pca3.draw();
pca4.draw();
pca5.draw();

Now let's look at just the eastern clade, and here we assume K=5; again clustering is independent of geography

In [26]:
#RUN AGAIN WITH EASTERN POPULATIONS ONLY

imap = {
#"ref": ["reference"],
#"Inam": ['M_ru_A7875_In','M_ru_T14456_In', 'M_ru_T23245_In', 'M_ru_T23246_In', 'M_ru_T23416_In', 'M_ru_T23478_In'],
#"Puru": ['M_ru_A10311_pu','M_ru_A10329_pu', 'M_ru_A1380_pu', 'M_ru_A2741_pu', 'M_ru_A436_pu', 'M_ru_A440_pu'],
#"JiGu": ['M_ru_A207_jigu', 'M_ru_T3228_jigu', 'M_ru_T7634_jigu'],
"Mach": ['M_ru_T368_ma', 'M_ru_T381_ma', 'M_ru_T476_ma', 'M_ru_T494_ma'],
"Roar": ['M_ru_J640_roar','M_ru_J676_roar'],
"ArSu": ['M_ru_80819_arsu', 'M_ru_81347_arsu', 'M_ru_85426_arsu'],
"SuTa": [ 'M_ru_85919_suta','M_ru_77750_suta', 'M_ru_78182_suta', 'M_ru_A11834_suta', 'M_ru_A15176_suta', 'M_ru_A5487_suta', 'M_ru_T10184_suta', 'M_ru_T11780_suta', 'M_ru_T14532_suta', 'M_ru_T14622_suta', 'M_ru_T753_suta', 'M_ru_A9903_pa'],
"Para": ['M_ru_A16195_pa','M_ru_A9235_pa', 'M_ru_T11079_pa', 'M_ru_T11238_pa', 'M_ru_T12541_pa', 'M_ru_T1649_pa', 'M_ru_T16553_pa', 'M_ru_T19782_pa', 'M_ru_T6500_pa', 'M_ru_T6577_pa']
}

# minimum % of samples that must be present in each SNP from each group
minmap1 = {i: 0.55 for i in imap}
minmap2 = {i: 0.65 for i in imap}
minmap3 = {i: 0.75 for i in imap}
minmap4 = {i: 0.85 for i in imap}
minmap5 = {i: 0.95 for i in imap}

In [27]:
# init pca object with input data and (optional) parameter options
pca = ipa.pca(
    data=data,
    imap=imap,
    minmap=minmap1,
    mincov=0.85,
    impute_method=5,
)
# init pca object with input data and (optional) parameter options
pca2 = ipa.pca(
    data=data,
    imap=imap,
    minmap=minmap2,
    mincov=0.85,
    impute_method=5,
)
# init pca object with input data and (optional) parameter options
pca3 = ipa.pca(
    data=data,
    imap=imap,
    minmap=minmap3,
    mincov=0.85,
    impute_method=5,
)
# init pca object with input data and (optional) parameter options
pca4 = ipa.pca(
    data=data,
    imap=imap,
    minmap=minmap4,
    mincov=0.85,
    impute_method=5,
)
# init pca object with input data and (optional) parameter options
pca5 = ipa.pca(
    data=data,
    imap=imap,
    minmap=minmap5,
    mincov=0.85,
    impute_method=5,
)

Kmeans clustering: iter=0, K=5, mincov=0.9, minmap={'global': 0.85}
Samples: 31
Sites before filtering: 819934
Filtered (indels): 0
Filtered (bi-allel): 9343
Filtered (mincov): 675574
Filtered (minmap): 656833
Filtered (combined): 678027
Sites after filtering: 141907
Sites containing missing values: 92999 (65.54%)
Missing values in SNP matrix: 163650 (3.72%)
Imputation: 'sampled'; (0, 1, 2) = 35.3%, 2.8%, 61.9%
{0: ['M_ru_A16195_pa', 'M_ru_A9235_pa', 'M_ru_T11238_pa', 'M_ru_T12541_pa', 'M_ru_T1649_pa', 'M_ru_T16553_pa', 'M_ru_T19782_pa', 'M_ru_T6500_pa', 'M_ru_T6577_pa'], 1: ['M_ru_85426_arsu', 'M_ru_J676_roar', 'M_ru_T10184_suta', 'M_ru_T11780_suta', 'M_ru_T14622_suta'], 2: ['M_ru_77750_suta', 'M_ru_78182_suta', 'M_ru_80819_arsu', 'M_ru_81347_arsu', 'M_ru_85919_suta', 'M_ru_A11834_suta', 'M_ru_A15176_suta', 'M_ru_A5487_suta', 'M_ru_A9903_pa', 'M_ru_J640_roar', 'M_ru_T14532_suta', 'M_ru_T753_suta'], 3: ['M_ru_T368_ma', 'M_ru_T381_ma', 'M_ru_T476_ma', 'M_ru_T494_ma'], 4: ['M_ru_T11079_p

In [28]:
# run the PCA analysis
pca.run()
pca2.run()
pca3.run()
pca4.run()
pca5.run()

Subsampling SNPs: 13732/114554
Subsampling SNPs: 11711/92842
Subsampling SNPs: 12743/103830
Subsampling SNPs: 13723/114754
Subsampling SNPs: 6264/48908


In [29]:
pca.draw()
pca2.draw()
pca3.draw()
pca4.draw()
pca5.draw()

(<toyplot.canvas.Canvas at 0x2b4eaaa52a50>,
 <toyplot.coordinates.Cartesian at 0x2b4eaaa52b10>)

In [30]:
pca.run_tsne(subsample=True, perplexity=5.0, n_iter=1000000, seed=123)
pca2.run_tsne(subsample=True, perplexity=5.0, n_iter=1000000, seed=123)
pca3.run_tsne(subsample=True, perplexity=5.0, n_iter=1000000, seed=123)
pca4.run_tsne(subsample=True, perplexity=5.0, n_iter=1000000, seed=123)
pca5.run_tsne(subsample=True, perplexity=5.0, n_iter=1000000, seed=123)

Subsampling SNPs: 13732/114554
Subsampling SNPs: 11711/92842
Subsampling SNPs: 12743/103830
Subsampling SNPs: 13723/114754
Subsampling SNPs: 6264/48908


In [31]:
pca.draw();
pca2.draw();
pca3.draw();
pca4.draw();
pca5.draw();

Now let's see if the results hold up to a different clustering algorithm so I will group samples into "populations" based on results from the structure analysis to see if PCA and t-SNE look the same. Here we use "sample" to impute from a priori defined populations

In [38]:
imap = {
#"ref": ["reference"],
"Inam": ['M_ru_A7875_In','M_ru_T14456_In', 'M_ru_T23245_In', 'M_ru_T23246_In', 'M_ru_T23416_In', 'M_ru_T23478_In'],
"Puru": ['M_ru_A10311_pu','M_ru_A10329_pu', 'M_ru_A1380_pu', 'M_ru_A2741_pu', 'M_ru_A436_pu', 'M_ru_A440_pu'],
"JiGuMach": ['M_ru_A207_jigu', 'M_ru_T3228_jigu', 'M_ru_T7634_jigu','M_ru_T13253_ma','M_ru_A474_ma', 'M_ru_T3164_ma', 'M_ru_J265_ma'],
"MachRoArSuTa": ['M_ru_T368_ma', 'M_ru_T381_ma', 'M_ru_T476_ma', 'M_ru_T494_ma','M_ru_J640_roar','M_ru_J676_roar','M_ru_80819_arsu', 'M_ru_81347_arsu', 'M_ru_85426_arsu','M_ru_85919_suta','M_ru_77750_suta', 'M_ru_78182_suta', 'M_ru_A11834_suta', 'M_ru_A15176_suta', 'M_ru_A5487_suta', 'M_ru_T10184_suta', 'M_ru_T11780_suta', 'M_ru_T14532_suta', 'M_ru_T14622_suta', 'M_ru_T753_suta', 'M_ru_A9903_pa'],
"Para": ['M_ru_A16195_pa','M_ru_A9235_pa', 'M_ru_T11079_pa', 'M_ru_T11238_pa', 'M_ru_T12541_pa', 'M_ru_T1649_pa', 'M_ru_T16553_pa', 'M_ru_T19782_pa', 'M_ru_T6500_pa', 'M_ru_T6577_pa']
}

# minimum % of samples that must be present in each SNP from each group
minmap1 = {i: 0.55 for i in imap}
minmap2 = {i: 0.65 for i in imap}
minmap3 = {i: 0.75 for i in imap}
minmap4 = {i: 0.85 for i in imap}
minmap5 = {i: 0.95 for i in imap}

In [39]:
# init pca object with input data and (optional) parameter options
pca = ipa.pca(
    data=data,
    imap=imap,
    minmap=minmap1,
    mincov=0.85,
    impute_method="sample",
)
# init pca object with input data and (optional) parameter options
pca2 = ipa.pca(
    data=data,
    imap=imap,
    minmap=minmap2,
    mincov=0.85,
    impute_method="sample",
)
# init pca object with input data and (optional) parameter options
pca3 = ipa.pca(
    data=data,
    imap=imap,
    minmap=minmap3,
    mincov=0.85,
    impute_method="sample",
)
# init pca object with input data and (optional) parameter options
pca4 = ipa.pca(
    data=data,
    imap=imap,
    minmap=minmap4,
    mincov=0.85,
    impute_method="sample",
)
# init pca object with input data and (optional) parameter options
pca5 = ipa.pca(
    data=data,
    imap=imap,
    minmap=minmap5,
    mincov=0.85,
    impute_method="sample",
)

Samples: 50
Sites before filtering: 819934
Filtered (indels): 0
Filtered (bi-allel): 14244
Filtered (mincov): 666808
Filtered (minmap): 623187
Filtered (combined): 675486
Sites after filtering: 144448
Sites containing missing values: 111860 (77.44%)
Missing values in SNP matrix: 369541 (5.12%)
Imputation: 'sampled'; (0, 1, 2) = 36.4%, 2.1%, 61.5%
Samples: 50
Sites before filtering: 819934
Filtered (indels): 0
Filtered (bi-allel): 14244
Filtered (mincov): 666808
Filtered (minmap): 643984
Filtered (combined): 678585
Sites after filtering: 141349
Sites containing missing values: 108761 (76.95%)
Missing values in SNP matrix: 352383 (4.99%)
Imputation: 'sampled'; (0, 1, 2) = 36.5%, 2.1%, 61.5%
Samples: 50
Sites before filtering: 819934
Filtered (indels): 0
Filtered (bi-allel): 14244
Filtered (mincov): 666808
Filtered (minmap): 700135
Filtered (combined): 705183
Sites after filtering: 114751
Sites containing missing values: 82163 (71.60%)
Missing values in SNP matrix: 217012 (3.78%)
Imputati

In [40]:
# run the PCA analysis
pca.run()
pca2.run()
pca3.run()
pca4.run()
pca5.run()

Subsampling SNPs: 16483/144448
Subsampling SNPs: 16134/141349
Subsampling SNPs: 13694/114751
Subsampling SNPs: 9807/78774
Subsampling SNPs: 5224/39369


In [41]:
pca.draw();
pca2.draw();
pca3.draw();
pca4.draw();
pca5.draw();

In [44]:
pca.run_tsne(subsample=True, perplexity=4.0, n_iter=1000000, seed=223)
pca2.run_tsne(subsample=True, perplexity=4.0, n_iter=1000000, seed=123)
pca3.run_tsne(subsample=True, perplexity=4.0, n_iter=1000000, seed=223)
pca4.run_tsne(subsample=True, perplexity=4.0, n_iter=1000000, seed=123)
pca5.run_tsne(subsample=True, perplexity=4.0, n_iter=1000000, seed=223)

Subsampling SNPs: 16483/144448
Subsampling SNPs: 16134/141349
Subsampling SNPs: 13694/114751
Subsampling SNPs: 9807/78774
Subsampling SNPs: 5224/39369


In [45]:
pca.draw();
pca2.draw();
pca3.draw();
pca4.draw();
pca5.draw();

Amazingly, the results look rather similar to the initial results of imputation by popuilations defined by the k-means clustering analysis

(You can place your cursor on a point to see the label)

# Genetic Distances (dxy)

Now let's generate distance matrices for EEMs and see if imputation effects measures of distance. First look at imputation results based on structure assignments, and then look at imputation results based on a priori assignments

In [46]:
imap = {
#"ref": ["reference"],
"Inam": ['M_ru_A7875_In','M_ru_T14456_In', 'M_ru_T23245_In', 'M_ru_T23246_In', 'M_ru_T23416_In', 'M_ru_T23478_In'],
"Puru": ['M_ru_A10311_pu','M_ru_A10329_pu', 'M_ru_A1380_pu', 'M_ru_A2741_pu', 'M_ru_A436_pu', 'M_ru_A440_pu'],
"JiGuMach": ['M_ru_A207_jigu', 'M_ru_T3228_jigu', 'M_ru_T7634_jigu','M_ru_T13253_ma','M_ru_A474_ma', 'M_ru_T3164_ma', 'M_ru_J265_ma'],
"MachRoArSuTa": ['M_ru_T368_ma', 'M_ru_T381_ma', 'M_ru_T476_ma', 'M_ru_T494_ma','M_ru_J640_roar','M_ru_J676_roar','M_ru_80819_arsu', 'M_ru_81347_arsu', 'M_ru_85426_arsu','M_ru_85919_suta','M_ru_77750_suta', 'M_ru_78182_suta', 'M_ru_A11834_suta', 'M_ru_A15176_suta', 'M_ru_A5487_suta', 'M_ru_T10184_suta', 'M_ru_T11780_suta', 'M_ru_T14532_suta', 'M_ru_T14622_suta', 'M_ru_T753_suta', 'M_ru_A9903_pa'],
"Para": ['M_ru_A16195_pa','M_ru_A9235_pa', 'M_ru_T11079_pa', 'M_ru_T11238_pa', 'M_ru_T12541_pa', 'M_ru_T1649_pa', 'M_ru_T16553_pa', 'M_ru_T19782_pa', 'M_ru_T6500_pa', 'M_ru_T6577_pa']
}

minmap4 = {i: 0.85 for i in imap}


In [47]:
# load the snp data into distance tool with arguments
from ipyrad.analysis.distance import Distance
dist = Distance(
    data=data, 
    imap=imap,
    minmap=minmap4,
    mincov=0.85,
    impute_method="sample",
    subsample_snps=False,
)
dist.run()

Samples: 50
Sites before filtering: 819934
Filtered (indels): 0
Filtered (bi-allel): 14244
Filtered (mincov): 666808
Filtered (minmap): 739034
Filtered (combined): 741160
Sites after filtering: 78774
Sites containing missing values: 46186 (58.63%)
Missing values in SNP matrix: 82019 (2.08%)
Imputation: 'sampled'; (0, 1, 2) = 37.1%, 2.1%, 60.8%


In [48]:
# save to a CSV file
dist.dists.to_csv("M_ru_distances_12Jan2022.csv")

# save to a CSV file with no labels (eems style)
dist.dists.to_csv(
    "M_ru_distances_eems_12Jan2022.csv",
    header=None,
    index=False,
    sep=" ",
)

In [49]:
imap = {
#"ref": ["reference"],
"Inam": ['M_ru_A7875_In','M_ru_T14456_In', 'M_ru_T23245_In', 'M_ru_T23246_In', 'M_ru_T23416_In', 'M_ru_T23478_In'],
"Puru": ['M_ru_A10311_pu','M_ru_A10329_pu', 'M_ru_A1380_pu', 'M_ru_A2741_pu', 'M_ru_A436_pu', 'M_ru_A440_pu'],
"JiGu": ['M_ru_A207_jigu', 'M_ru_T3228_jigu', 'M_ru_T7634_jigu'],
"Mach": ['M_ru_T13253_ma','M_ru_A474_ma', 'M_ru_T3164_ma', 'M_ru_J265_ma', 'M_ru_T368_ma', 'M_ru_T381_ma', 'M_ru_T476_ma', 'M_ru_T494_ma'],
"Roar": ['M_ru_J640_roar','M_ru_J676_roar'],
"ArSu": ['M_ru_80819_arsu', 'M_ru_81347_arsu', 'M_ru_85426_arsu'],
"SuTa": [ 'M_ru_85919_suta','M_ru_77750_suta', 'M_ru_78182_suta', 'M_ru_A11834_suta', 'M_ru_A15176_suta', 'M_ru_A5487_suta', 'M_ru_T10184_suta', 'M_ru_T11780_suta', 'M_ru_T14532_suta', 'M_ru_T14622_suta', 'M_ru_T753_suta', 'M_ru_A9903_pa'],
"Para": ['M_ru_A16195_pa','M_ru_A9235_pa', 'M_ru_T11079_pa', 'M_ru_T11238_pa', 'M_ru_T12541_pa', 'M_ru_T1649_pa', 'M_ru_T16553_pa', 'M_ru_T19782_pa', 'M_ru_T6500_pa', 'M_ru_T6577_pa']
}

minmap4 = {i: 0.85 for i in imap}

In [50]:
# load the snp data into distance tool with arguments
from ipyrad.analysis.distance import Distance
dist2 = Distance(
    data=data, 
    imap=imap,
    minmap=minmap4,
    mincov=0.85,
    impute_method="sample",
    subsample_snps=False,
)
dist2.run()

Samples: 50
Sites before filtering: 819934
Filtered (indels): 0
Filtered (bi-allel): 14244
Filtered (mincov): 666808
Filtered (minmap): 760928
Filtered (combined): 762475
Sites after filtering: 57459
Sites containing missing values: 24871 (43.28%)
Missing values in SNP matrix: 32039 (1.12%)
Imputation: 'sampled'; (0, 1, 2) = 37.3%, 2.0%, 60.6%


In [51]:
# get list of concatenated names from each group
ordered_names = []
for group in dist.imap.values():
    ordered_names += group

# reorder matrix to match name order    
ordered_matrix = dist.dists[ordered_names].T[ordered_names]

toyplot.matrix(
    ordered_matrix,
    bshow=False,
    tshow=False,
    rlocator=toyplot.locator.Explicit(
        range(len(ordered_names)),
        ordered_names,
));

# get list of concatenated names from each group
ordered_names = []
for group in dist2.imap.values():
    ordered_names += group

# reorder matrix to match name order    
ordered_matrix = dist2.dists[ordered_names].T[ordered_names]

toyplot.matrix(
    ordered_matrix,
    bshow=False,
    tshow=False,
    rlocator=toyplot.locator.Explicit(
        range(len(ordered_names)),
        ordered_names,
));

top=structure assignments, bottom=a priori assignments and you can see they are nearly identical