# PCA analysis
PCA stands for Principal Component Analysis. It is a dimensionality reduction method used to transform and project data points onto fewer orthogonal axes that can explain the greatest amount of variance in the data.

In [1]:
# the path to your .snps.hdf5 database file 
data = "/home/marianna/Documents/Phacelia/Phac_Assembly_ingroup/min12_clust90_outfiles/min12_clust90.snps.hdf5"

In [2]:
## imports
import ipyrad as ip
import ipyrad.analysis as ipa
import ipyparallel as ipp
import toyplot
import pandas as pd
import numpy as np   # numerical library

In [3]:
# After the cluster is running attach to it with ipyparallel
ipyclient = ipp.Client()
print(ip.cluster_info(ipyclient))

Parallel connection | Cryptantha: 50 cores
None


### Grouping individuals into populations based on the topology and the 'structure analysis' 

<b>imap dictionary</b> : assigns individual samples to populations for visualization purposes

<b>minimap dictionary</b> : states the required percentage coverage in each population; here it's 50%

In [4]:
# group individuals into populations, according to my first structure analysis.
imap ={
    "sin": ["W6368","W6376"],
    "pinn_peru": ["W5599","W6027","W6028","W6037","W6078","W6024","W5637", "W5636"],
    "nana": ["W6373"],
    "argent": ["W5610","W6374", "W6080","W6369"],
    "set_arg": ["W6375", "W6031","W6370"],
    "set_ch": ["W5145", "W5612"],
    "set_pe": ["W6001", "W6079","W6029"],
   
}

## require that 50% of samples have data in each group
minmap = {i: 0.5 for i in imap} 

In [5]:
# init pca object with input data and (optional) parameter options
pca = ipa.pca(
    data=data,
    imap=imap,
    minmap=minmap,
    mincov=0.25,   
    impute_method="sample",
)


Samples: 23
Sites before filtering: 19359
Filtered (indels): 1020
Filtered (bi-allel): 210
Filtered (mincov): 51
Filtered (minmap): 5685
Filtered (subsample invariant): 6133
Filtered (minor allele frequency): 0
Filtered (combined): 10771
Sites after filtering: 8587
Sites containing missing values: 5548 (64.61%)
Missing values in SNP matrix: 13587 (6.88%)
SNPs (total): 8587
SNPs (unlinked): 4200
Imputation: 'sampled'; (0, 1, 2) = 90.0%, 6.7%, 3.3%


In [6]:
# run the PCA analysis
pca.run()


Subsampling SNPs: 4200/8587


In [7]:
# store the PC axes as a dataframe
df = pd.DataFrame(pca.pcaxes[0], index=pca.names)

# write the PC axes to a CSV file
df.to_csv("pca_analysis20221121.csv")

# show the first ten samples and the first 10 PC axes
df.iloc[:10, :10].round(2)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
W5145,-7.73,5.25,-12.05,19.16,3.22,1.41,-0.99,-0.91,-0.18,0.03
W5599,5.67,2.8,0.03,-1.67,-1.19,-0.56,-0.95,16.43,2.68,17.91
W5610,-3.82,-1.39,-4.05,-3.74,-4.19,-5.64,-2.12,1.67,-1.23,4.46
W5612,-7.93,4.83,-12.32,19.48,2.89,1.72,-0.89,-0.72,-0.62,0.56
W5636,16.92,1.45,1.89,0.92,1.18,-1.12,-0.26,-8.24,0.78,2.94
W5637,18.22,1.55,2.25,1.19,1.17,-1.09,-0.22,-10.34,0.77,2.83
W6001,-10.3,14.65,8.16,-3.1,1.59,0.81,-0.05,-3.4,-0.21,1.69
W6024,15.66,1.71,1.51,0.96,1.02,-1.26,-0.05,-8.94,0.41,4.87
W6027,11.36,0.59,0.51,0.4,0.83,0.38,0.1,10.24,-2.1,-12.66
W6028,8.73,0.99,0.11,-0.44,0.31,0.6,0.03,9.04,-0.58,-5.12


In [8]:
# plot PC axes 0 and 2
pca.draw(0, 2);
# plot PC axes 0 and 1
pca.draw(0, 1);

In [9]:
# plot PC axes 0 and 2 with many replicate subsamples
pca.run(nreplicates=25, seed=12345)
pca.draw(0, 2);
# plot PC axes 0 and 1 with many replicate subsamples
pca.run(nreplicates=25, seed=12345)
pca.draw(0, 1);

Subsampling SNPs: 4200/8587
Subsampling SNPs: 4200/8587


In [10]:
# plot PC axes 0 and 2 with many replicate subsamples
pca.run(nreplicates=25, seed=12345)
pca.draw(0, 3);
# plot PC axes 0 and 1 with many replicate subsamples
pca.run(nreplicates=25, seed=12345)
pca.draw(0, 4);

Subsampling SNPs: 4200/8587
Subsampling SNPs: 4200/8587


In [11]:
pca.variances

{0: array([1.31947704e-01, 9.05674847e-02, 8.23719216e-02, 8.02147488e-02,
        6.91838922e-02, 6.52040275e-02, 5.61090408e-02, 4.75830478e-02,
        4.08214420e-02, 3.94493415e-02, 3.64779473e-02, 3.54958006e-02,
        3.10642819e-02, 3.05274876e-02, 2.78847857e-02, 2.71492781e-02,
        2.64430577e-02, 2.50538310e-02, 2.45298237e-02, 1.86638471e-02,
        1.31394263e-02, 1.17781446e-04, 1.12914849e-31]),
 1: array([1.27693646e-01, 9.42508229e-02, 8.13233310e-02, 7.98370565e-02,
        7.05450150e-02, 6.41789055e-02, 5.52506922e-02, 4.66280420e-02,
        4.08743566e-02, 4.04462032e-02, 3.52945733e-02, 3.47873597e-02,
        3.25378429e-02, 3.11216824e-02, 2.85571474e-02, 2.77325003e-02,
        2.69422873e-02, 2.60522497e-02, 2.48209533e-02, 1.75012011e-02,
        1.35048105e-02, 1.19320883e-04, 2.73991422e-31]),
 2: array([1.32484667e-01, 9.19734449e-02, 8.42850304e-02, 8.10782839e-02,
        7.00356388e-02, 6.40366348e-02, 5.39031583e-02, 4.62093969e-02,
        4.0

In [12]:
# store the variances as a dataframe
df_var = pd.DataFrame(pca.variances, index=pca.names)

# write the PC axes to a CSV file
df_var.to_csv("pca_variances.csv")

# show the first ten samples and the first 10 PC axes
df_var.iloc[:10, :10].round(2)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
W5145,0.13,0.13,0.13,0.12,0.13,0.12,0.13,0.12,0.13,0.13
W5599,0.09,0.09,0.09,0.09,0.1,0.09,0.09,0.1,0.09,0.1
W5610,0.08,0.08,0.08,0.08,0.08,0.08,0.08,0.08,0.08,0.08
W5612,0.08,0.08,0.08,0.08,0.08,0.08,0.08,0.08,0.08,0.08
W5636,0.07,0.07,0.07,0.07,0.07,0.07,0.07,0.07,0.07,0.07
W5637,0.07,0.06,0.06,0.07,0.06,0.06,0.06,0.07,0.07,0.06
W6001,0.06,0.06,0.05,0.06,0.05,0.06,0.06,0.06,0.06,0.06
W6024,0.05,0.05,0.05,0.05,0.05,0.05,0.05,0.05,0.04,0.05
W6027,0.04,0.04,0.04,0.04,0.04,0.04,0.04,0.04,0.04,0.04
W6028,0.04,0.04,0.04,0.04,0.04,0.04,0.04,0.04,0.04,0.04


In [24]:
# The outfile end in `.pdf` or `.svg`
pca.draw(0, 1, outfile="/home/marianna/Documents/Phacelia/Figures/PCA_01_20240117.pdf")

pca.draw(0, 2, outfile="/home/marianna/Documents/Phacelia/Figures/PCA_02_20240117.pdf")

pca.draw(0, 3, outfile="/home/marianna/Documents/Phacelia/Figures/PCA_03_20240117.pdf")

pca.draw(0, 4, outfile="/home/marianna/Documents/Phacelia/Figures/PCA_04_220240117.pdf")



(<toyplot.canvas.Canvas at 0x7f1cbdfccc70>,
 <toyplot.coordinates.Cartesian at 0x7f1cbe4ae1a0>)

### Grouping individuals into populations based on their geographical distribution

In [13]:
imap2 = {
"Peru": ["W5599", "W6078", "W5636","W6037", "W6024", "W5637", "W6079", "W6001", "W6028", "W6029", "W6027"],
"Argentina": ["W6368", "W6376", "W6373","W6374", "W6080", "W6369", "W5610","W6375","W6031","W6370"],
"Chile": ["W5612","W5145"],
}
## require that 50% of samples have data in each group
minmap = {i: 0.5 for i in imap2}

In [14]:
# init pca object with input data and (optional) parameter options
pca2 = ipa.pca(
    data=data,
    imap=imap2,
    minmap=minmap,
    mincov=0.25,   
    impute_method="sample",
)

Samples: 23
Sites before filtering: 19359
Filtered (indels): 1020
Filtered (bi-allel): 210
Filtered (mincov): 51
Filtered (minmap): 2298
Filtered (subsample invariant): 6133
Filtered (minor allele frequency): 0
Filtered (combined): 8381
Sites after filtering: 10977
Sites containing missing values: 7938 (72.31%)
Missing values in SNP matrix: 26176 (10.37%)
SNPs (total): 10977
SNPs (unlinked): 5189
Imputation: 'sampled'; (0, 1, 2) = 87.7%, 10.4%, 1.9%


In [15]:
# run the PCA analysis
pca2.run()

Subsampling SNPs: 5189/10977


In [16]:
# store the PC axes as a dataframe
df = pd.DataFrame(pca2.pcaxes[0], index=pca2.names)

# write the PC axes to a CSV file
df.to_csv("pca2_analysis_20221121.csv")

# show the first ten samples and the first 10 PC axes
df.iloc[:10, :10].round(2)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
W5145,-7.52,24.55,6.91,7.06,-3.75,-1.64,-0.55,-0.22,-0.39,-0.06
W5599,6.27,0.25,-3.05,-4.22,0.96,-0.68,6.98,21.02,-14.83,-4.32
W5610,-5.03,-0.27,-4.13,-1.81,-1.23,1.05,9.82,2.4,-6.4,2.33
W5612,-8.11,25.67,7.24,7.66,-4.43,-1.53,-0.75,0.4,-0.93,0.24
W5636,17.01,-2.87,2.93,3.66,-1.7,0.24,-2.37,-8.55,-7.09,0.55
W5637,17.72,-2.89,3.23,4.11,-2.29,0.27,-2.71,-9.33,-7.27,0.62
W6001,-2.62,7.03,-1.15,-18.03,11.47,-1.53,-5.54,-5.26,1.87,-1.4
W6024,16.95,-2.4,2.82,3.44,-1.56,0.14,-2.08,-8.56,-7.22,0.96
W6027,12.12,-1.29,0.71,2.51,-1.18,-0.28,1.97,7.06,17.22,0.32
W6028,10.82,-1.59,-0.46,1.14,-0.34,-0.37,2.04,7.56,5.21,-1.59


In [17]:
# plot PC axes 0 and 2
pca2.draw(0, 2);

In [18]:
# plot PC axes 0 and 1
pca2.draw(0, 1);

In [19]:
# plot PC axes 0 and 2 with many replicate subsamples
pca2.run(nreplicates=25, seed=12345)
pca2.draw(0, 2);
# plot PC axes 0 and 1 with many replicate subsamples
pca2.run(nreplicates=25, seed=12345)
pca2.draw(0, 1);

Subsampling SNPs: 5189/10977
Subsampling SNPs: 5189/10977


In [20]:
pca2.variances 

{0: array([1.25825733e-01, 8.87161600e-02, 7.63419781e-02, 6.96243201e-02,
        6.74637168e-02, 5.87793367e-02, 5.26333016e-02, 4.83279415e-02,
        4.29763346e-02, 4.24124349e-02, 4.02531350e-02, 3.77926240e-02,
        3.54486784e-02, 3.35798265e-02, 3.26558335e-02, 2.95169036e-02,
        2.82360857e-02, 2.80259766e-02, 2.59887821e-02, 1.83873273e-02,
        1.33818550e-02, 3.63171505e-03, 3.61942267e-32]),
 1: array([1.29719591e-01, 8.71629665e-02, 7.63077098e-02, 6.90741069e-02,
        6.72411965e-02, 5.99083114e-02, 5.21258545e-02, 4.83724842e-02,
        4.39198241e-02, 4.22438067e-02, 3.96177120e-02, 3.61602857e-02,
        3.46513447e-02, 3.39360449e-02, 3.31497587e-02, 2.86620837e-02,
        2.83357654e-02, 2.79296195e-02, 2.61142722e-02, 1.78869970e-02,
        1.39130157e-02, 3.56724877e-03, 2.22164262e-31]),
 2: array([1.30121297e-01, 8.74759284e-02, 7.74924822e-02, 6.86268133e-02,
        6.67647383e-02, 5.88329598e-02, 5.23652140e-02, 4.63260591e-02,
        4.4

In [21]:
# store the variances as a dataframe
df_var = pd.DataFrame(pca2.variances, index=pca2.names)

# write the PC axes to a CSV file
df_var.to_csv("pca2_variances.csv")

# show the first ten samples and the first 10 PC axes
df_var.iloc[:10, :10].round(2)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
W5145,0.13,0.13,0.13,0.13,0.13,0.13,0.13,0.13,0.13,0.13
W5599,0.09,0.09,0.09,0.09,0.09,0.08,0.09,0.09,0.09,0.09
W5610,0.08,0.08,0.08,0.08,0.08,0.08,0.08,0.08,0.08,0.08
W5612,0.07,0.07,0.07,0.07,0.07,0.07,0.07,0.07,0.07,0.07
W5636,0.07,0.07,0.07,0.07,0.07,0.07,0.07,0.07,0.07,0.07
W5637,0.06,0.06,0.06,0.06,0.06,0.06,0.06,0.06,0.06,0.06
W6001,0.05,0.05,0.05,0.05,0.05,0.05,0.05,0.05,0.05,0.05
W6024,0.05,0.05,0.05,0.05,0.05,0.05,0.05,0.05,0.05,0.05
W6027,0.04,0.04,0.04,0.04,0.04,0.04,0.04,0.04,0.04,0.04
W6028,0.04,0.04,0.04,0.04,0.04,0.04,0.04,0.04,0.04,0.04


In [22]:
# plot PC axes 0 and 2 with many replicate subsamples
pca2.run(nreplicates=25, seed=12345)
pca2.draw(0, 3);
# plot PC axes 0 and 1 with many replicate subsamples
pca2.run(nreplicates=25, seed=12345)
pca2.draw(0, 4);

Subsampling SNPs: 5189/10977
Subsampling SNPs: 5189/10977


In [25]:
# The outfile end in `.pdf` or `.svg`
pca2.draw(0, 1, outfile="/home/marianna/Documents/Phacelia/Figures/PCA_imap2_01_20240117.pdf")

pca2.draw(0, 2, outfile="/home/marianna/Documents/Phacelia/Figures/PCA_imap2_02_20240117.pdf")

pca2.draw(0, 3, outfile="/home/marianna/Documents/Phacelia/Figures/PCA_imap2_03_20240117.pdf")

pca2.draw(0, 4, outfile="/home/marianna/Documents/Phacelia/Figures/PCA_imap2_04_220240117.pdf")

(<toyplot.canvas.Canvas at 0x7f1cbe0f0f70>,
 <toyplot.coordinates.Cartesian at 0x7f1cbe405c00>)

In [23]:
# plotting all the PCA plots together
# get canvas object and set size
c = toyplot.Canvas(width=1000, height=900)

## built & dissect canvas into multiple cartesian areas 
##    y1
## x1    x2
##    y2
a0 = c.cartesian(bounds=('11%','38%', '6%',  '35%'))## x1, x2, y1, y2
a1 = c.cartesian(bounds=('59%', '85%', '6%', '35%'))
a2 = c.cartesian(bounds=('11%','38%', '50%', '80%'))
a3 = c.cartesian(bounds=('59%', '85%', '50%', '80%'))


pca.draw(0, 1, axes= a0);
pca.draw(0, 2, axes= a1);
pca2.draw(0, 1, axes= a2);
pca2.draw(0, 2, axes= a3);


In [21]:
# The outfile end in `.pdf` or `.svg`
import toyplot.pdf
toyplot.pdf.render(c,"/home/marianna/Documents/Phacelia/Figures/PCA_all_20221121.pdf")

import toyplot.svg
toyplot.pdf.render(c, "/home/marianna/Documents/Phacelia/Phac_Analyses/PCA_all_20221121.svg")