# PCA plotting

### Setup

In [None]:
import malariagen_data

In [None]:
ag3 = malariagen_data.Ag3(
    "simplecache::gs://vo_agam_release",
    simplecache=dict(cache_storage="../gcs_cache"),
    results_cache="results_cache",
)
ag3

In [None]:
af1 = malariagen_data.Af1(
    "simplecache::gs://vo_afun_release",
    simplecache=dict(cache_storage="../gcs_cache"),
    results_cache="results_cache",
)
af1

In [None]:
!rm -rf results_cache

## Mayotte

In [None]:
df_pca, evr = ag3.pca(
    region="3L:15,000,000-16,000,000",
    sample_sets="AG1000G-FR",
    n_snps=10_000,
)
df_pca

In [None]:
evr

In [None]:
ag3.plot_pca_coords(
    df_pca,
)

In [None]:
ag3.plot_pca_coords(
    df_pca,
    color="location",
)

In [None]:
df_pca, evr = ag3.pca(
    region="3L:15,000,000-16,000,000",
    sample_sets="AG1000G-FR",
    n_snps=10_000,
)

In [None]:
ag3.plot_pca_variance(evr)

In [None]:
ag3.plot_pca_coords(
    df_pca,
    color="location",
)

## Burkina Faso

In [None]:
df_pca, evr = ag3.pca(
    region="3L:15,000,000-16,000,000",
    sample_sets="AG1000G-BF-A",
    n_snps=10_000,
    max_cohort_size=50,
)

In [None]:
ag3.plot_pca_coords(
    df_pca,
    color="taxon",
    category_orders=dict(taxon=["coluzzii", "gambiae"]),
)

In [None]:
ag3.plot_pca_coords(
    df_pca,
    color="admin1_year",
    symbol="taxon",
)

## Ag3.0

In [None]:
df_pca, evr = ag3.pca(
    region="3L:15,000,000-16,000,000",
    sample_sets="3.0",
    sample_query="country != 'Lab Cross'",
    n_snps=10_000,
)

In [None]:
df_pca, evr = ag3.pca(
    region="3L:15,000,000-16,000,000",
    sample_sets="3.0",
    sample_query="country != 'Lab Cross'",
    n_snps=10_000,
)

In [None]:
ag3.plot_pca_variance(evr)

In [None]:
ag3.plot_pca_coords(
    df_pca,
    color="taxon",
    marker_size=5,
)

In [None]:
ag3.plot_pca_coords(
    df_pca,
    x="PC3",
    y="PC4",
    color="country",
    marker_size=5,
)

In [None]:
ag3.plot_pca_coords_3d(
    df_pca,
    color="taxon",
    marker_size=2,
)

In [None]:
ag3.plot_pca_coords_3d(
    df_pca,
    color="taxon",
    category_orders=dict(taxon=["coluzzii", "gambiae", "arabiensis", "gcx1", "gcx2", "gcx3"]),
    marker_size=2,
)

In [None]:
new_cohorts = {
    "East": "country in ['Malawi', 'Tanzania', 'Kenya', 'Uganda']",
    "West": "country in ['Mali', 'Burkina Faso', 'Cameroon']",
}

In [None]:
ag3.plot_pca_coords(
    df_pca,
    color=new_cohorts,
    marker_size=5,
)

In [None]:
ag3.plot_pca_coords_3d(
    df_pca,
    color=new_cohorts,
    marker_size=2,
)

## Af1.0

In [None]:
df_pca, evr = af1.pca(
    region="3RL:15,000,000-16,000,000",
    sample_sets="1.0",
    sample_query="country != 'Ghana'",
    n_snps=10_000,
)

In [None]:
af1.plot_pca_variance(evr)

In [None]:
af1.plot_pca_coords(
    df_pca,
    color="country",
)

In [None]:
af1.plot_pca_coords(
    df_pca,
    x="PC3",
    y="PC4",
    color="country",
)

## Excluding samples

In [None]:
df_pca, evr = ag3.pca(
    region="3L:15,000,000-16,000,000",
    sample_sets="AG1000G-BF-A",
    n_snps=10_000,
    max_cohort_size=50,
)

In [None]:
df_pca.head()

In [None]:
ag3.plot_pca_variance(evr)

In [None]:
ag3.plot_pca_coords(
    df_pca,
    color="taxon",
)

In [None]:
exclude_samples = ["AB0096-C", "AB0241-C", "AB0275-C", "AB0197-C"]

df_pca_ex, evr_ex = ag3.pca(
    region="3L:15,000,000-16,000,000",
    sample_sets="AG1000G-BF-A",
    n_snps=10_000,
    max_cohort_size=50,
    exclude_samples=exclude_samples,
)

In [None]:
df_pca_ex.head()

In [None]:
df_pca_ex.query(f"sample_id in {exclude_samples}")

In [None]:
ag3.plot_pca_variance(evr_ex)

In [None]:
ag3.plot_pca_coords(
    df_pca_ex,
    color="taxon",
)

## Excluding samples during fit

In [None]:
fit_exclude_samples = ["AB0096-C", "AB0241-C", "AB0275-C", "AB0197-C"]

df_pca_fex, evr_fex = ag3.pca(
    region="3L:15,000,000-16,000,000",
    sample_sets="AG1000G-BF-A",
    n_snps=10_000,
    max_cohort_size=50,
    fit_exclude_samples=fit_exclude_samples,
)

In [None]:
df_pca_fex.head()

In [None]:
df_pca_fex.query(f"sample_id in {fit_exclude_samples}")

In [None]:
ag3.plot_pca_variance(evr_fex)

In [None]:
ag3.plot_pca_coords(
    df_pca_fex,
    color="taxon",
)

In [None]:
ag3.plot_pca_coords(
    df_pca_fex,
    color="pca_fit",
)