In [None]:
%cd ..

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

from src.dataset import make_dataset
from src.preprocessing import build_features
from src.preprocessing import data_cleaning
from src.models import train_model
from src.visualizations import visualize
import json

In [None]:
with open('config/default-cancer-stage.json', "r") as jsonfile:
    config = json.load(jsonfile)

In [None]:
def create_pca(data, targets, target_column):
    # Run PCA
    ## Standardize data
    scaler = StandardScaler()
    scaled_data = scaler.fit_transform(data)
    ## Run sklearn PCA on data
    pca = PCA(n_components=2)
    pca_data = pca.fit_transform(scaled_data)
    # Convert ndarray to DataFrame, Add targets back in
    pca_data = pd.DataFrame(pca_data, index=data.index)
    pca_data = pd.merge(pca_data, targets, on="sampleid", how="inner")

    # Plot PCA
    ## Create scatterplot

    sns.scatterplot(x=0, y=1, data=pca_data,
                    hue=target_column,
                    alpha=0.3)
    plt.title("PCoA")
    plt.legend(bbox_to_anchor=(1.01, 1),
               loc=2,
               borderaxespad=0.,
               title=target_column)
    ## Format PC labels to include explained variance
    plt.xlabel(f'PC1 ({round(pca.explained_variance_[0], 2)}%)')
    plt.ylabel(f'PC2 ({round(pca.explained_variance_[1], 2)}%)')
    ## Save fig
    plt.savefig("figures\pca\PCA_" + target_column + ".png", bbox_inches='tight')

    # Return loading scores
    ## Get loading scores from both PC's
    pc1_loading_scores = pd.Series(pca.components_[0], index=data.columns)
    pc2_loading_scores = pd.Series(pca.components_[1], index=data.columns)
    ## Sort loading scores by magnitude
    pc1_loading_scores = pc1_loading_scores.abs().sort_values(ascending=False)
    pc2_loading_scores = pc2_loading_scores.abs().sort_values(ascending=False)
    ## Return extracted series
    return pc1_loading_scores, pc2_loading_scores

In [None]:
def pca(target_column):
    counts_filename = 'data/count_data_species_raw_WIS_overlapping_fungi_bacteria_12773samples.tsv'
    metadata_filename = 'data/metadata_species_WIS_overlapping_fungi_bacteria_12773samples.tsv'
    
    # load fungi counts and metadata into
    counts = make_dataset.read_data_file(counts_filename)
    raw_metadata = make_dataset.read_data_file(metadata_filename)

    metadata = raw_metadata.replace('Not available', np.nan)
    for col in ['pathologic_t_label', 'pathologic_n_label', 'pathologic_stage_label']:
        metadata[col] = data_cleaning.reduce_stages(metadata[col])

    stages = ['Stage I', 'Stage II', 'Stage III', 'Stage IV']

    # filter out missing stage data
    metadata = metadata[metadata.pathologic_stage_label.isin(["Stage I", "Stage II", "Stage III", "Stage IV"])]

    # filter to one experimental strategy
    metadata = metadata[metadata.experimental_strategy == 'WGS']

    # order by cancer stage
    metadata['pathologic_stage_label'] = pd.Categorical(metadata['pathologic_stage_label'], categories=stages)
    metadata = metadata.sort_values(by='pathologic_stage_label')

    # remove target column from pca (keep for coloring plot later)
    targets = metadata[[target_column]]
    metadata.drop(target_column, axis=1)

    # preprocess metadata
    metadata = build_features.preprocess_metadata(config, metadata)
    metadata = metadata.iloc[:, :-7]

    # merge counts data to metadata (drop any counts missing from index in metadata)
    data = pd.merge(metadata, counts, on="sampleid", how="left")

    pca = create_pca(metadata, targets, target_column)

    print('Target Column: ' + target_column)
    print()
    print("----  PC1 Loading Scores:  ----")
    print(pca[0][:5])
    print()
    print("----  PC2 Loading Scores:  ----")
    print(pca[0][:5])

In [None]:
pca('reference_genome')