In [None]:
from os import path
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
import pickle
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

In [None]:
from factorizer_wrappers import PCA_Factorizer
from factorizer_wrappers import test_Factorizer

In [None]:
test_Factorizer(PCA_Factorizer(n_components=4), atol=0.5) 
test_Factorizer(PCA_Factorizer(n_components=5), atol=0.001) 

In [None]:
# Read in AOCS spreadsheet
expression_df = pd.read_csv('../Data/HGSOC_Protein_Expression.csv', sep='\t')

assert len(expression_df) == 19730
assert len(expression_df.columns == 80 + 1)
assert expression_df.columns[0] == 'GeneENSG'
assert expression_df.columns[-1] == 'AOCS_171'

expression_matrix = np.asarray(expression_df.iloc[:,1:])

print(expression_matrix.shape[0], "genes")
print(expression_matrix.shape[1], "patients")

In [None]:
pca = PCA_Factorizer()
V = expression_matrix
pca.fit(V)

In [None]:
pca.explained_variance_

In [None]:
# Total variance is sum variances of each row
total_var = np.sum([np.var(V[:,i]) for i in range(V.shape[1])])
print("Sum of variance of all rows: %6.2f" % total_var)
print("Sum of the %d explained variances: %6.2f" % (len(pca.explained_variance_), sum(pca.explained_variance_)))

In [None]:
plt.figure(figsize=(10,6))
plt.plot(range(2,31), pca.explained_variance_[1:30] *100 /total_var, 'o-')
plt.xlabel('Components (K)')
plt.ylabel('% Explained variance')
# plt.plot([6, 12], pca.explained_variance_[[5, 11]], 'o')
plt.title("Scree plot for PCA variance explained")
plt.show()