In [None]:
from os import path
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
import pickle

In [None]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)


In [None]:
def example_V(n_genes=100):
    # Generate example expression matrix, useful in tests
    np.random.seed(0)
    time = np.linspace(0, 8, n_genes)

    s1 = np.sin(time) + 1.1  # Signal 1 : sinusoidal signal
    s2 = np.sign(np.sin(3 * time)) + 1.1  # Signal 2: square signal
    s3 = np.sin(2 * np.pi * time) + 1.1  # Signal 3: saw tooth signal
    s4 = np.cos(0.5 * np.pi * time) + 1.1  # Signal 4: cosine
    s5 = np.sin(0.2 * np.pi * time) + 1.1  # Signal 5: higher freq sine

    W = np.c_[s1, s2, s3, s4, s5]
    W += 0.1 * np.random.normal(size=W.shape)  # Add noise

    W /= W.std(axis=0)  # Standardize data
    # Mix data
    H = np.array([[1, 1, 1, 1, 1], [0.5, 0/6, 1, 1.2, 1], [1.5, 1, 2, 1, 1.1],
                 [1, 0.4, 1, 1.1, 0.1], [1, 0.2, 0.8, 1, 1.5]])  # Mixing matrix
    V = np.dot(W, H.T)  # Generate observations
    return V

def test_example_V():
    ngenes = 10
    eg_V = example_V(ngenes)
    # print(eg_V.shape)
    # print(eg_V)
    assert eg_V.shape == (10, 5)
    assert np.all(eg_V >= 0)
    print("test_example_V() passed.")
    
test_example_V()

In [None]:
# Read in AOCS spreadsheet
expression_df = pd.read_csv('../Data/HGSOC_Protein_Expression.csv', sep='\t')

assert len(expression_df) == 19730
assert len(expression_df.columns == 80 + 1)
assert expression_df.columns[0] == 'GeneENSG'
assert expression_df.columns[-1] == 'AOCS_171'

expression_matrix = np.asarray(expression_df.iloc[:,1:])

print(expression_matrix.shape[0], "genes")
print(expression_matrix.shape[1], "patients")

In [None]:
def test_Factorizer(facto, atol):
    print(facto)
    V = example_V(10)
    nc = facto.n_components
    
    facto.fit(V)
    
    W = facto.get_W()
    assert W.shape == (V.shape[0], nc)
    
    H = facto.get_H()
    assert H.shape == (nc, V.shape[1])
    
    V2 = facto.get_recovered_V()
    assert np.allclose(V, V2, atol=atol)
        
    print("test_Factorizer (%s) passed" % type(facto).__name__)
    

In [None]:
class PCA_Factorizer(PCA):
    def __init__(self, n_components=None, max_iter=None, random_state=None):
        PCA.__init__(self, n_components=n_components)
        self.V = None
        self.W = None
        self.H = None
        self.recovered_V = None
        
    def fit(self, V):
        self.V = V
        self.W = self.fit_transform(V)
        
    def get_W(self):
        assert self.V is not None
        if self.W is None:
            self.W = self.fit_transform(self.V)
        return self.W
    
    def get_H(self):
        assert self.V is not None
        if self.H is None:
            self.H = self.components_
        return self.H
    
    def get_recovered_V(self):
        assert self.V is not None
        if self.recovered_V is None:
            W = self.get_W()
            H = self.get_H()
            self.recovered_V = np.dot(W,H) + self.mean_
        return self.recovered_V


test_Factorizer(PCA_Factorizer(n_components=4), atol=1) 
test_Factorizer(PCA_Factorizer(n_components=5), atol=0.001) 

In [None]:
pca = PCA_Factorizer()
V = expression_matrix
pca.fit(V)

In [None]:
pca.explained_variance_

In [None]:
# Total variance is sum variances of each row
total_var = np.sum([np.var(V[:,i]) for i in range(V.shape[1])])
print("Sum of variance of all rows: %6.2f" % total_var)
print("Sum of the %d explained variances: %6.2f" % (len(pca.explained_variance_), sum(pca.explained_variance_)))

In [None]:
plt.figure(figsize=(10,6))
plt.plot(range(2,31), pca.explained_variance_[1:30] *100 /total_var, 'o-')
plt.xlabel('Components (K)')
plt.ylabel('% Explained variance')
# plt.plot([6, 12], pca.explained_variance_[[5, 11]], 'o')
plt.title("Scree plot for PCA variance explained")
plt.show()