# Compare factors delivered by the three methods

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import pickle
% matplotlib inline
from sklearn.manifold import TSNE 
from sklearn.preprocessing import normalize
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans

## Factorizer classes
These have now been hived off into ``factorizer_wrappers.py``.  Import and test them.

In [None]:
from factorizer_wrappers import ICA_Factorizer, NMF_Factorizer, PCA_Factorizer
from factorizer_wrappers import example_V, test_example_V, test_Factorizer

In [None]:
test_example_V()

test_Factorizer(ICA_Factorizer(n_components=4, fun='cube', algorithm='parallel'), atol=0.5) 
test_Factorizer(ICA_Factorizer(n_components=5), atol=0.001) 

test_Factorizer(NMF_Factorizer(n_components=4, max_iter=1000), atol=0.5) 
test_Factorizer(NMF_Factorizer(n_components=5, max_iter=1000), atol=0.1) 

test_Factorizer(PCA_Factorizer(n_components=4), atol=0.5) 
test_Factorizer(PCA_Factorizer(n_components=5), atol=0.001) 

## Read the expression matrix
This is repeated code, should be factored out...

In [None]:
# Read in expression spreadsheet which has been processed (see end of notebook) to inlcude only protein coding genes
expression_df = pd.read_csv('../Data/HGSOC_Protein_Expression.csv', sep='\t')
expression_df.set_index('GeneENSG', inplace=True)
assert len(expression_df) == 19730   # Only 
assert len(expression_df.columns) == 80
assert expression_df.columns[-1] == 'AOCS_171'
expression_matrix = normalize(np.asarray(expression_df))

print(expression_matrix.shape[0], "genes")
print(expression_matrix.shape[1], "patients")

In [None]:
def l2_norm_diff(m1, m2):
    return np.sqrt(np.mean((m1 - m2)**2))

In [None]:
def distance(v1, v2):
    dotp = np.dot(v1, v2)
    v1_mag = np.sqrt(np.sum(v1 * v1))
    v2_mag = np.sqrt(np.sum(v2 * v2))
    costheta = dotp / (v1_mag * v2_mag)
    
    angleRad = np.arccos(abs(min(costheta, 1.0)))
    angleDeg = angleRad * (180 / np.pi)
    return angleDeg

(distance(np.array([1,0,0,1]), np.array([1,0,0,1])), 
 distance(np.array([1,0,0,1]), np.array([0,0,0,1])),
 distance(np.array([1,0,0,0]), np.array([0,0,0,1])))

In [None]:
# Angle calculation

def calc_angle(v1, v2, show=False):
    
    dotp = np.dot(v1, v2)
    v1_mag = np.sqrt(np.sum(v1 * v1))
    v2_mag = np.sqrt(np.sum(v2 * v2))
    costheta = dotp / (v1_mag * v2_mag)

    angleRad = np.arccos(min(costheta, 1.0))
    angleDeg = angleRad * (180 / np.pi)

    if show:
        print ("v1:\n")
        print (v1)
        print ("\nv2:")
        print (v2)
        print ("\nv1 Mag.:%6.4f" % v1_mag)
        print ("\nv2 Mag.:%6.4f" % v2_mag)
        print ("v1 . v2 = %6.4f" % dotp)
        print(dotp / (m1_mag * m2_mag))
        print ("Angle between v1 and v2 = %5.1f degrees." % angleDeg)
    return angleDeg

calc_angle(v1= np.array([0,0,0,1]), v2=np.array([1,1,1,0]))

## Angle of vectors in a high dimensined space
Demonstrating that in a 20,000 dimensioned space, any two random vectors will be at very close to 90 degrees!


In [None]:
alist = []
dims=20000
n=5000
rvs = np.random.randn(n, dims)
for i in range(n-1):
    v1 = rvs[i,:]
    v2 = rvs[i+1,:]
    a = calc_angle(v1,v2)
    alist += [a]
    
plt.hist(alist, bins=50)
plt.title("Mean=%6.2f, SD=%6.2f degrees" % (np.mean(alist), np.std(alist)))
plt.show()

## Multiple cached runs of NMF and ICA
Run NMF and ICA for a range of components, with repeats and save into .pkl fles for later use.

In [None]:
def run(V, facto_class, n_components, n_repeats):
    pickle_fname = "../Cache/FactorizerExpt/%s_%d_%d.pkl" % (facto_class.__name__, n_components, n_repeats)
    print(pickle_fname)
    metagene_list = []
    for i in range(n_repeats):
        facto = facto_class(n_components=n_components, max_iter=5000, random_state=np.random.randint(10000))
        facto.fit(V)
        metagene_list += [facto.get_W()]
        print('\r%d/%d' % (i+1, n_repeats), end='')
    print()
    with open(pickle_fname, 'wb') as f:
        pickle.dump(metagene_list, f)
        
run(expression_matrix, ICA_Factorizer, 4, 2)

In [None]:
# This will take several hours, if enabled! 
if False:
    for nc in range(2, 31):
        run(expression_matrix, NMF_Factorizer, nc, 50)
        run(expression_matrix, ICA_Factorizer, nc, 50)
        print()

    print("All Done.")
    

## t-SNE plots of NMF, ICA and PCA components
It's interesting to see the components generated by the three methods ploted in the same space.   However, we must beware of over-interpeting t-SNE plots...

In [None]:
def combined_factors_scatter(n_components, n_repeats):
    
    # Read back the pickle files containing multiple runs. One file for each n_components for each
    # of NMF and ICA
    
    dummy_facto = NMF_Factorizer()
    pickle_fname = "../Cache/FactorizerExpt/%s_%d_%d.pkl" % (type(dummy_facto).__name__, n_components, n_repeats)
    # print(pickle_fname)
    with open(pickle_fname, 'rb') as f:
        nmf_metagene_list = pickle.load(f)

    dummy_facto = ICA_Factorizer()
    pickle_fname = "../Cache/FactorizerExpt/%s_%d_%d.pkl" % (type(dummy_facto).__name__, n_components, n_repeats)
    with open(pickle_fname, 'rb') as f:
        ica_metagene_list = pickle.load(f)
        
    # Add result of PCA analysis for same number of components
    pca_facto = PCA_Factorizer(n_components=n_components)
    pca_facto.fit(expression_matrix)
    pca_metagenes = pca_facto.get_W()
    
    stacked_metagenes = np.hstack(nmf_metagene_list + ica_metagene_list + [pca_metagenes]).T
    
    # For ICA at least, we see double the expected number of components, due to the arbitrary direction of the vector
    # So flip them into the same overall direction
    flipped_metagenes = [g if sum(g[:10])>0 else -g for g in stacked_metagenes[:]]

    # Reduce to a managable number of dimensions before passing to t-SNE
    pca = PCA(n_components=50)
    tsne = TSNE(n_components=2, init='pca', n_jobs=7)
    Y = tsne.fit_transform(pca.fit_transform(flipped_metagenes))
    Y.shape

    
    # Plot the t-SNE projections in two halves so that NMF and ICA show in different colours
    assert Y.shape[0] == 2 * n_components * n_repeats + n_components
    # Start indices of the components in Y
    nmf_Y = Y[0:n_components * n_repeats, :]
    ica_Y = Y[n_components * n_repeats: 2 * n_components * n_repeats, :]
    pca_Y = Y[2 * n_components * n_repeats:, :]
    
    plt.scatter(nmf_Y[:,0], nmf_Y[:,1], s=3, label='NMF')
    plt.scatter(ica_Y[:,0], ica_Y[:,1], s=3, label='ICA')
    plt.scatter(pca_Y[:,0], pca_Y[:,1], s=50, marker = '+', label='PCA')
    
    plt.xlabel("t-SNE dimension 1")
    plt.ylabel("t-SNE dimension 2")
    ax = plt.gca()
    ax.axes.xaxis.set_visible(False)
    ax.axes.yaxis.set_visible(False)
    
    
    plt.legend()
    plt.title("Components: %d" % n_components)
    
plt.figure(figsize=(4,4))
combined_factors_scatter(3, 50)
plt.suptitle("Just testing...")

In [None]:
def plot_multiple_combined_factors_scatter(start_comp, end_comp):
    plt.figure(figsize=(16,20))
    # plt.figure(figsize=(8,8))

    n_repeats = 50
    for nc in range(start_comp, end_comp):
        print('.', end='')
        plt.subplot(4,3,nc-start_comp+1)
        combined_factors_scatter(nc, n_repeats)
    plt.suptitle("t-SNE clustering for %d repeats of NMF and ICA", size=14)
    plt.show()

In [None]:
plot_multiple_combined_factors_scatter(3, 15)
plot_multiple_combined_factors_scatter(16, 28)

## Pick out the clusters with k-means
Although NMF seems to preduce components in a repeatable order - so that centroids can be calculated
directly, this seems not to be the case for ICA.  So use k-means to sort them out

In [None]:

def investigate_cluster_statistics(facto, doprint=False):
    # The given facto is not actually executed, just used to select the appropriate cached
    # .pkl files which were computed above.
    n_repeats = 50
    n_components = facto.n_components
    pickle_fname = "../Cache/FactorizerExpt/%s_%d_%d.pkl" % (type(facto).__name__, n_components, n_repeats)
    # print(pickle_fname)
    with open(pickle_fname, 'rb') as f:
        metagene_list = pickle.load(f)
    stacked_metagenes = np.hstack(metagene_list).T
    flipped_metagenes = [g if sum(g[:10])>0 else -g for g in stacked_metagenes[:]]

    pca = PCA(n_components=10)
    kmeans = KMeans(n_clusters=n_components, random_state=0).fit(pca.fit_transform(flipped_metagenes))
    cluster_table = np.reshape(kmeans.labels_, (n_repeats, n_components))
    clusters_are_aligned = np.all([cluster_table[r,:] == cluster_table[0,:] for r in range(n_repeats)])
    if doprint:
        for r in range(n_repeats):
            print(r, cluster_table[r,:])
        print()
    return clusters_are_aligned
    

investigate_cluster_statistics(NMF_Factorizer(n_components=4), True)
investigate_cluster_statistics(ICA_Factorizer(n_components=4), True)

In [None]:
# Lets see if clusters are assined consistently for NMF and ICA across a range of n_components
print("%6s %10s %10s" % ('', NMF_Factorizer.__name__, ICA_Factorizer.__name__))
for nc in range(3,31):
    nmf_consistent = investigate_cluster_statistics(NMF_Factorizer(n_components=nc))
    ica_consistent = investigate_cluster_statistics(ICA_Factorizer(n_components=nc))
    print("%6d%10s %10s" % (nc, nmf_consistent, ica_consistent))

In [None]:
# THIS IS BROKEN!
# Calculate angles between components
a = pca.inverse_transform(kmeans.cluster_centers_)
n = a.shape[0]
angle_matrix=np.zeros((n,n))
for i1 in range(n):
    for i2 in range(n):
        v1, v2 = a[i1,:], a[i2,:]
        angle_matrix[i1, i2] = calc_angle(v1, v2)

for i1 in range(n):
    print("%2d: " % i1, end="")
    for i2 in range(n):
        if (i2 <= i1):
            print("%6.0f°" % angle_matrix[i1, i2], end="")
    print()
        
        