# Transcriptomics analysis by matrix factorization

In [None]:
import matplotlib.pyplot as plt
import sys
%matplotlib inline

In [None]:
sys.path += ['../Src']

In [None]:
import importlib
import factor_clustering
importlib.reload(factor_clustering)
from factor_clustering import FactorClustering
from factorizer_wrappers import NMF_Factorizer, ICA_Factorizer, PCA_Factorizer

$V_t = W_tH_t $

## Perform multiple bootstrap repeat calculations of ICA, NMF and PCA and look at clustering

In [None]:
possible_datasets = {1: 'Mini_Expression', 
                     2: 'AOCS_Protein_Expression', 
                     3: 'TCGA_OV_VST', 
                     4: 'Canon_Sample_n200'}

fc = FactorClustering(possible_datasets[3], n_repeats=50, method='bootstrap')
print("Starting analysis for", fc.basename)
print("Method:", fc.method)

fc.read_expression_matrix()

all_factos = [NMF_Factorizer, ICA_Factorizer, PCA_Factorizer]

## Computation and caching of bootstrapped samples
For each of NMF, ICA and PCA We compute and cache 50 repeat factorizations over a range of component numbers.  A bootstrap sample is created having same number of patients, sampled *with* replacement.

In [None]:
start_nc, end_nc = 2, 14

In [None]:
if True:
    # Beware - this could take hours (for the full size dataset)!
    fc.compute_and_cache_multiple_factor_repeats(start_nc, end_nc, force=False)

## t-SNE plots per factorization method
These demonstrate visually how robust are the factors.  The median point for each cluster is indicated with a black '+'.

In [None]:
if True:
    fc.plot_multiple_single_factors_scatter(NMF_Factorizer, start_nc, end_nc)

In [None]:
if True:
    fc.plot_multiple_single_factors_scatter(ICA_Factorizer, start_nc, end_nc)

In [None]:
if True:
    fc.plot_multiple_single_factors_scatter(PCA_Factorizer, start_nc, end_nc)

## t-SNE plots of all three methods together

In [None]:
if True:
    fc.plot_multiple_combined_factors_scatter(start_nc, end_nc)

## Silhouette plots
These indicate robustness of clusters for each number of components.

In [None]:
if True:
    fc.plot_silhouette_scores(start_nc, end_nc, show=True)  
    

In [None]:
if False:
    fc.investigate_multiple_cluster_statistics(start_nc, end_nc)