In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import pickle
% matplotlib inline

import nimfa

In [None]:
def example_V(n_genes=100):
    # Generate example expression matrix, useful in tests
    np.random.seed(0)
    time = np.linspace(0, 8, n_genes)

    s1 = np.sin(time) + 1.1  # Signal 1 : sinusoidal signal
    s2 = np.sign(np.sin(3 * time)) + 1.1  # Signal 2: square signal
    s3 = np.sin(2 * np.pi * time) + 1.1  # Signal 3: saw tooth signal
    s4 = np.cos(0.5 * np.pi * time) + 1.1  # Signal 4: cosine
    s5 = np.sin(0.2 * np.pi * time) + 1.1  # Signal 5: higher freq sine

    W = np.c_[s1, s2, s3, s4, s5]
    W += 0.1 * np.random.normal(size=W.shape)  # Add noise

    W /= W.std(axis=0)  # Standardize data
    # Mix data
    H = np.array([[1, 1, 1, 1, 1], [0.5, 0/6, 1, 1.2, 1], [1.5, 1, 2, 1, 1.1],
                 [1, 0.4, 1, 1.1, 0.1], [1, 0.2, 0.8, 1, 1.5]])  # Mixing matrix
    V = np.dot(W, H.T)  # Generate observations
    return V

def test_example_V():
    ngenes = 10
    eg_V = example_V(ngenes)
    # print(eg_V.shape)
    # print(eg_V)
    assert eg_V.shape == (10, 5)
    assert np.all(eg_V >= 0)
    print("test_example_V() passed.")
    
test_example_V()

In [None]:
def l2_norm_diff(m1, m2):
#   return np.mean(np.sqrt((m1 - m2)**2))
    return np.sqrt(np.mean((m1 - m2)**2))

def test_l2_norm_diff():
    V = example_V(10)
    rms = l2_norm_diff(V, V+0.5)
    assert np.isclose(rms,0.5)
    
test_l2_norm_diff() 

In [None]:
# Read in AOCS spreadsheet
expression_df = pd.read_csv('AOCS_TPM_VST.csv')

assert len(expression_df) == 57914
assert len(expression_df.columns == 80 + 1)
assert expression_df.columns[0] == 'GeneENSG'
assert expression_df.columns[-1] == 'AOCS_171'

expression_matrix = np.asarray(expression_df.iloc[:,1:])

print(expression_matrix.shape[0], "genes")
print(expression_matrix.shape[1], "patients")

In [None]:

def run(V, nimfa_method, max_iter, n_run, pickle_name):
    print("Running for %s, to %s" % (str(nimfa_method), pickle_name))
    results_dict = {}
    for n_components in range(2,31):
        # Factorization will be run 3 times (n_run) and factors will be tracked for computing
        # cophenetic correlation. Note increased time and space complexity
        model = nimfa_method(V, seed='random_vcol', max_iter=max_iter, rank=n_components, n_run=n_run , track_factor=True)
        model_fit = model()
        sm = model_fit.summary()
        print('Components: %d, Rss: %5.3f, Evar: %5.3f, Iterations: %d, Cophenetic correlation:%5.3f, K-L divergence: %5.3f' % 
              (n_components, sm['rss'], sm['evar'], sm['n_iter'], sm['cophenetic'], model_fit.distance(metric='kl')))
        results_dict[n_components] = sm
    with open(pickle_name, 'wb') as f:
        pickle.dump(results_dict, f)
    print(pickle_name, "done.")

if False:
    run(expression_matrix, nimfa.Lsnmf, 20, 50, 'Lsnmf.pkl')
    # run(expression_matrix, nimfa.Icm, 20, 50, 'Icm.pkl')
    run(expression_matrix, nimfa.Bmf, 20, 50, 'Bmf.pkl')
    run(expression_matrix, nimfa.Nmf, 200, 10, 'Nmf.pkl')


In [None]:
lsnmf = nimfa.Lsnmf(expression_matrix, seed='random_vcol', max_iter=20, rank=3, track_error=True)
lsnmf_fit = lsnmf()

W = lsnmf_fit.basis()
print('Basis matrix:\n%s' % W)

H = lsnmf_fit.coef()
print('Mixture matrix:\n%s' % H)

r = lsnmf.estimate_rank(rank_range=[2,3,4], what=['rss', 'cophenetic'])
pp_r = '\n'.join('%d: %5.3f' % (rank, vals['rss']) for rank, vals in r.items())
print('Rank estimate:\n%s' % pp_r)

In [None]:
def plt_results_scree(pickle_name, title):
    
    with open(pickle_name, 'rb') as f:
        results_dict = pickle.load(f)
        
    xs = list(results_dict.keys())
    rss = np.array([results_dict[x]['rss'] for x in xs])
    evars = np.array([results_dict[x]['evar'] for x in xs])
    sparsness = [results_dict[x]['sparseness'][0] for x in xs]

    def rms(r):
        return np.sqrt(np.mean(np.multiply(r,r)))
    rmss = [rms(results_dict[x]['residuals']) for x in xs]
    cophs = [results_dict[x]['cophenetic'] for x in xs]    
    
    if False:
        for (n_components, sm) in results_dict.items():
            print('Components: %d, Rss: %5.3f, Evar: %5.3f, Iterations: %d, Cophenetic correlation:%5.3f' % 
                  (n_components, sm['rss'], sm['evar'], sm['n_iter'], sm['cophenetic']))
    
    fig, ax1 = plt.subplots(figsize=(8,4))

    color = 'tab:red'
    ax1.set_xlabel('Components (K)')
    ax1.set_ylabel('RMS error', color=color)
    ax1.plot(xs, rmss, 'o-', color=color)
    ax1.tick_params(axis='y', labelcolor=color)

    ax2 = ax1.twinx()  # instantiate a second axes that shares the same x-axis

    color = 'tab:blue'
    ax2.set_ylabel('Cophenetic', color=color)  # we already handled the x-label with ax1
    ax2.plot(xs, cophs, 'o-', color=color)
    ax2.tick_params(axis='y', labelcolor=color)

    fig.tight_layout() 
    
    # plt.plot([18, 18], [0, 0.75], '--', color='tab:green')
    plt.title(title)
    plt.show() 
    
plt_results_scree('Nmf.pkl', 'Second Scree plot for Nmf method')

In [None]:
plt_results_scree('std_nmf_results_dict.pkl', 'First Scree plot for Nmf method')
plt_results_scree('Nmf.pkl', 'Second Scree plot for Nmf method')

In [None]:
plt_results_scree('lsnmf_results_dict.pkl', 'First Scree plot for Lsnmf method')
plt_results_scree('Lsnmf.pkl', 'Second Scree plot for Lsnmf method')

In [None]:
plt_results_scree('bmf_results_dict.pkl', 'First Scree plot for Bmf method')
plt_results_scree('Bmf.pkl', 'Second Scree plot for Bmf method')

In [None]:
with open('Nmf.pkl', 'rb') as f:
    nmf_results = pickle.load(f)

In [None]:
k=14
nmf_results[k].keys()

In [None]:
nmf_results[k]['sparseness']

In [None]:
nmf_results[k]['dispersion']

In [None]:
plt.imshow(nmf_results[k]['consensus'])
plt.colorbar()
plt.show()

In [None]:
plt.imshow(nmf_results[k]['connectivity'])
plt.colorbar()
plt.show()

In [None]:
plt.hist(nmf_results[k]['score_features'])
plt.yscale('log')
plt.show()

In [None]:
plt.figure(figsize=(8,10))
plt.imshow(nmf_results[k]['residuals'][:3000,:], aspect=0.03)
plt.colorbar()
plt.show()