In [1]:

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
# =============================================================================
# Created By  : Xiaojun Ma
# Created Date: Mar 18 10:54:00 PDT 2020
# =============================================================================
"""
This script contains the major class of SPaRTAN model and its dependencies.
This script requires numpy, Scipy, matplotlib to be installed within the Python
environment you are running this script in
This script requires Cython modules present in the current directory
This file contains the following classes and functions
   
    class Timer: a class to convert time period in seconds to the format of h:m:s
   
    class pySPaRTAN: The major class for SPaRTAN, establishing an interaction matrix between
    surface proteins (P) and TFs (D) that predict target gene expression (Y).
   
    function normalize_column(): perform l2 normalization column-wize of given matrix
"""
import numpy as np
import cythKronPlus as krnP
import cythLeastR as leastR
import scipy.linalg
import functools
import time
import gc
# import matplotlib.pyplot as plt
from scipy import stats

class Timer:
    """ a class to convert time in seconds to the format of h:m:s
    
    Methods
    -------
    def __init__(self):
        initiate a timer
    def restart(self):
        restart a timer
    def get_time_hhmmss(self):
        return the period = end_time - start_time in (h, m, s) format
        
    """

    def __init__(self):
        # initiate a timer
        self.start = time.time()

    def restart(self):
        # restart a timer
        self.start = time.time()

    def get_time_hhmmss(self):
        # return the period = end_time - start_time
        # in (h, m, s) format
        end = time.time()
        m, s = divmod(end - self.start, 60)
        h, m = divmod(m, 60)
        time_str = "%02d:%02d:%02d" % (h, m, s)
        return time_str


def normalize_column(A, T=0):
    """ perform l2 normalization column-wize of given matrix
    
    Parameters:
        A : the matrix that works on
        T : switch of column-wize and row-wize.
            T=0: column-wize
            T=1: row-wize
            
    """
    if (T == 0):
        return np.divide(A, np.sqrt(np.sum(A**2, 0)))
    else:
        At = np.transpose(A)
        return np.transpose(np.divide(At, np.sqrt(np.sum(At**2, 0))))


class pySPaRTAN:
    """
    The major class for SPaRTAN, establishing an interaction matrix between
    surface proteins (P) and TFs (D) that predicts target gene expression (Y).
    Methods
    -------
    fit(self, D, P, Y, lamda=0.001, rsL2=0.001,
        spectrumP=0.7):
        train a SPaRTAN model
    ar_model2w(self):
        converts a trained model to intermidiat vaiable W
    ar_reconstruction(self, pred_test=None):
        reconstruction function
    predict(self, P_test=None):
        predict target gene expression
    get_corr(self, Y_pred, Y_test, plot=False):
        get the correlation between predicted Y_pred and Y_test
    get_W(self):
        get coefficient matrix
    get_projP(self, Y=None):
        get projected protein expression
    get_projD(self, P=None):
        get projected TF activity
    """

    def fit(self, D, P, Y, lamda=0.001, rsL2=0.001, corrtype='spearman'):
 
        """ trains a SPaRTAN model
        Parameters
        ----------
        D : array of shape (N, Q)
            The data matrix with N genes and Q TFs
        P : array of shape (M, S)
            The data matrix with M cells and S proteins 
            
        Y : array of shape (N, M)
            The data matrix with N genes and M cells 
            
        lamda : float > 0, default=0.001
            LASSO regularization for linear regression 
            
        rsL2 : float > 0, default=0.001
            ridge regularization for linear regression
            
        corrtype: string, default='spearman'
            correlation type used to evaluate the performance
        
        """
        
        spectrumA = 1
        spectrumP = 0.7

        self.D = D
        self.P = P
        self.Y = Y
        self.corrtype = corrtype

        # transformation
        A = self.Y.T @ self.D
        B = self.P.T
        Y = self.Y.T @ self.Y

        # SVD(A) SVD(B)
        UA, SA, VhA = np.linalg.svd(A)
        VA = VhA.T
        UB, SB, VhB = np.linalg.svd(B)
        VB = VhB.T

        a_cum_spectrum = np.cumsum(SA) / sum(SA)
        b_cum_spectrum = np.cumsum(SB) / sum(SB)

        da = np.nonzero(a_cum_spectrum >= spectrumA)[0][0] + 1
        db = np.nonzero(b_cum_spectrum >= spectrumP)[0][0] + 1

        Ua = UA[:, :da]
        Sa = SA[:da]
        Va = VA[:, :da]

        Ub = UB[:, :db]
        Sb = SB[:db]
        Vb = VB[:, :db]

        Yv = (Y.T).flatten()

        Vb = Vb.copy(order='C')
        Ua = Ua.copy(order='C')
        L = krnP.kron(Vb, Ua)

        d = np.eye(Y.shape[0], Y.shape[1])
        cidex = np.where(d.flatten() != 0)
        diag = np.array(cidex, dtype=np.int32).flatten()

        # make it c-like contiguous array
        Yv = Yv.copy(order='C')
        diag = diag.copy(order='C')

        L, Yv = krnP.removeDiagC(L, Yv, diag)

        opts = dict()
        opts['rsL2'] = rsL2

        # reshape Yv to 2darry
        Yv = Yv.reshape(Yv.shape[0], 1)
        beta, b = leastR.LeastR(L, Yv, lamda, opts)

        del L, Yv
        gc.collect()

        self.beta = beta
        self.Ua = Ua
        self.Ub = Ub
        self.Sa = np.diag(Sa)
        self.Sb = np.diag(Sb)
        self.Va = Va
        self.Vb = Vb
        self.lamda = lamda

    def ar_model2w(self):
        # converts a trained model to W
        m1 = self.Va
        m2 = np.linalg.pinv(self.Sa)
        m3 = self.beta.reshape(self.Va.shape[1], self.Ub.shape[1], order="F")
        m4 = np.linalg.pinv(self.Sb)
        m5 = self.Ub.T
        ww = m1 @ m2 @ m3 @ m4 @ m5
        return ww

    def ar_reconstruction(self, pred_test=None):
        """ reconstruction function
        Parameters
        ----------
        pred_test: prediction on test data
        
        """
        A = self.Y.T @ pred_test
        B = scipy.linalg.orth(self.Y)
        cm = scipy.linalg.lstsq(B, self.Y)[0]
        ct = scipy.linalg.lstsq(cm.T, A)[0]
        pred = B @ ct
        return pred

    def predict(self, P_test=None):
        """ predict target gene expression
        
        Parameters
        ----------
        P_test: Protein expression on test data
        
        Returns
        -------
        Y_pred: array of shape (N, Mtest)
                The predicted Y matrix on test data set which has N genes and Mtest cells
        
        """
        if P_test is not None:
            self.P_test = P_test

        w = self.ar_model2w()
        pred = self.D @ (w @ self.P_test.T)

        aff_rec = self.ar_reconstruction(pred)

        self.Y_pred = aff_rec
        return self.Y_pred

    def get_corr(self, Y_pred, Y_test):#, plot=False):
        """ get the correlation between predicted Y_pred and Y_test
        
        Parameters
        ----------
        Y_pred: array of shape (N, Mtest)
                predicted gene expression with N genes and Mtest cells
                
        Y_test: array of shape (N, Mtest)
               gene expression test data with N genes and Mtest cells
       # plot: whether to plot the correlation between Y_pred and Y_test, default is False
        
        
        Returns
        -------
        corr: float 0 <= value <= 1
              spearman/pearson corrlatioin between flattened Y_pred and Y_test
        
        """
        if self.corrtype == 'spearman':
            corr = stats.spearmanr(Y_test.ravel(order='F'), Y_pred.ravel(order='F'))[0]
        else:
            corr = stats.pearsonr(Y_test.ravel(order='F'), Y_pred.ravel(order='F'))[0]                           

#         if plot:
#             plt.plot(Y_test.ravel(order='F'), Y_pred.ravel(order='F'),
#                      linestyle='none', marker='+')
#             plt.title('reconstruction of Y test, corr={:.2f}'.format(corr))

        return corr

    def get_W(self):
        # get coefficient matrix
        
        self.W = self.ar_model2w()
        return self.W

    def get_projP(self, Y=None):
        """ get projected protein expression
        
        Parameters
        ----------
        Y:  array of shape (optional, default is (N, M) )
            input gene expression with N genes and M cells
            
        Returns
        -------
        projP: array of shape (M, S)
               projected protein expression with M cells and S proteins
               
        """
        if Y is None:
            Y = self.Y
        W = self.ar_model2w()
        return (Y.T @ self.D @ W).T

    def get_projD(self, P=None):
        """ get projected TF activity
        Parameters
        ----------
        P: array of shape (optional, default is (M, S) )
           input protein expression with M cells and S proteins
        
        Returns
        -------
        projD:  array of shape (Q, M) 
            projected TF activities with Q TFs and M cells
            
        """
              
        if P is None:
            P = self.P
        W = self.ar_model2w()
        return W @ P.T

In [2]:

"""
This script intends to use pySPaRTAN module to generate predicted matrices used in the paper.
Input data
----------
D: dataframe of shape (N, Q)
   The data frame with N genes and Q TFs
   
P: dataframe of shape (M, S)
   The data frame with M cells and S proteins  
   
Y: array of shape (N, M)
   The data frame with N genes and M cells
   
location: default in the directory ../data/inputs
   
Output data
-----------
projD: dataframe of shape (Q, M) 
       projected TF activities with Q TFs and M cells 
       
projP: dataframe of shape (S, M)
      projected protein expression with S proteins and M cells    
     
location: location: default in the directory ../data/outputs
 
Hyperparameters
---------------
      
pySPaRTAN has 2 Hyperparameters that can be adjusted: lamda and rsL2
We can run pySPaRTAN by specifying some values to those parameters or using default ones in the script.
We can also use cross-validation at first to get the optional values for those
hyperparameters, and then run pySPaRTAN to generate the projections.
Command lines
-------------
When running this script from command line, the following parameters can be added to the command:
    
    --input_dir : directory of input files, default="../data/inputs"
    
    --output_dir : directory of output files, default="../data/outputs"
    
    --dataset_D : dataframe of gene X TF 
                  Requires .csv format, only contains file name, not include ".csv" extension
                  
    --dataset_P : dataframe of cell X protein
                  Requires .csv format, only contains file name, not include ".csv" extension
        
    --dataset_Y : dataframe of gene X cell
                  Requires .csv format, only contains file name, not include ".csv" extension
                  
    --lamda :  float > 0.0, default=0.001
            LASSO regularization for linear regression 
            
    --rsL2 : float > 0.0, default=0.001
            ridge regularization for linear regression
            
    --normalization : string, default = "l2"
                     type of normalizion performed on matrices,
                     if set to "", then no normalization
                     
    --fold : int >=0, default = 0
             how many folds to be used when doing cross-validation.
             if set to 0, it means using default/specified hyper-parameters, 
             do not conduct cross-validation
 
    --correlation : string, ("pearson" or "spearman") default = "pearson"
                    type of correlation coefficient
                     
System requirements
------------------                         
This script requires numpy, pandas, sklearn to be installed in the python running environment
"""
from argparse import ArgumentParser, RawTextHelpFormatter
import os
import numpy as np
import pandas as pd
from sklearn.model_selection import KFold
from sklearn.preprocessing import normalize

print("Read in datasets D, P, and Y ...")
input_dir="../data/inputs"
dataset_D="Dpbmc"
dataset_P="Ppbmc5kn_CD16"
dataset_Y="Ypbmc5kn_CD16"

D_ori = pd.read_csv(input_dir+"/"+dataset_D+'.csv', index_col=0)
P_ori = pd.read_csv(os.path.join(input_dir, dataset_P+'.csv'), index_col=0)
Y_ori = pd.read_csv(os.path.join(input_dir, dataset_Y+'.csv'), index_col=0)

TF_name = list(D_ori.columns)
cell_name = list(Y_ori.columns)
gene_name = list(Y_ori.index)
protein_name = list(P_ori.columns)

D_mat = D_ori.values
P_mat = P_ori.values
Y_mat = Y_ori.values



Read in datasets D, P, and Y ...


In [3]:
P_ori

Unnamed: 0,CD3,CD4,CD8a,CD11b,CD14,CD15,CD16,CD19,CD20,CD25,...,CD86,CD127,CD137,CD197,CD274,CD278,CD335,PD-1,HLA-DR,TIGIT
AACAAGATCCTGATAG,0.078428,0.570127,0.528620,2.012850,0.806625,0.690183,2.997771,0.162837,0.528082,0.503782,...,2.888803,0.543569,0.658155,0.455585,0.494959,0.148321,0.504211,0.869081,3.076374,0.754039
AACACACTCTAAGCCA,2.219136,2.511904,1.008495,3.713292,3.380943,1.264247,0.972861,0.633515,0.796351,1.191613,...,2.651441,1.456231,0.658155,1.230752,1.509037,1.313200,0.928186,1.098661,2.443243,0.878365
AAGATAGCATGCGTGC,1.664543,2.157629,0.656890,3.099701,2.918836,1.094658,0.662836,0.302833,0.475437,2.084542,...,2.893106,0.543569,0.658155,0.654070,1.072027,1.052684,0.837834,1.609497,1.618791,0.446469
AATCGTGGTATCATGC,0.139392,0.518372,0.381441,2.771357,1.318837,0.843909,2.508252,0.633515,0.796351,0.763696,...,3.025615,0.148830,0.270339,0.916470,0.673275,0.101284,0.628202,0.379513,2.537797,0.247947
ACAAAGACAACTTGCA,0.162773,0.906333,0.715341,2.649509,1.162033,0.633199,2.056311,0.633515,0.714671,0.503782,...,3.597135,0.215687,0.270339,0.455585,1.176039,0.148321,0.362640,0.268281,3.184503,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
TTGGGATAGTGATTCC,0.292626,0.846356,1.089796,3.174509,3.306138,1.294968,0.879805,0.952275,1.098636,1.030034,...,3.433705,0.393007,0.658155,0.869212,0.494959,0.494417,1.287223,1.285251,3.542321,0.446469
TTGTGGAGTGAATATG,0.078428,0.097964,0.457735,1.492347,0.025512,0.365901,4.121449,0.162837,0.298502,0.837220,...,0.000000,0.215687,1.411983,0.526216,0.277795,0.526406,1.287223,0.379513,0.659549,2.712679
TTGTTGTCACGCGCTA,2.344031,2.305558,0.919994,3.749026,3.309920,1.324773,0.777194,0.952275,0.671186,0.837220,...,2.755213,1.100594,0.482931,1.046163,0.277795,1.484077,0.628202,1.046016,2.639111,0.000000
TTTACCACACAAGGTG,2.139160,2.392014,0.594810,3.656645,2.975785,1.056993,0.000000,0.881598,0.796351,0.684334,...,2.699475,0.589108,0.658155,0.712348,0.494959,2.208369,0.928186,2.553112,2.008012,1.409843


In [4]:
# normalize the dataset
D_mat = normalize(D_mat, norm="l2", axis=0)
Y_mat = normalize(Y_mat, norm="l2",axis=0)
P_mat = normalize(P_mat, norm="l2", axis=1)

In [5]:
reg = pySPaRTAN()
print("Processing ...")
# re-train the model
reg.fit(D_mat, P_mat, Y_mat, 0.01, 0.01, "pearson")

Processing ...


In [6]:
reg.get_corr(reg.predict(P_mat), Y_mat)

0.8174180475118495

In [7]:
Y_mat.shape

(2643, 140)

In [8]:
P_mat.shape

(140, 29)

In [9]:
D_mat.shape

(2643, 255)

In [10]:
# retrieve projD, projP
projD = reg.get_projD()
projP = reg.get_projP()

df_projP = pd.DataFrame(data=projP, index=protein_name, columns=cell_name)
df_projD = pd.DataFrame(data=projD, index=TF_name, columns=cell_name)