In [2]:
!date

Thu Nov  5 10:57:47 PST 2020


# Matrix Generation Clean Up

In [3]:
import anndata
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib as mpl
import matplotlib.patches as mpatches
import scanpy as scanp
from scipy.stats import ks_2samp, ttest_ind
from scipy.sparse import csr_matrix, lil_matrix
from sklearn.preprocessing import normalize
from sklearn.decomposition import TruncatedSVD
from sklearn.manifold import TSNE
from umap import UMAP
from sklearn.cluster import KMeans
from sklearn.metrics import adjusted_rand_score
from sklearn.preprocessing import LabelEncoder
from sklearn.neighbors import NeighborhoodComponentsAnalysis
from matplotlib import cm

import warnings
warnings.filterwarnings('ignore')

fsize=20

plt.rcParams.update({'font.size': fsize})
%config InlineBackend.figure_format = 'retina'

In [4]:
txn = pd.read_csv("../../data/SMARTseq/out_gencode/transcripts_fixed.txt", sep="\t", header=None)

In [5]:
adata = "../../data/SMARTseq/out_gencode/adata.h5ad"
adata = anndata.read_h5ad(adata)
adata.var["transcript_id"] = txn[0].apply(lambda x: x.split(".")[0]).values
adata.var["transcript_name"] = txn[4].values
adata.var["gene_id"] = txn[1].apply(lambda x: x.split(".")[0]).values
adata.var["gene_name"] = txn[5].values

adata.var["length"] = txn[6].values

In [6]:
adata.var["gene_name"] = adata.var["gene_name"].astype(str) + "_" + adata.var.gene_id.astype(str)
adata.var["transcript_name"] = adata.var["transcript_name"].astype(str) + "_" + adata.var.transcript_id.astype(str)

In [7]:
def change(x):
    if x=="L5 ET": return "L5 PT"
    return x

In [8]:
raw = adata.X.todense()

In [9]:
scaled = raw/adata.var.length.values

In [10]:
adata.layers["X"] = csr_matrix(scaled)
adata.X = csr_matrix(scaled)

In [11]:
adata.layers["norm"] = normalize(adata.X, norm='l1', axis=1)*1000000

In [12]:
adata.layers["norm"][0].sum()

1000000.0000000003

In [13]:
adata.layers["log1p"] = np.log1p(adata.layers["norm"])

In [14]:
adata.layers["norm"][0].sum()

1000000.0000000003

In [15]:
adata.X = adata.layers["norm"]

In [16]:
adata.layers["norm"][0].sum()

1000000.0000000003

In [17]:
adata.layers["norm"][0].sum()

1000000.0000000003

In [18]:
del raw
del scaled

In [19]:
adata.layers["norm"][0].sum()

1000000.0000000003

In [20]:
def group_mtx(mtx, components, features, s2t, source_id="transcript_id", target_id="gene_id", by="features"):
    """
    mtx: ndarray components by features 
    components: labels for rows of mtx
    features: labels for columns of mtx
    s2t: pandas dataframe mapping source (features or components) to a
    targets features(components) to group by
    target_id: column name in s2t to group by
    """
    if target_id not in s2t.columns: return -1
    
    ncomp   = components.shape[0]
    nfeat   = features.shape[0]
    ntarget = s2t[target_id].nunique()
    
    if by =="features": 
        source = features
    elif by =="components": 
        source = components
    
    # Map the source to an index
    source2idx = dict(zip(source, range(len(source))))
    # Map the target to a list of source indices
    target2idx = (s2t.groupby(target_id)[source_id].apply(lambda x: [source2idx[i] for i in x])).to_dict()
    
    # array of unique targets
    unique = s2t[target_id].unique().astype(str)
    nuniq = unique.shape[0]
    X = np.zeros((ncomp, nuniq))
    
    for tidx, t in enumerate(unique):
        # Grab the matrix indices corresponding to columns and source columns to group by
        source_indices = target2idx[t]
        
        # breaks generality
        sub_mtx = mtx[:, source_indices].sum(axis=1) # Sum on source indicies
        X[:,tidx] = np.asarray(sub_mtx)[:,0] # place summed vector in new matrix
        
    # Return matrix that is grouped by
    return (X, components, unique)
    
def filter_mtx(mtx, components, features, **kwargs):
    row_counts = kwargs.get("row_counts", 0) # threshold for min counts for rows
    col_counts = kwargs.get("col_counts", 0)
    row_zeros  = kwargs.get("row_zeros", 0) # threshold min number of non_zero entries in rows
    col_zeros  = kwargs.get("col_zeros", 0)
    
    return_mask = kwargs.get("return_mask", False)
    
    row_sum = np.asarray(mtx.sum(axis=1)).reshape(-1) # sum along the rows
    col_sum = np.asarray(mtx.sum(axis=0)).reshape(-1)
    
    mtx_zero_mask = mtx>0
    row_nz = np.asarray(mtx_zero_mask.sum(axis=1)).reshape(-1)
    col_nz = np.asarray(mtx_zero_mask.sum(axis=0)).reshape(-1)
    
    # Generate masks
    rs_mask = row_sum > row_counts
    cs_mask = col_sum > col_counts
    
    rz_mask = row_nz > row_zeros
    cz_mask = col_nz > col_zeros
    
    row_mask = np.logical_and(rs_mask, rz_mask)
    col_mask = np.logical_and(cs_mask, cz_mask)
    
    if return_mask:
        return (row_mask, col_mask)
    
    X = mtx[row_mask,:][:,col_mask]
    c = components[row_mask]
    f = features[col_mask]
    
    return (X, c, f)

In [21]:
%%time

mtx = np.array([[1,1,0],
                [0,1,0],
                [3,0,0],
                [0,2,0]])

components = np.array([1,2,3,4])
features = np.array([1, 2, 3])

X, c, f = filter_mtx(mtx, components, features, row_zeros=1, col_zeros=3)
rm, cmask = filter_mtx(mtx, components, features, return_mask=True)

CPU times: user 198 µs, sys: 49 µs, total: 247 µs
Wall time: 217 µs


In [22]:
cmask

array([ True,  True, False])

In [23]:
X

array([], shape=(1, 0), dtype=int64)

In [24]:
X==mtx

False

# Group isoforms into genes, and filter. 

go back and filter on isoforms and apply it to genes

In [25]:
adata = adata
mtx        = adata.layers["X"]
components = adata.obs.cell_id.values
features   = adata.var.transcript_id.values

In [26]:
adata

AnnData object with n_obs × n_vars = 6580 × 142604
    obs: 'cell_id'
    var: 'transcript_id', 'gene_id', 'gene_name', 'transcript_name', 'length'
    layers: 'X', 'norm', 'log1p'

In [27]:
%%time

mtx        = adata.layers["X"].todense()
components = adata.obs.cell_id.values
features   = adata.var.transcript_id.values

source_id = "transcript_id"
target_id = "gene_id"


s2t = adata.var

# Data for gene matrix
X, c, f = group_mtx(mtx, components, features, s2t)

CPU times: user 36.6 s, sys: 5.51 s, total: 42.2 s
Wall time: 42.2 s


In [28]:
adata

AnnData object with n_obs × n_vars = 6580 × 142604
    obs: 'cell_id'
    var: 'transcript_id', 'gene_id', 'gene_name', 'transcript_name', 'length'
    layers: 'X', 'norm', 'log1p'

In [29]:
# generate isoform based on gene mask.
isoform = adata[:, adata.var.gene_id.isin(f)]

In [30]:
# generate gene
tmp = adata.var.drop_duplicates(["gene_id", "gene_name"])
tmp = tmp[tmp.gene_id.isin(f)]
gene = anndata.AnnData(X=X, obs=adata.obs, var=tmp)

In [31]:
print(isoform)
print(gene)

View of AnnData object with n_obs × n_vars = 6580 × 142604
    obs: 'cell_id'
    var: 'transcript_id', 'gene_id', 'gene_name', 'transcript_name', 'length'
    layers: 'X', 'norm', 'log1p'
AnnData object with n_obs × n_vars = 6580 × 55401
    obs: 'cell_id'
    var: 'transcript_id', 'gene_id', 'gene_name', 'transcript_name', 'length'


In [32]:
gene.var.index = gene.var.gene_name.values
isoform.var.index = isoform.var.transcript_name.values

# Begin Check

In [94]:
# the gene_id is OK, need to fix the gene name to reflected the fact that
# the same gene_name is used with multiple gene_ids

In [33]:
adata.var.gene_id.nunique() == gene.var.gene_name.nunique()

True

In [34]:
adata.var.transcript_id.nunique() == isoform.var.transcript_name.nunique()

True

In [35]:
gene.X = csr_matrix(gene.X)

In [36]:
gene.layers["X"] = gene.X.copy() # here, X is rho, the number of molecules
isoform.layers["X"] = isoform.X.copy() # here X is rho, the number of molecules

# Save matrix

In [37]:
gene.write_h5ad("../../data/notebook/revision/gencode_gene.h5ad")
isoform.write_h5ad("../../data/notebook/revision/gencode_isoform.h5ad")

... storing 'gene_id' as categorical
... storing 'gene_name' as categorical
