In [1]:
from swan_vis import *

## Adding gene tpm to swangraph

In [39]:
def abundance_to_adata(sg, counts_file, how='iso'):
    # read in abundance file
    check_file_loc(counts_file, 'abundance matrix')
    try:
        df = pd.read_csv(counts_file, sep='\t')
    except:
        raise ValueError('Problem reading expression matrix {}'.format(counts_file))

    # check if abundance matrix is a talon abundance matrix
    cols = ['gene_ID', 'transcript_ID', 'annot_gene_id', 'annot_transcript_id',
        'annot_gene_name', 'annot_transcript_name', 'n_exons', 'length',
        'gene_novelty', 'transcript_novelty', 'ISM_subtype']
    if df.columns.tolist()[:11] == cols:
        df = reformat_talon_abundance(df, how=how)
        
    # rename id ID column
    col = df.columns[0]
    if how == 'gene':
        id_col = 'gid'
    elif how == 'iso':
        id_col = 'tid'

    df.rename({col: id_col}, axis=1, inplace=True)
    
    # sum for gene level
    if how == 'gene':
        df = df.groupby(id_col).sum().reset_index()
    
    # limit to just the transcripts already in the graph
    if how == 'iso':
        sg_tids = sg.t_df.tid.tolist()
        ab_tids = df.tid.tolist()
        tids = list(set(sg_tids)&set(ab_tids))
        df = df.loc[df.tid.isin(tids)]
        
    # transpose to get adata format
    df.set_index(id_col, inplace=True)
    df = df.T
    
    # get adata components - obs, var, and X
    var = df.columns.to_frame()
    var.columns = [id_col]
    obs = df.index.to_frame()
    obs.columns = ['dataset']
    X = sparse.csr_matrix(df.to_numpy())
    
    # create transcript-level adata object and filter out unexpressed transcripts
    adata = anndata.AnnData(var=var, obs=obs, X=X)
    genes, _  = sc.pp.filter_genes(adata, min_counts=1, inplace=False)
    adata = adata[:, genes]
    adata.layers['counts'] = adata.X
    
    return adata    

In [40]:
def merge_adata_abundance(sg, adata, how='iso'):
    
    if how == 'gene':
        dataset_list = sg.gene_datasets
        ab_bool = sg.has_gene_abundance()
        sg_adata = sg.gene_adata
    elif how == 'iso':
        dataset_list = sg.datasets
        ab_bool = sg.has_abundance()
        sg_adata = sg.adata
        
    print(adata)
    
    # add each dataset to list of "datasets", check if any are already there!
    datasets = adata.obs.dataset.tolist()
    for d in datasets:
        if d in dataset_list:
            raise ValueError('Dataset {} already present in the SwanGraph.'.format(d))
    dataset_list.extend(datasets)

    print()
    if len(datasets) <= 5:
        print('Adding abundance for datasets {} to SwanGraph.'.format(', '.join(datasets)))
    else:
        mini_datasets = datasets[:5]
        n = len(datasets) - len(mini_datasets)
        print('Adding abundance for datasets {}... (and {} more) to SwanGraph'.format(', '.join(mini_datasets), n))

    # if there is preexisting abundance data in the SwanGraph, concatenate
    # otherwise, adata is the new transcript level adata
    if not ab_bool:

        # create transcript-level adata object
        sg_adata = adata

        # add counts as layers
        sg_adata.layers['counts'] = sg_adata.X
        print('Calculating TPM...')
        sg_adata.layers['tpm'] = sparse.csr_matrix(calc_tpm(sg_adata, recalc=True).to_numpy())

        if not sg.sc and how == 'iso':
            print('Calculating PI...')
            sg_adata.layers['pi'] = sparse.csr_matrix(calc_pi(sg_adata, sg.t_df)[0].to_numpy())
    else:

        # first set current layer to be counts
        sg_adata.X = sg_adata.layers['counts']

        # concatenate existing adata with new one
        # outer join to add all new transcripts (that are from added
        # annotation or transcriptome) to the abundance
        uns = sg_adata.uns
        sg_adata = sg_adata.concatenate(adata, join='outer', index_unique=None)
        sg_adata.uns = uns

        # recalculate pi and tpm
        print('Calculating TPM...')
        sg_adata.layers['tpm'] = sparse.csr_matrix(calc_tpm(sg_adata, recalc=True).to_numpy())

        if not sg.sc and how == 'iso':
            print('Calculating PI...')
            sg_adata.layers['pi'] = sparse.csr_matrix(calc_pi(sg_adata, sg.t_df)[0].to_numpy())

    # add abundance for edges, TSS per gene, and TES per gene
    if how == 'iso':
        print('Calculating edge usage...')
        sg.create_edge_adata()
        print('Calculating TSS usage...')
        sg.create_end_adata(kind='tss')
        print('Calculating TES usage...')
        sg.create_end_adata(kind='tes')

    # set abundance flag to true
    # and make adata object
    if how == 'iso':
        sg.abundance = True  
        sg.adata = sg_adata
    elif how == 'gene':
        sg.gene_abundance = True
        sg.gene_adata = sg_adata

In [41]:
def add_abundance(sg, counts_file, how='iso'):
    adata = abundance_to_adata(sg, counts_file, how=how)
    merge_adata_abundance(sg, adata, how=how)
    
    return sg

In [43]:
fname = '/Users/fairliereese/mortazavi_lab/data/rnawg/lr_bulk/cerberus/swan.p'
sg = read(fname)

Read in graph from /Users/fairliereese/mortazavi_lab/data/rnawg/lr_bulk/cerberus/swan.p


In [44]:
sg.gene_datasets = []
sg.gene_abundance = False
sg.gene_adata = anndata.AnnData()

In [45]:
counts_file = '/Users/fairliereese/mortazavi_lab/data/rnawg/lr_bulk/talon/human_talon_abundance.tsv'
how = 'gene'

In [46]:
sg = add_abundance(sg, counts_file, how='gene')

AnnData object with n_obs × n_vars = 138 × 599976
    obs: 'dataset'
    var: 'gid'
    layers: 'counts'

Adding abundance for datasets gm12878_1_1, gm12878_1_2, gm12878_1_3, gm12878_1_4, gm12878_3_1... (and 133 more) to SwanGraph
Calculating TPM...


In [47]:
sg.gene_adata

AnnData object with n_obs × n_vars = 138 × 599976
    obs: 'dataset', 'total_counts'
    var: 'gid'
    layers: 'counts', 'tpm'

In [4]:
# def add_abundance(sg, counts_file, how='iso'):


In [None]:





# add each dataset to list of "datasets", check if any are already there!
datasets = adata.obs.dataset.tolist()
for d in datasets:
    if d in sg.datasets:
        raise ValueError('Dataset {} already present in the SwanGraph.'.format(d))
sg.datasets.extend(datasets)

print()
if len(datasets) <= 5:
    print('Adding abundance for datasets {} to SwanGraph.'.format(', '.join(datasets)))
else:
    mini_datasets = datasets[:5]
    n = len(datasets) - len(mini_datasets)
    print('Adding abundance for datasets {}... (and {} more) to SwanGraph'.format(', '.join(mini_datasets), n))

# if there is preexisting abundance data in the SwanGraph, concatenate
# otherwise, adata is the new transcript level adata
if not sg.has_abundance():

    # create transcript-level adata object
    sg.adata = adata

    # add counts as layers
    sg.adata.layers['counts'] = sg.adata.X
    print('Calculating transcript TPM...')
    sg.adata.layers['tpm'] = sparse.csr_matrix(calc_tpm(sg.adata, recalc=True).to_numpy())

    if not sg.sc:
        print('Calculating PI...')
        sg.adata.layers['pi'] = sparse.csr_matrix(calc_pi(sg.adata, sg.t_df)[0].to_numpy())
else:

    # first set current layer to be counts
    sg.adata.X = sg.adata.layers['counts']

    # concatenate existing adata with new one
    # outer join to add all new transcripts (that are from added
    # annotation or transcriptome) to the abundance
    uns = sg.adata.uns
    sg.adata = sg.adata.concatenate(adata, join='outer', index_unique=None)
    sg.adata.uns = uns

    # recalculate pi and tpm
    print('Calculating transcript TPM...')
    sg.adata.layers['tpm'] = sparse.csr_matrix(calc_tpm(sg.adata, recalc=True).to_numpy())

    if not sg.sc:
        print('Calculating PI...')
        sg.adata.layers['pi'] = sparse.csr_matrix(calc_pi(sg.adata, sg.t_df)[0].to_numpy())

# add abundance for edges, TSS per gene, and TES per gene
print('Calculating edge usage...')
sg.create_edge_adata()
print('Calculating TSS usage...')
sg.create_end_adata(kind='tss')
print('Calculating TES usage...')
sg.create_end_adata(kind='tes')

# set abundance flag to true
sg.abundance = True