In [394]:
import swan_vis as swan
import pandas as pd
import anndata
import scanpy as sc
from scipy import sparse
import numpy as np

In [422]:
# added 
# utils.py
def calc_total_counts(adata, obs_col='dataset', layer='counts'):
    
    # turn into a sparse dataframe
    cols = adata.var.index.tolist()
    inds = adata.obs[obs_col].tolist()
    data = adata.layers[layer]
    data = scipy.sparse.csr_matrix(data)
    df = pd.DataFrame.sparse.from_spmatrix(data, index=inds, columns=cols)
    df.index.name = obs_col 

    # add up values on condition (row)
    df = df.groupby(level=0).sum()
    
    return df

In [423]:
# added
# utils.py
def calc_tpm(adata, obs_col='dataset'):
    
    # calculate tpm using scanpy
    d = sc.pp.normalize_total(adata,
                              layer='counts',
                              target_sum=1e6,
                              key_added='total_counts',
                              inplace=False)
    adata.obs['total_counts'] = d['norm_factor']
    
    # turn into a sparse dataframe
    cols = adata.var.index.tolist()
    inds = adata.obs[obs_col].tolist()
    data = d['X']
    data = scipy.sparse.csr_matrix(data)
    df = pd.DataFrame.sparse.from_spmatrix(data, index=inds, columns=cols)
    df.index.name = obs_col    

    # average across tpm
    if obs_col != 'dataset':
        df.reset_index(inplace=True)
        df = df.groupby(obs_col).mean()
        
    return df

In [526]:
# added
# swangraph.py
def add_abundance(sg, counts_file):
    """
    Adds abundance from a counts matrix to the SwanGraph. Transcripts in the
    SwanGraph but not in the counts matrix will be assigned 0 counts.
    Transcripts in the abundance matrix but not in the SwanGraph will not
    have expression added.

    Parameters:
        counts_file (str): Path to TSV expression file where first column is
            the transcript ID and following columns name the added datasets and
            their counts in each dataset, OR to a TALON abundance matrix.
    """

    # read in abundance file
    swan.check_file_loc(counts_file, 'abundance matrix')
    try:
        df = pd.read_csv(counts_file, sep='\t')
    except:
        raise ValueError('Problem reading expression matrix {}'.format(counts_file))

    # check if abundance matrix is a talon abundance matrix
    cols = ['gene_ID', 'transcript_ID', 'annot_gene_id', 'annot_transcript_id',
        'annot_gene_name', 'annot_transcript_name', 'n_exons', 'length',
        'gene_novelty', 'transcript_novelty', 'ISM_subtype']
    if df.columns.tolist()[:11] == cols:
        df = swan.reformat_talon_abundance(counts_file)

    # rename transcript ID column
    col = df.columns[0]
    df.rename({col: 'tid'}, axis=1, inplace=True)

    # limit to just the transcripts already in the graph
    sg_tids = sg.t_df.tid.tolist()
    ab_tids = df.tid.tolist()
    tids = list(set(sg_tids)&set(ab_tids))
    df = df.loc[df.tid.isin(tids)]
    
    # transpose to get adata format
    df.set_index('tid', inplace=True)
    df = df.T
    
    # get adata components - obs, var, and X
    var = df.columns.to_frame()
    var.columns = ['tid']
    obs = df.index.to_frame()
    obs.columns = ['dataset']
    X = sparse.csr_matrix(df.to_numpy())
    
    # create transcript-level adata object and filter out unexpressed transcripts
    adata = anndata.AnnData(var=var, obs=obs, X=X)
    genes, _  = sc.pp.filter_genes(adata, min_counts=1, inplace=False)
    adata = adata[:, genes]
    adata.layers['counts'] = adata.X

    # add each dataset to list of "datasets", check if any are already there!
    datasets = adata.obs.dataset.tolist()
    for d in datasets:
        if d in sg.datasets:
            raise ValueError('Dataset {} already present in the SwanGraph.'.format(d))
    sg.datasets.extend(datasets)

    print()
    if len(datasets) <= 5:
        print('Adding abundance for datasets {} to SwanGraph.'.format(', '.join(datasets)))
    else:
        mini_datasets = datasets[:5]
        n = len(datasets) - len(mini_datasets)
        print('Adding abundance for datasets {}... (and {} more) to SwanGraph'.format(', '.join(mini_datasets), n))

    # if there is preexisting abundance data in the SwanGraph, concatenate
    # otherwise, adata is the new transcript level adata
    if not sg.has_abundance():

        # create transcript-level adata object
        sg.adata = adata

        # add counts as layers
        sg.adata.layers['counts'] = sg.adata.X
        print('Calculating transcript TPM...')
        sg.adata.layers['tpm'] = sparse.csr_matrix(calc_tpm(sg.adata).to_numpy())

        if not sg.sc:
            print('Calculating PI...') 
            sg.adata.layers['pi'] = sparse.csr_matrix(calc_pi(sg.adata, sg.t_df)[0].to_numpy())
    else:
        
        # first set current layer to be counts
        sg.adata.X = sg.adata.layers['counts']
        
        # concatenate existing adata with new one
        # outer join to add all new transcripts (that are from added
        # annotation or transcriptome) to the abundance
        uns = sg.adata.uns
        sg.adata = sg.adata.concatenate(adata, join='outer', index_unique=None)
        sg.adata.uns = uns
        
        # recalculate pi and tpm
        print('Calculating transcript TPM...')
        sg.adata.layers['tpm'] = sparse.csr_matrix(calc_tpm(sg.adata).to_numpy())

        if not sg.sc:
            print('Calculating PI...')
            sg.adata.layers['pi'] = sparse.csr_matrix(calc_pi(sg.adata, sg.t_df)[0].to_numpy())

    # add abundance for edges, TSS per gene, and TES per gene
    sg = create_edge_adata(sg)
    print('Calculating TSS usage...')
    sg = create_end_adata(sg, kind='tss')
    print('Calculating TES usage...')
    sg = create_end_adata(sg, kind='tes')

    # set abundance flag to true
    sg.abundance = True
    
    return sg

In [527]:
# added
# swangraph.py
def create_end_adata(sg, kind):
    """
    Create a tss / tes-level adata object. Enables calculating tss / tes
    usage across samples.

    Parameters:
        kind (str): Choose from 'tss' or 'tes'
    """

    df = swan.get_ends(sg.t_df, kind)

    # get a mergeable transcript expression df
    tid = sg.adata.var.index.tolist()
    obs = sg.adata.obs.index.tolist()
    data = sg.adata.layers['counts'].transpose()
    t_exp_df = pd.DataFrame.sparse.from_spmatrix(columns=obs, data=data, index=tid)
    t_exp_df = t_exp_df.merge(sg.t_df, how='left',
        left_index=True, right_index=True)

    # merge counts per transcript with end expression
    df = df.merge(t_exp_df, how='left',
        left_index=True, right_index=True)

    # sort based on vertex id
    df.sort_index(inplace=True, ascending=True)

    # set index to gene ID, gene name, and vertex id 
    df.reset_index(drop=True, inplace=True)
    df.set_index(['gid', 'gname', 'vertex_id'], inplace=True)
    df = df[sg.datasets]

    # groupby on gene and assign each unique TSS / gene combo an ID
    id_col = '{}_id'.format(kind)
    name_col = '{}_name'.format(kind)
    df.reset_index(inplace=True)
    df = df.groupby(['gid', 'gname', 'vertex_id']).sum().reset_index()
    df['end_gene_num'] = df.sort_values(['gid', 'vertex_id'],
                    ascending=[True, True])\
                    .groupby(['gid']) \
                    .cumcount() + 1
    df[id_col] = df['gid']+'_'+df['end_gene_num'].astype(str)
    df[name_col] = df['gname']+'_'+df['end_gene_num'].astype(str)
    df.drop('end_gene_num', axis=1, inplace=True)

    # obs, var, and X tables for new data
    var_cols = ['gid', 'gname', 'vertex_id', id_col, name_col]
    var = df[var_cols]
    var.set_index('{}_id'.format(kind), inplace=True)
    df.drop(var_cols, axis=1, inplace=True)
    df = df[sg.adata.obs.index.tolist()]
    X = sparse.csr_matrix(df.transpose().values)
    obs = sg.adata.obs
    
    # create anndata
    adata = anndata.AnnData(var=var, obs=obs, X=X)
    
    # add counts and tpm as layers
    adata.layers['counts'] = adata.X
    adata.layers['tpm'] = sparse.csr_matrix(calc_tpm(adata).to_numpy())
    if not sg.sc:
        adata.layers['pi'] = sparse.csr_matrix(calc_pi(adata,
                adata.var)[0].to_numpy())

    # assign adata and clean up unstructured data if needed
    if kind == 'tss':
        if sg.has_abundance():
            adata.uns = sg.tss_adata.uns
        sg.tss_adata = adata
        
    elif kind == 'tes':
        if sg.has_abundance():
            adata.uns = sg.tss_adata.uns
        sg.tes_adata = adata
    
    return sg


In [557]:
# added
# swangraph.py
def create_edge_adata(sg):
    """
    Create an edge-level adata object. Enables calculating edge usage across
    samples.
    """

    # get table what edges are in each transcript
    edge_exp_df = swan.pivot_path_list(sg.t_df, 'path')

    # get a mergeable transcript expression df
    tid = sg.adata.var.index.tolist()
    obs = sg.adata.obs.index.tolist()
    data = sg.adata.layers['counts'].transpose()
    t_exp_df = pd.DataFrame.sparse.from_spmatrix(columns=obs,
                                                 data=data,
                                                 index=tid)

    # merge counts per transcript with edges
    edge_exp_df = edge_exp_df.merge(t_exp_df, how='left',
        left_index=True, right_index=True)

    # sum the counts per transcript / edge / dataset
    edge_exp_df = edge_exp_df.groupby('edge_id').sum()

    # order based on order of edges in sg.edge_df
    edge_exp_df = edge_exp_df.merge(sg.edge_df[['v1', 'v2']],
        how='left', left_index=True, right_index=True)
    edge_exp_df.sort_values(by=['v1', 'v2'], inplace=True)
    edge_exp_df.drop(['v1', 'v2'], axis=1, inplace=True)
    
    # drop edges that are unexpressed
    edge_exp_df = edge_exp_df.loc[edge_exp_df.sum(1) > 0]

    # obs, var, and X tables for new data
    var = edge_exp_df.index.to_frame()
    X = sparse.csr_matrix(edge_exp_df.transpose().values)
    obs = sg.adata.obs

    # create edge-level adata object
    adata = anndata.AnnData(var=var, obs=obs, X=X)

    # add counts and tpm as layers
    adata.layers['counts'] = adata.X
    adata.layers['tpm'] = sparse.csr_matrix(calc_tpm(adata).to_numpy())
    # can't make pi for edges unless I make a new edge for 
    # each gene that the edge is in
    # could just have sg.edge_adata var separate from sg.edge_df for now tho
#     sg.edge_adata.layers['pi'] = sparse.csr_matrix(calc_pi(sg.adata, sg.edge_df)[0].to_numpy())

    # assign adata and clean up unstructured data if needed
    if sg.has_abundance():
        adata.uns = sg.edge_adata.uns
    sg.edge_adata = adata
    
    return sg

In [555]:
# added
# utils.py
def calc_pi(adata, t_df, obs_col='dataset'):

    # calculate cumulative counts across obs_col
    id_col = adata.var.index.name
    conditions = adata.obs[obs_col].unique().tolist()
    df = calc_total_counts(adata, obs_col=obs_col)
    df = df.transpose()
    # we use ints to index edges and locs
    if id_col == 'vertex_id' or id_col == 'edge_id':
        df.index = df.index.astype('int')

    sums = df.copy(deep=True)
    sums = sums[conditions]
    sums = sums.transpose()

    # add gid
    df = df.merge(t_df['gid'], how='left', left_index=True, right_index=True)
    t_counts = df.melt(id_vars=['gid'],
                       value_vars=conditions,
                       var_name=obs_col,
                       value_name='t_counts',
                       ignore_index=False)
    t_counts.index.name = id_col
    t_counts.reset_index(inplace=True)

    # calculate total number of reads per gene per condition
    temp = df.copy(deep=True)
    temp.reset_index(drop=True, inplace=True)
    totals = temp.groupby('gid').sum().reset_index()

    # merge back in
    df.reset_index(inplace=True)
    df.rename({'index':id_col}, axis=1, inplace=True)
    df = df.merge(totals, on='gid', suffixes=('_t_counts', None))
    del totals

    df = df.melt(id_vars=['gid'], 
                 value_vars=conditions, 
                 var_name=obs_col,
                 value_name='gene_counts')
    df = df.drop_duplicates()
    df = t_counts.merge(df, how='left', on=['gid', obs_col])


    df['pi'] = (df.t_counts/df.gene_counts)*100
    df = df.pivot(columns=obs_col, index=id_col, values='pi')

    # order based on order in adata
    ids = adata.var.index.tolist()
    df = df.loc[ids]
    cols = adata.obs[obs_col].unique().tolist()
    df = df[cols]

    # convert to sparse
    df = df.transpose()
    df = pd.DataFrame.sparse.from_spmatrix(data=sparse.csr_matrix(df.values),
                                           index=df.index.tolist(),
                                           columns=df.columns)
    return df, sums


In [558]:
sg = swan.SwanGraph()
sg.add_annotation('../testing/files/test_full_annotation.gtf')
sg.add_transcriptome('../testing/files/test_full.gtf')
sg = add_abundance(sg, '../testing/files/test_ab_talon_1.tsv')

print(sg.tss_adata.var.head())
print(sg.t_df.loc_path)
print(sg.adata.var.head())
print(sg.adata.layers['counts'].toarray())
print(sg.tss_adata.var.head())
print(sg.tss_adata.layers['counts'].toarray())
print(sg.tss_adata.layers['tpm'].toarray())
print(sg.tss_adata.layers['pi'].toarray())

print(sg.tes_adata.var.head())
print(sg.t_df[['gid', 'tid', 'loc_path']])
print(sg.adata.var.head())
print(sg.adata.layers['counts'].toarray())
print(sg.tes_adata.var)
print(sg.tes_adata.layers['counts'].toarray())
print(sg.tes_adata.layers['tpm'].toarray())
print(sg.tes_adata.layers['pi'].toarray())

print(type(sg.tes_adata.layers['counts']))
print(type(sg.tes_adata.layers['tpm']))
print(type(sg.tes_adata.layers['pi']))


Adding annotation to the SwanGraph

Adding transcriptome to the SwanGraph

Adding abundance for datasets dataset1, dataset2 to SwanGraph.
Calculating transcript TPM...
Calculating PI...
df
       dataset1  dataset2
test1       5.0       5.0
test2      10.0       0.0
test3       0.0      10.0
test4      10.0      10.0
test5       5.0       5.0
t_df
             tname        gid        gname               path    tid  \
tid                                                                    
test1  test1_tname  test1_gid  test1_gname    [0, 1, 2, 3, 4]  test1   
test2  test2_tname  test2_gid  test2_gname    [5, 6, 7, 8, 9]  test2   
test3  test3_tname  test2_gid  test2_gname  [5, 6, 14, 15, 9]  test3   
test4  test4_tname  test4_gid  test4_gname               [10]  test4   
test5  test5_tname  test2_gid  test2_gname        [5, 11, 12]  test5   

                    loc_path  annotation    novelty  
tid                                                  
test1     [0, 1, 2, 3, 4, 5]        



In [None]:
# test_add_abundance_3
sg = swan.SwanGraph()
sg.add_annotation('../testing/files/test_full_annotation.gtf')
sg.add_transcriptome('../testing/files/test_full.gtf')

sg = add_abundance(sg, '../testing/files/test_ab_talon_1.tsv')
print(sg.t_df.index.tolist())
print(sg.adata.var.index.tolist())
print(sg.adata.layers['counts'].toarray())
print(sg.adata.layers['tpm'].toarray())
print(sg.adata.layers['pi'].toarray())

In [379]:
# test_add_abundance_2
sg = swan.SwanGraph()
sg.add_annotation('../testing/files/test_full_annotation.gtf')
sg.add_transcriptome('../testing/files/test_full.gtf')

sg = add_abundance(sg, '../testing/files/test_ab_dataset1.tsv')
sg = add_abundance(sg, '../testing/files/test_ab_dataset2.tsv')

print(sg.t_df.index.tolist())
print(sg.adata.var.index.tolist())
print(sg.adata.layers['counts'].toarray())
print(sg.adata.layers['tpm'].toarray())
print(sg.adata.layers['pi'].toarray())


Adding annotation to the SwanGraph

Adding transcriptome to the SwanGraph

Adding abundance for datasets dataset1 to SwanGraph.
Calculating transcript TPM...
Calculating PI...

Adding abundance for datasets dataset2 to SwanGraph.
Calculating transcript TPM...
Calculating PI...
['test1', 'test2', 'test3', 'test4', 'test5', 'test6']
['test1', 'test2', 'test3', 'test4', 'test5']
[[ 5. 10.  0. 10.  5.]
 [ 5.  0. 10. 10.  5.]]
[[166666.69 333333.38      0.   333333.38 166666.69]
 [166666.69      0.   333333.38 333333.38 166666.69]]
[[100.        66.66667    0.       100.        33.333336]
 [100.         0.        66.66667  100.        33.333336]]


In [380]:
# test_add_abundance_1
sg = swan.SwanGraph()
sg.add_annotation('../testing/files/test_full_annotation.gtf')
sg.add_transcriptome('../testing/files/test_full.gtf')
sg = add_abundance(sg, '../testing/files/test_ab_1.tsv')

print(sg.t_df.index.tolist())
print(sg.adata.var.index.tolist())
print(sg.adata.layers['counts'].toarray())
print(sg.adata.layers['tpm'].toarray())
print(sg.adata.layers['pi'].toarray())


# looks good but tests still needa be updated


Adding annotation to the SwanGraph

Adding transcriptome to the SwanGraph

Adding abundance for datasets dataset1, dataset2 to SwanGraph.
Calculating transcript TPM...
Calculating PI...
['test1', 'test2', 'test3', 'test4', 'test5', 'test6']
['test1', 'test2', 'test3', 'test4', 'test5']
[[ 5. 10.  0. 10.  5.]
 [ 5.  0. 10. 10.  5.]]
[[166666.69 333333.38      0.   333333.38 166666.69]
 [166666.69      0.   333333.38 333333.38 166666.69]]
[[100.        66.66667    0.       100.        33.333336]
 [100.         0.        66.66667  100.        33.333336]]


In [446]:
# test_calc_pi_2
sg = swan.SwanGraph()
sg.add_transcriptome('../testing/files/test_full.gtf')
sg = add_abundance(sg, '../testing/files/test_ab_1.tsv')
sg.adata.obs['cluster'] = 'c1'
test_df, test_sums = calc_pi(sg.adata, sg.t_df, obs_col='cluster')

test_df


Adding transcriptome to the SwanGraph

Adding abundance for datasets dataset1, dataset2 to SwanGraph.
Calculating transcript TPM...
Calculating PI...
Calculating TSS usage...
Calculating TES usage...


tid,test1,test2,test3,test4,test5
c1,100.0,33.333336,33.333336,100.0,33.333336


In [428]:
# test_calc_pi_1
sg = swan.SwanGraph()
sg.add_transcriptome('../testing/files/test_full.gtf')
sg = add_abundance(sg, '../testing/files/test_ab_1.tsv')
test_df, test_sums = calc_pi(sg.adata, sg.t_df, obs_col='dataset')

test_df


Adding transcriptome to the SwanGraph

Adding abundance for datasets dataset1, dataset2 to SwanGraph.
Calculating transcript TPM...
Calculating PI...


tid,test1,test2,test3,test4,test5
dataset1,100.0,66.666672,0.0,100.0,33.333336
dataset2,100.0,0.0,66.666672,100.0,33.333336


In [233]:
sg = swan.read('test_mousewg.p')
ab = '/Users/fairliereese/mortazavi_lab/data/mousewg/lr_bulk/talon/mouse_talon_abundance_filtered.tsv'

Read in graph from test_mousewg.p


In [234]:
# test adding de novo
sg = add_abundance(sg, ab)


Adding abundance for datasets gastroc_14d_f_2, gastroc_14d_f_1, heart_18-20mo_m_1, heart_18-20mo_m_2, heart_18-20mo_f_1... (and 86 more) to SwanGraph
Calculating transcript TPM...


In [238]:
# test_calc_tpm_1
sg = swan.SwanGraph()
sg.add_transcriptome('../testing/files/test_full.gtf')
sg = add_abundance(sg, '../testing/files/test_ab_1.tsv')
sg.adata.obs['cluster'] = ['c1', 'c1']

df = calc_tpm(sg.adata)
df


Adding transcriptome to the SwanGraph

Adding abundance for datasets dataset1, dataset2 to SwanGraph.
Calculating transcript TPM...


Unnamed: 0_level_0,test1,test2,test3,test4,test5
dataset,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
dataset1,166666.671875,333333.34375,0.0,333333.34375,166666.671875
dataset2,166666.671875,0.0,333333.34375,333333.34375,166666.671875


In [240]:
sg.adata.layers['tpm'].toarray()

array([[166666.67, 333333.34,      0.  , 333333.34, 166666.67],
       [166666.67,      0.  , 333333.34, 333333.34, 166666.67]],
      dtype=float32)

In [220]:
# test_calc_tpm_2
sg = swan.SwanGraph()
sg.add_transcriptome('../testing/files/test_full.gtf')
sg = add_abundance(sg, '../testing/files/test_ab_1.tsv')
sg.adata.obs['cluster'] = ['c1', 'c1']

df = calc_tpm(sg.adata, obs_col='cluster')
df


Adding transcriptome to the SwanGraph

Adding abundance for datasets dataset1, dataset2 to SwanGraph.


Unnamed: 0_level_0,test1,test2,test3,test4,test5
cluster,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
c1,166666.671875,166666.671875,166666.671875,333333.34375,166666.671875


In [226]:
# test_calc_total_counts_1
sg = swan.SwanGraph()
sg.add_transcriptome('../testing/files/test_full.gtf')
sg = add_abundance(sg, '../testing/files/test_ab_1.tsv')
sg.adata.obs['cluster'] = ['c1', 'c1']

df = calc_total_counts(sg.adata)
df


Adding transcriptome to the SwanGraph

Adding abundance for datasets dataset1, dataset2 to SwanGraph.


Unnamed: 0_level_0,test1,test2,test3,test4,test5
dataset,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
dataset1,5.0,10.0,0.0,10.0,5.0
dataset2,5.0,0.0,10.0,10.0,5.0


In [227]:
# test_calc_total_counts_2
sg = swan.SwanGraph()
sg.add_transcriptome('../testing/files/test_full.gtf')
sg = add_abundance(sg, '../testing/files/test_ab_1.tsv')
sg.adata.obs['cluster'] = ['c1', 'c1']

df = calc_total_counts(sg.adata, obs_col='cluster')
df


Adding transcriptome to the SwanGraph

Adding abundance for datasets dataset1, dataset2 to SwanGraph.


Unnamed: 0_level_0,test1,test2,test3,test4,test5
cluster,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
c1,10.0,10.0,10.0,20.0,10.0


In [None]:
# test merging when incoming adata has duplicate dataset names

In [None]:
# test merging when adding new dataset adds new transcript id to the adata - already tested with test_add_abundance_2