In [8]:
import sqlite3
import pandas as pd

## ensure that dumb filtering results are working.
* Try joining the results from filter w/o includeAnnot and with and then
* with my hacky way

In [23]:
pass_list_annot = 'test_pass_list.csv'
pass_list = 'test_pass_list_2.csv'

annot_df = pd.read_csv(pass_list_annot, header=None, names=['gid', 'tid'])
df = pd.read_csv(pass_list, header=None, names=['gid', 'tid'])
print(len(annot_df))
print(len(df))

97852
20549


In [32]:
db = '/Users/fairliereese/mortazavi_lab/data/rnawg/lr_bulk/talon/human.db'
annot_name = 'gencode_v29'

In [33]:
with sqlite3.connect(db) as conn:
    query = """SELECT DISTINCT gene_ID, transcript_ID FROM observed
                       LEFT JOIN transcript_annotations AS ta
                           ON ta.ID = observed.transcript_ID
                       WHERE (ta.attribute = 'transcript_status'
                              AND ta.value = 'KNOWN'
                              AND ta.annot_name = '%s')""" % (annot_name)
    annot = pd.read_sql_query(query, conn)

In [35]:
len(annot)

96699

In [37]:
annot.head()

Unnamed: 0,gene_ID,transcript_ID
0,6,8
1,6,9
2,13,22
3,38109,132568
4,20,32


In [39]:
len(annot.merge(annot_df, how='inner',
                left_on=['gene_ID', 'transcript_ID'],
                right_on=['gid', 'tid']))

96699

## db to read annot

In [20]:
def get_gene_exp(df,
                 filter_novel=True,
                 filt_bcs=None):
    """
    Parameters:
        df (pandas DataFrame): talon ab or read annot
        filter_novel (bool): whether or not to filter out novel genes
        filt_bcs (str): path to *_filt_bcs.txt file to filter on from
            read_annot file
    """
    # read annot
    if 'dataset' in df.columns:

        if filter_novel:
            df = df.loc[df.gene_novelty == 'Known']

        if filt_bcs:
            bcs = pd.read_csv(filt_bcs, sep='\t', header=None)
            bcs.columns = ['bc']
            bcs = bcs.bc.tolist()
            df = df.loc[df.dataset.isin(bcs)]

        # limit to relevant columns
        cols = ['annot_gene_id', 'annot_gene_name', 'dataset', 'read_name']
        df = df[cols]

        # groupby gene name, id, and dataset and count occurrences
        gb_cols = ['annot_gene_id', 'annot_gene_name', 'dataset']
        df = df.groupby(gb_cols).count()
        df.rename({'read_name': 'counts'}, axis=1, inplace=True)

        df.reset_index(inplace=True)
        df = df.pivot(index=['annot_gene_id', 'annot_gene_name'],
                 columns=['dataset'],
                 values=['counts'])
        df.columns = df.columns.get_level_values(1)
        df.columns.name = ''
        df.reset_index(inplace=True)
        df.fillna(0, inplace=True)
        df.reset_index(inplace=True)
        df.drop('index', inplace=True, axis=1)
        gene_df = df

In [21]:
def make_adata(df,
               samp_df,
               sr_df,
               verbose=False,
               how='gene'):

    """
    Make an AnnData from long read data.

    Parameters:
        df (pandas DataFrame): DataFrame from get_gene_exp function
        samp_df (pandas DataFrame): DataFrame from get_sample_metadata function
        sr_df (pandas DataFrame): DataFrame from get_illumina_metadata function
        verbose (bool): Whether or not to display output messages
        how (str): Choose from 'gene' or 'transcript'
    """

    if how == 'gene':
        # print(df.head())
        var = df[['annot_gene_id', 'annot_gene_name']]
        df.drop(['annot_gene_name'], axis=1, inplace=True)
        df.set_index('annot_gene_id', inplace=True)
    elif how == 'transcript':
        var = df[['annot_transcript_id', 'annot_transcript_name', \
                'annot_gene_id', 'annot_gene_name', 'transcript_novelty']]
        df.drop(['annot_transcript_name', 'annot_gene_id', \
                 'annot_gene_name', 'transcript_novelty'], axis=1, inplace=True)
        df.set_index('annot_transcript_id', inplace=True)

    df = df.transpose()
    df.index.name = 'bc'
    X = scipy.sparse.csr_matrix(df.values)
    df.reset_index(inplace=True)
    obs = df.bc.to_frame()
    obs = df.bc.to_frame()
    obs['bc3_long'] = obs['bc'].str.slice(0,8)
    obs['bc2_long'] = obs['bc'].str.slice(8,16)
    temp = obs['bc'].values[0]
    if '-' in temp:
        obs['bc1_long'] = obs['bc'].str.slice(16,24)
        obs['bc_index'] = obs.bc
        obs['bc_back'] = obs.bc.str.split('-', expand=True)[0]
        obs['experiment'] = obs.bc.str.split('-', expand=True)[1]
        obs['library'] = obs.experiment.str.split('_', expand=True)[1]

        # fix library names for those with issues
        if '2ka' in obs.library.unique().tolist():
            obs.loc[obs.library == '2ka', 'library'] = 'a'

        obs['bc'] = obs.bc_back
        obs.drop('bc_back', axis=1, inplace=True)

    else:
        obs['bc1_long'] = obs['bc'].str.slice(16,-1)
        obs['bc_index'] = obs['bc']

    if verbose:
        print('Found {} unique bc3s'.format(len(obs.bc3_long.unique())))
        print('Found {} unique bc2s'.format(len(obs.bc2_long.unique())))
        print('Found {} unique bc1s'.format(len(obs.bc1_long.unique())))

    # merge with information from illumina runs
    # pdb.set_trace()

    if sr_df is not None:
        # print(len(obs.index))
        obs = obs.merge(sr_df, how='left', on=['bc', 'library'])
        # print(len(obs.index))

    # merge with sample information
    if samp_df is not None:
        # print()
        # print(len(obs.index))
        obs = obs.merge(samp_df, how='left', left_on=['bc1_long', 'library'], right_on=['bc1_dt', 'library'])
        # print(len(obs.index))

    # construct the actual anndata
    adata = anndata.AnnData(X=X, obs=obs, var=var)
    adata.obs.set_index('bc_index', inplace=True)

    if how == 'gene':
        adata.var.set_index('annot_gene_id', inplace=True)
    elif how == 'transcript':
        adata.var.set_index('annot_transcript_id', inplace=True)
    adata.layers['raw'] = adata.X.copy()

    # annotate the group of mitochondrial genes as 'mt'
    adata.var['mt'] = adata.var.annot_gene_name.str.startswith('mt-')
    sc.pp.calculate_qc_metrics(adata, qc_vars=['mt'], percent_top=None, log1p=False, inplace=True)

    # format gene names
    if how == 'gene':
        adata.var.reset_index(inplace=True)
        adata.var.annot_gene_name = adata.var.annot_gene_name.astype(str)
        adata.var.index = adata.var.annot_gene_name
        adata.var_names_make_unique()
        adata.var.drop('annot_gene_name', axis=1, inplace=True)
    elif how == 'transcript':
        adata.var.reset_index(inplace=True)
        adata.var.annot_transcript_name = adata.var.annot_transcript_name.astype(str)
        adata.var.index = adata.var.annot_transcript_name
        adata.var_names_make_unique()
        adata.var.drop('annot_transcript_name', axis=1, inplace=True)

    return adata

In [28]:
import sqlite3
from src/talon/post/ import post_utils as putils


SyntaxError: invalid syntax (<ipython-input-28-fb33b9525cdb>, line 2)

In [22]:
pass_list = '/Users/fairliereese/Documents/programming/mortazavi_lab/data/mousewg/hippocampus/lr_splitseq/talon/test_pass_list.csv'
db = '/Users/fairliereese/Documents/programming/mortazavi_lab/data/mousewg/hippocampus/lr_splitseq/talon/hippocampus.db'
annot = 'gencode_vM21'

In [None]:
whitelist = putils.handle_filtering(db, 
                                    annot, 
                                    False, 
                                    pass_list, 
                                    None)