The goal of this analysis is to try and refine the 3' end calling of TALON by using the GENCODE strategy of longest TES per transcript.

In [1]:
import pandas as pd
import numpy as np
import csv

In [2]:
def get_longest_ends(annot, how='tes', novelty='novel'):
    df = pd.read_csv(annot, sep='\t')
    
    if novelty == 'novel':
        df = df.loc[df.transcript_novelty != 'Known']
    
    fwd = df.loc[df.strand == '+']
    rev = df.loc[df.strand == '-']
    
    # furthest downstream for tes
    # if + strand, max coord of read end
    # if - strand, min coord of read end
    if how == 'tes':
        fwd = fwd[['transcript_ID', 'read_end']]
        fwd = fwd.groupby('transcript_ID').max().reset_index()
        rev = rev[['transcript_ID', 'read_end']]
        rev = rev.groupby('transcript_ID').min().reset_index()
        
        
    # furthest upstream for tss:
    # if + strand, min coord of read start
    # if - strand, max coord of read start
    elif how == 'tss':
        fwd = fwd[['transcript_ID', 'read_start']]
        fwd = fwd.groupby('transcript_ID').min().reset_index()
        rev = rev[['transcript_ID', 'read_start']]
        rev = rev.groupby('transcript_ID').max().reset_index()
    
    # concat fwd and rev
    df = pd.concat([fwd, rev])
    
    return df

In [14]:
# replace tss or tes coords with those in a different df
# gtf: gtf file location 
# ends: df with transcript_ID, end coordinate
# how: 'tss' or 'tes'
# opref: output file prefix
def replace_gtf_end_coords(gtf, ends, opref, how='tes', test=False):
    
    # read preexisting GTF and 
    gtf_df = pd.read_csv(gtf, sep='\t', header=None, \
                names=['chr', 'source', 'entry_type', \
                       'start', 'stop', 'score', 'strand',\
                       'frame', 'fields'], comment='#')

    # get relevant values from fields
    gtf_df['transcript_id'] = np.nan
    gtf_df.loc[gtf_df.entry_type!='gene', 'transcript_id'] = gtf_df.loc[gtf_df.entry_type!='gene'].fields.str.split(pat='talon_transcript "', n=1, expand=True)[1]
    gtf_df.loc[gtf_df.entry_type!='gene', 'transcript_id'] = gtf_df.loc[gtf_df.entry_type!='gene'].transcript_id.str.split(pat='"', n=1, expand=True)[0]
    

    if how == 'tes':
        ends.columns = ['transcript_id', 'tes']
    elif how == 'tss':
        ends.columns = ['transcript_id', 'tss']
        
    # merge gtf_df with end information 
#     ends.transcript_id = ends.transcript_id.astype('str')
    df = gtf_df.loc[gtf_df.transcript_id.notnull()]
    ends.transcript_id = ends.transcript_id.astype('str')
    df.transcript_id = df.transcript_id.astype('str')
    gtf_df = gtf_df.merge(ends, how='left', on='transcript_id')
    
    if test:
        tids = ['142372', '142634']
        print('142372' in df.transcript_id)
        df = df.loc[df.transcript_id.isin(tids)]
        print(ends.loc[ends.transcript_id.isin(tids)])
        print()
        print('Before editing')
        print(gtf_df[['transcript_id', 'entry_type', 'strand', 'start', 'stop', how]])
    
    # swap out read starts or ends for the longest ones
    # fwd: swap out transcript "stop" and last exon "stop" 
    # rev: swap out transcript "start" and last exon "start"
    tids = df.transcript_id.unique()
    for t, tid in enumerate(tids):
        if t % 1000 == 0:
            print('Processing transcript {} of {}'.format(t, len(tids)))
        if how == 'tes':
            # transcript entry for fwd
            ind = gtf_df.loc[(gtf_df.strand=='+')&(gtf_df.transcript_id==tid)].index.tolist()
            if test:
                print('fwd ind')
                print(ind)
            if ind: 
                if test: 
                    print('fixing...')
                i = ind[0]
                gtf_df.loc[i, 'stop'] = gtf_df.loc[i, 'tes']
                # stop of last exon for fwd
                i = ind[-1]
                gtf_df.loc[i, 'stop'] = gtf_df.loc[i, 'tes']

            # transcript for rev
            ind = gtf_df.loc[(gtf_df.strand=='-')&(gtf_df.transcript_id==tid)].index.tolist()
            if test: 
                print('rev ind')
                print(ind)
            if ind: 
                if test:
                    print('fixing...')
                i = ind[0]
                gtf_df.loc[i, 'start'] = gtf_df.loc[i, 'tes']
                # start of last exon for rev
                i = ind[-1]
                gtf_df.loc[i, 'start'] = gtf_df.loc[i, 'tes']

            # for exon can probably just take [1] or [-1] of temp to access
            # first or last exon respectively

        # fwd: swap out transcript "start" and first exon "start" 
        # rev: swap out transcript "stop" and first exon "stop"                 
#         elif how == 'tss':
#             # transcript entry for fwd
#             ind = df.loc[(df.strand=='+')&(df.transcript_id==tid)].index.tolist()
#             if ind: 
#                 df.loc[ind, 'start'] = df.loc[ind, 'tss']
#                 # start of first exon for fwd
#                 ind = ind[1]
#                 df.loc[ind].start = df.loc[ind].tss
#             # transcript entry for rev
#             ind = df.loc[(df.strand=='-')&(df.transcript_id==tid)].index.tolist()
#             if ind:
#                 df.loc[ind].stop = df.loc[ind].tss
#                 # stop of first exon for rev
#                 ind = ind[1]
#                 df.loc[ind].stop = df.loc[ind].tss

    if test:
        print()
        print('After editing')
        print(gtf_df[['transcript_id', 'entry_type', 'strand', 'start', 'stop', how]])
      
    cols=['chr', 'source', 'entry_type', \
          'start', 'stop', 'score', 'strand',\
           'frame', 'fields']
    gtf_df = gtf_df[cols]
    gtf_df['start'] = gtf_df['start'].astype('int')
    gtf_df['stop'] = gtf_df['stop'].astype('int')
    if test:
        fname = '{}_revised_tes_test.gtf'.format(opref)
    else:
        fname = '{}_revised_tes.gtf'.format(opref)
    gtf_df.to_csv(fname, sep='\t', header=None, index=False, quoting=csv.QUOTE_NONE)
    return gtf_df

In [12]:
annot = 'PacBio_Brain_talon_read_annot.tsv'
gtf = 'test.gtf'

tes = get_longest_ends(annot, how='tes', novelty='novel')

In [15]:
df = replace_gtf_end_coords(gtf, tes, 'beep', how='tes', test=True)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[name] = value


False
    transcript_id      tes
229        142372  4899923
304        142634  3344252

Before editing
   transcript_id  entry_type strand    start     stop        tes
0            NaN        gene      +  4857814  4897905        NaN
1         142372  transcript      +  4857814  4897905  4899923.0
2         142372        exon      +  4857814  4857976  4899923.0
3         142372        exon      +  4867470  4867532  4899923.0
4         142372        exon      +  4878027  4878132  4899923.0
5         142372        exon      +  4886744  4886831  4899923.0
6         142372        exon      +  4889460  4889602  4899923.0
7         142372        exon      +  4890740  4890796  4899923.0
8         142372        exon      +  4891915  4892069  4899923.0
9         142372        exon      +  4893417  4893563  4899923.0
10        142372        exon      +  4894934  4895005  4899923.0
11        142372        exon      +  4896356  4897905  4899923.0
12           NaN        gene      -  3344256  340918

In [200]:
# annot = 'PacBio_Brain_talon_read_annot.tsv'
# gtf = 'Brain_talon.gtf'

# tes = get_longest_ends(annot, how='tes', novelty='novel')
# df = replace_gtf_end_coords(gtf, tes, 'beep', how='tes', test=True)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[name] = value


True
     transcript_id      tes
229         142372  4899923
304         142634  3344252

Before editing
        transcript_id  entry_type strand    start     stop        tes
92             142372  transcript      +  4857814  4897905  4899923.0
93             142372        exon      +  4857814  4857976  4899923.0
94             142372        exon      +  4867470  4867532  4899923.0
95             142372        exon      +  4878027  4878132  4899923.0
96             142372        exon      +  4886744  4886831  4899923.0
97             142372        exon      +  4889460  4889602  4899923.0
98             142372        exon      +  4890740  4890796  4899923.0
99             142372        exon      +  4891915  4892069  4899923.0
100            142372        exon      +  4893417  4893563  4899923.0
101            142372        exon      +  4894934  4895005  4899923.0
102            142372        exon      +  4896356  4897905  4899923.0
228081         142634  transcript      -  3344256  3371

In [197]:
# cortex
annot = 'PacBio_Brain_talon_read_annot.tsv'
gtf = 'mouse_brain_cortex_talon.gtf'

tes = get_longest_ends(annot, how='tes', novelty='novel')
df = replace_gtf_end_coords(gtf, tes, 'mouse_brain_cortex', how='tes')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[name] = value


Processing transcript 0 of 37457
Processing transcript 1000 of 37457
Processing transcript 2000 of 37457
Processing transcript 3000 of 37457
Processing transcript 4000 of 37457
Processing transcript 5000 of 37457
Processing transcript 6000 of 37457
Processing transcript 7000 of 37457
Processing transcript 8000 of 37457
Processing transcript 9000 of 37457
Processing transcript 10000 of 37457
Processing transcript 11000 of 37457
Processing transcript 12000 of 37457
Processing transcript 13000 of 37457
Processing transcript 14000 of 37457
Processing transcript 15000 of 37457
Processing transcript 16000 of 37457
Processing transcript 17000 of 37457
Processing transcript 18000 of 37457
Processing transcript 19000 of 37457
Processing transcript 20000 of 37457
Processing transcript 21000 of 37457
Processing transcript 22000 of 37457
Processing transcript 23000 of 37457
Processing transcript 24000 of 37457
Processing transcript 25000 of 37457
Processing transcript 26000 of 37457
Processing tra

In [198]:
# hippocampus
annot = 'PacBio_Brain_talon_read_annot.tsv'
gtf = 'mouse_brain_hippocampus_talon.gtf'

tes = get_longest_ends(annot, how='tes', novelty='novel')
df = replace_gtf_end_coords(gtf, tes, 'mouse_brain_hippocampus', how='tes')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[name] = value


Processing transcript 0 of 37034
Processing transcript 1000 of 37034
Processing transcript 2000 of 37034
Processing transcript 3000 of 37034
Processing transcript 4000 of 37034
Processing transcript 5000 of 37034
Processing transcript 6000 of 37034
Processing transcript 7000 of 37034
Processing transcript 8000 of 37034
Processing transcript 9000 of 37034
Processing transcript 10000 of 37034
Processing transcript 11000 of 37034
Processing transcript 12000 of 37034
Processing transcript 13000 of 37034
Processing transcript 14000 of 37034
Processing transcript 15000 of 37034
Processing transcript 16000 of 37034
Processing transcript 17000 of 37034
Processing transcript 18000 of 37034
Processing transcript 19000 of 37034
Processing transcript 20000 of 37034
Processing transcript 21000 of 37034
Processing transcript 22000 of 37034
Processing transcript 23000 of 37034
Processing transcript 24000 of 37034
Processing transcript 25000 of 37034
Processing transcript 26000 of 37034
Processing tra

In [None]:
# fix stupid numbors
gtf = 'mouse_brain_cortex_revised_tes.gtf'
gtf_df = pd.read_csv(gtf, sep='\t', header=None, \
            names=['chr', 'source', 'entry_type', \
                   'start', 'stop', 'score', 'strand',\
                   'frame', 'fields'])
gtf_df