The goal of this analysis is to try and refine the 3' end calling of TALON by using the GENCODE strategy of longest TES per transcript.

In [175]:
import pandas as pd
import numpy as np
import csv

In [30]:
def get_longest_ends(annot, how='tes', novelty='novel'):
    df = pd.read_csv(annot, sep='\t')
    
    if novelty == 'novel':
        df = df.loc[df.transcript_novelty != 'Known']
    
    fwd = df.loc[df.strand == '+']
    rev = df.loc[df.strand == '-']
    
    # furthest downstream for tes
    # if + strand, max coord of read end
    # if - strand, min coord of read end
    if how == 'tes':
        fwd = fwd[['transcript_ID', 'read_end']]
        fwd = fwd.groupby('transcript_ID').max().reset_index()
        rev = rev[['transcript_ID', 'read_end']]
        rev = rev.groupby('transcript_ID').min().reset_index()
        
        
    # furthest upstream for tss:
    # if + strand, min coord of read start
    # if - strand, max coord of read start
    elif how == 'tss':
        fwd = fwd[['transcript_ID', 'read_start']]
        fwd = fwd.groupby('transcript_ID').min().reset_index()
        rev = rev[['transcript_ID', 'read_start']]
        rev = rev.groupby('transcript_ID').max().reset_index()
    
    # concat fwd and rev
    df = pd.concat([fwd, rev])
    
    return df

In [176]:
# replace tss or tes coords with those in a different df
# gtf: gtf file location 
# ends: df with transcript_ID, end coordinate
# how: 'tss' or 'tes'
# opref: output file prefix
def replace_gtf_end_coords(gtf, ends, opref, how='tes', test=False):
    
    # read preexisting GTF and 
    gtf_df = pd.read_csv(gtf, sep='\t', header=None, \
                names=['chr', 'source', 'entry_type', \
                       'start', 'stop', 'score', 'strand',\
                       'frame', 'fields'])

    # get relevant values from fields
    gtf_df['transcript_id'] = np.nan
    gtf_df.loc[gtf_df.entry_type!='gene', 'transcript_id'] = gtf_df.loc[gtf_df.entry_type!='gene'].fields.str.split(pat='talon_transcript "', n=1, expand=True)[1]
    gtf_df.loc[gtf_df.entry_type!='gene', 'transcript_id'] = gtf_df.loc[gtf_df.entry_type!='gene'].transcript_id.str.split(pat='"', n=1, expand=True)[0]

    if how == 'tes':
        ends.columns = ['transcript_id', 'tes']
    elif how == 'tss':
        ends.columns = ['transcript_id', 'tss']
        
    # merge gtf_df with end information 
#     ends.transcript_id = ends.transcript_id.astype('str')
#     gtf_df.transcript_id = gtf_df.transcript_id.astype('str')
    df = gtf_df.loc[gtf_df.transcript_id.notnull()]
    ends.transcript_id = ends.transcript_id.astype('int')
    df.transcript_id = df.transcript_id.astype('int')
    df = df.merge(ends, how='left', on='transcript_id')
    
    if test:
        tids = [142372, 142634]
        print(142372 in df.transcript_id)
        df = df.loc[df.transcript_id.isin(tids)]
        print(ends.loc[ends.transcript_id.isin(tids)])
        print()
        print('Before editing')
        print(df[['transcript_id', 'entry_type', 'strand', 'start', 'stop', how]])
    
    # swap out read starts or ends for the longest ones
    # fwd: swap out transcript "stop" and last exon "stop" 
    # rev: swap out transcript "start" and last exon "start"
    for tid in df.transcript_id.unique():
        if how == 'tes':
            # transcript entry for fwd
            ind = df.loc[(df.strand=='+')&(df.transcript_id==tid)].index.tolist()
            if test:
                print('fwd ind')
                print(ind)
            if ind: 
                if test: 
                    print('fixing...')
                i = ind[0]
                df.loc[i, 'stop'] = df.loc[i, 'tes']
                # stop of last exon for fwd
                i = ind[-1]
                df.loc[i, 'stop'] = df.loc[i, 'tes']

            # transcript for rev
            ind = df.loc[(df.strand=='-')&(df.transcript_id==tid)].index.tolist()
            if test: 
                print('rev ind')
                print(ind)
            if ind: 
                if test:
                    print('%%fileixing...')
                i = ind[0]
                df.loc[i, 'start'] = df.loc[i, 'tes']
                # start of last exon for rev
                i = ind[-1]
                df.loc[i, 'start'] = df.loc[i, 'tes']

            # for exon can probably just take [1] or [-1] of temp to access
            # first or last exon respectively

        # fwd: swap out transcript "start" and first exon "start" 
        # rev: swap out transcript "stop" and first exon "stop"                 
#         elif how == 'tss':
#             # transcript entry for fwd
#             ind = df.loc[(df.strand=='+')&(df.transcript_id==tid)].index.tolist()
#             if ind: 
#                 df.loc[ind, 'start'] = df.loc[ind, 'tss']
#                 # start of first exon for fwd
#                 ind = ind[1]
#                 df.loc[ind].start = df.loc[ind].tss
#             # transcript entry for rev
#             ind = df.loc[(df.strand=='-')&(df.transcript_id==tid)].index.tolist()
#             if ind:
#                 df.loc[ind].stop = df.loc[ind].tss
#                 # stop of first exon for rev
#                 ind = ind[1]
#                 df.loc[ind].stop = df.loc[ind].tss

    if test:
        print()
        print('After editing')
        print(df[['transcript_id', 'entry_type', 'strand', 'start', 'stop', how]])
        
    cols=['chr', 'source', 'entry_type', \
          'start', 'stop', 'score', 'strand',\
           'frame', 'fields']
    df = df[cols]
    fname = '{}_revised_tes.gtf'.format(opref)
    df.to_csv(fname, sep='\t', header=None, index=False, quoting=csv.QUOTE_NONE)
    return df

In [177]:
# annot = 'PacBio_Brain_talon_read_annot.tsv'
# gtf = 'Brain_talon.gtf'

# tes = get_longest_ends(annot, how='tes', novelty='novel')
df = replace_gtf_end_coords(gtf, tes, 'beep', how='tes', test=True)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[name] = value


True
     transcript_id      tes
229         142372  4899923
304         142634  3344252

Before editing
        transcript_id  entry_type strand    start     stop        tes
92             142372  transcript      +  4857814  4897905  4899923.0
93             142372        exon      +  4857814  4857976  4899923.0
94             142372        exon      +  4867470  4867532  4899923.0
95             142372        exon      +  4878027  4878132  4899923.0
96             142372        exon      +  4886744  4886831  4899923.0
97             142372        exon      +  4889460  4889602  4899923.0
98             142372        exon      +  4890740  4890796  4899923.0
99             142372        exon      +  4891915  4892069  4899923.0
100            142372        exon      +  4893417  4893563  4899923.0
101            142372        exon      +  4894934  4895005  4899923.0
102            142372        exon      +  4896356  4897905  4899923.0
228081         142634  transcript      -  3344256  3371

In [None]:
annot = 'PacBio_Brain_talon_read_annot.tsv'
gtf = 'Brain_talon.gtf'

tes = get_longest_ends(annot, how='tes', novelty='novel')
df = replace_gtf_end_coords(gtf, tes, 'beep', how='tes')

In [69]:
annot_df = pd.read_csv(annot, sep='\t')

In [71]:
annot_df.head()

Unnamed: 0,read_name,dataset,genome_build,chrom,read_start,read_end,strand,n_exons,read_length,gene_ID,...,annot_gene_name,annot_transcript_name,gene_novelty,transcript_novelty,ISM_subtype,fraction_As,custom_label,allelic_label,start_support,end_support
0,m54284U_191107_221739/45877683/ccs,PacBio_Cortex_Rep1,mm10_ERCC_SIRV,ERCC-00092,1,1105,+,1,1105,55504,...,,,Known,Known,,0.0,,,,
1,m54284U_191107_221739/29819866/ccs,PacBio_Cortex_Rep1,mm10_ERCC_SIRV,ERCC-00042,1,1007,+,1,1007,55475,...,,,Known,Known,,0.0,,,,
2,m54284U_191107_221739/110168264/ccs,PacBio_Cortex_Rep1,mm10_ERCC_SIRV,ERCC-00092,1,1104,+,1,1104,55504,...,,,Known,Known,,0.0,,,,
3,m54284U_191107_221739/14550884/ccs,PacBio_Cortex_Rep1,mm10_ERCC_SIRV,ERCC-00004,1,418,+,1,418,55453,...,,,Known,Known,,0.3,,,,
4,m54284U_191107_221739/104335118/ccs,PacBio_Cortex_Rep1,mm10_ERCC_SIRV,ERCC-00042,1,980,+,1,986,55475,...,,,Known,Known,,0.35,,,,


In [135]:
beep = annot_df[['read_name', 'strand', 'transcript_novelty', 'transcript_ID']]
beep = beep.groupby(['transcript_ID', 'strand', 'transcript_novelty']).count().reset_index()

In [133]:
beep.head()

Unnamed: 0,transcript_ID,transcript_novelty,read_name
0,1,Known,2
1,4,Known,3
2,7,Known,1
3,15,Known,1
4,26,Known,2


In [136]:
beep.loc[(beep.transcript_novelty!='Known')&(beep.read_name > 1)&(beep.strand == '+')]

Unnamed: 0,transcript_ID,strand,transcript_novelty,read_name
36945,141935,+,NNC,4
36947,141937,+,NNC,8
36949,141939,+,Intergenic,2
36950,141940,+,NIC,3
36951,141941,+,NNC,32
...,...,...,...,...
549329,654319,+,NIC,27
549332,654322,+,NNC,2
549333,654323,+,NNC,2
549334,654324,+,ISM,3


In [137]:
beep.loc[(beep.transcript_novelty!='Known')&(beep.read_name > 1)&(beep.strand == '-')]

Unnamed: 0,transcript_ID,strand,transcript_novelty,read_name
36948,141938,-,ISM,2
36953,141943,-,Intergenic,57
36970,141960,-,ISM,4
36979,141969,-,NNC,5
36982,141972,-,Genomic,5
...,...,...,...,...
549312,654302,-,NNC,3
549314,654304,-,ISM,3
549316,654306,-,NNC,2
549324,654314,-,Genomic,6


In [143]:
pass_list = pd.read_csv('mouse_brain_whitelist.csv', header=None)
pass_list.columns = ['gene_id', 'transcript_id']

In [144]:
pass_list.head()

Unnamed: 0,gene_id,transcript_id
0,55504,141821
1,55475,141792
2,55453,141770
3,55463,141780
4,55495,141812


In [145]:
beep = beep.loc[beep.transcript_ID.isin(pass_list.transcript_id.tolist())]

In [146]:
beep.loc[(beep.transcript_novelty!='Known')&(beep.read_name > 1)&(beep.strand == '+')]

Unnamed: 0,transcript_ID,strand,transcript_novelty,read_name
37382,142372,+,NNC,157
37446,142436,+,NIC,74
37478,142468,+,NIC,85
37479,142469,+,NIC,28
37598,142588,+,NIC,94
...,...,...,...,...
548038,653028,+,ISM,114
548042,653032,+,ISM,47
548050,653040,+,ISM,252
548594,653584,+,NNC,37


In [147]:
beep.loc[(beep.transcript_novelty!='Known')&(beep.read_name > 1)&(beep.strand == '-')]

Unnamed: 0,transcript_ID,strand,transcript_novelty,read_name
37272,142262,-,NNC,38
37644,142634,-,NIC,80
37645,142635,-,NIC,46
37846,142836,-,ISM,30
37847,142837,-,ISM,45
...,...,...,...,...
547976,652966,-,NIC,37
547981,652971,-,ISM,32
547985,652975,-,ISM,47
547986,652976,-,ISM,21
