In [1]:
import os
import sys
from collections import defaultdict
import gzip
import pandas as pd
import re
import csv

GTF_HEADER  = ['seqname', 'source', 'feature', 'start', 'end', 'score',
               'strand', 'frame']
R_SEMICOLON = re.compile(r'\s*;\s*')
R_COMMA     = re.compile(r'\s*,\s*')
R_KEYVALUE  = re.compile(r'(\s+|\s*=\s*)')


def dataframe(filename):
    """Open an optionally gzipped GTF file and return a pandas.DataFrame.
    """
    # Each column is a list stored as a value in this dict.
    result = defaultdict(list)

    for i, line in enumerate(lines(filename)):
        for key in line.keys():
            # This key has not been seen yet, so set it to None for all
            # previous lines.
            if key not in result:
                result[key] = [None] * i

        # Ensure this row has some value for each column.
        for key in result.keys():
            result[key].append(line.get(key, None))

    return pd.DataFrame(result)


def lines(filename):
    """Open an optionally gzipped GTF file and generate a dict for each line.
    """
    fn_open = gzip.open if filename.endswith('.gz') else open

    with fn_open(filename) as fh:
        for line in fh:
            if line.startswith('#'):
                continue
            else:
                yield parse(line)


def parse(line):
    """Parse a single GTF line and return a dict.
    """
    result = {}

    fields = line.rstrip().split('\t')

    for i, col in enumerate(GTF_HEADER):
        result[col] = _get_value(fields[i])

    # INFO field consists of "key1=value;key2=value;...".
    infos = [x for x in re.split(R_SEMICOLON, fields[8]) if x.strip()]

    for i, info in enumerate(infos, 1):
        # It should be key="value".
        try:
            key, _, value = re.split(R_KEYVALUE, info, 1)
        # But sometimes it is just "value".
        except ValueError:
            key = 'INFO{}'.format(i)
            value = info
        # Ignore the field if there is no value.
        if value:
            result[key] = _get_value(value)

    return result


def _get_value(value):
    if not value:
        return None

    # Strip double and single quotes.
    value = value.strip('"\'')

    # Return a list if the value has a comma.
    if ',' in value:
        value = re.split(R_COMMA, value)
    # These values are equivalent to None.
    elif value in ['', '.', 'NA']:
        return None

    return value

The history saving thread hit an unexpected error (DatabaseError('database disk image is malformed')).History will not be written to the database.


In [10]:
common_name='Name'
common_id='gene_id'
transcript_or_gene='transcript'
gtf = dataframe('/wynton/group/ye/mtschmitz/refdata2/rhemac10/CAT_chang/Rhesus.gtf')
gtf = gtf.fillna('')
print(gtf)
print(list(gtf.columns))

In [12]:
tgtf=gtf.loc[gtf['feature']=='transcript',:]
tgtf.loc[:,['transcript_id',common_id]]
tgtf.drop_duplicates(subset = ['transcript_id'], keep = 'first', inplace = True) 

ggtf=gtf.loc[gtf['feature']=='transcript',:]
ggtf.drop_duplicates(subset = [common_id], keep = 'first', inplace = True) 
gdict=dict(zip(ggtf[common_id],ggtf[common_name]))
#ggtf.loc[:,[common_id,common_name]]
tgtf[common_name]=list(tgtf[common_id].replace(gdict))

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


KeyboardInterrupt: 

In [16]:
tgtf[common_name]=list(tgtf[common_id].replace(gdict))

KeyboardInterrupt: 

In [13]:
gtf['feature'].unique()

array(['transcript', 'exon', 'CDS'], dtype=object)

In [14]:
ggtf

Unnamed: 0,seqname,source,feature,start,end,score,strand,frame,transcript_id,gene_id,...,Name,rna_support,reference_support,novel_5p_cap,novel_poly_a,alternative_source_transcripts,collapsed_gene_ids,collapsed_gene_names,ont,possible_split_gene_locations
0,chr1,CAT,transcript,7416,13554,9450,-,,Rhesus_T0000001,Rhesus_G0000001,...,PGBD2,True,True,,,,,,,
17,chr1,CAT,transcript,14267,14369,3520,+,,Rhesus_T0000005,Rhesus_G0000002,...,RNU6-1205P,True,True,,,,,,,
19,chr1,CAT,transcript,65269,71509,8050,-,,Rhesus_T0000006,Rhesus_G0000003,...,AL672291.1,True,True,,,,,,,
24,chr1,CAT,transcript,71366,80765,9000,+,,Rhesus_T0000007,Rhesus_G0000004,...,ZNF692,True,True,,,,,,,
321,chr1,CAT,transcript,81255,91705,9530,-,,Rhesus_T0000038,Rhesus_G0000005,...,ZNF672,True,True,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3322866,chrY,CAT,transcript,10104323,10117294,,+,,Rhesus_T0262195,Rhesus_G0066375,...,Rhesus_G0066375,True,True,True,True,,,,,
3322949,chrY,CAT,transcript,10529111,10529943,,-,,Rhesus_T0262198,Rhesus_G0066376,...,Rhesus_G0066376,True,True,True,True,,,,,
3322954,chrY,CAT,transcript,10630192,10630413,,+,,Rhesus_T0262199,Rhesus_G0066377,...,Rhesus_G0066377,True,True,True,True,,,,,
3322957,chrY,CAT,transcript,10630593,10690186,,+,,Rhesus_T0262201,Rhesus_G0066379,...,Rhesus_G0066379,True,True,True,True,,,,,


In [15]:
tgtf

Unnamed: 0,seqname,source,feature,start,end,score,strand,frame,transcript_id,gene_id,...,Name,rna_support,reference_support,novel_5p_cap,novel_poly_a,alternative_source_transcripts,collapsed_gene_ids,collapsed_gene_names,ont,possible_split_gene_locations
0,chr1,CAT,transcript,7416,13554,9450,-,,Rhesus_T0000001,Rhesus_G0000001,...,PGBD2,True,True,,,,,,,
4,chr1,CAT,transcript,8223,13554,9540,-,,Rhesus_T0000002,Rhesus_G0000001,...,PGBD2,True,True,,,,,,,
9,chr1,CAT,transcript,9002,13507,,-,,Rhesus_T0000003,Rhesus_G0000001,...,PGBD2,True,True,True,True,14339.t1,,,,
14,chr1,CAT,transcript,9964,13554,6640,-,,Rhesus_T0000004,Rhesus_G0000001,...,PGBD2,True,True,,,,,,,
17,chr1,CAT,transcript,14267,14369,3520,+,,Rhesus_T0000005,Rhesus_G0000002,...,RNU6-1205P,True,True,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3322954,chrY,CAT,transcript,10630192,10630413,,+,,Rhesus_T0262199,Rhesus_G0066377,...,Rhesus_G0066377,True,True,True,True,,,,,
3322957,chrY,CAT,transcript,10630593,10690186,,+,,Rhesus_T0262201,Rhesus_G0066379,...,Rhesus_G0066379,True,True,True,True,,,,,
3323022,chrY,CAT,transcript,10630593,10690186,,+,,Rhesus_T0262202,Rhesus_G0066379,...,Rhesus_G0066379,True,True,True,True,,,,,
3323091,chrY,CAT,transcript,10630593,10692903,,+,,Rhesus_T0262203,Rhesus_G0066379,...,Rhesus_G0066379,True,True,True,True,,,,,
