##### 08 November 2019
# Parse Ensembl GRCh37.87 GFF3 to a BED file of exonic regions for validation of RefSeqGFF3 exon bed file
##### by Pavlos Bousounis

***Last updated 2019-11-11***

### Import modules

In [6]:
from datetime import datetime
import gffutils
import gzip
import numpy as np
import os
import pandas as pd
import pprint
import shutil
import re

### Display current working directory and today's date

In [5]:
today = datetime.today().strftime('%Y-%m-%d')

print('Today is: {}'.format(today))

Today is: 2019-11-11


### Specify working directory

In [3]:
os.chdir('/Users/pbousounis/Experiments/2019-10-29_hg19mod/2019-11-08_EnsemblGFF3_GRCh37-parse2bed')

### Define function ***gff2bed()*** 
****Note: Returns ONLY exon features***

In [4]:
def gff2bed(gff_path_name):
    
    """ Given a bed file name/path, parse the contents of EXONIC records into a dataframe with attributes info separated into columns: 
        1. Read in bed as a tab-delimited file; add column names as per GFF3 spec 
        ??3. IMPORTANT: convert to 0-based positions (subtract 1 from start position); half open coordinates?
        4. Return a bed-like pandas DataFrame"""

    # define gff3 column names (https://github.com/The-Sequence-Ontology/Specifications/blob/master/gff3.md)
    gff_cols = ['seqid', 'source', 'type', 'start', 'end', 'score', 'strand', 'phase', 'attributes']

    # read in the gff3 file, skip header lines
    gff = pd.read_csv(gff_path_name, comment='#', sep='\t', header=None, names=gff_cols, low_memory=False)

    # select only exons
    exons = gff[gff.type == 'exon']

    # extract info from gff attributes 
    exons['transcript_id'] = exons['attributes'].str.extract(r'(transcript:)(\w+)')[1]
    exons['exon_id'] = exons['attributes'].str.extract(r'(exon_id={1})(\w+)')[1]

    exons.rename(columns={"seqid": "chrom"}, inplace=True)

    exons = exons[['chrom', 'start', 'end', 'strand', 'score', 'phase', 'transcript_id', 'exon_id', 'source']]
    
    exons['name'] = exons['exon_id'] + '_' + exons['transcript_id']
    
    return(exons[['chrom', 'start', 'end', 'name']])

### Decompress the GFF3 archive

In [9]:
# decompress the gunzipped file
gff_gz = 'data/Homo_sapiens.GRCh37.87.chr.gff3.gz'
gff_out = 'data/Homo_sapiens.GRCh37.87.chr.gff3'

with gzip.open(gff_gz, 'rb') as f_in:
    with open(gff_out, 'wb') as f_out:
        shutil.copyfileobj(f_in, f_out)

### Examine the structure of the GFF3 file

In [30]:
# examiner = GFFExaminer()
# in_handle = open(gff_out)
# pprint.pprint(examiner.available_limits(in_handle))
# in_handle.close()

### Convert the Ensembl GFF3 to BED 

In [5]:
# run gff2bed() on the RefSeq GRCh37 GFF3 file
gff_file = 'data/Homo_sapiens.GRCh37.87.chr.gff3'
ens_df = gff2bed(gff_file)
ens_df.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().rename(**kwargs)


Unnamed: 0,chrom,start,end,name
18,1,11869,12227,ENSE00002234944_ENST00000456328
19,1,12613,12721,ENSE00003582793_ENST00000456328
20,1,13221,14409,ENSE00002312635_ENST00000456328
22,1,11872,12227,ENSE00002234632_ENST00000515242
23,1,12613,12721,ENSE00003608237_ENST00000515242


### Extract rows containing NAs and save them to file

In [9]:
# create bool series True for missing values  
bool_series = pd.isnull(ens_df["name"])  
    
# displaying data only with name = NaN  
ens_df_na = ens_df[bool_series] 

# save dataframe to file
ens_df_na_fileout = 'output/' + today + '_Ensembl-GRCh37_GFF3_NA-names.bed'
ens_df_na.to_csv(ens_df_na_fileout, sep='\t', index=None, header=None)

In [8]:
# remove rows with 'name' = NA
ens_df = ens_df.dropna()

### Save the parsed Ensembl bed file

In [11]:
# save the bed file
ens_df_fileout = today + '_Ensembl-GRCh37_GFF3.bed'
ens_df.to_csv(os.path.join('output', ens_df_fileout), sep='\t', index=False, header=False)

# Results:

1. Ensembl GRCh37 latest genomic GFF3 archive was decompressed
2. Decompressed GFF3 was parsed to a bed file containing only exonic regions
3. Rows containing NAs were removed for compatibility with pybedtools 
4. BED-formatted file saved to ***./output/2019-11-08_Ensembl-GRCh37_GFF3.bed***