In [12]:
import click
import progressbar
import pyfaidx
import pandas as pd

from gtfparse import read_gtf
from itertools import chain
from typing import Tuple, List
from copy import deepcopy

In [7]:
gtffile = "/Users/milessmith/workspace/ensembl/Homo_sapiens.GRCh38.94.gtf.gz"
fastafile = "/Users/milessmith/workspace/ensembl/Homo_sapiens.GRCh38.dna.primary_assembly.fa"
feature_type = None
outfile = "/Users/milessmith/workspace/merrycrispr/merrycrispr/excision_test.fa"
gene_name = "ZCCHC14"
boundary = 100

In [56]:
def match_seq(rec: pd.Series, 
              sequences: pyfaidx.Fasta) -> pyfaidx.Sequence:
    try:
        if rec['strand'] == "-":
            rev = True
        else:
            rev = False
        seq = pyfaidx.Sequence(name=f"{rec['gene_name']}_{rec['feature']}_"
                                    f"{rec['strand']}_{rec['start']}_{rec['end']}",
                               seq=sequences.get_seq(name=rec['seqname'],
                                                     start=rec['start'],
                                                     end=rec['end'],
                                                     rc=rev).seq)
    except ValueError:
        print(f"problem with {rec['gene_name']} {rec['start']} "
              f"{rec['end']} {rec['seqname']} {rec['strand']}")
    return seq

In [52]:
def split_record(rec: pd.DataFrame, 
                 padding: int,
                 rev: bool=False) -> pd.DataFrame:
    rec1 = deepcopy(rec)
    rec2 = deepcopy(rec)
    if rev == False:
        rec1["end"] = rec1["start"]
        rec1["start"] -= padding
        rec1["gene_name"] += "_upstream"
        rec2["start"] = rec2["end"]
        rec2["end"] += padding
        rec2["gene_name"] += "_downstream"
    elif rev == True:
        rec1["end"] = rec1["start"]
        rec1["start"] -= padding
        rec1["gene_name"] += "_downstream"
        rec2["start"] = rec2["end"]
        rec2["end"] += padding
        rec2["gene_name"] += "_upstream"
    new_rec = pd.concat([rec1, rec2])
    return new_rec

In [3]:
print("Parsing GTF/GFF file.")
records = read_gtf(filepath_or_buffer=gtffile, 
                   chunksize=8192 * 1024)
if gene_name is not None:
    records = records[records['gene_name'].isin(gene_name)]
if feature_type is not None:
    records = records[records['feature'] == feature_type]
    records = records[['seqname', 'feature', 'start', 'end', 'strand', 'frame', 'gene_name', f'{feature_type}_id']].drop_duplicates()
else:
    records = records[['seqname', 'feature', 'start', 'end', 'strand', 'frame', 'gene_name']].drop_duplicates()

# if "exon_number" in kwargs.keys():
#     records = records[records['exon_number'] == kwargs['exon_number']]


Parsing GTF/GFF file.


INFO:root:Extracted GTF attributes: ['gene_id', 'gene_version', 'gene_name', 'gene_source', 'gene_biotype', 'transcript_id', 'transcript_version', 'transcript_name', 'transcript_source', 'transcript_biotype', 'tag', 'transcript_support_level', 'exon_number', 'exon_id', 'exon_version', 'protein_id', 'protein_version', 'ccds_id']


In [4]:
print(f"{len(records)} total records found.")

print(f"Loading the sequences in {fastafile}."
      f"Note: if this is the first time opening this file, "
      "it may take a few moments as an index is built.")
sequences = pyfaidx.Fasta(fastafile)
print(f"Finished loading {fastafile}")

49 total records found.
Loading the sequences in /Users/milessmith/workspace/ensembl/Homo_sapiens.GRCh38.dna.primary_assembly.fa.Note: if this is the first time opening this file, it may take a few moments as an index is built.
Finished loading /Users/milessmith/workspace/ensembl/Homo_sapiens.GRCh38.dna.primary_assembly.fa


In [5]:
seq_count = 1
seq_widgets = ['Matching features to sequences: ',
               progressbar.Counter(),
               f'/{seq_count}',
               progressbar.Percentage(),
               ' ',
               progressbar.Bar(),
               progressbar.Timer(),
               ' ',
               progressbar.ETA()]
seq_progress = progressbar.ProgressBar(widgets=seq_widgets,
                                       maxval=len(records.index)).start()

Matching features to sequences: 0/1  0% | |Elapsed Time: 0:00:00 ETA:  --:--:--

In [45]:
records.head()

Unnamed: 0,seqname,feature,start,end,strand,frame,gene_name
2181459,16,gene,87406246,87492045,-,0,ZCCHC14
2181460,16,transcript,87406246,87492045,-,0,ZCCHC14
2181461,16,exon,87491669,87492045,-,0,ZCCHC14
2181462,16,CDS,87491669,87491827,-,0,ZCCHC14
2181463,16,start_codon,87491825,87491827,-,0,ZCCHC14


In [47]:
records['gene_name'].head()

2181459    ZCCHC14
2181460    ZCCHC14
2181461    ZCCHC14
2181462    ZCCHC14
2181463    ZCCHC14
Name: gene_name, dtype: object

In [48]:
records['gene_name'].head() + "_upstream"

2181459    ZCCHC14_upstream
2181460    ZCCHC14_upstream
2181461    ZCCHC14_upstream
2181462    ZCCHC14_upstream
2181463    ZCCHC14_upstream
Name: gene_name, dtype: object

In [49]:
records['start'].head()

2181459    87406246
2181460    87406246
2181461    87491669
2181462    87491669
2181463    87491825
Name: start, dtype: int64

In [50]:
records['start'].head() + 100

2181459    87406346
2181460    87406346
2181461    87491769
2181462    87491769
2181463    87491925
Name: start, dtype: int64

In [65]:
beta = records['start']

In [66]:
type(beta)

pandas.core.series.Series

In [57]:
if boundary > 0:
    split_features = split_record(records, boundary)

In [58]:
split_features.shape

(98, 7)

In [59]:
records.shape

(49, 7)

In [60]:
final_list = [match_seq(_, sequences) for _ in records.itertuples()]

In [61]:
final_list

[>ZCCHC14_gene_-_87406246_87492045
 AGGCCAACAACCCGGCCGACCTGGGCAGCCTCACCAACCTGACGGACGAGGTGGTGCGCAGCAAGCTGCTGGTGTCGCTGGCGCTGCTGGGCTCGGAGCAGCGCGAGGCGGCGGGCGTGCTCTACCGCACGCTCACGCACATCGACTCCATCATCCACAACTACGGGCTGCAGCTTAACGAGGGCCGCACGGGCGATGAGTTCCTGCTGCTGTTCACCATGGCCTCCAACCACCCGGCCTTCAGCTTCCACCAGAAGCAGGTGCTGCGCCAGGAGCTCACGCAGATCCAGAGCAGCCTGAACGGCGGCGGGGGCCACGGCGGCAAGGGCGCGCCCGGGCCGGGCGGCGCGCTGCCCACTTGCCCAGCCTGCCACAAGGTGCGTGCCCGCCCCGAGCTCTGCCCTGTACCCCAAGTCTGCATCCCCAACTCTGCACTCCAAGCCTCCAGCCTGCACCGCGACCCCCCGACCCACGCCTCCAGCCTGCACTGAGAGCTCCCACCCAGGTCTCCAACCTGCACCACGAGCCCCCAGCCCGTATCCCAAGCCCCATCCTAAGCCTCTATGTCGCACCCCAAATGTGTGCCCCACCTCTCCAAGCCTCCACCGTACATCCCAAGCCTCCGCCGTACATCCCAAGCCTCCGCCGTACATCCCAAGCCCCCATCCTAAGCCCGCACTCCTCACCCTAAGTCTGCACTTTAAGCCCCAAGACCGCACCGCGGCCCTGAGCCCACACCCTCAGTCCCCTCTGCGTGCCCCAAAGCCTCCACACCTCCGCCTCGCGCCCCTTGGGCAGGAGCGGCTGCAGGGGCCCTGGGTCCGAGGATCCGCGGGAGCGGTGCGGGAGACGTCCGCGGTCAGAGCTCACAGTCCCAGGTGCCCTCCTTTCACTTAGCCGGCTGCAAACGCGATAAGGCCTTTGTCCCCTTAATGGGGCCCTTGGGTGACAGATAACACACATT

In [62]:
len(final_list)

49

In [63]:
split_list = [match_seq(_, sequences) for _ in split_features.itertuples()]

In [64]:
split_list

[>ZCCHC14_upstream_gene_-_87406146_87406246
 GTTTTATTTCACTAGGTCTTTGTCTCAGAATATGAAGCACACACCTTTTCACATCAGTCTGAATCTCCAAACTAGGGGTTCCATCCTTTCTAGCCTCTCAC,
 >ZCCHC14_upstream_transcript_-_87406146_87406246
 GTTTTATTTCACTAGGTCTTTGTCTCAGAATATGAAGCACACACCTTTTCACATCAGTCTGAATCTCCAAACTAGGGGTTCCATCCTTTCTAGCCTCTCAC,
 >ZCCHC14_upstream_exon_-_87491569_87491669
 GGTGCGTGCCCGCCCCGAGCTCTGCCCTGTACCCCAAGTCTGCATCCCCAACTCTGCACTCCAAGCCTCCAGCCTGCACCGCGACCCCCCGACCCACGCCT,
 >ZCCHC14_upstream_CDS_-_87491569_87491669
 GGTGCGTGCCCGCCCCGAGCTCTGCCCTGTACCCCAAGTCTGCATCCCCAACTCTGCACTCCAAGCCTCCAGCCTGCACCGCGACCCCCCGACCCACGCCT,
 >ZCCHC14_upstream_start_codon_-_87491725_87491825
 GGCCTCCAACCACCCGGCCTTCAGCTTCCACCAGAAGCAGGTGCTGCGCCAGGAGCTCACGCAGATCCAGAGCAGCCTGAACGGCGGCGGGGGCCACGGCG,
 >ZCCHC14_upstream_exon_-_87459908_87460008
 GGTGAGTGCACGCACGGGTTTCAGGAGGACGTCTGACGGACGGAGGATGGGCACACCCCAGATCCCCGTGAGGGCATTGTTAAATGCTTTCAGATCTTTCA,
 >ZCCHC14_upstream_CDS_-_87459908_87460008
 GGTGAGTGCACGCACGGGTTTCAGGAGGACGTCTGACGGACGGAGGATGGGCACAC

In [None]:
with open(outfile, 'w') as o_file:
    for entry in final_list:
        o_file.writelines(f"> {entry.fancy_name}\n{entry.seq}\n")