In [1]:
import sys

import click
import progressbar
import pyfaidx
from gtfparse import read_gtf

In [5]:
gtffile = "/Users/milessmith/workspace/ensembl/Homo_sapiens.GRCh38.94.gtf.gz"
fastafile = "/Users/milessmith/workspace/ensembl/Homo_sapiens.GRCh38.dna.primary_assembly.fa.gz"
feature_type = "gene"
outfile = "/Users/milessmith/workspace/merrycrispr/merrycrispr/data/test.fasta"
gene_name = ["PML"]
boundary = 0

In [3]:
records = read_gtf(gtffile)
if gene_name:
    records = records[records['gene_name'].isin(gene_name)]
if feature_type:
    records = records[records['feature'] == feature_type]
    records = records[['seqname', 'feature', 'start', 'end', 'strand', 'frame', 'gene_name', f'{feature_type}_id']].drop_duplicates()
else:
    records = records[['seqname', 'feature', 'start', 'end', 'strand', 'frame', 'gene_name']].drop_duplicates()

INFO:root:Extracted GTF attributes: ['gene_id', 'gene_version', 'gene_name', 'gene_source', 'gene_biotype', 'transcript_id', 'transcript_version', 'transcript_name', 'transcript_source', 'transcript_biotype', 'tag', 'transcript_support_level', 'exon_number', 'exon_id', 'exon_version', 'protein_id', 'protein_version', 'ccds_id']


In [7]:
print(f"{len(records)} total records found.")

print(f"Loading the sequences in {fastafile}.  "
      f"Note: if this is the first time opening this file, "
      "it may take a few moments as an index is built.")
sequences = pyfaidx.Fasta(fastafile)
print(f"Finished loading {fastafile}")

1 total records found.
Loading the sequences in /Users/milessmith/workspace/ensembl/Homo_sapiens.GRCh38.dna.primary_assembly.fa.gz.  Note: if this is the first time opening this file, it may take a few moments as an index is built.
Finished loading /Users/milessmith/workspace/ensembl/Homo_sapiens.GRCh38.dna.primary_assembly.fa.gz


In [8]:
seq_count = 1
seq_widgets = ['Matching features to sequences: ', 
               progressbar.Counter(),
               f'/{records.index}',
               progressbar.Percentage(),
               ' ',
               progressbar.Bar(),
               progressbar.Timer(),
               ' ',
               progressbar.ETA()]
seq_progress = progressbar.ProgressBar(widgets=seq_widgets,
                                       maxval=len(records.index)).start()

Matching features to sequences: 0/Int64Index([2027769], dtype='int64')  0% ||Elapsed Time: 0:00:00 ETA:  --:--:--

In [9]:
records.gene_name

2027769    PML
Name: gene_name, dtype: object

In [10]:
records.start

2027769    73994673
Name: start, dtype: int64

In [11]:
type(boundary)

int

In [12]:
alpha = [rec for rec in records.itertuples()]

In [13]:
alpha[0]

Pandas(Index=2027769, seqname='15', feature='gene', start=73994673, end=74047812, strand='+', frame='0', gene_name='PML', gene_id='ENSG00000140464')

In [14]:
seq = pyfaidx.Sequence(name=f"{alpha[0].gene_name}_"
                            f"{getattr(alpha[0], f'{feature_type}_id')}_"
                            f"{alpha[0].strand}_{alpha[0].start}_{alpha[0].end}_"
                            f"{misc},
                       seq=sequences[alpha[0].seqname][alpha[0].start:alpha[0].end].seq)

In [15]:
seq

>PML_ENSG00000140464_+_73994673_74047812
TCTCCAGAGGCGGGCCCTGAGCCGGCACCTCCCCTTTCGGACAGCTCAAGGGACTCAGCCAACTGGCTCACGCCTCCCCTTCAGCTTCTCTTCACGCACTCCAAGATCTAAACCGAGAATCGAAACTAAGCTGGGGTCCATGGAGCCTGCACCCGCCCGATCTCCGAGGCCCCAGCAGGACCCCGCCCGGCCCCAGGAGCCCACCATGCCTCCCCCCGAGACCCCCTCTGAAGGCCGCCAGCCCAGCCCCAGCCCCAGCCCTACAGAGGTACTATTGGGTTAGGGGATGATGGGGTTAAGCTTTGTTGGTTTGCTGTGGTGGGGAGAGGCGGGAAGAGAGGGTCTAACGGAGGATTTGGTCAAGTACCCTAGAGAGTGACACAAAGCGGGAAGTCCAGACACCAGGGTCCTGACCGTCTCGGGTGGGGCAGGGAAGGGAGGGTAGGATAGAGTAGAAAAGAGGACACGGAGGAGTTGGGGCGGCCTCGCTGGGCTGCGGTTTCTCCACTGAGCAGTTGGGCAAGGTGAGAAGGGTCAGTGGCCTCCGGGCCTGGGCCCTTCCGCCCACCCTCGAGCCCTGCCTCAACTTTGCCTCAGATGCAGGACTTCAGATTAGGGAGGATGGAGGTAGTACCCCTGTTGCGCTGGCCTGGAGCCAGGGGCATGTCCCAGGCACGGCAAAACTAAAACCAACTTCCCAGATCCGAGGTGAGAAACTGGCTCAGACTGAAGAGGTATCTTTGCCAAGGCCTCCCAGCTCATGTGGTTTCTGCTTAAGGAAGCCTCCCCAACGAACCCTTCTCTTGCCACACCTTTCCTGCCCACCTCCCACCTCCCCCGACAAAGGAACTACTTGGGTTTCTTGCTCTGCTGCCTTTCAGGCCCTTTTACTCCCCTTCATGAAAGTACAGAGGACACCGTATTACAGTAACTTTTATAAACATTATTACAACTAAGAA

In [16]:
seq.name

'PML_ENSG00000140464_+_73994673_74047812'

In [17]:
from copy import deepcopy

In [18]:
seq2 = deepcopy(seq)

In [19]:
seq2.name

'PML_ENSG00000140464_+_73994673_74047812'

In [20]:
seq2.name = seq2.name + "_upstream"

In [21]:
seq2.name

'PML_ENSG00000140464_+_73994673_74047812_upstream'

In [23]:
final_list = []
for rec in records.itertuples():
    seq_progress.update(seq_count)
    seq_count += 1

    if rec.start != rec.end: # you'd be surprised
        if rec.strand == "+":
            # for a normal, say, exonic sequence
            if boundary == 0:
                try:
                    seq = pyfaidx.Sequence(name=f"{rec.gene_name}_{getattr(rec, f'{feature_type}_id')}_"
                                           f"{rec.strand}_{rec.start}_{rec.end}",
                                           seq=sequences[rec.seqname][rec.start:rec.end].seq)
                    final_list.append(seq)
                except ValueError:
                    print(f"problem with {rec.gene_name} {rec.start} "
                          f"{rec.end} {rec.seqname} {rec.strand}")
            # for excising sequences
            else:
                try:
                    upstream = pyfaidx.Sequence(name=f"{rec.gene_name} "
                                                f"{getattr(rec, f'{feature_type}_id')} upstream",
                                                seq=sequences[rec.seqname][(rec.start - boundary):rec.start].seq)
                    final_list.append(upstream)
                    downstream = pyfaidx.Sequence(name=f"{rec.gene_name} "
                                                  f"{getattr(rec,f'{feature_type}_id')} downstream",
                                                  seq=sequences[rec.seqname][(rec.end):rec.end + boundary].seq)
                    final_list.append(downstream)
                except ValueError:
                    print(f"problem with {rec.gene_name} {rec.start} {rec.end} {rec.seqname} {rec.strand}")
        if rec.strand == "-":
            # for a normal, say, exonic sequence
            if boundary == 0:
                try:
                    seq = pyfaidx.Sequence(name=f"{rec.gene_name}_"
                                           f"{getattr(rec, f'{feature_type}_id')}_{rec.strand}_"
                                           f"{rec.start}_{rec.end}",
                                           seq=sequences[rec.seqname][rec.start:rec.end].reverse.complement.seq)
                    final_list.append(seq)
                except ValueError:
                    print(f"problem with {rec.gene_name} {rec.start} "
                          f"{rec.end} {rec.seqname} {rec.strand}")
            # for excising sequences
            else:
                try:
                    downstream = pyfaidx.Sequence(name=f"{rec.gene_name} "
                                                  f"{getattr(rec, f'{feature_type}_id')} downstream",
                                                  seq=sequences[rec.seqname][(rec.start - boundary):rec.start].reverse.complement.seq)
                    final_list.append(downstream)
                    upstream = pyfaidx.Sequence(name=f"{rec.gene_name} "
                                                f"{getattr(rec, f'{feature_type}_id')} upstream",
                                                seq=sequences[rec.seqname][(rec.end):rec.end + boundary].reverse.complement.seq)
                    final_list.append(upstream)
                except ValueError:
                    print(f"problem with {rec.gene_name} {rec.start} "
                          f"{rec.end} {rec.seqname} {rec.strand}")

    seq_progress.finish()

Matching features to sequences: 1/Int64Index([2027769], dtype='int64')100% ||Elapsed Time: 0:00:18 ETA:  0:00:00Matching features to sequences: 1/Int64Index([2027769], dtype='int64')100% ||Elapsed Time: 0:00:18 Time: 0:00:18


In [24]:
with open(outfile, 'w') as o_file:
    for entry in final_list:
        o_file.writelines(f"> {entry.fancy_name}\n{entry.seq}\n")