In [1]:
import sys

import click
import progressbar
import pyfaidx
from gtfparse import read_gtf

In [2]:
gtffile = "/Users/milessmith/workspace/ensembl/Homo_sapiens.GRCh38.94.gtf.gz"
fastafile = "/Users/milessmith/workspace/ensembl/Homo_sapiens.GRCh38.dna.primary_assembly.fa"
feature_type = "gene"
outfile = "/Users/milessmith/workspace/merrycrispr/merrycrispr/data/test.fasta"
gene_name = None
boundary = None

In [3]:
records = read_gtf(gtffile)
if gene_name:
    records = records[records['gene_name'].isin(gene_name)]
if feature_type:
    records = records[records['feature'] == feature_type]
    records = records[['seqname', 'feature', 'start', 'end', 'strand', 'frame', 'gene_name', f'{feature_type}_id']].drop_duplicates()
else:
    records = records[['seqname', 'feature', 'start', 'end', 'strand', 'frame', 'gene_name']].drop_duplicates()

In [None]:
print(f"{len(records)} total records found.")

print(f"Loading the sequences in {fastafile}.  "
      f"Note: if this is the first time opening this file, "
      "it may take a few moments as an index is built.")
sequences = pyfaidx.Fasta(fastafile)
print(f"Finished loading {fastafile}")

In [None]:
seq_count = 1
seq_widgets = ['Matching features to sequences: ', progressbar.Counter(),
               f'/{records.index}', progressbar.Percentage(),
               ' ', progressbar.Bar(), progressbar.Timer(), ' ', progressbar.ETA()]
seq_progress = progressbar.ProgressBar(widgets=seq_widgets,
                                       maxval=len(records.index)).start()

In [5]:
final_list = []
    for rec in records.itertuples():
        seq_progress.update(seq_count)
        seq_count += 1

        if rec.start != rec.end: # you'd be surprised
            if rec.strand == "+":
                # for a normal, say, exonic sequence
                if boundary == 0:
                    try:
                        seq = pyfaidx.Sequence(name=f"{rec.gene_name}_{getattr(rec, f'{feature_type}_id')}_{rec.strand}_{rec.start}_{rec.end}",
                                               seq=sequences[rec.seqname][rec.start:rec.end].seq)
                        final_list.append(seq)
                    except ValueError:
                        print(f"problem with {rec.gene_name} {rec.start} {rec.end} {rec.seqname} {rec.strand}")
                # for excising sequences
                else:
                    try:
                        upstream = pyfaidx.Sequence(name=f"{rec.gene_name} {getattr(rec, f'{feature_type}_id')} upstream",
                                                    seq=sequences[rec.seqname][(rec.start - boundary):rec.start].seq)
                        final_list.append(upstream)
                        downstream = pyfaidx.Sequence(name=f"{rec.gene_name} {getattr(rec,f'{feature_type}_id')} downstream",
                                                      seq=sequences[rec.seqname][(rec.end):rec.end + boundary].seq)
                        final_list.append(downstream)
                    except ValueError:
                        print(f"problem with {rec.gene_name} {rec.start} {rec.end} {rec.seqname} {rec.strand}")
            if rec.strand == "-":
                # for a normal, say, exonic sequence
                if boundary == 0:
                    try:
                        seq = pyfaidx.Sequence(name=f"{rec.gene_name}_{getattr(rec, f'{feature_type}_id')}_{rec.strand}_{rec.start}_{rec.end}",
                                               seq=sequences[rec.seqname][rec.start:rec.end].reverse.complement.seq)
                        final_list.append(seq)
                    except ValueError:
                        print(f"problem with {rec.gene_name} {rec.start} {rec.end} {rec.seqname} {rec.strand}")
                # for excising sequences
                else:
                    try:
                        downstream = pyfaidx.Sequence(name=f"{rec.gene_name} {getattr(rec, f'{feature_type}_id')} downstream",
                                                      seq=sequences[rec.seqname][(rec.start - boundary):rec.start].reverse.complement.seq)
                        final_list.append(downstream)
                        upstream = pyfaidx.Sequence(name=f"{rec.gene_name} {getattr(rec, f'{feature_type}_id')} upstream",
                                                    seq=sequences[rec.seqname][(rec.end):rec.end + boundary].reverse.complement.seq)
                        final_list.append(upstream)
                    except ValueError:
                        print(f"problem with {rec.gene_name} {rec.start} {rec.end} {rec.seqname} {rec.strand}")

    seq_progress.finish()

IndentationError: unexpected indent (<ipython-input-5-0aff0f983b9a>, line 2)

In [None]:
with open(outfile, 'w') as o_file:
    for entry in final_list:
        o_file.writelines(f"> {entry.fancy_name}\n{entry.seq}\n")