# Load Modules

In [3]:
%load_ext autoreload
%autoreload 2

import numpy as np
import pandas as pd
from tqdm import tqdm
import datetime
import os

pd.options.mode.chained_assignment = None

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


# Load Data

In [None]:
def seqReader(fn):
    """
    iterate through sequences and yield as generator
    """
    def openSeq(fn):
        if 'gz' in fn:
            return gzip.open(fn, 'rt')
        else:
            return open(fn, 'r')

    def num_iter(fn):
        if 'fastq' in fn or 'fq' in fn:
            return 4
        else:
            return 2

    n = num_iter(fn)

    with openSeq(fn) as f:
        while True:
            try:
                yield [next(f).strip('\n') for _ in range(n)]
            except StopIteration:
                break

def strip_primers(seq):
    left_primer = "GTGGTTGGTGCTGTAGGAGCA"
    right_primer = "TGATAAGCATATGCCATGGCCTC"

    return seq[len(left_primer) : -len(right_primer)]

def translate(seq):

    def chunker(seq, size):
        return (seq[pos:pos + size] for pos in np.arange(0, len(seq), size))

    translation = {
        'ATA':'I', 'ATC':'I', 'ATT':'I', 'ATG':'M',
        'ACA':'T', 'ACC':'T', 'ACG':'T', 'ACT':'T',
        'AAC':'N', 'AAT':'N', 'AAA':'K', 'AAG':'K',
        'AGC':'S', 'AGT':'S', 'AGA':'R', 'AGG':'R',
        'CTA':'L', 'CTC':'L', 'CTG':'L', 'CTT':'L',
        'CCA':'P', 'CCC':'P', 'CCG':'P', 'CCT':'P',
        'CAC':'H', 'CAT':'H', 'CAA':'Q', 'CAG':'Q',
        'CGA':'R', 'CGC':'R', 'CGG':'R', 'CGT':'R',
        'GTA':'V', 'GTC':'V', 'GTG':'V', 'GTT':'V',
        'GCA':'A', 'GCC':'A', 'GCG':'A', 'GCT':'A',
        'GAC':'D', 'GAT':'D', 'GAA':'E', 'GAG':'E',
        'GGA':'G', 'GGC':'G', 'GGG':'G', 'GGT':'G',
        'TCA':'S', 'TCC':'S', 'TCG':'S', 'TCT':'S',
        'TTC':'F', 'TTT':'F', 'TTA':'L', 'TTG':'L',
        'TAC':'Y', 'TAT':'Y', 'TAA':'_', 'TAG':'_',
        'TGC':'C', 'TGT':'C', 'TGA':'_', 'TGG':'W',
    }

    to_translate = seq.strip('\n')
    aa = ''.join(
            [translation[codon] for codon in chunker(to_translate, 3) if len(codon) == 3]
            )
    return aa

def process_reference(fasta_fn):
    left_primer = "GTGGTTGGTGCTGTAGGAGCA"
    right_primer = "TGATAAGCATATGCCATGGCCTC"

    for idx, header_seq in tqdm(enumerate(seqReader(fasta_fn))):
        header, seq = header_seq
        header = header.strip(">")
        trunc_seq = seq[len(left_primer) : -len(right_primer)]
        aa_seq = translate(trunc_seq)
        
        
        yield(idx, header, trunc_seq, aa_seq)
        

# Define Input Fasta
fa_fn = "../data/reference_sequences/plasmodium_peanut_T7_display_seqs.fasta"

idx_header_map = "../data/meta/peptide_meta/idx_header_map.tab"
idx_truncated_sequence = "../data/meta/peptide_meta/target_truncated_seq.fa"
idx_amino_acid = "../data/meta/peptide_meta/target_aa.fa"
log_fn = "../data/meta/peptide_meta/log.txt"

ih = open(idx_header_map, "w+")
it = open(idx_truncated_sequence, "w+")
ia = open(idx_amino_acid, "w+")
log = open(log_fn, "w+")


for idx, header, trunc_seq, aa_seq in process_reference(fa_fn):
    ih.write(
        "t{}\t{}\n".format(idx, header)
    )
    it.write(
        ">t{}\n{}\n".format(idx, trunc_seq)
    )
    ia.write(
        ">t{}\n{}\n".format(idx, aa_seq)
    )
log.write(
    "# Meta Generation\n\nGenerated : {}\nInput Fasta : {}\n".format(
        datetime.datetime.now(),
        fa_fn
    )
)

    
    
ih.close()
it.close()
ia.close()
log.close()


101248it [00:03, 27853.19it/s]