In [1]:
import numpy as np
import pandas as pd

from Bio import SeqIO
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord

In [2]:
transtable = str.maketrans("ACTG","TGAC")

# Doc string

In [3]:
"""
Construct a database of all possible reads from the different combinations of oligo A, oligo B, primers, and samples
"""

'\nConstruct a database of all possible reads from the different combinations of oligo A, oligo B, primers, and samples\n'

# Args
TSO, PRIMER_R4, B_OLIGO are same sense as iontorrent seq data


ANNEAL, A_OLIGO, PRIMER_P1 are negative sense of seq data

In [None]:
EXP = "exp2_MHC_ILLUMINA"

In [4]:
N6 = "NNNNNN"
TSO = "TTTCTTATATGGG"
PRIMER_R4 = "CGAGTACCATGGGCGTAAC"
ANNEAL = "GTGTGACCTTCCCCAAAAGGCGTAG".translate(transtable)[::-1]
PRIMER_P1_MHC = "GAAGTTCCAGCCAGCGTC".translate(transtable)[::-1] # Used only for MHC barcodes
PRIMER_P1_CD8 = "GTAAAAGATCCCAGGTTTCATC".translate(transtable)[::-1] # Used only for CD8 barcodes
CD8_BARCODE = "A4000"

In [5]:
print(ANNEAL)
print(PRIMER_P1_CD8)
print(PRIMER_P1_MHC)

CTACGCCTTTTGGGGAAGGTCACAC
GATGAAACCTGGGATCTTTTAC
GACGCTGGCTGGAACTTC


In [6]:
# CD8 oligo A
"CTCTCGGCCTTAGCGCCATTTTTTTGGAAACCTCT".translate(transtable)[::-1]

'AGAGGTTTCCAAAAAAATGGCGCTAAGGCCGAGAG'

# Load data

In [7]:
OLIGO_A_SEQS = "../data/exp3_MHC/barcode_library/oligo_a.fa"
OLIGO_B_SEQS = "../data/exp3_MHC/barcode_library/oligo_b.fa"
SAMPLE_SEQS = "../data/exp3_MHC/barcode_library/sample.fa"
OVERREP_SEQS = "../data/exp3_MHC/fastqc/IONTORRENT.R1.gems.no_umi.no_adapters_fastqc/overrepresented_sequences.txt"

## Output files

In [8]:
BARCODE_TEMPLATES = "../data/exp3_MHC/barcode_library/barcode_templates.fa"

In [9]:
EXPECTED_TEMPLATES = "../data/" + EXP + "/blast/expected_templates/templates.fa"
REVERSED_TEMPLATES = "../data/" + EXP + "/blast/reversed_templates/templates.fa"
REV_TEMPLATES_START = "../data/" + EXP + "/blast/rev_templates_start/templates.fa"
OVERREP_SEQ_TEMPLATES = "../data/" + EXP + "/blast/overrep_seq_templates/templates.fa"

# Import data

In [None]:
oligo_a_records = list()
for record in SeqIO.parse(OLIGO_A_SEQS, "fasta"):
    record.seq = record.seq.reverse_complement()
    oligo_a_records.append(record)

In [None]:
oligo_b_records = list()
for record in SeqIO.parse(OLIGO_B_SEQS, "fasta"):
    oligo_b_records.append(record)

In [None]:
sample_records = list()
for record in SeqIO.parse(SAMPLE_SEQS, "fasta"):
    record.seq = record.seq.reverse_complement()
    sample_records.append(record)

# Construct templates

In [None]:
template_records = list()
for oligo_a_record in oligo_a_records:
    if oligo_a_record.id.endswith(CD8_BARCODE):
        primer_p1 = PRIMER_P1_CD8
    else:
        primer_p1 = PRIMER_P1_MHC           
    for oligo_b_record in oligo_b_records:
        for sample_record in sample_records:
            template_seq = Seq(TSO + PRIMER_R4 + N6) + oligo_b_record.seq + Seq(ANNEAL) + oligo_a_record.seq + Seq(N6 + primer_p1) + sample_record.seq
            template_id = oligo_a_record.id.split("-")[-1] + oligo_b_record.id.split("-")[-1] + "_" + sample_record.id
            template_records.append(SeqRecord(template_seq, id=template_id))
            print(SeqRecord(template_seq, id=template_id).format("fasta"))

In [None]:
SeqIO.write(template_records, BARCODE_TEMPLATES, "fasta")

In [None]:
SeqIO.write(template_records, EXPECTED_TEMPLATES, "fasta")

# Import erroneous data

In [10]:
oligo_a_records = list()
for record in SeqIO.parse(OLIGO_A_SEQS, "fasta"):
    oligo_a_records.append(record)

In [11]:
oligo_b_records = list()
for record in SeqIO.parse(OLIGO_B_SEQS, "fasta"):
    record.seq = record.seq.reverse_complement()
    oligo_b_records.append(record)

In [12]:
sample_records = list()
for record in SeqIO.parse(SAMPLE_SEQS, "fasta"):
    sample_records.append(record)

In [None]:
acc2seq = dict()
overrep_acc = dict()

with open(OVERREP_SEQS) as fh:
    for line in fh:
        if line.startswith(">>") or line.startswith("#"):
            continue
        
        fields = line.strip().split("\t")
        
        if len(fields) < 4:
            continue
            
        seq = fields[0]
        acc = fields[3].split(" ")[-1][1:-1]
        
        if acc in overrep_acc:
            overrep_acc[acc] += 1
        else:
            overrep_acc[acc] = 1
            
        acc2seq[acc + "_" + str(overrep_acc[acc])] = seq
        
acc2seq

# Construct erroneous templates

## Reversed template start

In [None]:
template_records = list()
for primer_p1 in [PRIMER_P1_MHC, PRIMER_P1_CD8]:
    template_seq = Seq(TSO + primer_p1.translate(transtable)[::-1])
    template_id = "TSO+PRIMER_A"
    template_records.append(SeqRecord(template_seq, id=template_id))
    
    for sample_record in sample_records:
        template_seq = Seq(TSO) + sample_record.seq + Seq(primer_p1)
        template_id = "TSO_" + sample_record.id + "_PRIMER_A"
        template_records.append(SeqRecord(template_seq, id=template_id))

In [None]:
SeqIO.write(template_records, REV_TEMPLATES_START, "fasta")

## Reversed template (whole)

In [13]:
template_records = list()
for oligo_a_record in oligo_a_records:
    if oligo_a_record.id.endswith(CD8_BARCODE):
        primer_p1 = PRIMER_P1_CD8
    else:
        primer_p1 = PRIMER_P1_MHC           
    for oligo_b_record in oligo_b_records:
        for sample_record in sample_records:
            template_seq = Seq(TSO) + sample_record.seq + Seq(primer_p1 + N6) + oligo_a_record.seq + Seq(ANNEAL.translate(transtable)[::-1]) + oligo_b_record.seq + Seq(N6 + PRIMER_R4)
            template_id = oligo_a_record.id.split("-")[-1] + oligo_b_record.id.split("-")[-1] + "_" + sample_record.id
            template_records.append(SeqRecord(template_seq, id=template_id))

In [14]:
SeqIO.write(template_records, REVERSED_TEMPLATES, "fasta")

675

## Overrepresented sequences as templates

In [None]:
template_records = list()
for acc, seq in acc2seq.items():
    template_seq = Seq(seq)
    template_id = acc
    template_records.append(SeqRecord(template_seq, id=template_id))

In [None]:
SeqIO.write(template_records, OVERREP_SEQ_TEMPLATES, "fasta")