In [1]:
import numpy as np
import pandas as pd

from Bio import SeqIO
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord

ModuleNotFoundError: No module named 'Bio'

In [124]:
transtable = str.maketrans("ACTG","TGAC")

# Doc string

In [125]:
"""
Construct a database of all possible reads from the different combinations of oligo A, oligo B, primers, and samples
"""

'\nConstruct a database of all possible reads from the different combinations of oligo A, oligo B, primers, and samples\n'

# Args
TSO, PRIMER_R4, B_OLIGO are same sense as iontorrent seq data


ANNEAL, A_OLIGO, PRIMER_P1 are negative sense of seq data

In [169]:
EXP = "exp5_MHC_ILLUMINA"

## Amalies design

In [170]:
N6 = "NNNNNN"
TSO = "TTTCTTATATGGG"
PRIMER_R4 = "CGAGTACCATGGGCGTAAC"
ANNEAL = "GTGTGACCTTCCCCAAAAGGCGTAG".translate(transtable)[::-1]
PRIMER_P1_MHC = "GAAGTTCCAGCCAGCGTC".translate(transtable)[::-1] # Used only for MHC barcodes
PRIMER_P1_CD8 = "GTAAAAGATCCCAGGTTTCATC".translate(transtable)[::-1] # Used only for CD8 barcodes
CD8_BARCODE = "A4000"

## 10x design # CITEseq
https://citeseq.files.wordpress.com/2018/11/eccite_schemes.pdf
reads are as illustrated in top sequences.

In [171]:
TSO = "TTTCTTATATGGG"
FREE_END_MHC = "TGGAATTCTCGGGTGCCAAGGAACTCCAGTCAC" #ADT cDNA additive primer + extra anneal seq
FREE_END_CD8 = "GTGACTGGAGTTCAGACGTGTGCTCTTCCGATCT".translate(transtable)[::-1] #HTO cDNA additive primer 
P7 = "ATCTCGTATGCCGTCTTCTGCTTG"
i7_MHC = "CGTGAT".translate(transtable)[::-1]
i7_CD8 = "CGAGTAAT".translate(transtable)[::-1]
CDB_BARCODE_ID_10x = "HASH"

In [172]:
# i7_MHC = "CAAGCAGAAGACGGCATACGAGATCGTGATGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA".translate(transtable)[::-1] # Almost full seq of the end of final read given by Amalie 
# i7_CD8 = "CAAGCAGAAGACGGCATACGAGATCGAGTAATGTGACTGGAGTTCAGACGTGTGC".translate(transtable)[::-1] # Almost full seq of the end of final read given by Amalie 

In [173]:
# Barcodes
print('GTTGAGGCGAGG'.translate(transtable)[::-1])
print('TTGTCTCGGATC'.translate(transtable)[::-1])
print('ATGTCGCGTAAC'.translate(transtable)[::-1])

CCTCGCCTCAAC
GATCCGAGACAA
GTTACGCGACAT


# Load data

In [174]:
BARCODE_SETUP = "/Volumes/tuba/herpov/tcr-pmhc-sc-project/data/" + EXP + "/barcode_library/oligo.lst"

In [175]:
OLIGO_10x = "../data/" + EXP + "/barcode_library/oligo_10x.fa"
OLIGO_A_SEQS = "../data/" + EXP + "/barcode_library/oligo_a.fa"
OLIGO_B_SEQS = "../data/" + EXP + "/barcode_library/oligo_b.fa"
SAMPLE_SEQS = "../data/" + EXP + "/barcode_library/sample.fa"
#OVERREP_SEQS = "../data/" + EXP + "/fastqc/IONTORRENT.R1.gems.no_umi.no_adapters_fastqc/overrepresented_sequences.txt"

## Output files

In [176]:
BARCODE_TEMPLATES = "../data/" + EXP + "/barcode_library/barcode_templates.fa"

In [177]:
#EXPECTED_TEMPLATES = "../data/" + EXP + "/blast/expected_templates/templates.fa"
#REVERSED_TEMPLATES = "../data/" + EXP + "/blast/reversed_templates/templates.fa"
#REV_TEMPLATES_START = "../data/" + EXP + "/blast/rev_templates_start/templates.fa"
#OVERREP_SEQ_TEMPLATES = "../data/" + EXP + "/blast/overrep_seq_templates/templates.fa"

# Import data

In [178]:
oligos = np.loadtxt(BARCODE_SETUP, dtype='str')
oligos = set(oligos)

In [179]:
oligos

{'A1064',
 'A1065',
 'A1066',
 'A4000',
 'ADT_5000',
 'ADT_5001',
 'ADT_5002',
 'B288',
 'B290',
 'HASH_5003',
 'HASH_5004'}

In [180]:
oligo_a_records = list()
for record in SeqIO.parse(OLIGO_A_SEQS, "fasta"):
    if record.id.split('-')[-1] in oligos:
        record.seq = record.seq.reverse_complement()
        oligo_a_records.append(record)

In [181]:
oligo_b_records = list()
for record in SeqIO.parse(OLIGO_B_SEQS, "fasta"):
    if record.id.split('-')[-1] in oligos:
        oligo_b_records.append(record)

In [182]:
sample_records = list()
for record in SeqIO.parse(SAMPLE_SEQS, "fasta"):
    if record.id.split('-')[-1] in oligos:
        record.seq = record.seq.reverse_complement()
        sample_records.append(record)

In [183]:
oligo_10x_records = list()
for record in SeqIO.parse(OLIGO_10x, "fasta"):
    if record.id.split('-')[-1] in oligos:
        record.seq = record.seq.reverse_complement()
        oligo_10x_records.append(record)

# Construct templates
We will exclude sample IDs since they pollute the annotations

In [186]:
template_records = list()

In [187]:
for oligo_a_record in oligo_a_records:
    if oligo_a_record.id.endswith(CD8_BARCODE):
        primer_p1 = PRIMER_P1_CD8
    else:
        primer_p1 = PRIMER_P1_MHC           
    for oligo_b_record in oligo_b_records:
        if oligo_b_record.id.split("-")[-1] == 'B290' and oligo_a_record.id.split("-")[-1] != 'A4000':
            continue
        #for sample_record in sample_records:
        template_seq = Seq(TSO + PRIMER_R4 + N6) + oligo_b_record.seq + Seq(ANNEAL) + oligo_a_record.seq + Seq(N6 + primer_p1) #+ sample_record.seq
        template_id = oligo_a_record.id.split("-")[-1] + oligo_b_record.id.split("-")[-1] #+ "_" + sample_record.id
        template_records.append(SeqRecord(template_seq, id=template_id))
        print(SeqRecord(template_seq, id=template_id).format("fasta"))

>A1064B288 <unknown description>
TTTCTTATATGGGCGAGTACCATGGGCGTAACNNNNNNCTTGGCAATCCATGCTCCCATT
TGGCTACGCCTTTTGGGGAAGGTCACACTTGTCATAAGGAGATAGCTACTACGNNNNNNG
ACGCTGGCTGGAACTTC

>A1065B288 <unknown description>
TTTCTTATATGGGCGAGTACCATGGGCGTAACNNNNNNCTTGGCAATCCATGCTCCCATT
TGGCTACGCCTTTTGGGGAAGGTCACACCGGTAGTTACTTGCACTTTGCGGTCNNNNNNG
ACGCTGGCTGGAACTTC

>A1066B288 <unknown description>
TTTCTTATATGGGCGAGTACCATGGGCGTAACNNNNNNCTTGGCAATCCATGCTCCCATT
TGGCTACGCCTTTTGGGGAAGGTCACACGCTTGCAGGCAGATAATAACAAGCGNNNNNNG
ACGCTGGCTGGAACTTC

>A4000B288 <unknown description>
TTTCTTATATGGGCGAGTACCATGGGCGTAACNNNNNNCTTGGCAATCCATGCTCCCATT
TGGCTACGCCTTTTGGGGAAGGTCACACGGTTATCAAGGTCCTATACCCGCAGNNNNNNG
ATGAAACCTGGGATCTTTTAC

>A4000B290 <unknown description>
TTTCTTATATGGGCGAGTACCATGGGCGTAACNNNNNNGCCTGTAGTCCCACGCGATCTA
ACACTACGCCTTTTGGGGAAGGTCACACGGTTATCAAGGTCCTATACCCGCAGNNNNNNG
ATGAAACCTGGGATCTTTTAC



In [188]:
for oligo in oligo_10x_records:
    if oligo.id.startswith(CDB_BARCODE_ID_10x):
        free_end = FREE_END_CD8
        i7 = i7_CD8
    else:
        free_end = FREE_END_MHC
        i7 = i7_MHC
        
    template_seq = Seq(TSO) + oligo.seq + Seq(free_end + i7 + P7)
    template_id = oligo.id
    template_records.append(SeqRecord(template_seq, id=template_id))
    print(SeqRecord(template_seq, id=template_id).format("fasta"))

>ADT_5000 <unknown description>
TTTCTTATATGGGCCTCGCCTCAACTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACAT
CACGATCTCGTATGCCGTCTTCTGCTTG

>ADT_5001 <unknown description>
TTTCTTATATGGGGATCCGAGACAATGGAATTCTCGGGTGCCAAGGAACTCCAGTCACAT
CACGATCTCGTATGCCGTCTTCTGCTTG

>ADT_5002 <unknown description>
TTTCTTATATGGGGTTACGCGACATTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACAT
CACGATCTCGTATGCCGTCTTCTGCTTG

>HASH_5003 <unknown description>
TTTCTTATATGGGAGTAGTTCCTGTAGATCGGAAGAGCACACGTCTGAACTCCAGTCACA
TTACTCGATCTCGTATGCCGTCTTCTGCTTG

>HASH_5004 <unknown description>
TTTCTTATATGGGATTAGGTCTCGCAGATCGGAAGAGCACACGTCTGAACTCCAGTCACA
TTACTCGATCTCGTATGCCGTCTTCTGCTTG



In [189]:
SeqIO.write(template_records, BARCODE_TEMPLATES, "fasta")

10

In [16]:
#SeqIO.write(template_records, EXPECTED_TEMPLATES, "fasta")

135

# Import erroneous data

In [10]:
oligo_a_records = list()
for record in SeqIO.parse(OLIGO_A_SEQS, "fasta"):
    oligo_a_records.append(record)

In [11]:
oligo_b_records = list()
for record in SeqIO.parse(OLIGO_B_SEQS, "fasta"):
    record.seq = record.seq.reverse_complement()
    oligo_b_records.append(record)

In [12]:
sample_records = list()
for record in SeqIO.parse(SAMPLE_SEQS, "fasta"):
    sample_records.append(record)

In [None]:
acc2seq = dict()
overrep_acc = dict()

with open(OVERREP_SEQS) as fh:
    for line in fh:
        if line.startswith(">>") or line.startswith("#"):
            continue
        
        fields = line.strip().split("\t")
        
        if len(fields) < 4:
            continue
            
        seq = fields[0]
        acc = fields[3].split(" ")[-1][1:-1]
        
        if acc in overrep_acc:
            overrep_acc[acc] += 1
        else:
            overrep_acc[acc] = 1
            
        acc2seq[acc + "_" + str(overrep_acc[acc])] = seq
        
acc2seq

# Construct erroneous templates

## Reversed template start

In [None]:
template_records = list()
for primer_p1 in [PRIMER_P1_MHC, PRIMER_P1_CD8]:
    template_seq = Seq(TSO + primer_p1.translate(transtable)[::-1])
    template_id = "TSO+PRIMER_A"
    template_records.append(SeqRecord(template_seq, id=template_id))
    
    for sample_record in sample_records:
        template_seq = Seq(TSO) + sample_record.seq + Seq(primer_p1)
        template_id = "TSO_" + sample_record.id + "_PRIMER_A"
        template_records.append(SeqRecord(template_seq, id=template_id))

In [None]:
SeqIO.write(template_records, REV_TEMPLATES_START, "fasta")

## Reversed template (whole)

In [13]:
template_records = list()
for oligo_a_record in oligo_a_records:
    if oligo_a_record.id.endswith(CD8_BARCODE):
        primer_p1 = PRIMER_P1_CD8
    else:
        primer_p1 = PRIMER_P1_MHC           
    for oligo_b_record in oligo_b_records:
        for sample_record in sample_records:
            template_seq = Seq(TSO) + sample_record.seq + Seq(primer_p1 + N6) + oligo_a_record.seq + Seq(ANNEAL.translate(transtable)[::-1]) + oligo_b_record.seq + Seq(N6 + PRIMER_R4)
            template_id = oligo_a_record.id.split("-")[-1] + oligo_b_record.id.split("-")[-1] + "_" + sample_record.id
            template_records.append(SeqRecord(template_seq, id=template_id))

In [14]:
SeqIO.write(template_records, REVERSED_TEMPLATES, "fasta")

675

## Overrepresented sequences as templates

In [None]:
template_records = list()
for acc, seq in acc2seq.items():
    template_seq = Seq(seq)
    template_id = acc
    template_records.append(SeqRecord(template_seq, id=template_id))

In [None]:
SeqIO.write(template_records, OVERREP_SEQ_TEMPLATES, "fasta")