In [1]:
import numpy as np
import pandas as pd

from Bio import SeqIO
from Bio import pairwise2

In [2]:
transtable = str.maketrans("ACTG","TGAC")

# Doc string

In [3]:
"""
New barracoda!

Procedure for each read:
Detect b_oligo and a_oligo
"""

'\nNew barracoda!\n\nProcedure for each read:\nDetect b_oligo and a_oligo\n'

# Args
TSO, PRIMER_R4, B_OLIGO are same sense as iontorrent seq data


ANNEARL, A_OLIGO, PRIMER_P1 are negative sense of seq data

In [4]:
TSO = "TTTCTTATATGGG"
PRIMER_R4 = "CGAGTACCATGGGCGTAAC"
ANNEAL = "GTGTGACCTTCCCCAAAAGGCGTAG".translate(transtable)[::-1]
PRIMER_P1 = "GAAGTTCCAGCCAGCGTC".translate(transtable)[::-1] # Used only for MHC barcodes
PRIMER_P1 = "GTAAAAGATCCCAGGTTTCATC".translate(transtable)[::-1] # Used only for CD8 barcodes
PRIMER = TSO + PRIMER_R4

In [5]:
ANNEAL

'CTACGCCTTTTGGGGAAGGTCACAC'

In [6]:
# CD8 oligo A
"CTGCGGGTATAGGACCTTGATAACC".translate(transtable)[::-1]

'GGTTATCAAGGTCCTATACCCGCAG'

In [7]:
"CAAGCAGAAGACGGCATACGAGATCGTGATGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA".translate(transtable)[::-1]

'TGGAATTCTCGGGTGCCAAGGAACTCCAGTCACATCACGATCTCGTATGCCGTCTTCTGCTTG'

# Load data

In [5]:
BARCODE_READS = "../data/exp3_MHC/processed/longranger_clean/tmp/grep-anneal-reads-both-ends.fa"
OLIGO_A_SEQS = "../data/exp3_MHC/barcode_library/oligo_a.fa"
OLIGO_B_SEQS = "../data/exp3_MHC/barcode_library/oligo_b.fa"
SAMPLE_SEQS = "../data/exp3_MHC/barcode_library/sample.fa"

# Import data

In [6]:
oligo_a_records = list()
for record in SeqIO.parse(OLIGO_A_SEQS, "fasta"):
    record.seq = record.seq.reverse_complement()
    oligo_a_records.append(record)

In [7]:
oligo_b_records = list()
for record in SeqIO.parse(OLIGO_B_SEQS, "fasta"):
    oligo_b_records.append(record)

In [8]:
sample_records = list()
for record in SeqIO.parse(SAMPLE_SEQS, "fasta"):
    sample_records.append(record)

In [None]:
for seq_record in SeqIO.parse(OLIGO_A_SEQS, "fasta"):
    print(seq_record.id)
    print(repr(seq_record.seq))
    print(len(seq_record))

# Find oligos and positions

In [9]:
def get_best_alignment(barcode_record, query_records, query_name):
    """
    Pairwise alignments to identify which query_record is source of the barcode.
    The best score desides which record. The score, the alignment length, the start and end positions are annotated
    """
    
    best_alignment = [('',  '', 0, 0, 0)]
    for record in query_records:
        alignment_score = pairwise2.align.localxd(barcode_record.seq, record.seq, -0.4, -0.1, -4, -1, score_only=True)
        if alignment_score > best_alignment[0][2]:
            best_alignment = pairwise2.align.localxd(barcode_record.seq, record.seq, -0.4, -0.1, -4, -1, score_only=False)
            query_len = len(record.seq)

    assert best_alignment[0][4] != 0
    assert len(best_alignment[0][0]) == len(best_alignment[0][1])
    
    return pd.Series([barcode_record.id,
                      record.id,
                      query_name,
                      query_len,
                      best_alignment[0][2],
                      best_alignment[0][3],
                      best_alignment[0][4],
                      len(best_alignment[0][0])],
                      index = ("barcode_id", "query_id", "query", "query_len", "align_score", "align_start", "align_end", "align_len"))

In [None]:
chunk_size = 10
column_dtypes = {"barcode_id": "object",
                 "query_id": "object",
                 "query": "object",
                 "query_len": "int64",
                 "align_score": "float64",
                 "align_start": "int64",
                 "align_end": "int64",
                 "align_len": "int64"}

align_df = pd.DataFrame(columns=("barcode_id", "query_id", "query", "query_len", "align_score", "align_start", "align_end", "align_len"))


In [None]:
align_df

In [14]:
i = 0
chunk_size = 10

align_df = pd.DataFrame(columns=("barcode_id", "query_id", "query", "query_len", "align_score", "align_start", "align_end", "align_len"))
for barcode_record in SeqIO.parse(BARCODE_READS, "fasta"):
    oligo_a_alignment = get_best_alignment(barcode_record, oligo_a_records, "oligo_a")
    oligo_b_alignment = get_best_alignment(barcode_record, oligo_b_records, "oligo_b")
    sample_alignment = get_best_alignment(barcode_record, sample_records, "sample")
    
    align_df = align_df.append([oligo_a_alignment, oligo_b_alignment, sample_alignment], ignore_index=True)
    i += 1
    if i > 0:
        break
        
align_df
#print(pairwise2.format_alignment(*alignments[0]))

Unnamed: 0,barcode_id,query_id,query,query_len,align_score,align_start,align_end,align_len
0,HBBAF:00130:00791,2OS-AB1-A4000,oligo_a,25,25.0,87,112,124
1,HBBAF:00130:00791,2OS-4-long-Oligo-B302,oligo_b,25,25.0,37,62,124
2,HBBAF:00130:00791,bc25mer_51469,sample,8,5.6,51,59,125


In [15]:
align_df.append([oligo_a_alignment, oligo_b_alignment, sample_alignment], ignore_index=True)

Unnamed: 0,barcode_id,query_id,query,query_len,align_score,align_start,align_end,align_len
0,HBBAF:00130:00791,2OS-AB1-A4000,oligo_a,25,25.0,87,112,124
1,HBBAF:00130:00791,2OS-4-long-Oligo-B302,oligo_b,25,25.0,37,62,124
2,HBBAF:00130:00791,bc25mer_51469,sample,8,5.6,51,59,125
3,HBBAF:00130:00791,2OS-AB1-A4000,oligo_a,25,25.0,87,112,124
4,HBBAF:00130:00791,2OS-4-long-Oligo-B302,oligo_b,25,25.0,37,62,124
5,HBBAF:00130:00791,bc25mer_51469,sample,8,5.6,51,59,125


# Find reads containing 2 of 3: Primer_R4, Anneal, Primer_P1

# Identify the two N6 UMIs

# Tabularize no. reads per GEM