## This notebook produces a 6-frames translation of a given transcriptome and find antibody crossreactivity, provided a list of epitopes recognised by epitope-mapped monoclonal antibodies.

In [None]:
# Import necessary stuff
import pandas as pd 
from pandas import DataFrame
#from Bio.Seq import SeqIO
#import matplotlib.pyplot as plt

import Bio
from Bio import SeqIO


In [None]:
import os
import pandas as pd 
from pandas import DataFrame
from Bio import SeqIO

# Genetic code dictionary
gencode = {
    'ATA':'I', 'ATC':'I', 'ATT':'I', 'ATG':'M',
    'ACA':'T', 'ACC':'T', 'ACG':'T', 'ACT':'T',
    'AAC':'N', 'AAT':'N', 'AAA':'K', 'AAG':'K',
    'AGC':'S', 'AGT':'S', 'AGA':'R', 'AGG':'R',
    'CTA':'L', 'CTC':'L', 'CTG':'L', 'CTT':'L',
    'CCA':'P', 'CCC':'P', 'CCG':'P', 'CCT':'P',
    'CAC':'H', 'CAT':'H', 'CAA':'Q', 'CAG':'Q',
    'CGA':'R', 'CGC':'R', 'CGG':'R', 'CGT':'R',
    'GTA':'V', 'GTC':'V', 'GTG':'V', 'GTT':'V',
    'GCA':'A', 'GCC':'A', 'GCG':'A', 'GCT':'A',
    'GAC':'D', 'GAT':'D', 'GAA':'E', 'GAG':'E',
    'GGA':'G', 'GGC':'G', 'GGG':'G', 'GGT':'G',
    'TCA':'S', 'TCC':'S', 'TCG':'S', 'TCT':'S',
    'TTC':'F', 'TTT':'F', 'TTA':'L', 'TTG':'L',
    'TAC':'Y', 'TAT':'Y', 'TAA':'_', 'TAG':'_',
    'TGC':'C', 'TGT':'C', 'TGA':'_', 'TGG':'W'
}

# Base pair complements
basepairs = {'A':'T', 'C':'G', 'G':'C', 'T':'A'}

def translate_frameshifted(sequence):
    """Translate a nucleotide sequence in frameshifted fashion."""
    translate = ''.join([gencode.get(sequence[3*i:3*i+3], 'X') for i in range(len(sequence)//3)])
    return translate

def reverse_complement(sequence):
    """Generate the reverse complement of a DNA sequence."""
    reversed_sequence = sequence[::-1]
    rc = ''.join([basepairs.get(reversed_sequence[i], 'X') for i in range(len(sequence))])
    return rc

def append_new_line(file_name, text_to_append):
    """Append given text as a new line at the end of file."""
    with open(file_name, "a+") as file_object:
        file_object.seek(0)
        data = file_object.read(100)
        if len(data) > 0:
            file_object.write("\n")
        file_object.write(text_to_append)



In [None]:
# Define the output directory (edit this variable to change the location)
output_folder = "/path/to/your/output/folder"

# Ensure the directory exists
if not os.path.exists(output_folder):
    os.makedirs(output_folder)

# Define the output file path
output_file_path = os.path.join(output_folder, "output6frame.txt")

# Process fasta sequences
# SPECIFY THE INPUT TRANSCRIPTOME HERE!!!!

for seq_record in SeqIO.parse("/Users/marco.grillo/Desktop/ncbi-blast-2.12.0+/bin/Paroedura_picta_denovo_transcriptome.fasta", "fasta"):
    t1 = translate_frameshifted(seq_record.seq[0:])  # First frame
    t2 = translate_frameshifted(seq_record.seq[1:])  # Second frame
    t3 = translate_frameshifted(seq_record.seq[2:])  # Third frame
    t4 = translate_frameshifted(reverse_complement(seq_record.seq))  # Negative first frame
    t5 = translate_frameshifted(reverse_complement(seq_record.seq[:len(seq_record.seq)-1]))  # Negative second frame
    t6 = translate_frameshifted(reverse_complement(seq_record.seq[:len(seq_record.seq)-2]))  # Negative third frame

    header1 = ">" + seq_record.name
    append_new_line(output_file_path, header1)
    append_new_line(output_file_path, t1)
    append_new_line(output_file_path, header1)
    append_new_line(output_file_path, t2)
    append_new_line(output_file_path, header1)
    append_new_line(output_file_path, t3)
    append_new_line(output_file_path, header1)
    append_new_line(output_file_path, t4)
    append_new_line(output_file_path, header1)
    append_new_line(output_file_path, t5)
    append_new_line(output_file_path, header1)
    append_new_line(output_file_path, t6)

print(f"Output saved in: {output_file_path}")

### Read the epitope list

In [None]:
# This opens the file with the epitope sequences
bcf = pd.read_csv("/Users/marco.grillo/Desktop/mab_antigen_epitope_data_20210909_2.csv", sep=";") 
filtered_bcf = bcf.dropna(subset=['epitope_specificity'])
#filtered_bcf.apply(print(filtered_bcf['epitope_specificity']))

### You will need to read the 6-frame translation again here as input_file

### Alternatively, if you have a pre-computed proteome fasta, you can just input it here and skip the 6-frame translation above

In [None]:
from Bio import SeqIO
input_file = "/Users/marco.grillo/Dropbox/Work@Nilsson/Useful Jupyter notebook/output6frame.txt"
#output_file = "hits.txt"
column_names = ["epitope", "target"]
functional_epitopes = pd.DataFrame (columns = column_names)
# The next line is a Python generator expression - memory efficient!
e=0
for epitope in filtered_bcf['epitope_specificity']:
    e=e+1
    print('checking epitope number:'+ str(e))
    x=0
    for r in SeqIO.parse(input_file, "fasta"):
        x=x+1
        #print ('looking against sequence: '+str(x))
        y=0
        if epitope in r.seq.upper():
            y=y+1
            print ('found'+ str(y))
            functional_epitopes = functional_epitopes.append({"epitope": epitope, "target": r.id}, ignore_index=True)

### Save your epitope list, alongside their predicted hits

In [None]:
output = pd.DataFrame()
for fep in functional_epitopes['epitope']:
    output =output.append( filtered_bcf.loc[filtered_bcf['epitope_specificity'] == fep])
output.to_csv('/Users/marco.grillo/Dropbox/Work@Nilsson/Useful Jupyter notebook/gecko_Atlas.csv')
#print(output.gene_name.unique())
#len(output.gene_name.unique())