<a href="https://colab.research.google.com/github/pachterlab/LSCHWCP_2023/blob/main/Notebooks/Supp_Fig_10/Supp_Fig_10d/seq_distances_after_translation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Plot distances between PalmDB sequences after translation

In [None]:
!pip install -q biopython
from Bio import SeqIO
from scipy.spatial.distance import hamming
import itertools
from random import sample
import matplotlib.pyplot as plt
from tqdm import tqdm

TQDM_BAR_FORMAT = (
    "{l_bar}{bar}| {n_fmt}/{total_fmt} [elapsed: {elapsed} remaining: {remaining}]"
)

%load_ext blackcellmagic
%config InlineBackend.figure_format='retina'

Define codes:

In [None]:
# comme-free code AA dictionary
cfcode = {
    "F": "ACC",
    "L": "ACA",
    "I": "ATA",
    "M": "ATC",
    "V": "ATT",
    "S": "CTA",
    "P": "CTC",
    "T": "CTT",
    "A": "AGA",
    "Y": "AGC",
    "H": "AGT",
    "Q": "AGG",
    "N": "CGA",
    "K": "CGC",
    "D": "CGT",
    "E": "CGG",
    "C": "TGA",
    "W": "TGC",
    "R": "TGT",
    "G": "TGG",
    "X": "NNN",  # Amino acid not known
    "B": "CGT",  # Represents either N or D - will translate as D here (N is only off by one base)
    "J": "ACA",  # Represents either L or I - will translate as L here (I is only off by one base)
    "Z": "CGG"   # Represents either E or Q - will translate as E here (Q is only off by one base)
}

In [None]:
# Standard human genetic code
# Also see:
# https://www.biostars.org/p/432189/
# https://www.ncbi.nlm.nih.gov/pmc/articles/PMC4253638/

dnacode = {
    "A": "GCC",
    "C": "TGC",
    "D": "GAC",
    "E": "GAG",
    "F": "TTC",
    "G": "GGC",
    "H": "CAC",
    "I": "ATC",
    "K": "AAG",
    "L": "CTG",
    "M": "ATG",
    "N": "AAC",
    "P": "CCC",
    "Q": "CAG",
    "R": "AGA",
    "S": "TCC",
    "T": "ACC",
    "V": "GTG",
    "W": "TGG",
    "Y": "TAC",
    "X": "NNN",  # Amino acid not known
    "B": "AAC",  # Represents either N or D
    "J": "CTG",  # Represents either L or I
    "Z": "GAG",  # Represents either E or Q
}

Reverse translate RdRP amino acid sequences in the PalmDB:

In [None]:
# Download the RdRP amino acid sequences
!wget https://raw.githubusercontent.com/pachterlab/LSCHWCP_2023/main/PalmDB/palmdb_rdrp_seqs.fa

fasta = "palmdb_rdrp_seqs.fa"

In [None]:
ids = []
seqs = []
nn_seqs = []
cfc_seqs = []
seq_lens = []
for record in SeqIO.parse(fasta, "fasta"):
    # Translate AA sequence to comma-free
    cfc_seq_temp = []
    for aa in record.seq:
        # Capitalize amino acid
        aa = aa.capitalize()

        # Rev translate to comme-free code
        cfc_seq_temp.append(cfcode[aa])

    cfc_seqs.append("".join(cfc_seq_temp))


    # Reverse translate AA sequence using standard dna code
    nn_seq_temp = []
    for aa in record.seq:
        # Capitalize amino acid
        aa = aa.capitalize()

        # Reverse translate
        nn_seq_temp.append(dnacode[aa])

    # Standard reverse translation
    nn_seqs.append("".join(nn_seq_temp))


    # Record original amino acid seq
    seqs.append(record.seq)
    # Record original sequence length
    seq_lens.append(len(record.seq))

    # Record IDs
    ids.append(record.id)

In [None]:
print(f"Number of sequences: {len(ids)}")
print(f"AA sequence lengths (min-max): {min(seq_lens)} - {max(seq_lens)}")
# Check if all IDs are unique
print("IDs are unique: ",len(ids) == len(set(ids)))
# Check if all sequences are unique
print("Sequences are unique: ", len(cfc_seqs) == len(set(cfc_seqs)))
print("Number of shared sequences: ", len(cfc_seqs) - len(set(cfc_seqs)))

### Plot the distances between sequences in each code space:

In [None]:
# Define number of sequences to compare (will be randomly selected)
n = 10000

def get_dist_perc(seqs):
    dist_perc = []
    with tqdm(total=len(itertools.combinations(sample(seqs, n), 2)), bar_format=TQDM_BAR_FORMAT) as pbar:
        for seq1, seq2 in itertools.combinations(sample(seqs, n), 2):
            if len(seq1) > len(seq2):
                # Compute Hamming distance (returns percentage that can be converted to Hamming distance by multiplying by length of array)
                hamming_distance = hamming(list(seq1[:len(seq2)]), list(seq2))
            elif len(seq2) > len(seq1):
                hamming_distance = hamming(list(seq1), list(seq2[:len(seq1)]))
            else:
                hamming_distance = hamming(list(seq1), list(seq2))
            dist_perc.append(hamming_distance)

            pbar.update(1)

    return dist_perc

# Get Hamming distances between first n AA sequences
dist_perc = get_dist_perc(seqs)

# Get Hamming distances between first n standard nucleotide sequences
dist_perc_nn = get_dist_perc(nn_seqs)

# Get Hamming distances between first n cfc sequences
dist_perc_cfc = get_dist_perc(cfc_seqs)

In [None]:
# Plot Hamming distances
fig, axs = plt.subplots(figsize=(10,5), ncols=3, sharey=True)

fig.suptitle(f"Hamming distance between {n} sequences\nrandomly selected from the PalmDB", fontsize=16)

ax=axs[0]
ax.hist(dist_perc, 100)
ax.set_title(f"Amino acids")
ax.set_xlabel("% differing amino acids")
ax.set_ylabel("Frequency")
ax.set_xlim(0,1.0)

ax=axs[1]
ax.hist(dist_perc_nn, 100)
ax.set_title(f"After reverse translation\n(using human optimized code)")
ax.set_xlabel("% differing nucleotides")
# ax.set_ylabel("Frequency")
ax.set_xlim(0,1.0)

ax=axs[2]
ax.hist(dist_perc_cfc, 100)
ax.set_title(f"After reverse translation\nto comma-free code")
ax.set_xlabel("% differing nucleotides")
# ax.set_ylabel("Frequency")
ax.set_xlim(0,1.0)

plt.tight_layout()

fig.savefig("palmdb_dists.png", dpi=300, bbox_inches="tight")

fig.show()