In [78]:
import numpy as np
import pandas as pd
import re

# I made a little CSV with AminoAcids and some their properties. This information
# is sourced from pubchem api, and WHO
amino_acids = pd.read_csv('AminoAcids.csv', index_col=0, squeeze=True).T.to_dict()
codons = {}

# Since this is a CSV we need to evaluate the codon string
for amino in amino_acids:
    amino_acids[amino]["codons"] = eval(amino_acids[amino]["codons"])

# Reverse for codon dictionary
    for codon in amino_acids[amino]["codons"]:
        codons[codon] = amino

# Sample Test
print("Sample from amino_acid csv: \n")
print(amino_acids["E"], "\n")
print("Sample from codon dictionary: codons['GAG'] = {}".format(codons["GAG"]))

# Create a random DNA string and a random search term with possible errors
# Add letters to this list to experiement 
dna = ["A", "C", "G", "T"] 
dna_string = "".join(np.random.choice(dna, np.random.randint(23,100)))
print("\nDNA string to be transposed: {}".format(dna_string))

# What follows is a little function to translate DNA into protein
def protein_shaker(dna):

    # First, we must replace the T in DNA to a U in RNA
    rna = dna.replace("T", "U")
    
    # Break into groups of 3 with the remaining tail
    rna_groups = re.findall(".{3}|.{1,2}", rna)
    
    # We could use regex replace, but that is not as readable to me
    protein = []
    for item in rna_groups:
        if item in codons and len(item) == 3:
            protein.append(codons[item])
            
        # Add DNA fragment warning
        elif len(item) != 3:
            print("{} is a fragment".format(item))
            protein.append("{fragment}")
            
        # Add Codon warning 
        else:
            print("{} is not valid a valid codon".format(item))
            protein.append("{invalid codon}")
    
    protein = "".join(protein).split("*")
    print("{} proteins found in DNA".format(len(protein)))
    return protein
    

protein_shaker(dna_string)



Sample from amino_acid csv: 

{'amino_acid': 'glutamic acid', 'code': 'glu', 'composition': 'C5H7NO3', 'smiles': 'C(CC(=O)O)C(C(=O)O)N', 'codons': {'GAG', 'GAA'}, 'isoelectric': '3.15', 'carb_pKa': '2.1', 'amino_pKa': '9.47', 'side_pKa': '4.07', 'polarity': 'acidic', 'mono_mass': '129.0425931', 'avg_mass': '129.11398', 'cid_number': 'CID33032', 'properties': '{"hydrophilic", "acidic"}'} 

Sample from codon dictionary: codons['GAG'] = E

DNA string to be transposed: CTCCCGTACCTCTATTCCATAGGACAGCACGGTAACAAATAGCAATCGCCGGCGGGTCCTTGCGATTGAAGC
3 proteins found in DNA


['LPYLYSIGQHGNK', 'QSPAGPCD', 'S']