In [3]:
from Bio import Restriction
from Bio.Seq import Seq
from Bio import SeqIO
import contextlib
import io
from statistics import mean
from statistics import median
import json


# Test Multiple Sequence import from single fasta

In [4]:

test_seq = {}
with open("./dna_to_cut.fa") as dna:
    for record in SeqIO.parse(dna, "fasta"):
        test_seq[record.id] = record.seq
print(test_seq)

{'test1': Seq('TCATTAACTGTTGTTTGGTAGCACAAAAGTATTACCATGGTCCTAGAAGTTCGG...ATT'), 'test2': Seq('CGTTATGGCACCAGGGAGTTTAAGCCGAGTCAATGGAGCTCGCAATACAGAGTT...GCC'), 'test3': Seq('ACCCCGCTCGGGTATGGCAGAGAGAACGCCTTCTGAATTGTGCTATCCTTCGAC...CGG')}


# Test Creating json object of all commercial enzymes with all important properties

In [5]:
enzymes = {str(x): getattr(Restriction, str(x)) for x in sorted(list(Restriction.CommOnly))}
# print(enzymes)
enzyme_obj = {}

for enzyme in enzymes:
  # if "A" in enzymes[enzyme].site:
    # print(enzyme, enzymes[enzyme].site, enzymes[enzyme].overhang(), enzymes[enzyme].cut_once(), enzymes[enzyme].cut_twice(), enzymes[enzyme].inact_temp , enzymes[enzyme].is_methylable())#, [x for x in dir(enzymes[enzyme]) if not x.startswith("_")])
    enzyme_obj[str(enzyme)] = {
      "site": enzymes[enzyme].site,
      "overhang": enzymes[enzyme].overhang(),
      "cut_once": enzymes[enzyme].cut_once(),
      "cut_twice": enzymes[enzyme].cut_twice(),
      "inact_temp": enzymes[enzyme].inact_temp,
      "is_methylable": enzymes[enzyme].is_methylable(),
    } 
print(enzyme_obj)

# jsonify the enzyme object
with open("enzyme.json", "w") as f:
  json.dump(enzyme_obj, f, indent=2)
  
# Check the values of each object in the enzyme object
site_vals = set()
overhang_vals = set()
cut_once_vals = set()
cut_twice_vals = set()
inact_temp_vals = set()
is_methylable_vals = set()

for enzyme in enzyme_obj:
  site_vals.add(enzyme_obj[enzyme]["site"])
  overhang_vals.add(enzyme_obj[enzyme]["overhang"])
  cut_once_vals.add(enzyme_obj[enzyme]["cut_once"])
  cut_twice_vals.add(enzyme_obj[enzyme]["cut_twice"])
  inact_temp_vals.add(enzyme_obj[enzyme]["inact_temp"])
  is_methylable_vals.add(enzyme_obj[enzyme]["is_methylable"])

print(f"site_vals: {site_vals}, lenght: {len(site_vals)}")
print(f"overhang_vals: {overhang_vals}, lenght: {len(overhang_vals)}")
print(f"cut_once_vals: {cut_once_vals}, lenght: {len(cut_once_vals)}")
print(f"cut_twice_vals: {cut_twice_vals}, lenght: {len(cut_twice_vals)}")
print(f"inact_temp_vals: {inact_temp_vals}, lenght: {len(inact_temp_vals)}")
print(f"is_methylable_vals: {is_methylable_vals}, lenght: {len(is_methylable_vals)}")



{'AbaSI': {'site': 'C', 'overhang': "3' overhang", 'cut_once': True, 'cut_twice': False, 'inact_temp': 65, 'is_methylable': False}, 'FspEI': {'site': 'CC', 'overhang': "5' overhang", 'cut_once': True, 'cut_twice': False, 'inact_temp': 65, 'is_methylable': False}, 'AccII': {'site': 'CGCG', 'overhang': 'blunt', 'cut_once': True, 'cut_twice': False, 'inact_temp': 65, 'is_methylable': False}, 'AciI': {'site': 'CCGC', 'overhang': "5' overhang", 'cut_once': True, 'cut_twice': False, 'inact_temp': 65, 'is_methylable': True}, 'AfaI': {'site': 'GTAC', 'overhang': 'blunt', 'cut_once': True, 'cut_twice': False, 'inact_temp': 65, 'is_methylable': False}, 'AluBI': {'site': 'AGCT', 'overhang': 'blunt', 'cut_once': True, 'cut_twice': False, 'inact_temp': 65, 'is_methylable': True}, 'AluI': {'site': 'AGCT', 'overhang': 'blunt', 'cut_once': True, 'cut_twice': False, 'inact_temp': 65, 'is_methylable': True}, 'AoxI': {'site': 'GGCC', 'overhang': "5' overhang", 'cut_once': True, 'cut_twice': False, 'inact

# Test performing restriction enzyme digestion on multiple fasta lines and formatting output to str

# Test validating DNA sequence

In [6]:
from io import StringIO
import sys
from contextlib import redirect_stdout

output = ""
with io.StringIO() as buf, redirect_stdout(buf):
  for i,seq in enumerate(test_seq.keys()):
    output_message = f"Restriction Digest Analysis on Sequence: {seq}"
    if i!=0: print("\n")
    print("-"*(len(output_message)+3))
    print(output_message)
    print("-"*(len(output_message)+3), end="\n\n")
    
    test_analysis = Restriction.Analysis(Restriction.RestrictionBatch(enzymes), test_seq[seq], linear=True)
    
    test_analysis.print_as("map")
    
    test_analysis.print_that()
    
    output = buf.getvalue()
  
print(output)



-------------------------------------------------
Restriction Digest Analysis on Sequence: test1
-------------------------------------------------

 2 MspJI
 |                                                          
 | 4 AbaSI
 | |                                                        
 | |5 MseI SaqAI Tru9I Tru1I
 | ||                                                       
 | ||  8 AbaSI
 | ||  |                                                    
 | ||  |9 AbaSI
 | ||  ||                                                   
 | ||  ||10 HpyCH4III Bst4CI TaaI
 | ||  |||                                                  
 | ||  ||| 12 AbaSI
 | ||  ||| |                                                
 | ||  ||| | 14 AbaSI
 | ||  ||| | |                                              
 | ||  ||| | |     20 AbaSI AbaSI
 | ||  ||| | |     |                                        
 | ||  ||| | |     |  23 FspEI
 | ||  ||| | |     |  |                                     
 | ||  ||| | |     | 

In [16]:

def check_if_valid_fasta_file(file):
    try:
        with open(file, "r") as handle:
            print(handle)
            print(handle.read().startswith(">"))
            print(handle)
            # check if file starts with >
            if not (handle.read().startswith(">"):
                raise ValueError("Invalid fasta file")
            fasta = SeqIO.parse(handle, "fasta")
            # check if starts with >
            if not fasta:
                raise ValueError("Invalid fasta file")
            
            return True
    except:
        raise ValueError("Invalid fasta file")

def load_dna(file):
  try:
    dna_dict = {}
    # check if the file exists
    with open(file) as f:
      pass
    
    # Check if valid fasta file
    check_if_valid_fasta_file(file)
    
    # Read file into dictionary
    with open(file) as dna:
      for record in SeqIO.parse(dna, "fasta"):
        # Check if sequence is valid
        # Only valid if sequence contains A, T, G, C, U, or N
        if not set(record.seq).issubset("ATGCUatgcuNn"):
            raise ValueError("Invalid sequence")
          
        # Read sequence into dictionary
        dna_dict[record.id] = record.seq
        
    return dna_dict
  
  except Exception as e:
    print(f"{file} is not a valid fasta file")
    print(e)

# DNA sequences
dna_to_cut = load_dna("dna_to_cut.fa")
dna_to_not_cut = load_dna("dna_to_not_cut.fa")
print(dna_to_cut)
print(dna_to_not_cut)

<_io.TextIOWrapper name='dna_to_cut.fa' mode='r' encoding='UTF-8'>
True
<_io.TextIOWrapper name='dna_to_cut.fa' mode='r' encoding='UTF-8'>
dna_to_cut.fa is not a valid fasta file
Invalid fasta file
<_io.TextIOWrapper name='dna_to_not_cut.fa' mode='r' encoding='UTF-8'>
False
<_io.TextIOWrapper name='dna_to_not_cut.fa' mode='r' encoding='UTF-8'>
dna_to_not_cut.fa is not a valid fasta file
Invalid fasta file
None
None
