## We need to find set of REs that:
1. one of the REs must cut human mtDNA only one time
2. other REs must not cut mtDNA but must cut nuclear DNA to pieces less than 3-5kb

In [29]:
import random
from collections import Counter
from multiprocessing import Pool
from typing import Dict, List

import pandas as pd
from Bio.Restriction import Analysis, AllEnzymes, RestrictionBatch
from Bio.SeqRecord import SeqRecord
from Bio import SeqIO
import tqdm

In [30]:
PIECES_SIZE = 5000
PATH_TO_HUMAN_GENOME = ""
PATH_TO_RE = "../data/processed/cuted_seqs_num.csv"
PATH_TO_REF_MT = "../data/external/NC_012920.1.fasta"
PATH_TO_SEQS_MT = "../data/raw/..."

PATH_TO_OUT_RE_WITHOUT_SITE = "../data/processed/..."

In [4]:
len(AllEnzymes)

978

In [11]:
ref_mt = next(SeqIO.parse(PATH_TO_REF_MT, "fasta"))
ref_mt

SeqRecord(seq=Seq('GATCACAGGTCTATCACCCTATTAACCACTCACGGGAGCTCTCCATGCATTTGG...ATG'), id='NC_012920.1', name='NC_012920.1', description='NC_012920.1 Homo sapiens mitochondrion, complete genome', dbxrefs=[])

## Search the enzymes that don't cut mtDNA

In [None]:
def extract_RE_without_site_on_mt(rec: SeqRecord) -> List[Dict]:
    ana = Analysis(AllEnzymes, rec.seq, linear=False)
    data = []
    for restr_enz in ana.without_site():
        re_name = repr(restr_enz)
        one_data = {"RE": re_name, "SeqName": rec.description}
        data.append(one_data)
    return data

In [None]:
# more than 1 hour and 10GB of RAM

threads = 24
fasta = SeqIO.parse(PATH_TO_SEQS_MT, "fasta")
with Pool(threads) as p:
    collection_of_pot_rs = p.map(extract_RE_without_site_on_mt, fasta)

In [None]:
pot_rs = []
for xx in collection_of_pot_rs:
    for x in xx:
        pot_rs.append(x)

df = pd.DataFrame(pot_rs)
df.to_csv(PATH_TO_OUT_RE_WITHOUT_SITE, index=None)

df_counts = df.RE.value_counts().reset_index()
df_counts.columns = ["RE", "CuttedSeqs"]
df_counts.to_csv(PATH_TO_OUT_CUTNUM, index=None)
