## We need to find set of REs that:
1. one of the REs must cut human mtDNA only one time
2. other REs must not cut mtDNA but must cut nuclear DNA to pieces less than 3-5kb

In [2]:
import re
import random
from collections import Counter
from multiprocessing import Pool
from typing import Dict, List

import pandas as pd
from Bio.Restriction import Analysis, AllEnzymes, RestrictionBatch
from Bio.SeqRecord import SeqRecord
from Bio import SeqIO
import tqdm

In [1]:
PIECES_SIZE = 5000
PATH_TO_HUMAN_GENOME = "../data/external/GCF_000001405.40/ncbi_dataset/data/GCF_000001405.40/*.fna"
# PATH_TO_RE = "../data/processed/cuted_seqs_num.csv"
PATH_TO_REF_MT = "../data/external/NC_012920.1.fasta"
PATH_TO_SEQS_MT = "../data/raw/sequence.fasta"

PATH_TO_OUT_RE_WITHOUT_SITE = "../data/processed/..."

In [3]:
len(AllEnzymes)

978

In [4]:
# human_genome = SeqIO.parse(PATH_TO_HUMAN_GENOME, "fasta")
mt_seqs = SeqIO.parse(PATH_TO_SEQS_MT, "fasta")
ref_mt = next(SeqIO.parse(PATH_TO_REF_MT, "fasta"))

## Search the enzymes that don't cut mtDNA

In [5]:
def extract_RE_without_site_on_mt(rec: SeqRecord) -> List[Dict]:
    ana = Analysis(AllEnzymes, rec.seq, linear=False)
    data = []
    for restr_enz in ana.without_site():
        re_name = repr(restr_enz)
        one_data = {"RE": re_name, "SeqName": rec.description}
        data.append(one_data)
    return data

In [13]:
pd.DataFrame(extract_RE_without_site_on_mt(ref_mt))

Unnamed: 0,RE,SeqName
0,PspXI,"NC_012920.1 Homo sapiens mitochondrion, comple..."
1,BssHII,"NC_012920.1 Homo sapiens mitochondrion, comple..."
2,SrfI,"NC_012920.1 Homo sapiens mitochondrion, comple..."
3,TspARh3I,"NC_012920.1 Homo sapiens mitochondrion, comple..."
4,Ple19I,"NC_012920.1 Homo sapiens mitochondrion, comple..."
...,...,...
101,CpoI,"NC_012920.1 Homo sapiens mitochondrion, comple..."
102,CciNI,"NC_012920.1 Homo sapiens mitochondrion, comple..."
103,Sth20745III,"NC_012920.1 Homo sapiens mitochondrion, comple..."
104,AdeI,"NC_012920.1 Homo sapiens mitochondrion, comple..."


In [None]:
# more than 1 hour and 10GB of RAM, really
threads = 24
with Pool(threads) as p:
    collection_of_pot_rs = p.map(extract_RE_without_site_on_mt, mt_seqs)

In [None]:
pot_rs = []
for xx in collection_of_pot_rs:
    for x in xx:
        pot_rs.append(x)

df = pd.DataFrame(pot_rs)
df.to_csv(PATH_TO_OUT_RE_WITHOUT_SITE, index=None)

In [None]:
df_counts = df.RE.value_counts().reset_index()
df_counts.columns = ["RE", "CuttedSeqs"]
df_counts.to_csv(PATH_TO_OUT_CUTNUM, index=None)

In [None]:
REs_without_site_on_mt = ...

## Search of minimal subset of REs that destroy muclear DNA

1. initially need to add to the set ClaI, as RE that cut mtDNA only one time
2. drop RE dublicates
3. apply approach from GO hw2

In [6]:
type(AllEnzymes.get("ClaI"))

RestrictionType

In [7]:
maximal_subset = RestrictionBatch(["ClaI"] + REs_without_site_on_mt)

RestrictionBatch(['ClaI'])

In [10]:
ClaI = AllEnzymes.get("ClaI")

In [16]:
ClaI.charac

(2, -2, None, None, 'ATCGAT')

In [43]:
# we need to use isoschizomers (equischizomers+neoschizomers)
ClaI.isoschizomers?

[0;31mSignature:[0m [0mClaI[0m[0;34m.[0m[0misoschizomers[0m[0;34m([0m[0mbatch[0m[0;34m=[0m[0;32mNone[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0;31mDocstring:[0m
List all isoschizomers of the enzyme.

Return a tuple of all the equischizomers and neoschizomers of RE.
If batch is supplied it is used instead of the default AllEnzymes.
[0;31mFile:[0m      ~/env_bio/lib/python3.8/site-packages/Bio/Restriction/Restriction.py
[0;31mType:[0m      method


In [39]:
ClaI.isoschizomers()

[Bsa29I, BseCI, BshVI, BspDI, Bsu15I, BsuTUI]

In [50]:
def collect_isoschizomers(enzymes: RestrictionBatch) -> List[List[str]]:
    data = []
    visited = set()
    for RE in enzymes:
        if repr(RE) in visited:
            continue

        visited.add(repr(RE))
        isosh_names = []
        for isosh in RE.isoschizomers():
            visited.add(repr(isosh))
            isosh_names.append(repr(isosh))
        data.append([repr(RE)] + isosh_names)
    return data

In [53]:
full_isosh = collect_isoschizomers(AllEnzymes)
with open("../data/processed/full_isoschizomers.txt", "w") as fout:
    for batch in full_isosh:
        fout.write(",".join(batch) + "\n")

In [None]:
for chromosome in human_genome:
    for partof_chr in re.split("N{50,}", chromosome):
        if "mt" in partof_chr.description.lower():
            continue
        ana = Analysis(maximal_subset, partof_chr.seq)
        for enzyme, positions in ana.with_sites().items():
            # use values to sort and write to table
            # after that analize table and create subset
            pass
        

    break