## We need to find set of REs that:
1. one of the REs must cut human mtDNA only one time
2. other REs must not cut mtDNA but must cut nuclear DNA to pieces less than 3-5kb

In [1]:
import re
import random
import glob
from collections import Counter
from multiprocessing import Pool
from typing import Dict, List

import pandas as pd
import matplotlib.pyplot as plt
from Bio.Restriction import Analysis, AllEnzymes, RestrictionBatch
from Bio.SeqRecord import SeqRecord
from Bio import SeqIO
import tqdm

In [2]:
PIECES_SIZE = 5000
PATH_TO_HUMAN_GENOME = "../data/external/GCF_000001405.40/ncbi_dataset/data/GCF_000001405.40/*.fna"
# PATH_TO_RE = "../data/processed/cuted_seqs_num.csv"
PATH_TO_REF_MT = "../data/external/NC_012920.1.fasta"
PATH_TO_SEQS_MT = "../data/raw/sequence.fasta"

In [3]:
len(AllEnzymes)

978

In [4]:
# human_genome = SeqIO.parse(PATH_TO_HUMAN_GENOME, "fasta")
mt_seqs = SeqIO.parse(PATH_TO_SEQS_MT, "fasta")
ref_mt = next(SeqIO.parse(PATH_TO_REF_MT, "fasta"))

## Search the enzymes that don't cut mtDNA

In [5]:
def extract_RE_without_site_on_mt(rec: SeqRecord) -> List[Dict]:
    ana = Analysis(AllEnzymes, rec.seq, linear=False)
    data = []
    for restr_enz in ana.without_site():
        re_name = repr(restr_enz)
        one_data = {"RE": re_name, "SeqName": rec.description}
        data.append(one_data)
    return data

In [7]:
# 27 min and 10GB of RAM
threads = 24
with Pool(threads) as p:
    collection_of_pot_rs = p.map(extract_RE_without_site_on_mt, mt_seqs)

In [11]:
pot_rs = []
for xx in collection_of_pot_rs:
    for x in xx:
        pot_rs.append(x)

re_without_site = pd.DataFrame(pot_rs)
re_without_site.to_csv("../data/interim/re_without_site.csv", index=None)
re_without_site.head()

Unnamed: 0,RE,SeqName
0,ArsI,MK968879.1 Homo sapiens isolate YHL_TK036_F4b1...
1,BglII,MK968879.1 Homo sapiens isolate YHL_TK036_F4b1...
2,FspI,MK968879.1 Homo sapiens isolate YHL_TK036_F4b1...
3,PspOMII,MK968879.1 Homo sapiens isolate YHL_TK036_F4b1...
4,MteI,MK968879.1 Homo sapiens isolate YHL_TK036_F4b1...


In [16]:
re_maximal_subset = re_without_site.RE.value_counts().reset_index()
re_maximal_subset.columns = ["RE", "NotCuttedSeqsNum"]
re_maximal_subset["Percentage"] = re_maximal_subset.NotCuttedSeqsNum / len(collection_of_pot_rs) * 100
re_maximal_subset.to_csv("../data/processed/NotCuttedSeqsNum.csv", index=None)
re_maximal_subset.head()

Unnamed: 0,RE,NotCuttedSeqsNum,Percentage
0,Sse232I,56445,100.0
1,SwaI,56445,100.0
2,Sse8387I,56445,100.0
3,MreI,56445,100.0
4,MauBI,56445,100.0


## Search of minimal subset of REs that destroy muclear DNA

1. initially need to add to the set ClaI, as RE that cut mtDNA only one time
2. drop RE dublicates
3. apply approach from GO hw2

In [5]:
re_maximal_subset = pd.read_csv("../data/processed/NotCuttedSeqsNum.csv")
re_maximal_subset

Unnamed: 0,RE,NotCuttedSeqsNum,Percentage
0,Sse232I,56445,100.000000
1,SwaI,56445,100.000000
2,Sse8387I,56445,100.000000
3,MreI,56445,100.000000
4,MauBI,56445,100.000000
...,...,...,...
255,Cma23826I,1,0.001772
256,Acc65V,1,0.001772
257,PpiP13II,1,0.001772
258,Sse8647I,1,0.001772


In [6]:
# sample RE that don't cut most of mtDNA
cutoff = 95 # %
REs_without_site_on_mt = re_maximal_subset[re_maximal_subset.Percentage > cutoff].RE.values
print(REs_without_site_on_mt.shape)
REs_without_site_on_mt

(103,)


array(['Sse232I', 'SwaI', 'Sse8387I', 'MreI', 'MauBI', 'NotI', 'AscI',
       'SgrDI', 'AbsI', 'SmiI', 'SbfI', 'PalAI', 'MteI', 'CciNI', 'SgsI',
       'FspAI', 'SdaI', 'AsiSI', 'RigI', 'RgaI', 'FseI', 'SfaAI', 'SgfI',
       'NpeUS61II', 'MluI', 'SrfI', 'CspI', 'Rsr2I', 'Sth20745III',
       'RsrII', 'CpoI', 'GauT27I', 'TspARh3I', 'SfiI', 'Bsp460III',
       'McaTI', 'BssHII', 'PauI', 'PteI', 'Pst273I', 'BsePI', 'BoxI',
       'PshAI', 'BstPAI', 'Lmo370I', 'PvuI', 'Ple19I', 'BshTI', 'AgeI',
       'CspAI', 'PinAI', 'AsiGI', 'PspXI', 'Eco72I', 'PmaCI', 'AcvI',
       'PmlI', 'PspCI', 'BbrPI', 'SmaI', 'XmaI', 'Cfr9I', 'TspMI',
       'RpaB5I', 'Ssp714II', 'Ecl35734I', 'AdeI', 'DraIII', 'MspSC27II',
       'BtuMI', 'NruI', 'Bsp68I', 'RruI', 'BspGI', 'AspJHL3II', 'SstE37I',
       'UbaF13I', 'ArsI', 'BglII', 'TssI', 'PsrI', 'MabI', 'CsiI',
       'SexAI', 'DseDI', 'DrdI', 'AasI', 'PspOMII', 'MstI', 'NsbI',
       'FspI', 'Acc16I', 'RpaBI', 'UbaF12I', 'CspCI', 'PfrJS12V',
       'CcrNAIII'

In [7]:
# create sample batch (custom data structure)
ClaI = AllEnzymes.get("ClaI")
excess_maximal_subset = RestrictionBatch(REs_without_site_on_mt)
excess_maximal_subset

RestrictionBatch(['AasI', 'AbsI', 'Acc16I', 'AcvI', 'AdeI', 'AgeI', 'ArsI', 'AscI', 'AsiGI', 'AsiSI', 'AspJHL3II', 'BbrPI', 'BglII', 'BoxI', 'BsaBI', 'Bse8I', 'BseJI', 'BsePI', 'BshTI', 'Bsp460III', 'Bsp68I', 'BspGI', 'BssHII', 'BstPAI', 'BtuMI', 'CciNI', 'CcrNAIII', 'Cfr9I', 'CpoI', 'CsiI', 'CspAI', 'CspCI', 'CspI', 'DraIII', 'DrdI', 'DseDI', 'Ecl35734I', 'Eco72I', 'FseI', 'FspAI', 'FspI', 'GauT27I', 'HspMHR1II', 'Lmo370I', 'MabI', 'MauBI', 'McaTI', 'MluI', 'MreI', 'MspSC27II', 'MstI', 'MteI', 'NotI', 'NpeUS61II', 'NruI', 'NsbI', 'PalAI', 'PauI', 'PfrJS12V', 'PinAI', 'Ple19I', 'PmaCI', 'PmlI', 'PshAI', 'PspCI', 'PspOMII', 'PspXI', 'PsrI', 'Pst273I', 'PteI', 'PvuI', 'RgaI', 'RigI', 'RpaB5I', 'RpaBI', 'RruI', 'Rsr2I', 'RsrII', 'SalI', 'SbfI', 'SdaI', 'SexAI', 'SfaAI', 'SfiI', 'SgfI', 'SgrAII', 'SgrDI', 'SgsI', 'SmaI', 'SmiI', 'SrfI', 'Sse232I', 'Sse8387I', 'Ssp714II', 'SstE37I', 'Sth20745III', 'SwaI', 'TspARh3I', 'TspMI', 'TssI', 'UbaF12I', 'UbaF13I', 'XmaI'])

In [8]:
def collect_isoschizomers(enzymes: RestrictionBatch) -> List[List[str]]:
    """Search and collect all isoschizomers in passed enzymes"""
    data = []
    visited = set()
    for RE in enzymes:
        if repr(RE) in visited:
            continue

        visited.add(repr(RE))
        isosh_names = []
        for isosh in RE.isoschizomers():
            visited.add(repr(isosh))
            isosh_names.append(repr(isosh))
        data.append([repr(RE)] + isosh_names)
    return data

In [9]:
# # collect full collection of isoschizomers
# full_isosh = collect_isoschizomers(AllEnzymes)
# with open("../data/processed/full_isoschizomers.txt", "w") as fout:
#     for batch in full_isosh:
#         fout.write(",".join(batch) + "\n")

In [10]:
# collect collection of isoschizomers for used sample REs
sufficient_sample = []
full_isosh = collect_isoschizomers(excess_maximal_subset)
with open("../data/processed/used_re_isoschizomers.txt", "w") as fout:
    for batch in full_isosh:
        sufficient_sample.append(batch[0])
        fout.write(",".join(batch) + "\n")

print(len(sufficient_sample))
maximal_subset = RestrictionBatch(sufficient_sample)
maximal_subset

57


RestrictionBatch(['AbsI', 'ArsI', 'AspJHL3II', 'BglII', 'Bse8I', 'BshTI', 'Bsp460III', 'BspGI', 'BssHII', 'BstPAI', 'CcrNAIII', 'Cfr9I', 'CspCI', 'DraIII', 'DseDI', 'Ecl35734I', 'Eco72I', 'FspAI', 'GauT27I', 'HspMHR1II', 'Lmo370I', 'MauBI', 'MluI', 'MreI', 'MspSC27II', 'MstI', 'MteI', 'NotI', 'NpeUS61II', 'PalAI', 'PfrJS12V', 'PspOMII', 'PspXI', 'PsrI', 'Pst273I', 'PvuI', 'RgaI', 'RigI', 'RpaB5I', 'RpaBI', 'RruI', 'RsrII', 'SalI', 'SexAI', 'SfiI', 'SgrAII', 'SgrDI', 'SmiI', 'SrfI', 'Sse8387I', 'Ssp714II', 'SstE37I', 'Sth20745III', 'TspARh3I', 'TssI', 'UbaF12I', 'UbaF13I'])

In [11]:
for fp in glob.glob("../data/external/GCF_000001405.40/ncbi_dataset/data/GCF_000001405.40/*.fna"):
    if "chrMT" in fp:
        continue
    
    for record in SeqIO.parse(fp, format="fasta"):
        # ana = Analysis(maximal_subset, record.seq)
        # for enzyme, positions in ana.with_sites().items():
        #     # use values to sort and write to table
        #     # after that analize table and create subset
        #     break
            

        break
    break

In [12]:
ana = Analysis(maximal_subset, record.seq)

In [67]:
record, len(record)

(SeqRecord(seq=Seq('NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN...NNN'), id='NC_000014.9', name='NC_000014.9', description='NC_000014.9 Homo sapiens chromosome 14, GRCh38.p14 Primary Assembly', dbxrefs=[]),
 107043718)

In [18]:
data = []
for enzyme, positions in ana.with_sites().items():
    for pos in positions:
        data.append({"RE": repr(enzyme), "Pos": pos})
cutpos = pd.DataFrame(data).sort_values("Pos")
cutpos.to_csv("../data/interim/Chr1_cutpos.csv", index=None)
cutpos

Unnamed: 0,RE,Pos
159882,Eco72I,16000812
66326,DraIII,16000815
307962,Ecl35734I,16001785
353802,ArsI,16003465
353803,ArsI,16003497
...,...,...
159881,BstPAI,106883621
206916,AspJHL3II,106883656
247162,UbaF13I,106883659
125010,PspOMII,106883683


In [35]:
cutpos.reset_index(drop=True, inplace=True)

In [41]:
data = []
intervals = []
beg = 0
prev_RE = None
for i, row in cutpos.iterrows():
    end = row.Pos
    data.append({
        "Begin": beg,
        "End": end,
        "REbegin": prev_RE,
        "REend": row.RE,
        "Lenght": end - beg,
    })
    beg = end
    prev_RE = row.RE

df = pd.DataFrame(data)

In [42]:
df

Unnamed: 0,Begin,End,REbegin,REend,Lenght
0,0,16000812,,Eco72I,16000812
1,16000812,16000815,Eco72I,DraIII,3
2,16000815,16001785,DraIII,Ecl35734I,970
3,16001785,16003465,Ecl35734I,ArsI,1680
4,16003465,16003497,ArsI,ArsI,32
...,...,...,...,...,...
380076,106883553,106883621,BssHII,BstPAI,68
380077,106883621,106883656,BstPAI,AspJHL3II,35
380078,106883656,106883659,AspJHL3II,UbaF13I,3
380079,106883659,106883683,UbaF13I,PspOMII,24


In [47]:
df.Lenght.value_counts().sort_index().tail(10)

4545        1
4799        1
5086        1
5331        1
5424        1
12489       1
51955       1
100145      1
150255      1
16000812    1
Name: Lenght, dtype: int64

In [17]:
x = pd.Series(cutpos.Pos.values[1:] - cutpos.Pos.values[:-1])
y = x.value_counts()

In [None]:
# for fp in glob.glob("../data/external/GCF_000001405.40/ncbi_dataset/data/GCF_000001405.40/*.fna"):
#     if "chrMT" in fp:
#         continue
    
#     for record in SeqIO.parse(fp, format="fasta"):
#         for partof_chr in re.split("N{50,}", record):
#             if "mt" in partof_chr.description.lower():
#                 continue
#             ana = Analysis(maximal_subset, partof_chr.seq)
#             for enzyme, positions in ana.with_sites().items():
#                 # use values to sort and write to table
#                 # after that analize table and create subset
#                 pass
            

#         break