In [None]:
!pip install exrex

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting exrex
  Downloading exrex-0.10.5.tar.gz (4.8 kB)
Building wheels for collected packages: exrex
  Building wheel for exrex (setup.py) ... [?25l[?25hdone
  Created wheel for exrex: filename=exrex-0.10.5-py3-none-any.whl size=9174 sha256=636ce06c42fa8e256d68c5cd705af7cbb3b31ade5364f03074868250fab08474
  Stored in directory: /root/.cache/pip/wheels/ed/56/7f/7372a25ffef1298b673cdb2c557b75c676e105b249c4d08791
Successfully built exrex
Installing collected packages: exrex
Successfully installed exrex-0.10.5


In [None]:
import pandas as pd

In [None]:
import exrex

import random
from typing import List


class QuadruplexGenerator:
    """
    Quadruplex generator, uses all available detection regexes to generate
    new sequences containing quadruplexes.

    Source: Emilia Puig Lombardi, Arturo Londoño-Vallejo, A guide to computational methods for G-quadruplex prediction, Nucleic Acids Research, Volume 48, Issue 1, 10 January 2020, Pages 1–15, https://doi.org/10.1093/nar/gkz1097
    """

    REGEX_LIST: List[str] = [
        "G{3,5}[ATC]{1,7}G{3,5}[ATC]{1,7}G{3,5}[ATC]{1,7}G{3,5}",
        "G{3,5}[ATC]{1,7}C{3,5}[ATC]{1,7}G{3,5}[ATC]{1,7}C{3,5}",
        "G{3,5}[ATC]{1,7}G{3,5}[ATC]{1,7}C{3,5}[ATC]{1,7}C{3,5}",
        "G{3,5}[ATC]{1,7}C{3,5}[ATC]{1,7}C{3,5}[ATC]{1,7}G{3,5}",
        "G{3,5}[ATC]{1,7}G{3,5}[ATC]{1,7}G{3,5}[ATC]{1,7}C{3,5}",
        "G{3,5}[ATC]{1,7}G{3,5}[ATC]{1,7}C{3,5}[ATC]{1,7}G{3,5}",
        "C{3,5}[ATC]{1,7}C{3,5}[ATC]{1,7}C{3,5}[ATC]{1,7}C{3,5}",
        "C{3,5}[ATC]{1,7}G{3,5}[ATC]{1,7}C{3,5}[ATC]{1,7}G{3,5}",
        "C{3,5}[ATC]{1,7}C{3,5}[ATC]{1,7}G{3,5}[ATC]{1,7}G{3,5}",
        "C{3,5}[ATC]{1,7}G{3,5}[ATC]{1,7}G{3,5}[ATC]{1,7}C{3,5}",
        "C{3,5}[ATC]{1,7}C{3,5}[ATC]{1,7}C{3,5}[ATC]{1,7}G{3,5}",
        "C{3,5}[ATC]{1,7}C{3,5}[ATC]{1,7}G{3,5}[ATC]{1,7}C{3,5}",
    ]
    NOISE_NUCLEOTIDES: List[str] = ["A", "T"]

    def __init__(self) -> None:
        self.quadruplexes: List[str] = []

    def _place_random_noise(
        self,
        *,
        sequence: str,
        minimum_number_of_noise: int,
        maximum_number_of_noise: int,
    ) -> str:
        """
        Generate random random noise to the sequence
        :param sequence: generated sequence
        :param minimum_number_of_noise: minimum number of noise nucleotides
        :param maximum_number_of_noise: maximum number of noise nucleotides
        :return: sequence with random noise
        """
        sequence = list(sequence)
        places = random.randint(minimum_number_of_noise, maximum_number_of_noise)

        for i in range(places):
            index = random.randint(1, len(sequence))

            if index < len(sequence):
                sequence[index] = random.choice(self.NOISE_NUCLEOTIDES)

        return "".join(sequence)

    def _generate_quadruplexes(
        self,
        *,
        quadruplex_regex: str,
        limit: int,
        place_random_noise: bool = False,
        minimum_number_of_noise: int,
        maximum_number_of_noise: int,
    ) -> List[str]:
        """
        Generate quadruplex sequences base on regex.
        :param quadruplex_regex: quadruplex detection regex used for generation
        :param limit: maximum number of generated quadruplexes per one regex
        :param place_random_noise: place random noise to the sequence
        :param minimum_number_of_noise: minimum number of noise nucleotides
        :param maximum_number_of_noise: maximum number of noise nucleotides
        :return: list of quadruplex sequences
        """
        local_quadruplexes: List[str] = []

        while len(local_quadruplexes) <= limit:
            generated_quadruplex = exrex.getone(quadruplex_regex)

            if generated_quadruplex not in local_quadruplexes:
                if place_random_noise:
                    local_quadruplexes.append(
                        self._place_random_noise(
                            sequence=generated_quadruplex,
                            minimum_number_of_noise=minimum_number_of_noise,
                            maximum_number_of_noise=maximum_number_of_noise,
                        )
                    )
                else:
                    local_quadruplexes.append(generated_quadruplex)

        return local_quadruplexes

    def run(
        self,
        *,
        limit_per_regex: int = 10,
        place_random_noise: bool = False,
        minimum_number_of_noise: int = 0,
        maximum_number_of_noise: int = 3,
    ) -> None:
        """
        Run quadruplex generator
        :param limit_per_regex: maximum number of generated quadruplexes per one regex
        :param place_random_noise: place random noise to the sequence
        :param minimum_number_of_noise: minimum number of noise nucleotides
        :param maximum_number_of_noise: maximum number of noise nucleotides
        :return:
        """

        for regex in self.REGEX_LIST:
            self.quadruplexes += self._generate_quadruplexes(
                quadruplex_regex=regex,
                limit=limit_per_regex,
                minimum_number_of_noise=minimum_number_of_noise,
                maximum_number_of_noise=maximum_number_of_noise,
                place_random_noise=place_random_noise,
            )


In [None]:
generator = QuadruplexGenerator()
generator.run(
    limit_per_regex=60000,
    place_random_noise=True,
    minimum_number_of_noise=1,
    maximum_number_of_noise=10,
)

In [None]:
unique = list(set(generator.quadruplexes))

In [None]:
def random_sequence(left: int, right: int) -> tuple:
    left_sequence = [random.choice(["A", "T", "G", "C"]) for i in range(left)]
    right_sequence = [random.choice(["A", "T", "G", "C"]) for i in range(right)]

    return "".join(left_sequence), "".join(right_sequence)

In [None]:
def fifty(sequence: str) -> str:
    rest_length = 50 - len(sequence)
    left = random.randint(0, rest_length)
    right = rest_length - left

    l_s, r_s = random_sequence(left, right)
    
    return l_s + sequence + r_s


In [None]:
def calc_score(sequence):
    total = 0
    partial = 0

    for base in sequence:      

        if base == 'C':
            partial -= 1
        elif base == 'G':
            partial += 1 
        else:
            total += partial * partial
            partial = 0

    return abs(total / len(sequence))

In [None]:
unique_fifty = [fifty(sequence) for sequence in unique]

In [None]:
unique_fifty[0:10]

['TACGCAGTGGTCATTCCGTGTGCCACTCTCCCCCAAATTCACTGGTCCTC',
 'GTCTCTCTAGCGGTGTTATCCCCCGGGGGATTTATCCCACGACGAACGGT',
 'AGGCACACCATCGACGAGGGGTCCAACTCAGAGCATTCCTGTAATTGAGC',
 'GTGGGGGAAACGGGGGACACAACCACCTGGGGTAAGTACCTCCCTACTAC',
 'CAATACCGCTGGTTCGTCCGGGAAGGTGTGGGGACTTTAGGGGCCACTCT',
 'TCCCCTTGTGATTAAAGTAGTACCCCCTCCACGATTCAACCAACGGAAAC',
 'GTAGAGGCGAGCACTAGGAGACCTCCTTAGGGACCTCCCAATTGCGGGCT',
 'TGCTACAGATCCGGGTTACGGGACCCCCCCCCCATAGACTGTATCATAAC',
 'CATCCCAACGCCGTAGTTATGCCCCCAATCCTGGGGGACCCGTCAGCGAG',
 'GGGGACCCGGGCCTCTACCCCATCCTTGGGGCGATTAATCGATAAGGCTC']

In [None]:
random_sequences = [random_sequence(50,0)[0] for i in range(0,600000)]

In [None]:
random_sequences[0:10]

['TATTTTTACGAAGAAATGAGGCAACGCCTGTTTCCCACCAATGGCATGTG',
 'GCAATGCCTGAAGTCACAAGACAGAACTGCCTAGATTGGAGCCGAGCTCC',
 'CGCTCGTACGTGTTATGGAGGATCACCGTCTGTGTTAAAGTGGGGCAGGG',
 'ACTAGAGGCTACCCTTAACGTAATGTTGCATAACATGGTTCCTCTCCTCG',
 'CGCAAATCTAAATCGTTGGTGGCTCATGGAAGACGTTGATGATTGCCGTC',
 'GACGTATGGTTCTCTAGCGGCGGTCACCGTATTAAACGCGCGCGTAAAGG',
 'TCCTGCACGAAACTGATGGGGGGGCAATTGACAAGAATTTGCCTCCGAGT',
 'TTTAGAAGCGTAGCCATTGCTTTCAAAAGAGAAAGTGCGTCCTGCTGGAT',
 'TTGGGGGTGTGGTCTATCGGCTGCGACACCGCTACTGGTAGAATTGTATT',
 'CGACGTAATGGAAGAGCTTATACGTTCTTCCCCCGTGCGTAGTCCCGTTA']

In [None]:
REGEX_LIST: List[str] = [
        "G{3,5}[ATC]{1,7}G{3,5}[ATC]{1,7}G{3,5}[ATC]{1,7}G{3,5}",
        "G{3,5}[ATC]{1,7}C{3,5}[ATC]{1,7}G{3,5}[ATC]{1,7}C{3,5}",
        "G{3,5}[ATC]{1,7}G{3,5}[ATC]{1,7}C{3,5}[ATC]{1,7}C{3,5}",
        "G{3,5}[ATC]{1,7}C{3,5}[ATC]{1,7}C{3,5}[ATC]{1,7}G{3,5}",
        "G{3,5}[ATC]{1,7}G{3,5}[ATC]{1,7}G{3,5}[ATC]{1,7}C{3,5}",
        "G{3,5}[ATC]{1,7}G{3,5}[ATC]{1,7}C{3,5}[ATC]{1,7}G{3,5}",
        "C{3,5}[ATC]{1,7}C{3,5}[ATC]{1,7}C{3,5}[ATC]{1,7}C{3,5}",
        "C{3,5}[ATC]{1,7}G{3,5}[ATC]{1,7}C{3,5}[ATC]{1,7}G{3,5}",
        "C{3,5}[ATC]{1,7}C{3,5}[ATC]{1,7}G{3,5}[ATC]{1,7}G{3,5}",
        "C{3,5}[ATC]{1,7}G{3,5}[ATC]{1,7}G{3,5}[ATC]{1,7}C{3,5}",
        "C{3,5}[ATC]{1,7}C{3,5}[ATC]{1,7}C{3,5}[ATC]{1,7}G{3,5}",
        "C{3,5}[ATC]{1,7}C{3,5}[ATC]{1,7}G{3,5}[ATC]{1,7}C{3,5}",
]

In [None]:
random_sequences_filtered = []

In [None]:
import re

counter = 0

for rnd_seq in random_sequences:
    score = calc_score(rnd_seq)

    if(score > 1.2):
        counter += 1
        continue
    
    for regex in REGEX_LIST:
        result = re.search(regex, rnd_seq)
        if result:
            counter += 1
            break

    random_sequences_filtered.append(rnd_seq)

print(counter)

In [None]:
random_sequences_with_flag = [[sequence, 0] for sequence in random_sequences_filtered]

In [None]:
random_sequences_with_flag[0:10]

In [None]:
unique_fifty_with_flag = [[sequence, 1] for sequence in unique_fifty]

In [None]:
unique_fifty_with_flag[0:10]

[['TAACCCATTCCCATTTGGGGGAGGGGTTGAGAAACACGGTGACGCGAGAC', 1],
 ['GGTACGTGGCTTCCCCTAAAGGGGGTCCCCATCCCACTTCAGGGTGTCAA', 1],
 ['GTTCTAGGGACCCTGCGACCCCCTTCTCCCCCTTCCCCACCCCAGGGTAG', 1],
 ['ACCTAGCCCCCCCTCCCCCCTGGGGGCCCCCCGCCTGGCAGTCCCGAAGA', 1],
 ['CTCCCCTCACTAGTCTAGTACGTACGGGGGCTCAGGGCTTCCAGCCACTC', 1],
 ['TCTACAAGCGGCGTCCTCAATCCCCCATGGGACCCCCGGGCAGGCCCCGA', 1],
 ['GCCCGAGCATCAGGGGGAATATGGGGGAACCCACCCCACCCCCCGCATGA', 1],
 ['TGGCCGGGGGTAACCCCCCTTAAGGGGATCCCACCTGTTGGCCTTCGACG', 1],
 ['AATATCTTTTTGGGCCCGACCCCCTCTTTAGGGGCCCACCTTACCTAGGG', 1],
 ['GGAATGGAACAGGCTGGGGGACACATGGGGGCTTATGGGCCGGGGCGGTG', 1]]

In [None]:
sequences = unique_fifty_with_flag + random_sequences_with_flag

In [None]:
sequences[0:10]

[['TAACCCATTCCCATTTGGGGGAGGGGTTGAGAAACACGGTGACGCGAGAC', 1],
 ['GGTACGTGGCTTCCCCTAAAGGGGGTCCCCATCCCACTTCAGGGTGTCAA', 1],
 ['GTTCTAGGGACCCTGCGACCCCCTTCTCCCCCTTCCCCACCCCAGGGTAG', 1],
 ['ACCTAGCCCCCCCTCCCCCCTGGGGGCCCCCCGCCTGGCAGTCCCGAAGA', 1],
 ['CTCCCCTCACTAGTCTAGTACGTACGGGGGCTCAGGGCTTCCAGCCACTC', 1],
 ['TCTACAAGCGGCGTCCTCAATCCCCCATGGGACCCCCGGGCAGGCCCCGA', 1],
 ['GCCCGAGCATCAGGGGGAATATGGGGGAACCCACCCCACCCCCCGCATGA', 1],
 ['TGGCCGGGGGTAACCCCCCTTAAGGGGATCCCACCTGTTGGCCTTCGACG', 1],
 ['AATATCTTTTTGGGCCCGACCCCCTCTTTAGGGGCCCACCTTACCTAGGG', 1],
 ['GGAATGGAACAGGCTGGGGGACACATGGGGGCTTATGGGCCGGGGCGGTG', 1]]

In [None]:
df = pd.DataFrame(data=sequences, columns=['sequence', 'g4flag'])

In [None]:
from sklearn.utils import shuffle


In [None]:
df = shuffle(df)

In [None]:
df

Unnamed: 0,sequence,g4flag
457240,ACCTGCCCACTCCCCCTTCCAACGGGGGTCTAGGGGGAGTATGTTC...,1
333291,GGGCTCGGAACCCCCACCTATGGGGGCTATCCCCCATATCTATGCA...,1
143038,GGAGGTGCGTACACGACTATTCGTCCCCCCACACATACTCTCTCCT...,1
295958,TTCTCTCAGGCAAGCCGACTCACTCAGGGGCCAACACTCCTGGGGG...,1
951772,GGATTCTCTGGGATAGCGTCTTGTAAGAGCTCCGGCTACCGAGCCG...,0
...,...,...
571467,GGGACAATACCCCTCACCCCATAATCGGGGCGATCGAAGCTCGGCC...,1
218614,TAACGGTAACCTACCGACCGGAGAGGGGGCCCCCTTACGGGGTTAA...,1
571370,GGGCCATGTATCCCCTCTGGGACCAAGGGATCTGGTTGTAAACCCC...,1
182381,CGCCCCATAACTCCTCCCCACTATCCCCTCCGCTTATACTTAACTA...,1


In [None]:
df.to_csv('/content/drive/MyDrive/Vyzkum_DNA/Quadtree/dataset.csv')