In [1]:

from google.colab import drive
import sys
drive.mount('/content/drive')

# Add necessary directories to sys.path
sys.path.append('/content/drive/My Drive/AbelBioToken-main/AbelBioToken-main/data')
sys.path.append('/content/drive/My Drive/AbelBioToken-main/AbelBioToken-main/model')
sys.path.append('/content/drive/My Drive/AbelBioToken-main/AbelBioToken-main/train')
sys.path.append('/content/drive/My Drive/AbelBioToken-main/AbelBioToken-main/test')


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import csv
from enum import Enum, auto
import random
import numpy as np
import generator

import os
import sys

# Explicitly define the directory where the files are located
explicit_dir = '/content/drive/My Drive/AbelBioToken-main/AbelBioToken-main-copy/data'
sys.path.append('/content/drive/My Drive/AbelBioToken-main/AbelBioToken-main-copy')

gen = generator.Generator(
    struc_file=os.path.join(explicit_dir, "structure.yaml"),
    set_file=os.path.join(explicit_dir, "sets.yaml"),
    cap_file=os.path.join(explicit_dir, "cap.yaml"),
)

aa_dict = {
    1: "A",
    2: "C",
    3: "D",
    4: "E",
    5: "F",
    6: "G",
    7: "H",
    8: "I",
    9: "K",
    10: "L",
    11: "M",
    12: "N",
    13: "O",
    14: "P",
    15: "Q",
    16: "R",
    17: "S",
    18: "T",
    19: "U",
    20: "V",
    21: "W",
    22: "Y",
}

class Category(Enum):
    POT = auto()
    CAP = auto()

def effective_num(num, sig_figs) -> int:
    if num == 0:
        return 0
    order = int(np.floor(np.log10(np.abs(num))))
    effective_float = np.round(num, sig_figs - order - 1)
    return int(effective_float * np.power(10, np.abs(order) + sig_figs - 1))

def get_zeros(curve: np.ndarray) -> np.ndarray:
    zero_points = np.where(
        np.logical_and(np.abs(np.diff(curve)) >= 1e-10, np.diff(np.sign(curve)) != 0)
    )[0]
    return zero_points

def process_curve(curve: np.ndarray, ctg: Category):
    ph_array = np.linspace(0, 14, 1400)
    zero_points = get_zeros(curve)
    extreme = np.array(
        [0, effective_num(curve.min(), 3), effective_num(curve.max(), 3), 0]
    )
    if ctg == Category.POT:
        temp_grad = np.gradient(curve, ph_array[1] - ph_array[0])
        extend_zeros = get_zeros(np.abs(temp_grad) - 0.00075)
        zero_points = np.append(zero_points, (extend_zeros[0], extend_zeros[-1]))
        zero_points.sort()
    return np.append(zero_points, extreme)

def get_token(chara_curves: tuple):
    pot_token = process_curve(chara_curves[0], ctg=Category.POT)
    cap_token = process_curve(chara_curves[1], ctg=Category.CAP)
    return np.append(pot_token, cap_token)

def number_sequence(num, base=23):
    result = list()
    while num > 0:
        temp = num % base
        if temp == 0:
            temp = np.random.randint(1, base)
        result.append(temp)
        num = num // base
    result = [aa_dict[x] for x in result]
    return "".join(result)

def generate_dataset(num, length: int):
    rand_num = np.random.choice(
        np.arange(23 ** (length - 1), 23**length - 1), num, replace=False
    )
    with open(os.path.join(explicit_dir, "aa_data.csv"), mode="a") as file:
        file_writer = csv.writer(file)
        for num in rand_num:
            aa_chain = number_sequence(num)
            output_list = [aa_chain]
            output_list = output_list + get_token(gen.get(aa_chain)).tolist()
            print(output_list)
            file_writer.writerow(output_list)
    return rand_num

def shuffle_file(filepath):
    with open(os.path.join(explicit_dir, filepath), mode="r") as in_file:
        lines = in_file.readlines()
    random.shuffle(lines)
    with open(os.path.join(explicit_dir, filepath), mode="w") as out_file:
        out_file.writelines(lines)

if __name__ == "__main__":
    # Uncomment the function calls to run as needed
    # shuffle_file("aa_data.csv")
    # generate_dataset(400, 2)
     generate_dataset(6000, 3)
    # generate_dataset(10000, 5)
    # generate_dataset(10000, 6)
    # generate_dataset(10000, 7)
    # generate_dataset(10000, 8)


['DSD', 214, 362, 639, 916, 1064, 0, -118, 118, 0, 331, 947, 0, -349, 440, 0]
['WIH', 817, 965, 1113, 0, -118, 118, 0, 934, 0, -349, 440, 0]
['FUN', 197, 345, 493, 0, -118, 118, 0, 314, 0, -349, 440, 0]
['RYV', 821, 1000, 1147, 1245, 1323, 0, -292, 232, 0, 971, 1279, 0, -199, 201, 0]
['YVR', 801, 983, 1165, 0, -265, 265, 0, 928, 1038, 0, -526, 161, 0]
['ICT', 666, 818, 922, 1025, 1177, 0, -119, 119, 0, 784, 1059, 0, -352, 445, 0]
['FNK', 729, 877, 1025, 0, -118, 118, 0, 846, 0, -349, 440, 0]
['YGS', 799, 982, 1165, 0, -260, 260, 0, 927, 1038, 0, -519, 157, 0]
['KYY', 820, 1014, 1214, 0, -441, 426, 0, 972, 1087, 0, -253, 408, 0]
['YPF', 847, 1032, 1217, 0, -244, 244, 0, 975, 1090, 0, -498, 148, 0]
['MPG', 909, 1057, 1205, 0, -118, 118, 0, 1025, 0, -349, 440, 0]
['ONK', 729, 877, 1025, 0, -118, 118, 0, 846, 0, -349, 440, 0]
['NLW', 809, 957, 1105, 0, -118, 118, 0, 926, 0, -349, 440, 0]
['ONL', 729, 877, 1025, 0, -118, 118, 0, 846, 0, -349, 440, 0]
['DFV', 214, 362, 638, 914, 1062, 0, -11