In [None]:
!pip install lightgbm==3.3.2
!pip install pip install m2cgen

### FLAML - AUTO TUNING

In [None]:
!pip install flaml

In [None]:
from flaml import AutoML

In [None]:
automl = AutoML()
automl.fit(X_train, y_train, task='classification', time_budget=1500)

### LIGHTGBM - MODEL TRAINING

In [None]:
import csv
import numpy as np
import pandas as pd

In [None]:
dataset = '/content/drive/MyDrive/Vyzkum_DNA/Quadtree/dataset.csv'

In [None]:
pd.read_csv(dataset)

Unnamed: 0.1,Unnamed: 0,sequence,g4flag
0,457240,ACCTGCCCACTCCCCCTTCCAACGGGGGTCTAGGGGGAGTATGTTC...,1
1,333291,GGGCTCGGAACCCCCACCTATGGGGGCTATCCCCCATATCTATGCA...,1
2,143038,GGAGGTGCGTACACGACTATTCGTCCCCCCACACATACTCTCTCCT...,1
3,295958,TTCTCTCAGGCAAGCCGACTCACTCAGGGGCCAACACTCCTGGGGG...,1
4,951772,GGATTCTCTGGGATAGCGTCTTGTAAGAGCTCCGGCTACCGAGCCG...,0
...,...,...,...
1195978,571467,GGGACAATACCCCTCACCCCATAATCGGGGCGATCGAAGCTCGGCC...,1
1195979,218614,TAACGGTAACCTACCGACCGGAGAGGGGGCCCCCTTACGGGGTTAA...,1
1195980,571370,GGGCCATGTATCCCCTCTGGGACCAAGGGATCTGGTTGTAAACCCC...,1
1195981,182381,CGCCCCATAACTCCTCCCCACTATCCCCTCCGCTTATACTTAACTA...,1


In [None]:
def sequence_convertor(sequence_path: str) -> tuple:
  """
  Transform raw sequences into encoded numpy array
  :param sequence_path: ....
  :return: encoded numpy arrays
  """
  with open(sequence_path) as g4_csv:
    next(g4_csv)
    g4_reader = csv.reader(g4_csv, delimiter=',')
    sequences = []
    converted_sequences = []
    g4 = []
    
    for row in g4_reader:
        converted = []

        for base in row[1]:
            if base == 'C':
                converted.append(1)
            elif base == 'G':
                converted.append(-1)
            else:
                converted.append(0)
        
        converted_sequences.append(converted)
        g4.append(row[2])

    return np.array(converted_sequences), np.array(g4)

In [None]:
X, y = sequence_convertor(dataset)

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, confusion_matrix, classification_report

In [None]:
X.shape

(1195983, 50)

In [None]:
X

array([[ 0,  1,  1, ...,  1, -1, -1],
       [-1, -1, -1, ..., -1,  0,  0],
       [-1, -1,  0, ...,  0,  1,  1],
       ...,
       [-1, -1, -1, ...,  0,  0,  0],
       [ 1, -1,  1, ...,  0,  1, -1],
       [ 0,  0,  0, ..., -1,  1,  1]])

In [None]:
y

array(['1', '1', '1', ..., '1', '1', '1'], dtype='<U1')

In [None]:
X_train, X_rest, y_train, y_rest = train_test_split(X, y, test_size = 0.2)

In [None]:
X_val, X_test, y_val, y_test = train_test_split(X_rest, y_rest, test_size = 0.4)

In [None]:
X_train.shape

(956786, 50)

In [None]:
y_train.shape

(956786,)

TRAINING

In [None]:
import lightgbm as lgb

In [None]:
model = lgb.LGBMClassifier(
    objective="binary",
    colsample_bytree=0.817574864502621,
    learning_rate=0.03744835808549148, max_bin=127,
    min_child_samples=3, n_estimators=1000, num_leaves=74,
    reg_alpha=0.0033803043003857677, reg_lambda=0.7013136087939289,
    verbose=10
)

model = model.fit(X_train, y_train, eval_set=[(X_val, y_val)])

[LightGBM] [Info] Number of positive: 479579, number of negative: 477207
[LightGBM] [Debug] Dataset::GetMultiBinFromAllFeatures: sparse rate 0.453193
[LightGBM] [Debug] init for col-wise cost 0.000015 seconds, init for row-wise cost 0.665635 seconds
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Debug] Using Sparse Multi-Val Bin
[LightGBM] [Info] Total Bins 150
[LightGBM] [Info] Number of data points in the train set: 956786, number of used features: 50
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.501240 -> initscore=0.004958
[LightGBM] [Info] Start training from score 0.004958
[LightGBM] [Debug] Trained a tree with leaves = 74 and depth = 10
[1]	valid_0's binary_logloss: 0.677821
[LightGBM] [Debug] Trained a tree with leaves = 74 and depth = 9
[2]	valid_0's binary_logloss: 0.663917
[LightGBM] [Debug] Trained a tree with leaves = 74 and depth = 11
[3]	valid_0's binary_logloss: 0.649306
[LightGBM

In [None]:
model.booster_.save_model('/content/drive/MyDrive/Vyzkum_DNA/Quadtree/quad_tree.txt')

### CONVERT TO JAVASCRIPT - FOR PREVIEW WEBSITE

In [None]:
import m2cgen as m2c
import sys

sys.setrecursionlimit(2147483647)

model_to_js = m2c.export_to_javascript(model)

In [None]:
with open('/content/drive/MyDrive/quadtree.js', 'w') as fd:
    fd.write(model_to_js)