In [3]:
""" Example script for defining and using custom models in AutoGluon Tabular """

from autogluon.core.utils import infer_problem_type
from autogluon.tabular import TabularDataset, TabularPredictor
from autogluon.tabular.configs.hyperparameter_configs import get_hyperparameter_config
from autogluon.core.data import LabelCleaner
from autogluon.core.models import AbstractModel

#########################
# Create a custom model #
#########################

# In this example, we create a custom Naive Bayes model for use in AutoGluon
class NaiveBayesModel(AbstractModel):
    # The `_preprocess` method takes the input data and transforms it to the internal representation usable by the model.
    # `_preprocess` is called by `preprocess` and is used during model fit and model inference.
    def _preprocess(self, X, **kwargs):
        # Drop category and object column dtypes, since NaiveBayes can't handle these dtypes.
        cat_columns = X.select_dtypes(['category', 'object']).columns
        X = X.drop(cat_columns, axis=1)
        # Add a fillna call to handle missing values.
        return super()._preprocess(X, **kwargs).fillna(0)

    # The `_fit` method takes the input training data (and optionally the validation data) and trains the model.
    def _fit(self, X, y, **kwargs):
        from sklearn.naive_bayes import GaussianNB
        # It is important to call `preprocess(X)` in `_fit` to replicate what will occur during inference.
        X = self.preprocess(X)
        self.model = GaussianNB(**self.params)
        self.model.fit(X, y)

In [4]:
# Example of a more optimized implementation that drops the invalid features earlier on to avoid having to make repeated checks.
class AdvancedNaiveBayesModel(AbstractModel):
    def _preprocess(self, X, **kwargs):
        # Add a fillna call to handle missing values.
        return super()._preprocess(X, **kwargs).fillna(0)

    def _fit(self, X, y, **kwargs):
        from sklearn.naive_bayes import GaussianNB
        X = self.preprocess(X)
        self.model = GaussianNB(**self.params)
        self.model.fit(X, y)

    # The `_get_default_auxiliary_params` method defines various model-agnostic parameters such as maximum memory usage and valid input column dtypes.
    # For most users who build custom models, they will only need to specify the valid/invalid dtypes to the model here.
    def _get_default_auxiliary_params(self) -> dict:
        default_auxiliary_params = super()._get_default_auxiliary_params()
        extra_auxiliary_params = dict(
            # Drop category and object column dtypes, since NaiveBayes can't handle these dtypes.
            ignored_type_group_raw=['category', 'object'],
        )
        default_auxiliary_params.update(extra_auxiliary_params)
        return default_auxiliary_params


In [49]:
import torch
import torch.nn as nn

import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import pandas as pd
from sklearn.metrics import precision_score


class ConvNet(nn.Module):
    def __init__(self, input_channels,seq_len):
        super(ConvNet, self).__init__()
        # Input layer
        self.conv1 = nn.Conv1d(input_channels, 4, kernel_size=3, padding=1)
        self.bn1 = nn.BatchNorm1d(4)
        self.relu1 = nn.ReLU()
        self.conv2 = nn.Conv1d(4, 16, kernel_size=3, padding=1)
        self.bn2 = nn.BatchNorm1d(16)
        self.relu2 = nn.ReLU()
        self.conv3 = nn.Conv1d(16, 32, kernel_size=3, padding=1)
        self.bn3 = nn.BatchNorm1d(32)
        self.relu3 = nn.ReLU()

        
        # Hidden layer
        self.flatten = nn.Flatten()
        #self.fc1 = nn.Linear(440192, 128)
        self.fc1 = nn.Linear(32*seq_len, 128)
        self.bn5 = nn.BatchNorm1d(128)
        self.relu5 = nn.ReLU()
        self.fc2 = nn.Linear(128, 16)
        self.bn6 = nn.BatchNorm1d(16)
        self.relu6 = nn.ReLU()
        self.dropout = nn.Dropout(p=0.5)
        
        # Output layer
        self.fc3 = nn.Linear(16, 1)
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        x = self.conv1(x)
        x = self.bn1(x)
        x = self.relu1(x)
        x = self.conv2(x)
        x = self.bn2(x)
        x = self.relu2(x)
        x = self.conv3(x)
        x = self.bn3(x)
        x = self.relu3(x)

        x = self.flatten(x)
        x = self.fc1(x)
        x = self.bn5(x)
        x = self.relu5(x)
        x = self.fc2(x)
        x = self.bn6(x)
        x = self.relu6(x)
        x = self.dropout(x)
        
        x = self.fc3(x)
        x = self.sigmoid(x)
        return x

    
class ConvNetModel(AbstractModel):
    # The `_preprocess` method takes the input data and transforms it to the internal representation usable by the model.
    # `_preprocess` is called by `preprocess` and is used during model fit and model inference.
    def _preprocess(self, X, **kwargs):
        # Perform preprocessing steps such as imputation, scaling, and one-hot encoding.
        # Add any custom preprocessing steps if needed.
        return super()._preprocess(X, **kwargs)

          
            
    def _fit(self, X, y, batch_size=32, epochs=10, lr=0.001,  **kwargs):
        X_train = torch.tensor(X.values, dtype=torch.float32)
        y_train = torch.tensor(y.values, dtype=torch.float32)
        #print("X_train.shape",X_train.shape)
        seq_len = X_train.shape[1]
        print("seq_len",seq_len)
        self.model = ConvNet(input_channels = 1, seq_len=seq_len)
        self.train(self.model, X_train, y_train, batch_size=batch_size, epochs=epochs, lr=lr)

            
    def predict(self, X):
        X_test = torch.tensor(X.values, dtype=torch.float32)
        X_test = X_test.unsqueeze(1)  # Adjust the input shape for Conv1d
        self.model.eval()
        with torch.no_grad():
            outputs = self.model(X_test)
            preds = torch.round(outputs)
            predictions = preds.squeeze().tolist()
        return predictions
    
    def predict_proba(self, X):
        X_test = torch.tensor(X.values, dtype=torch.float32)
        X_test = X_test.unsqueeze(1)  # Adjust the input shape for Conv1d
        self.model.eval()
        with torch.no_grad():
            outputs = self.model(X_test)
            predictions = outputs.squeeze().tolist()
        return predictions

In [6]:
################
# Loading Data #
################

# train_data = TabularDataset('https://autogluon.s3.amazonaws.com/datasets/Inc/train.csv')  # can be local CSV file as well, returns Pandas DataFrame
# test_data = TabularDataset('https://autogluon.s3.amazonaws.com/datasets/Inc/test.csv')  # another Pandas DataFrame
# label = 'class'  # specifies which column do we want to predict
# train_data = train_data.head(1000)  # subsample for faster demo

from autogluon.core.utils.loaders import load_pd
import pandas as pd
train_data  = load_pd.load('./train.csv')
test_data  = load_pd.load('./test.csv')
concatenated_df = pd.concat([train_data,test_data], axis=0)
label = 'solubility'



In [7]:
from pandas import DataFrame
import torch
from autogluon.features.generators import AbstractFeatureGenerator
from autogluon.common.features.types import R_INT,R_FLOAT,R_OBJECT,R_CATEGORY,S_TEXT_AS_CATEGORY 
import pandas as pd
import numpy as np
from torch.nn.utils.rnn import pad_sequence

class one_hot_Generator(AbstractFeatureGenerator):
    def __init__(self, **kwargs):
        super().__init__(**kwargs)

    def _fit_transform(self, X: DataFrame, **kwargs) -> (DataFrame, dict):
        # Here we can specify any logic we want to make a stateful feature generator based on the data.
        # Just call _transform since this isn't a stateful feature generator.
        X_out = self._transform(X)
        # return the output and the new special types of the data. For this generator, we don't add any new special types, so just return the input special types
        return X_out, self.feature_metadata_in.type_group_map_special

    def _transform(self, X: DataFrame) -> DataFrame:
        # Here we can specify the logic taken to convert input data to output data post-fit. Here we can reference any variables created during fit if the generator is stateful.
        # Because this feature generator is not stateful, we simply add k to all features.
        letter_to_int = {'C': 0, 'P': 1, 'R': 2, 'N': 3, 'F': 4, 'K': 5, 'A': 6, 'H': 7, 'Y': 8, 'V': 9, 'L': 10, 'D': 11, 'G': 12, 'E': 13, 'Q': 14, 'M': 15, 'T': 16, 'S': 17, 'I': 18, 'W': 19, 'X':20}

        def one_hot_encoding(sequence,letter_to_int):
            letter_sequence = [letter_to_int[letter] for letter in sequence]

            encoded_tensor  = torch.zeros((len(letter_to_int),len(sequence)), dtype=torch.int64)
            for i in range(len(letter_sequence)):
                encoded_tensor[letter_sequence[i],i] = 1
            return encoded_tensor
        
        # Convert the protein sequences to one-hot encoding
        
        one_hot_df = pd.DataFrame()

        # get the first column 
        column = X.iloc[:, 0]
        sequences = column.tolist()
        one_hot_seqs = []
        for seq in sequences:
            one_hot_seq = one_hot_encoding(seq,letter_to_int)
            one_hot_seqs.append(one_hot_seq.flatten().numpy())

        #print("one_hot_seqs size",one_hot_seqs.shape)
        # Create a dataframe with separate columns for each amino acid position
        max_length = max(len(seq) for seq in sequences)
        column_name = [f'aa{i}_{aa}' for aa in letter_to_int for i in range(1, max_length+1) ]


        #print("one_hot_seqs shape",one_hot_seqs.shape)
        df = pd.DataFrame(one_hot_seqs,columns = column_name)
        df  = df.fillna(value=0).astype("bool")
        return df

    @staticmethod
    def get_default_infer_features_in_args() -> dict:
        default_infer_features = dict(valid_raw_types=[R_OBJECT]) 
        print(default_infer_features)
        return default_infer_features  # This limits input features to only integers. We can assume that the input to _fit_transform and _transform only contain the data post-applying this filter.


In [8]:
from autogluon.features.generators import CategoryFeatureGenerator, AsTypeFeatureGenerator, BulkFeatureGenerator, DropUniqueFeatureGenerator, FillNaFeatureGenerator, PipelineFeatureGenerator, OneHotEncoderFeatureGenerator,IdentityFeatureGenerator
import copy
import sys
sys.path.append('/user/mahaohui/autoML/autogluon_examples')
from feature_generator import count_charge_Generator, net_charge_Generator, one_hot_Generator


train_feature_generator = PipelineFeatureGenerator(
    generators=[
        # Stage 1: Convert feature types to be the same as during fit. Does not need to be specified.
        # Stage 2: Fill NaN values of data. Does not need to be specified.
        [  # Stage 3: Add 5 to all int features and convert all object features to category features. Concatenate the outputs of each.
            # count_charge_Generator(),
            # net_charge_Generator(),
            one_hot_Generator(verbosity=3,features_in=['seq']),
            #OneHotEncoderFeatureGenerator(),
            #CategoryFeatureGenerator(),
            IdentityFeatureGenerator(infer_features_in_args=dict(
                valid_raw_types=[R_INT, R_FLOAT])),
        ],
        # Stage 4: Drop any features which are always the same value (useless). Does not need to be specified.
     ],
    verbosity=3
)
one_hot_all_data = train_feature_generator.fit_transform(X=concatenated_df)
print(one_hot_all_data)

{'valid_raw_types': ['object']}
      aa2_C  aa3_C  aa4_C  aa5_C  aa6_C  aa7_C  aa8_C  aa9_C  aa10_C  aa11_C  \
0     False  False  False  False  False  False  False  False   False   False   
1     False  False  False  False  False  False  False  False   False   False   
2     False  False  False  False  False  False  False  False   False   False   
3     False  False  False  False  False  False  False  False   False   False   
4     False  False  False  False  False  False  False  False   False   False   
...     ...    ...    ...    ...    ...    ...    ...    ...     ...     ...   
1318  False  False  False  False  False  False  False  False   False   False   
1319  False  False  False  False  False  False  False  False   False   False   
1320  False  False  False  False  False  False  False  False   False   False   
1321  False  False  False  False  False  False  False  False   False   False   
1322  False  False  False  False  False  False  False  False   False   False   

      .

In [9]:
one_hot_train_data = one_hot_all_data[:len(train_data)]
one_hot_test_data = one_hot_all_data[len(train_data):]
one_hot_test_data = one_hot_test_data.drop(["fold"],axis=1)
print(one_hot_train_data)
print(one_hot_test_data)

       aa2_C  aa3_C  aa4_C  aa5_C  aa6_C  aa7_C  aa8_C  aa9_C  aa10_C  aa11_C  \
0      False  False  False  False  False  False  False  False   False   False   
1      False  False  False  False  False  False  False  False   False   False   
2      False  False  False  False  False  False  False  False   False   False   
3      False  False  False  False  False  False  False  False   False   False   
4      False  False  False  False  False  False  False  False   False   False   
...      ...    ...    ...    ...    ...    ...    ...    ...     ...     ...   
11219  False  False  False  False  False  False  False  False   False   False   
11220  False  False  False  False  False  False  False  False   False   False   
11221  False  False  False  False  False  False  False  False   False   False   
11222  False  False  False  False  False  False  False  False   False   False   
11223  False  False  False  False  False  False  False  False   False   False   

       ...  aa576_W  aa627_

In [10]:
one_hot_valid_data1 = one_hot_train_data[one_hot_train_data["fold"] ==0.0]
one_hot_train_data1 = one_hot_train_data[one_hot_train_data["fold"] !=0.0]

In [11]:
one_hot_train_data1 = one_hot_train_data1.drop(["fold"],axis=1)
one_hot_valid_data1 = one_hot_valid_data1.drop(["fold"],axis=1)

In [30]:
print(one_hot_train_data1)
print(one_hot_valid_data1)

       aa2_C  aa3_C  aa4_C  aa5_C  aa6_C  aa7_C  aa8_C  aa9_C  aa10_C  aa11_C  \
1      False  False  False  False  False  False  False  False   False   False   
2      False  False  False  False  False  False  False  False   False   False   
3      False  False  False  False  False  False  False  False   False   False   
4      False  False  False  False  False  False  False  False   False   False   
5      False  False  False  False  False  False  False  False   False   False   
...      ...    ...    ...    ...    ...    ...    ...    ...     ...     ...   
11213  False  False  False  False  False  False  False  False   False   False   
11215  False  False  False  False  False  False  False  False   False   False   
11216  False  False  False  False  False  False  False  False   False   False   
11217  False  False  False  False  False  False  False  False   False   False   
11223  False  False  False  False  False  False  False  False   False   False   

       ...  aa546_W  aa576_

In [50]:
custom_hyperparameters = {ConvNetModel: {}}
# custom_hyperparameters = {NaiveBayesModel: [{}, {'var_smoothing': 0.00001}, {'var_smoothing': 0.000002}]}  # Train 3 NaiveBayes models with different hyperparameters
predictor = TabularPredictor(label=label,eval_metric="precision")
predictor.fit(train_data=one_hot_train_data1, tuning_data=one_hot_valid_data1, hyperparameters=custom_hyperparameters,feature_generator=None, )  # Train a single default NaiveBayesModel


No path specified. Models will be saved in: "AutogluonModels/ag-20230627_104219/"
	Consider setting `time_limit` to ensure training finishes within an expected duration or experiment with a small portion of `train_data` to identify an ideal `presets` and `hyperparameters` configuration.
Beginning AutoGluon training ...
AutoGluon will save models to "AutogluonModels/ag-20230627_104219/"
AutoGluon Version:  0.7.0
Python Version:     3.9.0
Operating System:   Linux
Platform Machine:   x86_64
Platform Version:   #1 SMP Thu Jun 10 13:32:12 UTC 2021
Train Data Rows:    8281
Train Data Columns: 13756
Tuning Data Rows:    2943
Tuning Data Columns: 13756
Label Column: solubility
Preprocessing data ...
AutoGluon infers your prediction problem is: 'binary' (because only two unique label-values observed).
	2 unique label values:  [1, 0]
	If 'binary' is not the correct problem_type, please manually specify the problem_type parameter during predictor init (You may specify problem_type as one of: ['b

seq_len 13756


		'ConvNet' object has no attribute 'model'
Detailed Traceback:
Traceback (most recent call last):
  File "/user/mahaohui/miniconda3/envs/autoML/lib/python3.9/site-packages/autogluon/core/trainer/abstract_trainer.py", line 1502, in _train_and_save
    model = self._train_single(X, y, model, X_val, y_val, total_resources=total_resources, **model_fit_kwargs)
  File "/user/mahaohui/miniconda3/envs/autoML/lib/python3.9/site-packages/autogluon/core/trainer/abstract_trainer.py", line 1447, in _train_single
    model = model.fit(X=X, y=y, X_val=X_val, y_val=y_val, total_resources=total_resources, **model_fit_kwargs)
  File "/user/mahaohui/miniconda3/envs/autoML/lib/python3.9/site-packages/autogluon/core/models/abstract/abstract_model.py", line 703, in fit
    out = self._fit(**kwargs)
  File "/tmp/ipykernel_2117/3397548642.py", line 106, in _fit
    self.model.fit(X, y, batch_size=batch_size, epochs=epochs, lr=lr)
  File "/tmp/ipykernel_2117/3397548642.py", line 72, in fit
    optimizer = opt

ValueError: AutoGluon did not successfully train any models

In [123]:
################################################
# Training custom model using TabularPredictor #
################################################

custom_hyperparameters = {ConvNetModel: {}}
# custom_hyperparameters = {NaiveBayesModel: [{}, {'var_smoothing': 0.00001}, {'var_smoothing': 0.000002}]}  # Train 3 NaiveBayes models with different hyperparameters
predictor = TabularPredictor(label=label, eval_metric="precision")
predictor.fit(train_data=one_hot_train_data1, tuning_data=one_hot_valid_data1, hyperparameters=custom_hyperparameters,feature_generator=None,)  # Train a single default NaiveBayesModel


No path specified. Models will be saved in: "AutogluonModels/ag-20230626_075506/"
Beginning AutoGluon training ... Time limit = 120s
AutoGluon will save models to "AutogluonModels/ag-20230626_075506/"
AutoGluon Version:  0.7.0
Python Version:     3.9.0
Operating System:   Linux
Platform Machine:   x86_64
Platform Version:   #1 SMP Thu Jun 10 13:32:12 UTC 2021
Train Data Rows:    8281
Train Data Columns: 13756
Tuning Data Rows:    2943
Tuning Data Columns: 13756
Label Column: solubility
Preprocessing data ...
AutoGluon infers your prediction problem is: 'binary' (because only two unique label-values observed).
	2 unique label values:  [1, 0]
	If 'binary' is not the correct problem_type, please manually specify the problem_type parameter during predictor init (You may specify problem_type as one of: ['binary', 'multiclass', 'regression'])
Selected class <--> label mapping:  class 1 = 1, class 0 = 0
Using Feature Generators to preprocess the data ...
Fitting IdentityFeatureGenerator...
	I

seq_len 13756
Epoch 1 loss: 0.6859306526738543
Epoch 2 loss: 0.6000387591215991
Epoch 3 loss: 0.46730088968147604
Epoch 4 loss: 0.3355218250622121
Epoch 5 loss: 0.2332828393162683
Epoch 6 loss: 0.17486146613666714
Epoch 7 loss: 0.12697389985384175
Epoch 8 loss: 0.11056849756097609
Epoch 9 loss: 0.09588737711454778
Epoch 10 loss: 0.08587926510692567


	0.7088	 = Validation score   (accuracy)
	1573.04s	 = Training   runtime
	11.31s	 = Validation runtime
Fitting model: WeightedEnsemble_L2 ... Training model for up to 110.46s of the -1480.53s of remaining time.
	0.7088	 = Validation score   (accuracy)
	0.03s	 = Training   runtime
	0.0s	 = Validation runtime
AutoGluon training complete, total runtime = 1601.66s ... Best model: "WeightedEnsemble_L2"
TabularPredictor saved. To load, use: predictor = TabularPredictor.load("AutogluonModels/ag-20230626_075506/")


<autogluon.tabular.predictor.predictor.TabularPredictor at 0x2b50b8995520>

In [124]:
predictor.evaluate(one_hot_test_data, silent=True)

{'accuracy': 0.6326530612244898,
 'balanced_accuracy': 0.5077271223351983,
 'mcc': 0.03790849756936191,
 'roc_auc': 0.5186949694075584,
 'f1': 0.7698863636363636,
 'precision': 0.6401574803149607,
 'recall': 0.9655581947743468}

In [126]:
predictor.evaluate(one_hot_valid_data1, silent=True)

{'accuracy': 0.708800543662929,
 'balanced_accuracy': 0.5137198144802362,
 'mcc': 0.13176302073190968,
 'roc_auc': 0.5268012162340809,
 'f1': 0.8278770837517574,
 'precision': 0.7067901234567902,
 'recall': 0.9990305380513815}

In [22]:
from autogluon.common import space
from autogluon.tabular.configs.hyperparameter_configs import get_hyperparameter_config

custom_hyperparameters = get_hyperparameter_config('default')

# custom_hyperparameters[CustomRandomForestModel] = best_model_info['hyperparameters']

print(custom_hyperparameters)

predictor = TabularPredictor(label=label)
predictor.fit(train_data=one_hot_train_data1, tuning_data=one_hot_valid_data1, hyperparameters=custom_hyperparameters,feature_generator=None, eval_metric="precision",time_limit=360)  # Train a single default NaiveBayesModel


{'NN_TORCH': {}, 'GBM': [{'extra_trees': True, 'ag_args': {'name_suffix': 'XT'}}, {}, 'GBMLarge'], 'CAT': {}, 'XGB': {}, 'FASTAI': {}, 'RF': [{'criterion': 'gini', 'ag_args': {'name_suffix': 'Gini', 'problem_types': ['binary', 'multiclass']}}, {'criterion': 'entropy', 'ag_args': {'name_suffix': 'Entr', 'problem_types': ['binary', 'multiclass']}}, {'criterion': 'squared_error', 'ag_args': {'name_suffix': 'MSE', 'problem_types': ['regression', 'quantile']}}], 'XT': [{'criterion': 'gini', 'ag_args': {'name_suffix': 'Gini', 'problem_types': ['binary', 'multiclass']}}, {'criterion': 'entropy', 'ag_args': {'name_suffix': 'Entr', 'problem_types': ['binary', 'multiclass']}}, {'criterion': 'squared_error', 'ag_args': {'name_suffix': 'MSE', 'problem_types': ['regression', 'quantile']}}], 'KNN': [{'weights': 'uniform', 'ag_args': {'name_suffix': 'Unif'}}, {'weights': 'distance', 'ag_args': {'name_suffix': 'Dist'}}]}


In [28]:
from autogluon.common import space
from autogluon.tabular import TabularPredictor

ConvNetOptions = {
    "batch_size": 128,
    'lr': space.Real(1e-4, 1e-2, default=5e-4, log=True),
    "epochs": space.Int(lower=5, upper=10, default=5),
}

CustomHyperparameters = {ConvNetModel : ConvNetOptions}

TuneKwargs = {
    "searcher": "bayes",
    "scheduler": "local",
    "num_trials": 3,
}

predictor = TabularPredictor(label=label)
predictor.fit(train_data=one_hot_train_data1,
              tuning_data=one_hot_valid_data1,
              hyperparameters=CustomHyperparameters,
              hyperparameter_tune_kwargs=TuneKwargs,
              feature_generator=None)

No path specified. Models will be saved in: "AutogluonModels/ag-20230627_040057/"
	Consider setting `time_limit` to ensure training finishes within an expected duration or experiment with a small portion of `train_data` to identify an ideal `presets` and `hyperparameters` configuration.
Beginning AutoGluon training ...
AutoGluon will save models to "AutogluonModels/ag-20230627_040057/"
AutoGluon Version:  0.7.0
Python Version:     3.9.0
Operating System:   Linux
Platform Machine:   x86_64
Platform Version:   #1 SMP Thu Jun 10 13:32:12 UTC 2021
Train Data Rows:    8281
Train Data Columns: 13756
Tuning Data Rows:    2943
Tuning Data Columns: 13756
Label Column: solubility
Preprocessing data ...
AutoGluon infers your prediction problem is: 'binary' (because only two unique label-values observed).
	2 unique label values:  [1, 0]
	If 'binary' is not the correct problem_type, please manually specify the problem_type parameter during predictor init (You may specify problem_type as one of: ['b

seq_len 13756
Epoch 1 loss: 0.6719854345848394
Epoch 2 loss: 0.5943438645250113
Epoch 3 loss: 0.4359022209117579
Epoch 4 loss: 0.28266195161629093
Epoch 5 loss: 0.17684369366760402
Epoch 6 loss: 0.14072937192034352
Epoch 7 loss: 0.11817425430225309
Epoch 8 loss: 0.10011605590278673
Epoch 9 loss: 0.09068134251274457
Epoch 10 loss: 0.08874948418793059


Fitted model: ConvNetModel ...
	0.6925	 = Validation score   (accuracy)
	1464.77s	 = Training   runtime
	8.43s	 = Validation runtime
Fitting model: WeightedEnsemble_L2 ...
	0.6925	 = Validation score   (accuracy)
	0.01s	 = Training   runtime
	0.01s	 = Validation runtime
AutoGluon training complete, total runtime = 1521.97s ... Best model: "WeightedEnsemble_L2"
TabularPredictor saved. To load, use: predictor = TabularPredictor.load("AutogluonModels/ag-20230627_040057/")


<autogluon.tabular.predictor.predictor.TabularPredictor at 0x2b002eccd7f0>

In [24]:
# train customised model with other models together 
from autogluon.common import space
CustomRandomForestModel

ConvNetoptions =  {
    "batch_size":128, 
    'lr': space.Real(1e-4, 1e-2, default=5e-4, log=True),
    "epochs":space.Int(lower=5, upper=10, default=5), 
}

custom_hyperparameters = {ConvNetModel: ConvNetoptions}

tune_kwargs = {
                  "searcher": "bayes", # bayes
                  "scheduler": "local",
                  "num_trials": 3,
              }
predictor = TabularPredictor(label=label)
predictor.fit(train_data=one_hot_train_data1, 
              tuning_data=one_hot_valid_data1, 
              hyperparameters=custom_hyperparameters, 
              hyperparameter_tune_kwargs= "auto",
              feature_generator=None, )  # Train a single default NaiveBayesModel


No path specified. Models will be saved in: "AutogluonModels/ag-20230627_022636/"
	Consider setting `time_limit` to ensure training finishes within an expected duration or experiment with a small portion of `train_data` to identify an ideal `presets` and `hyperparameters` configuration.
Beginning AutoGluon training ...
AutoGluon will save models to "AutogluonModels/ag-20230627_022636/"
AutoGluon Version:  0.7.0
Python Version:     3.9.0
Operating System:   Linux
Platform Machine:   x86_64
Platform Version:   #1 SMP Thu Jun 10 13:32:12 UTC 2021
Train Data Rows:    8281
Train Data Columns: 13756
Tuning Data Rows:    2943
Tuning Data Columns: 13756
Label Column: solubility
Preprocessing data ...
AutoGluon infers your prediction problem is: 'binary' (because only two unique label-values observed).
	2 unique label values:  [1, 0]
	If 'binary' is not the correct problem_type, please manually specify the problem_type parameter during predictor init (You may specify problem_type as one of: ['b

seq_len 13756


KeyboardInterrupt: 

In [None]:
predictor.evaluate(one_hot_test_data, silent=True)

In [None]:
predictor.evaluate(one_hot_valid_data1, silent=True)

In [33]:
from autogluon.tabular.configs.hyperparameter_configs import get_hyperparameter_config

# Now we can add the custom model with tuned hyperparameters to be trained alongside the default models:
custom_hyperparameters = get_hyperparameter_config('default')

custom_hyperparameters[ConvNetModel] = {}

print(custom_hyperparameters)

predictor = TabularPredictor(label=label, eval_metric="precision")
predictor.fit(train_data=one_hot_train_data1, tuning_data=one_hot_valid_data1, hyperparameters=custom_hyperparameters,feature_generator=None)  # Train a single default NaiveBayesModel


No path specified. Models will be saved in: "AutogluonModels/ag-20230627_053839/"


{'NN_TORCH': {}, 'GBM': [{'extra_trees': True, 'ag_args': {'name_suffix': 'XT'}}, {}, 'GBMLarge'], 'CAT': {}, 'XGB': {}, 'FASTAI': {}, 'RF': [{'criterion': 'gini', 'ag_args': {'name_suffix': 'Gini', 'problem_types': ['binary', 'multiclass']}}, {'criterion': 'entropy', 'ag_args': {'name_suffix': 'Entr', 'problem_types': ['binary', 'multiclass']}}, {'criterion': 'squared_error', 'ag_args': {'name_suffix': 'MSE', 'problem_types': ['regression', 'quantile']}}], 'XT': [{'criterion': 'gini', 'ag_args': {'name_suffix': 'Gini', 'problem_types': ['binary', 'multiclass']}}, {'criterion': 'entropy', 'ag_args': {'name_suffix': 'Entr', 'problem_types': ['binary', 'multiclass']}}, {'criterion': 'squared_error', 'ag_args': {'name_suffix': 'MSE', 'problem_types': ['regression', 'quantile']}}], 'KNN': [{'weights': 'uniform', 'ag_args': {'name_suffix': 'Unif'}}, {'weights': 'distance', 'ag_args': {'name_suffix': 'Dist'}}], <class '__main__.ConvNetModel'>: {}}


	Consider setting `time_limit` to ensure training finishes within an expected duration or experiment with a small portion of `train_data` to identify an ideal `presets` and `hyperparameters` configuration.
Beginning AutoGluon training ...
AutoGluon will save models to "AutogluonModels/ag-20230627_053839/"
AutoGluon Version:  0.7.0
Python Version:     3.9.0
Operating System:   Linux
Platform Machine:   x86_64
Platform Version:   #1 SMP Thu Jun 10 13:32:12 UTC 2021
Train Data Rows:    8281
Train Data Columns: 13756
Tuning Data Rows:    2943
Tuning Data Columns: 13756
Label Column: solubility
Preprocessing data ...
AutoGluon infers your prediction problem is: 'binary' (because only two unique label-values observed).
	2 unique label values:  [1, 0]
	If 'binary' is not the correct problem_type, please manually specify the problem_type parameter during predictor init (You may specify problem_type as one of: ['binary', 'multiclass', 'regression'])
Selected class <--> label mapping:  class 1 =

seq_len 13756
Epoch 1 loss: 0.6715952516064163
Epoch 2 loss: 0.5869934253914412
Epoch 3 loss: 0.4224137973415759
Epoch 4 loss: 0.26641654200101084
Epoch 5 loss: 0.1738036938387063
Epoch 6 loss: 0.13570972154451202
Epoch 7 loss: 0.11850019416576901
Epoch 8 loss: 0.09886133019603038
Epoch 9 loss: 0.08607578083020888
Epoch 10 loss: 0.08309740766290893


	0.7136	 = Validation score   (precision)
	754.92s	 = Training   runtime
	6.55s	 = Validation runtime
Fitting model: WeightedEnsemble_L2 ...
	0.7452	 = Validation score   (precision)
	2.37s	 = Training   runtime
	0.0s	 = Validation runtime
AutoGluon training complete, total runtime = 4038.58s ... Best model: "WeightedEnsemble_L2"
TabularPredictor saved. To load, use: predictor = TabularPredictor.load("AutogluonModels/ag-20230627_053839/")


<autogluon.tabular.predictor.predictor.TabularPredictor at 0x2b002cee0340>

In [34]:
predictor.evaluate(one_hot_test_data, silent=True)

{'precision': 0.6659663865546218,
 'accuracy': 0.6024187452758881,
 'balanced_accuracy': 0.5459232300087407,
 'mcc': 0.09835253929687982,
 'roc_auc': 0.5585503281465277,
 'f1': 0.7068004459308808,
 'recall': 0.7529691211401425}

In [35]:
predictor.evaluate(one_hot_valid_data1, silent=True)

{'precision': 0.7452443118239462,
 'accuracy': 0.745837580699966,
 'balanced_accuracy': 0.5961780615167673,
 'mcc': 0.3092416007181366,
 'roc_auc': 0.6757819041113999,
 'f1': 0.8423271500843171,
 'recall': 0.9684924866698982}

In [37]:
predictor.leaderboard(one_hot_test_data, silent=True)

Unnamed: 0,model,score_test,score_val,pred_time_test,pred_time_val,fit_time,pred_time_test_marginal,pred_time_val_marginal,fit_time_marginal,stack_level,can_infer,fit_order
0,NeuralNetFastAI,0.666316,0.745054,40.64863,70.726684,2515.404765,40.64863,70.726684,2515.404765,1,True,8
1,WeightedEnsemble_L2,0.665966,0.745244,40.922761,71.059309,2695.919962,0.01163,0.004929,2.373953,2,True,12
2,XGBoost,0.65131,0.723614,0.262501,0.327696,178.141245,0.262501,0.327696,178.141245,1,True,9
3,CatBoost,0.646192,0.715955,8.91242,12.076163,282.817101,8.91242,12.076163,282.817101,1,True,5
4,LightGBM,0.645032,0.713106,0.245141,0.42156,25.254385,0.245141,0.42156,25.254385,1,True,2
5,LightGBMXT,0.645032,0.713106,0.375744,0.474071,26.496801,0.375744,0.474071,26.496801,1,True,1
6,ConvNetModel,0.642646,0.713561,5.233721,6.548399,754.924919,5.233721,6.548399,754.924919,1,True,11
7,LightGBMLarge,0.641252,0.711378,0.338435,0.466935,37.443775,0.338435,0.466935,37.443775,1,True,10
8,ExtraTreesGini,0.640398,0.706023,0.695148,0.203592,18.604183,0.695148,0.203592,18.604183,1,True,6
9,RandomForestGini,0.63908,0.705299,0.61218,0.193828,18.308598,0.61218,0.193828,18.308598,1,True,3
