In [1]:
"""
Example script for defining and using FeatureGenerators in AutoGluon Tabular.
FeatureGenerators act to clean and prepare the data to maximize predictive accuracy in downstream models.
FeatureGenerators are stateful data preprocessors which take input data (pandas DataFrame) and output transformed data (pandas DataFrame).
FeatureGenerators are first fit on training data through the .fit_transform() function, and then transform new data through the .transform() function.
These generators can do anything from filling NaN values (FillNaFeatureGenerator), dropping duplicate features (DropDuplicatesFeatureGenerator), generating ngram features from text (TextNgramFeatureGenerator), and much more.
In AutoGluon's TabularPredictor, the input data is transformed via a FeatureGenerator before entering a machine learning model. Some models use this transformed input directly and others perform further transformations before making predictions.

This example is intended for advanced users that have a strong understanding of feature engineering and data preparation.
Most users can get strong performance without specifying custom feature generators due to the generic and powerful default feature generator used by AutoGluon.
An advanced user may wish to create a custom feature generator to:
    1. Experiment with different preprocessing pipelines to improve model quality.
    2. Have full control over what data is being sent to downstream models.
    3. Migrate existing pipelines into AutoGluon for ease of use and deployment.
    4. Contribute new feature generators to AutoGluon.
"""

################
# Loading Data #
################

import pandas as pd
import os
import torch
from autogluon.tabular import TabularDataset, TabularPredictor
import numpy as np

# train_data = TabularDataset('https://autogluon.s3.amazonaws.com/datasets/AdultIncomeBinaryClassification/train_data.csv')  # can be local CSV file as well, returns Pandas DataFrame
# test_data = TabularDataset('https://autogluon.s3.amazonaws.com/datasets/AdultIncomeBinaryClassification/test_data.csv')  # another Pandas DataFrame
# label = 'class'  # specifies which column do we want to predict
# sample_train_data = train_data.head(100)  # subsample for faster demo

# # Separate features and labels
# # Make sure to not include your label/target column when sending input to the feature generators, or else the label will be transformed as well.
# X = sample_train_data.drop(columns=[label])
# y = sample_train_data[label]

# X_test = test_data.drop(columns=[label])
# y_test = test_data[label]

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
from autogluon.core.utils.loaders import load_pd
train_data  = load_pd.load('./train.csv')
test_data  = load_pd.load('./test.csv')
train_data.head()
label = 'solubility'
splitted_valid_data = train_data[train_data["fold"] ==0.0]
splitted_train_data = train_data[train_data["fold"] !=0.0]
# valid_data = valid_data[["seq","solubility"]]
# train_data = train_data[["seq","solubility"]]
# train_data = train_data[:1000]
X = train_data[["seq"]]
y = train_data[label]
X_test = test_data[["seq","solubility"]]
y_test = test_data[label]


In [3]:
print(len(train_data))
print(len(test_data))

11224
1323


In [4]:
concatenated_df = pd.concat([train_data,test_data], axis=0)

In [5]:
# concatenated_df  = concatenated_df.fillna(value=-1)
print(concatenated_df)

               sid  solubility  \
0     AaCD00331182           1   
1     AaCD00331183           1   
2     AaCD00331184           1   
3     AaCD00331185           1   
4     AaCD00331621           1   
...            ...         ...   
1318          ZR72           1   
1319          ZR74           1   
1320          ZR75           1   
1321          ZR78           1   
1322          ZR93           1   

                                                    seq  fold  
0     MTYKDGTYSSDGTYTSPNGLETVGVELTLAADKVSAVNITVHPSNP...   0.0  
1     MTAMNILVLGSDSRGSSDADVEANTATDQRADTLMLVHVPADRKKT...   1.0  
2     MKAEGNTAMNILVLGSDSRGSSDADVEANTATDQRADTLMLVHVPA...   1.0  
3     MQSFNSGSTKIHNAFPEESTRPQKAEGNTAMNILVLGSDSRGSSDA...   1.0  
4     MNAPVKFEYFKNPKNRELTAVELEAFAKELDQIKQEVLDDIGEKDA...   2.0  
...                                                 ...   ...  
1318  MGGYKGIKADGGKVNQAKQLAAKIAKDIEACQKQTQQLAEYIEGSD...   NaN  
1319  MAFTLSAIQQAHQQFTGVDFPKLFKAFKDMGMTYNIVNIQDGTATY...   NaN  
1320  MASKYGIND

In [6]:
concatenated_df.dtypes

sid            object
solubility      int64
seq            object
fold          float64
dtype: object

In [7]:
concatenated_df.head()

Unnamed: 0,sid,solubility,seq,fold
0,AaCD00331182,1,MTYKDGTYSSDGTYTSPNGLETVGVELTLAADKVSAVNITVHPSNP...,0.0
1,AaCD00331183,1,MTAMNILVLGSDSRGSSDADVEANTATDQRADTLMLVHVPADRKKT...,1.0
2,AaCD00331184,1,MKAEGNTAMNILVLGSDSRGSSDADVEANTATDQRADTLMLVHVPA...,1.0
3,AaCD00331185,1,MQSFNSGSTKIHNAFPEESTRPQKAEGNTAMNILVLGSDSRGSSDA...,1.0
4,AaCD00331621,1,MNAPVKFEYFKNPKNRELTAVELEAFAKELDQIKQEVLDDIGEKDA...,2.0


In [101]:
import numpy as np
import pandas as pd

# Example protein sequences
sequences = ['MDEKRRAQHNEV', 'MDEKRRAQH', 'MDEKRRAQHNEVKD']

def one_hot_encoding(sequence):
    letter_to_int = {'C': 0, 'P': 1, 'R': 2, 'N': 3, 'F': 4, 'K': 5, 'A': 6, 'H': 7, 'Y': 8, 'V': 9, 'L': 10, 'D': 11, 'G': 12, 'E': 13, 'Q': 14, 'M': 15, 'T': 16, 'S': 17, 'I': 18, 'W': 19, 'X':20}
    letter_sequence = [letter_to_int[letter] for letter in sequence]

    encoded_tensor  = torch.zeros((len(sequence),len(letter_to_int)), dtype=torch.int64)
    for i in range(len(letter_sequence)):
        encoded_tensor[i,letter_sequence[i]] = 1
    return encoded_tensor
letter_to_int = {'C': 0, 'P': 1, 'R': 2, 'N': 3, 'F': 4, 'K': 5, 'A': 6, 'H': 7, 'Y': 8, 'V': 9, 'L': 10, 'D': 11, 'G': 12, 'E': 13, 'Q': 14, 'M': 15, 'T': 16, 'S': 17, 'I': 18, 'W': 19, 'X':20}


# 
# Create a dictionary to map amino acids to one-hot vectors
aa_dict = {'A': [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
           'C': [0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
           'D': [0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
           'E': [0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
           'F': [0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
           'G': [0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
           'H': [0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
           'I': [0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
           'K': [0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
           'L': [0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
           'M': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0],
           'N': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0],
           'P': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0],
           'Q': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0],
           'R': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0],
           'S': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0],
           'T': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0],
           'V': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0],
           'W': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0],
           'Y': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1]}

# Convert the protein sequences to one-hot encoding
one_hot_seqs = []
for seq in sequences:
    #one_hot_seq = np.array([aa_dict[aa] for aa in seq]) 
    one_hot_seq = one_hot_encoding(seq)
    print(np.shape(one_hot_seq))
    print(one_hot_seq)
    
    one_hot_seqs.append(one_hot_seq.flatten())
    print(np.shape(one_hot_seq.flatten()))

# Create a dataframe with separate columns for each amino acid position
df = pd.DataFrame(one_hot_seqs, columns=[f'aa{i}_{aa}' for i in range(1, len(one_hot_seq)+1) for aa in letter_to_int])
print(np.shape(df))
print(df)

torch.Size([12, 21])
tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]])
torch.Size([252])
torch.Size([9, 21])
tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0

In [8]:
#####################################
# Create a custom feature generator #
#####################################

from pandas import DataFrame
import torch
from autogluon.features.generators import AbstractFeatureGenerator
from autogluon.common.features.types import R_INT,R_FLOAT,R_OBJECT,R_CATEGORY,S_TEXT_AS_CATEGORY 
import pandas as pd
import numpy as np
from torch.nn.utils.rnn import pad_sequence

# Feature generator to add k to all values of integer features.
class PlusKFeatureGenerator(AbstractFeatureGenerator):
    def __init__(self, k, **kwargs):
        super().__init__(**kwargs)
        self.k = k

    def _fit_transform(self, X: DataFrame, **kwargs) -> (DataFrame, dict):
        # Here we can specify any logic we want to make a stateful feature generator based on the data.
        # Just call _transform since this isn't a stateful feature generator.
        X_out = self._transform(X)
        # return the output and the new special types of the data. For this generator, we don't add any new special types, so just return the input special types
        return X_out, self.feature_metadata_in.type_group_map_special

    def _transform(self, X: DataFrame) -> DataFrame:
        # Here we can specify the logic taken to convert input data to output data post-fit. Here we can reference any variables created during fit if the generator is stateful.
        # Because this feature generator is not stateful, we simply add k to all features.
        return X + self.k

    @staticmethod
    def get_default_infer_features_in_args() -> dict:
        default_infer_features = dict(valid_raw_types=[R_CATEGORY]) 
        print(default_infer_features)
        return default_infer_features  # This limits input features to only integers. We can assume that the input to _fit_transform and _transform only contain the data post-applying this filter.


    
class net_charge_Generator(AbstractFeatureGenerator):
    def __init__(self,  **kwargs):
        super().__init__(**kwargs)

    def _fit_transform(self, X: DataFrame, **kwargs) -> (DataFrame, dict):
        # Here we can specify any logic we want to make a stateful feature generator based on the data.
        # Just call _transform since this isn't a stateful feature generator.
        X_out = self._transform(X)
        # return the output and the new special types of the data. For this generator, we don't add any new special types, so just return the input special types
        return X_out, self.feature_metadata_in.type_group_map_special

    def _transform(self, X: DataFrame) -> DataFrame:
        # Here we can specify the logic taken to convert input data to output data post-fit. Here we can reference any variables created during fit if the generator is stateful.
        # Because this feature generator is not stateful, we simply add k to all features.

        def net_charge(seq):
            # Define the pKa values of the amino acids at pH 7.4
            pKa = {'D': 3.9, 'E': 4.3, 'H': 6.0, 'C': 8.3, 'Y': 10.1, 'K': 10.8, 'R': 12.5,
                   'A': 0, 'G': 0, 'I': 0, 'L': 0, 'M': 0, 'F': 0, 'P': 0, 'S': 0, 'T': 0,
                   'W': 0, 'V': 0}
            # Count the number of each type of amino acid in the sequence
            aa_count = {aa: seq.count(aa) for aa in pKa.keys()}

            # Calculate the net charge of the sequence using the pKa values
            net_charge = sum([-1 * aa_count[aa] * (10 ** (-pKa[aa])) for aa in ['D', 'E']]) \
                         + sum([aa_count[aa] * (10 ** (-pKa[aa])) for aa in ['K', 'R', 'H']]) \
                         + sum([aa_count[aa] for aa in ['C', 'Y', 'K', 'R']])
            return net_charge
    
        print("X:",X)
        df = pd.DataFrame(columns=['net_charge'])
        for column in X.columns:
            df['net_charge'] = X['seq'].apply(net_charge)
        
        return df

    @staticmethod
    def get_default_infer_features_in_args() -> dict:
        default_infer_features = dict(valid_raw_types=[R_OBJECT]) 
        print(default_infer_features)
        return default_infer_features  # This limits input features to only integers. We can assume that the input to _fit_transform and _transform only contain the data post-applying this filter.


class count_charge_Generator(AbstractFeatureGenerator):
    def __init__(self, **kwargs):
        super().__init__(**kwargs)

    def _fit_transform(self, X: DataFrame, **kwargs) -> (DataFrame, dict):
        # Here we can specify any logic we want to make a stateful feature generator based on the data.
        # Just call _transform since this isn't a stateful feature generator.
        X_out = self._transform(X)
        # return the output and the new special types of the data. For this generator, we don't add any new special types, so just return the input special types
        return X_out, self.feature_metadata_in.type_group_map_special

    def _transform(self, X: DataFrame) -> DataFrame:
        # Here we can specify the logic taken to convert input data to output data post-fit. Here we can reference any variables created during fit if the generator is stateful.
        # Because this feature generator is not stateful, we simply add k to all features.

        def count_chargeed(seq):
            charged = ['D','E','K','R','H']
            polar = ['S','T','T','Q','C']
            aromatic = ['Y']

            hdrophobic = ['A','V','L','I','M','F','W']
            neutral = ['P','G']

            charged_counter = 0
            polar_counter = 0
            aromatic_counter = 0
            hdrophobic_counter = 0
            neutral_counter = 0

            for c in seq:
                if c in charged:
                    charged_counter+=1
                elif c in polar:
                    polar_counter+=1
                elif c in aromatic:
                    aromatic_counter+=1
                elif c in hdrophobic:
                    hdrophobic_counter+=1
                elif c in neutral:
                    neutral_counter+=1
            return (charged_counter,polar_counter,aromatic_counter,hdrophobic_counter,neutral_counter)

        df = pd.DataFrame(columns=['charged', 'polar', 'aromatic', 'hdrophobic', 'neutral'])
        for column in X.columns:
            df[['charged', 'polar', 'aromatic', 'hdrophobic', 'neutral']] = X[column].apply(count_chargeed).tolist()
        return df

    @staticmethod
    def get_default_infer_features_in_args() -> dict:
        default_infer_features = dict(valid_raw_types=[R_OBJECT]) 
        print(default_infer_features)
        return default_infer_features  # This limits input features to only integers. We can assume that the input to _fit_transform and _transform only contain the data post-applying this filter.

    
class one_hot_Generator(AbstractFeatureGenerator):
    def __init__(self, **kwargs):
        super().__init__(**kwargs)

    def _fit_transform(self, X: DataFrame, **kwargs) -> (DataFrame, dict):
        # Here we can specify any logic we want to make a stateful feature generator based on the data.
        # Just call _transform since this isn't a stateful feature generator.
        X_out = self._transform(X)
        # return the output and the new special types of the data. For this generator, we don't add any new special types, so just return the input special types
        return X_out, self.feature_metadata_in.type_group_map_special

    def _transform(self, X: DataFrame) -> DataFrame:
        # Here we can specify the logic taken to convert input data to output data post-fit. Here we can reference any variables created during fit if the generator is stateful.
        # Because this feature generator is not stateful, we simply add k to all features.
        letter_to_int = {'C': 0, 'P': 1, 'R': 2, 'N': 3, 'F': 4, 'K': 5, 'A': 6, 'H': 7, 'Y': 8, 'V': 9, 'L': 10, 'D': 11, 'G': 12, 'E': 13, 'Q': 14, 'M': 15, 'T': 16, 'S': 17, 'I': 18, 'W': 19, 'X':20}

        def one_hot_encoding(sequence,letter_to_int):
            letter_sequence = [letter_to_int[letter] for letter in sequence]

            encoded_tensor  = torch.zeros((len(letter_to_int),len(sequence)), dtype=torch.int64)
            for i in range(len(letter_sequence)):
                encoded_tensor[letter_sequence[i],i] = 1
            return encoded_tensor
        
        # Convert the protein sequences to one-hot encoding
        
        one_hot_df = pd.DataFrame()

        # get the first column 
        column = X.iloc[:, 0]
        sequences = column.tolist()
        one_hot_seqs = []
        for seq in sequences:
            one_hot_seq = one_hot_encoding(seq,letter_to_int)
            one_hot_seqs.append(one_hot_seq.flatten().numpy())

        #print("one_hot_seqs size",one_hot_seqs.shape)
        # Create a dataframe with separate columns for each amino acid position
        max_length = max(len(seq) for seq in sequences)
        column_name = [f'aa{i}_{aa}' for aa in letter_to_int for i in range(1, max_length+1) ]


        #print("one_hot_seqs shape",one_hot_seqs.shape)
        df = pd.DataFrame(one_hot_seqs,columns = column_name)
        df  = df.fillna(value=0).astype("bool")
        return df

    @staticmethod
    def get_default_infer_features_in_args() -> dict:
        default_infer_features = dict(valid_raw_types=[R_OBJECT]) 
        print(default_infer_features)
        return default_infer_features  # This limits input features to only integers. We can assume that the input to _fit_transform and _transform only contain the data post-applying this filter.


In [9]:
# one_hot_train = one_hot_Generator(verbosity=3,features_in=['seq'])
# one_hot_train_data = one_hot_train.fit_transform(X=train_data)
# one_hot_train_data  = one_hot_train_data.fillna(value=0, downcast='infer')
# print(one_hot_train_data.head(5))

In [3]:
from autogluon.features.generators import CategoryFeatureGenerator, AsTypeFeatureGenerator, BulkFeatureGenerator, DropUniqueFeatureGenerator, FillNaFeatureGenerator, PipelineFeatureGenerator, OneHotEncoderFeatureGenerator,IdentityFeatureGenerator
import copy


train_feature_generator = PipelineFeatureGenerator(
    generators=[
        # Stage 1: Convert feature types to be the same as during fit. Does not need to be specified.
        # Stage 2: Fill NaN values of data. Does not need to be specified.
        [  # Stage 3: Add 5 to all int features and convert all object features to category features. Concatenate the outputs of each.
            # count_charge_Generator(),
            # net_charge_Generator(),
            one_hot_Generator(verbosity=3,features_in=['seq']),
            #OneHotEncoderFeatureGenerator(),
            #CategoryFeatureGenerator(),
            IdentityFeatureGenerator(infer_features_in_args=dict(
                valid_raw_types=[R_INT, R_FLOAT])),
        ],
        # Stage 4: Drop any features which are always the same value (useless). Does not need to be specified.
     ],
    verbosity=3
)

one_hot_all_data = train_feature_generator.fit_transform(X=concatenated_df)
print(one_hot_all_data)

{'valid_raw_types': ['object']}
{'valid_raw_types': ['object']}


NameError: name 'R_INT' is not defined

In [11]:
print(one_hot_all_data.dtypes)

aa2_C            bool
aa3_C            bool
aa4_C            bool
aa5_C            bool
aa6_C            bool
               ...   
aa749_W          bool
aa776_W          bool
aa810_W          bool
solubility       int8
fold          float64
Length: 13758, dtype: object


In [12]:
one_hot_train_data = one_hot_all_data[:len(train_data)]
one_hot_test_data = one_hot_all_data[len(train_data):]
print(one_hot_train_data)
print(one_hot_test_data)

       aa2_C  aa3_C  aa4_C  aa5_C  aa6_C  aa7_C  aa8_C  aa9_C  aa10_C  aa11_C  \
0      False  False  False  False  False  False  False  False   False   False   
1      False  False  False  False  False  False  False  False   False   False   
2      False  False  False  False  False  False  False  False   False   False   
3      False  False  False  False  False  False  False  False   False   False   
4      False  False  False  False  False  False  False  False   False   False   
...      ...    ...    ...    ...    ...    ...    ...    ...     ...     ...   
11219  False  False  False  False  False  False  False  False   False   False   
11220  False  False  False  False  False  False  False  False   False   False   
11221  False  False  False  False  False  False  False  False   False   False   
11222  False  False  False  False  False  False  False  False   False   False   
11223  False  False  False  False  False  False  False  False   False   False   

       ...  aa576_W  aa627_

In [13]:
one_hot_valid_data1 = one_hot_train_data[one_hot_train_data["fold"] ==0.0]
one_hot_train_data1 = one_hot_train_data[one_hot_train_data["fold"] !=0.0]

In [14]:
one_hot_train_data1 = one_hot_train_data1.drop(["fold"],axis=1)
# one_hot_train_data1.astype(bool)                                            

In [15]:
one_hot_valid_data1 = one_hot_valid_data1.drop(["fold"],axis=1)
# one_hot_valid_data1.astype(bool)

In [16]:
print(one_hot_train_data1.dtypes)
print(one_hot_valid_data1.dtypes)

aa2_C         bool
aa3_C         bool
aa4_C         bool
aa5_C         bool
aa6_C         bool
              ... 
aa745_W       bool
aa749_W       bool
aa776_W       bool
aa810_W       bool
solubility    int8
Length: 13757, dtype: object
aa2_C         bool
aa3_C         bool
aa4_C         bool
aa5_C         bool
aa6_C         bool
              ... 
aa745_W       bool
aa749_W       bool
aa776_W       bool
aa810_W       bool
solubility    int8
Length: 13757, dtype: object


In [17]:
# DO THE ENCODING BEFOTRE SPLITING TRAIN VALID!!!!!!


In [18]:
predictor = TabularPredictor(label='solubility',eval_metric="precision").fit(train_data=one_hot_train_data1, tuning_data=one_hot_valid_data1, feature_generator=None)

No path specified. Models will be saved in: "AutogluonModels/ag-20230620_022326/"
	Consider setting `time_limit` to ensure training finishes within an expected duration or experiment with a small portion of `train_data` to identify an ideal `presets` and `hyperparameters` configuration.
Beginning AutoGluon training ...
AutoGluon will save models to "AutogluonModels/ag-20230620_022326/"
AutoGluon Version:  0.7.0
Python Version:     3.9.0
Operating System:   Linux
Platform Machine:   x86_64
Platform Version:   #1 SMP Thu Jun 10 13:32:12 UTC 2021
Train Data Rows:    8281
Train Data Columns: 13756
Tuning Data Rows:    2943
Tuning Data Columns: 13756
Label Column: solubility
Preprocessing data ...
AutoGluon infers your prediction problem is: 'binary' (because only two unique label-values observed).
	2 unique label values:  [1, 0]
	If 'binary' is not the correct problem_type, please manually specify the problem_type parameter during predictor init (You may specify problem_type as one of: ['b

In [21]:
predictor.evaluate(one_hot_test_data, silent=True)

{'precision': 0.6659663865546218,
 'accuracy': 0.6024187452758881,
 'balanced_accuracy': 0.5459232300087407,
 'mcc': 0.09835253929687982,
 'roc_auc': 0.5585503281465277,
 'f1': 0.7068004459308808,
 'recall': 0.7529691211401425}

In [22]:
predictor.evaluate(one_hot_valid_data1, silent=True)

{'precision': 0.7452443118239462,
 'accuracy': 0.745837580699966,
 'balanced_accuracy': 0.5961780615167673,
 'mcc': 0.3092416007181366,
 'roc_auc': 0.6757819041113999,
 'f1': 0.8423271500843171,
 'recall': 0.9684924866698982}