In [1]:
!pip install transformers
!pip install simpletransformers
!pip install wandb

In [2]:
import pandas as pd
import numpy as np
from simpletransformers.classification import ClassificationModel
from sklearn.model_selection import train_test_split
from sklearn import metrics

In [7]:
df = pd.read_csv('../input/chemical-explosion-dataset/final_dict.csv')
x = df['SMILES']
y = df["Y"] > 4
y = y.to_frame()
y.rename(columns={"Y":"Strong_explosive"}, inplace = True)
Y = y.astype(int)
Y = Y.to_numpy(dtype=int)

In [8]:
xtrain, xvalid, ytrain, yvalid = train_test_split(x, Y, test_size = 0.3, random_state = 1)
Xtrain = {'text':xtrain, 'labels':np.squeeze(ytrain)}
Xvalid = {'text':xvalid, 'labels':np.squeeze(yvalid)}
train_df = pd.DataFrame(Xtrain)
valid_df = pd.DataFrame(Xvalid)

In [11]:
xtrain.shape[0]/32

In [12]:
model = ClassificationModel('roberta', 'DeepChem/ChemBERTa-77M-MTR', args = {
'evaluate_during_training' : True,
'evaluate_during_training_verbose' : True,
'evaluate_during_training_steps' : 70,
'use_cached_eval_features ':True,
'use_early_stopping': True,
'early_stopping_delta': 0.001,
'early_stopping_metric':"acc",
'early_stopping_metric_minimize' : False,
'early_stopping_patience' : 70,
'max_seq_length' : 128,
'num_train_epochs' : 150,
'train_batch_size' : 32,
'learning_rate' : 4e-5,
'reprocess_input_data': True, 
'overwrite_output_dir': True,
'auto_weights': True
}, num_labels=2,)

In [13]:
import logging

logging.basicConfig(level=logging.INFO)
transformers_logger = logging.getLogger("transformers")
transformers_logger.setLevel(logging.WARNING)

In [14]:
out = model.train_model(train_df, eval_df=valid_df,args={'wandb_project': 'chemical_explosion_classification'}, acc = metrics.accuracy_score)

In [18]:
model = ClassificationModel(
    "roberta", "./outputs/best_model",args={'wandb_project': 'chemical_explosion_classification'}
)

In [19]:
result, model_outputs, wrong_predictions = model.eval_model(valid_df,acc=metrics.accuracy_score)
print(result)

In [20]:
!sudo apt-get install git-lfs
!git lfs install
!git config --global credential.helper store

In [23]:
from huggingface_hub import notebook_login

notebook_login()

In [24]:
from transformers import AutoModelForSequenceClassification
from transformers import AutoTokenizer

model = AutoModelForSequenceClassification.from_pretrained("./outputs/best_model")
model.push_to_hub("Parsa/Chemical_explosion_classification")

In [25]:
tokenizer = AutoTokenizer.from_pretrained("./outputs/best_model")
tokenizer.push_to_hub("Parsa/Chemical_explosion_classification")