### Load data

In [None]:
import json
import pandas as pd

with open('casbert_resources/omex_classifier_data.json', 'r') as fp:
    final_data = json.load(fp)

data = {'queries':[], 'labels':[]}
for q, v in final_data.items():
    if v['indexType'] != -1:
        data['queries'] += [q]
        data['labels'] += [v['indexType']]
        
df = pd.DataFrame(data)

#### Divide into train, validation, and test data (proportion 3:3:4)

In [None]:
import numpy as np
df_train, df_eval, df_test = np.split(df.sample(frac=1, random_state=0),[int(.3*len(df)), int(.6*len(df))])
print(df_train.shape, df_eval.shape, df_test.shape)

#### Augmenting data train
Since the data size is relatively small, we augments dt_train using nlpaug package.

In [None]:
import nlpaug.augmenter.word as naw

aug = naw.ContextualWordEmbsAug(model_path='allenai/scibert_scivocab_uncased', action="substitute")

In [None]:
zero, one = df_train[df_train['labels']==0].shape[0], df_train[df_train['labels']==1].shape[0]
maxNumData = max(zero, one)
times = 20
multiplicator = {0:int(maxNumData/zero*times), 1:int(maxNumData/one*times)}
multiplicator

In [None]:
from tqdm import tqdm

df_tr = df_train.copy()
for row in tqdm(df_train.iloc, total=df_train.shape[0]):
    augmented_text = aug.augment(row['queries'], n=multiplicator[row['labels']])
    
    df_tmp = pd.DataFrame({'queries': augmented_text, 'labels': [row['labels']]*len(augmented_text)})
    df_tr = pd.concat([df_tr, df_tmp], ignore_index = True, axis = 0)

df_train = df_tr

In [None]:
print(df_train[df_train['labels']==1].shape[0])
print(df_train[df_train['labels']==0].shape[0])
print(df_eval[df_eval['labels']==1].shape)
print(df_eval[df_eval['labels']==0].shape)
print(df_test[df_test['labels']==1].shape)
print(df_test[df_test['labels']==0].shape)

In [None]:
# save df_train, df_test, and df_eval for further use

df_train.to_csv('casbert_resources/omex_classifier_train.csv')
df_eval.to_csv('casbert_resources/omex_classifier_evaluate.csv')
df_test.to_csv('casbert_resources/omex_classifier_test.csv')

#### TRAINING

In [None]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")

In [None]:
from datasets import Dataset
train_datasets = Dataset.from_pandas(df_train)
eval_datasets = Dataset.from_pandas(df_eval)
test_datasets = Dataset.from_pandas(df_test)

In [None]:
def tokenize_function(examples):
    return tokenizer(examples["queries"], padding="max_length", truncation=True)

train_datasets = train_datasets.map(tokenize_function, batched=True)
eval_datasets = eval_datasets.map(tokenize_function, batched=True)
test_datasets = test_datasets.map(tokenize_function, batched=True)

In [None]:
from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained("bert-base-cased", num_labels=2)

In [None]:
from transformers import TrainingArguments

# training_args = TrainingArguments("test_trainer")
training_args = TrainingArguments("test_trainer", evaluation_strategy="epoch")
training_args.num_train_epochs=10
training_args

In [None]:
from transformers import Trainer

trainer = Trainer(
    model=model, 
    args=training_args, 
    train_dataset=train_datasets, 
    eval_dataset=eval_datasets
)

In [None]:
trainer.train()

#### EVALUATION

In [None]:
import numpy as np
from datasets import load_metric

metric = load_metric("accuracy")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_datasets,
    eval_dataset=test_datasets,
    compute_metrics=compute_metrics,
)
trainer.evaluate()

In [None]:
inputs = tokenizer("version of Concentration of property of sodium in cytosol of part of cardiac myocyte", return_tensors="pt")

In [None]:
            inputs

In [None]:
import torch

use_cuda = torch.cuda.is_available()
device = torch.device("cuda" if use_cuda else "cpu")

if use_cuda:

    model = model.cuda()
        
inputs = inputs.to(device)
outputs = model(**inputs)

In [None]:
print(outputs.logits)
print(outputs.logits.softmax(dim=-1).tolist())
print(outputs.logits.argmax(dim=-1).tolist())

In [None]:
# Save the trained model, so it can be reused later
model.save_pretrained('casbert_resources/omex_trained_model')