In [2]:
# !pip install transformers
# !pip install seqeval
# !pip install tensorboardx
# !pip install simpletransformers

In [1]:
import pandas as pd
from simpletransformers.classification import ClassificationModel
from sklearn.metrics import f1_score, accuracy_score

def f1_multiclass(labels, preds):
    return f1_score(labels, preds, average='macro')

In [2]:
# load data and format it for simpletransformers

train_df = pd.read_json("data/AS/train.json", orient="records")
val_df = pd.read_json("data/AS/validation.json", orient="records")
test_df = pd.read_json("data/AS/test.json", orient="records")

train_df = train_df.loc[:,["problem","label"]]
train_df = train_df.rename(columns = {"problem":"text", "label":"labels"})
val_df = val_df.loc[:,["problem","label"]]
val_df = val_df.rename(columns = {"problem":"text", "label":"labels"})
test_df = test_df.loc[:,["problem","label"]]
test_df = test_df.rename(columns = {"problem":"text", "label":"labels"})

In [4]:
train_df

Unnamed: 0,text,labels
0,products have similarities which can be analyz...,8
1,learn filters in order to capture local corr...,5
2,a successful point cloud registration often li...,5
3,integrating large intelligent reflecting surfa...,5
4,the indicator diagram is an important basis to...,2
...,...,...
44601,semantic segmentation is an important visual p...,5
44602,"people start posting tweets containing texts, ...",3
44603,"for the investigation of the steganography, mo...",5
44604,semantic text matching is one of the most impo...,0


In [11]:
# calculate class weights
num_classes = 15
weights_dict = {i: ((len(train_df)/num_classes)/train_df["labels"].value_counts()[i]) for i in train_df["labels"].value_counts().index}
weights = [0]*num_classes
for i in weights_dict:
    weights[i] = weights_dict[i]
weights

[0.7940542946150422,
 1.528911739502999,
 1.517985366683682,
 1.448481896411755,
 1.7090421455938696,
 0.20291595587399067,
 5.377456298975286,
 0.4881374480192602,
 0.72530081300813,
 1.5336427711878975,
 0.9240936399419929,
 3.8075970977379425,
 8.1921028466483,
 3.8027280477408354,
 4.366715614292707]

In [None]:
# define model parameters
args = {
    "train_batch_size": 32,
    "num_train_epochs": 21,
    "learning_rate": 4e-5,
#     "weight": weights,
    "save_model_every_epoch": False,
    "save_eval_checkpoints": False,
    "overwrite_output_dir": True,
    "reprocess_input_data": False,
    'evaluate_during_training': True,
    "eval_batch_size": 32
}

# Create a ClassificationModel
model = ClassificationModel('bert', 'allenai/scibert_scivocab_uncased', weight=weights, num_labels=num_classes, args=args)

In [None]:
# train model

model.train_model(train_df, eval_df=val_df, f1=f1_multiclass, acc=accuracy_score)

In [None]:
#evaluate model
result, model_outputs, wrong_predictions = model.eval_model(test_df, f1=f1_multiclass, acc=accuracy_score)
print(result)

In [None]:
#get predictions
preds, raw = model.predict(list(test_df["text"]))